diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index cc8e8398..b64d416c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -21,7 +21,7 @@ jobs: - os: windows-latest build_command: make build ARCH=x64 COMP=gcc OS=windows - os: macos-latest - build_command: make build ARCH=x64-modern COMP=gcc OS=osx + build_command: make build ARCH=arm COMP=gcc OS=osx steps: - uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index 001704be..fb9fa71c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +<<<<<<< HEAD +<<<<<<< HEAD bin/ data/ doc/ @@ -5,3 +7,38 @@ doc/ *.7z .DS_Store +======= +*.o +*.s +*.exe +*~ +pgopti* +*.dyn +all.gc* +generate_flip.* +generate_count_flip.* +bcnttest.* +*.utf8.c +bin/ +doc/ +<<<<<<< HEAD +>>>>>>> dd6b636 (Bcc32 friendly and minor improvement on Flip_32.) +======= +problem/ +>>>>>>> 48873fa (calc opponent_feature once in eval_open) +======= +*.o +*.s +*.exe +*~ +pgopti* +*.dyn +all.gc* +generate_flip.* +generate_count_flip.* +bcnttest.* +*.utf8.c +bin/ +doc/ +problem/ +>>>>>>> 3e1ed4f (fix cr/lf in repository to lf) diff --git a/README.md b/README.md index def41a7a..22b96b04 100644 --- a/README.md +++ b/README.md @@ -51,3 +51,111 @@ cd src doxygen open ../doc/html/index.html ``` +======= +# edax-reversi-AVX +Automatically exported from code.google.com/p/okuharaandroid-edax-reversi + +======= +# edax-reversi-AVX +Automatically exported from code.google.com/p/okuharaandroid-edax-reversi + +Edax is a strong othello program. Its main features are: + + fast bitboard based & multithreaded engine. + accurate midgame-evaluation function. + opening book learning capability. + text based rich interface. + multi-protocol support to connect to graphical interfaces or play on Internet (GGS). + multi-OS support to run under MS-Windows, Linux and Mac OS X. + +>>>>>>> 81dec96 (Kindergarten last flip for arm32; MSVC arm Windows build (not tested)) +This is SSE/AVX optimized version of Edax 4.4.0. Functionally equivalent to the parent project, provided no bugs are introduced. + +Thanks to AVX2, x64-modern build solves fforum-40-59.obf 60% faster than official edax-4.4 on Haswell, and runs level 30 autoplay 80% faster. + +See http://www.amy.hi-ho.ne.jp/okuhara/bitboard.htm and http://www.amy.hi-ho.ne.jp/okuhara/edaxopt.htm for optimization details in Japanese. + +## 1. Mobility (board_sse.c, board_mmx.c) + +### 1.1 new SSE2 version of get_moves +Diagonals are SIMD'd using vertical mirroring by bswap. + + Athlon -get_moves_sse + problem\fforum-20-39.obf: 111349635 nodes in 0:07.998 (13922185 nodes/s). + mobility: 81.10 < 81.28 +/- 0.17 < 82.03 + Athlon +get_moves_sse + problem\fforum-20-39.obf: 111349635 nodes in 0:07.889 (14114544 nodes/s). + mobility: 71.08 < 71.72 +/- 0.34 < 73.53 + Core2 -get_moves_sse + problem/fforum-20-39.obf: 111349635 nodes in 0:10.180 (10938078 nodes/s). + mobility: 78.06 < 78.18 +/- 0.08 < 78.41 + Core2 +get_moves_sse + problem/fforum-20-39.obf: 111349635 nodes in 0:09.978 (11159514 nodes/s). + mobility: 60.84 < 61.19 +/- 0.13 < 61.47 + +### 1.2 can_move +Now calls SIMD'd get_moves for x86/x64 build. + +## 2. Stability (board.c, board_sse.c, board_mmx.c) + +### 2.1 get_full_lines_h, get_full_lines_v +get_full_lines for horizontal and vertical are simplified. The latter is compiled into rotation instrunction. + +### 2.2 rearranged loop +The last while loop is rearranged not to call bit_count in case stable == 0. + +### 2.3 new SSE2 version with bswap and pcmpeqb + Athlon -get_stability_sse + stability: 90.10 < 90.28 +/- 0.24 < 91.20 + Athlon +get_stability_sse + stability: 81.59 < 81.93 +/- 0.73 < 86.25 + Core2 -get_stability_sse + stability: 79.24 < 79.39 +/- 0.15 < 79.93 + Core2 +get_stability_sse + stability: 71.80 < 71.85 +/- 0.06 < 72.07 + +### 2.4 get_corner_stability +Kindergarten version eliminates bit_count call. + +### 2.5 find_edge_stable +Loop optimization and flip using carry propagation. One time execution but affect total solving time. + +## 3. eval.c (4.4.5) +Eval feature calculation using SSE2 / AVX2 (now in eval_sse.c) improves midgame by 15-30% and endgame by 8-12%. +Restoring eval from backup instead of rewinding. +eval_open (one time execution) is also optimized. + +## 4. hash.c +I think hash->data.move[0] on line 677 should be hash->data.move[1]. + +## 5. board_symetry, board_unique (board.c, board_sse.c) +SSE optimization and mirroring reduction. (Not used in solving game) + +## 6. endgame_sse.c (4.4.7) +Keep more variables in SSE registers. SSE optimized count_last_flip. Parity sort by shuffle. + +## 7. board_get_hash_code (4.5.0) +Changed to use CRC32c. This enables hardware acceleration on modern build. + +## 8. AVX2 versions (x64-modern build only) +In many cases AVX2 version is simplest, thanks to variable shift instructions (although they are 3 micro-op instructions). + +Benchmarks are on Core i5-4260U (Haswell) 1.4GHz (TB 2.7GHz) single thread. + + 4.4.0 original x64-modern clang + problem/fforum-20-39.obf: 111349635 nodes in 0:05.726 (19446321 nodes/s). + +optimizations 1-5 above, no-avx2 + problem/fforum-20-39.obf: 111349635 nodes in 0:05.342 (20844185 nodes/s). + +get_moves (board_sse.c) + problem/fforum-20-39.obf: 111349635 nodes in 0:05.142 (21654927 nodes/s). + +flip_avx.c + problem/fforum-20-39.obf: 111349635 nodes in 0:04.946 (22513068 nodes/s). + +count_last_flip_sse.c + problem/fforum-20-39.obf: 111349635 nodes in 0:04.906 (22696624 nodes/s). + +## 9. makefile +gcc-old, x86 build should be -m32, not -m64. Some flags and defines added for optimization. +<<<<<<< HEAD +>>>>>>> b9d48c1 (Create README.md) +======= +>>>>>>> 81dec96 (Kindergarten last flip for arm32; MSVC arm Windows build (not tested)) diff --git a/src/Android.mk b/src/Android.mk new file mode 100644 index 00000000..4a54afdc --- /dev/null +++ b/src/Android.mk @@ -0,0 +1,8 @@ +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) +LOCAL_MODULE := aEdax # should be renamed to lib..aEdax..so afterwords +LOCAL_CFLAGS += -DUNICODE +LOCAL_SRC_FILES := all.c board_sse.c.neon eval_sse.c.neon flip_neon_bitscan.c.neon android/cpu-features.c +LOCAL_ARM_NEON := false +# cmd-strip := +include $(BUILD_EXECUTABLE) diff --git a/src/Application.mk b/src/Application.mk new file mode 100644 index 00000000..6a59b5f1 --- /dev/null +++ b/src/Application.mk @@ -0,0 +1,3 @@ +APP_ABI := armeabi-v7a arm64-v8a x86 x86_64 +APP_PLATFORM := android-14 +APP_BUILD_SCRIPT := Android.mk diff --git a/src/Doxyfile b/src/Doxyfile index 8a539572..db85cfe5 100644 --- a/src/Doxyfile +++ b/src/Doxyfile @@ -687,7 +687,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = _*.c # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -1599,7 +1599,7 @@ HIDE_UNDOC_RELATIONS = YES # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) -HAVE_DOT = YES +HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will @@ -1688,7 +1688,7 @@ INCLUDED_BY_GRAPH = YES # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. -CALL_GRAPH = YES +CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function @@ -1696,7 +1696,7 @@ CALL_GRAPH = YES # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. -CALLER_GRAPH = YES +CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. diff --git a/src/Makefile b/src/Makefile index 87974154..4f6f51c3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,9 +1,9 @@ # # makefile to Compile Edax # -# Copyright 1998 - 2012 +# Copyright 1998 - 2024 # Richard Delorme -# Version 4.3 +# Version 4.5 # # Default settings @@ -38,51 +38,89 @@ endif # gcc 4.x (x >= 7) ifeq ($(COMP),gcc) - CFLAGS = -std=c99 -pedantic -W -Wall -Wextra -pipe -D_GNU_SOURCE=1 + CFLAGS = -std=c99 -pedantic -W -Wall -Wextra -pipe -D_GNU_SOURCE=1 -DUNICODE PGO_GEN = -fprofile-generate PGO_USE = -fprofile-correction -fprofile-use - + ifeq ($(BUILD),optimize) - CFLAGS += -Ofast -fwhole-program -flto -DNDEBUG + CFLAGS += -Ofast -fwhole-program -DNDEBUG + LTOFLAG = -flto else - CFLAGS += -O0 -g -DDEBUG + CFLAGS += -O1 -g -DDEBUG endif ifeq ($(ARCH),x64-modern) - CFLAGS += -m64 -march=native -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -m64 -march=core-avx2 -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT + endif + ifeq ($(ARCH),x64-avx512) + CFLAGS += -m64 -march=skylake-avx512 -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT + endif + ifeq ($(ARCH),x64-popcnt) + CFLAGS += -m64 -mpopcnt -mtune=generic -DUSE_GAS_X64 -DPOPCOUNT + endif + ifeq ($(ARCH),x64-k10) + CFLAGS += -m64 -march=amdfam10 -DUSE_GAS_X64 -DPOPCOUNT -DMOVE_GENERATOR=MOVE_GENERATOR_BITSCAN endif ifeq ($(ARCH),x32-modern) - CFLAGS += -mx32 -march=native -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -mx32 -march=core-avx2 -DUSE_GAS_X64 -DPOPCOUNT endif ifeq ($(ARCH),x64) - CFLAGS += -m64 -mtune=generic -DUSE_GAS_X64 + CFLAGS += -m64 -mtune=generic -DUSE_GAS_X64 endif ifeq ($(ARCH),x32) - CFLAGS += -mx32 -march=native -DUSE_GAS_X64 + CFLAGS += -mx32 -mtune=generic -DUSE_GAS_X64 + endif + ifeq ($(ARCH),x86-sse) + CFLAGS += -m32 -march=pentium-m -mfpmath=sse -DUSE_GAS_X86 -DUSE_GAS_MMX -DhasSSE2 + ifeq ($(BUILD),optimize) + CFLAGS += -fomit-frame-pointer + endif endif ifeq ($(ARCH),x86) - CFLAGS += -m32 -mtune=generic -DUSE_GAS_X86 + CFLAGS += -m32 -march=i386 -mtune=generic -DUSE_GAS_X86 -DUSE_GAS_MMX ifeq ($(BUILD),optimize) CFLAGS += -fomit-frame-pointer endif endif - ifeq ($(ARCH),ARM) + ifeq ($(ARCH),arm) ifeq ($(BUILD),optimize) - CFLAGS += -fomit-frame-pointer -DUSE_GCC_ARM + CFLAGS += -fomit-frame-pointer endif endif - ifeq ($(ARCH),ARMv7) + ifeq ($(ARCH),armv7) + CFLAGS += -march=armv7-a ifeq ($(BUILD),optimize) - CFLAGS += -fomit-frame-pointer -march=armv7-a -mfpu=neon -DUSE_GCC_ARM + CFLAGS += -fomit-frame-pointer + endif + endif + ifeq ($(ARCH),arm-neon) + CFLAGS += -march=armv7-a+simd -mfloat-abi=softfp + ifeq ($(BUILD),optimize) + CFLAGS += -fomit-frame-pointer + endif + endif + ifeq ($(ARCH),arm-sve) + CFLAGS += -march=armv8.2-a+sve + ifeq ($(BUILD),optimize) + CFLAGS += -fomit-frame-pointer endif endif ifeq ($(OS),osx) CFLAGS += -mmacosx-version-min=10.4 + ifeq ($(ARCH),x86) + CFLAGS += -msse2 -DhasSSE2 + endif + ifeq ($(ARCH),arm) + CFLAGS += -march=armv8.3-a + endif + endif + ifeq ($(OS),android) + CFLAGS += -DANDROID=1 endif ifeq ($(OS),windows) - CFLAGS += -D__USE_MINGW_ANSI_STDIO -DWINVER=0x501 - ifeq ($(ARCH),x86) + CFLAGS += -D__USE_MINGW_ANSI_STDIO -DWINVER=0x0501 + ifneq (,$(findstring x86,$(ARCH))) CFLAGS += -DUSE_PTHREAD endif endif @@ -90,8 +128,9 @@ ifeq ($(COMP),gcc) endif ifeq ($(COMP),gcc-old) - CFLAGS = -std=c99 -pedantic -W -Wall -Wextra -pipe -D_GNU_SOURCE=1 - + CC = gcc + CFLAGS = -std=c99 -pedantic -W -Wall -Wextra -pipe -D_GNU_SOURCE=1 -DUNICODE + ifeq ($(BUILD),optimize) CFLAGS += -O3 -fwhole-program -DNDEBUG else @@ -99,30 +138,33 @@ ifeq ($(COMP),gcc-old) endif ifeq ($(ARCH),x64-modern) - CFLAGS += -m64 -march=native -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -m64 -march=native -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT endif ifeq ($(ARCH),x64) CFLAGS += -m64 -mtune=generic -DUSE_GAS_X64 endif ifeq ($(ARCH),x86) - CFLAGS += -m64 -mtune=generic -DUSE_GAS_X86 + CFLAGS += -m32 -mtune=generic -DUSE_GAS_X86 -DUSE_GAS_MMX ifeq ($(BUILD),optimize) CFLAGS += -fomit-frame-pointer endif endif - ifeq ($(ARCH),ARM) + ifeq ($(ARCH),arm) ifeq ($(BUILD),optimize) CFLAGS += -fomit-frame-pointer -DUSE_GCC_ARM endif endif - ifeq ($(ARCH),ARMv7) + ifeq ($(ARCH),armv7) ifeq ($(BUILD),optimize) CFLAGS += -fomit-frame-pointer -march=armv7-a -mfpu=neon -DUSE_GCC_ARM endif endif ifeq ($(OS),osx) - CFLAGS += -mmacosx-version-min=10.4 + CFLAGS += -mmacosx-version-min=10.4 -mdynamic-no-pic + ifeq ($(ARCH),x86) + CFLAGS += -msse2 -DhasSSE2 + endif endif ifeq ($(OS),android) CFLAGS += -DANDROID=1 @@ -138,24 +180,25 @@ endif # g++ ifeq ($(COMP),g++) - CFLAGS = -x c++ -std=c++11 -pedantic -W -Wall -Wextra -pipe -D_GNU_SOURCE=1 + CFLAGS = -x c++ -std=c++11 -pedantic -W -Wall -Wextra -pipe -D_GNU_SOURCE=1 -DUNICODE PGO_GEN = -fprofile-generate PGO_USE = -fprofile-correction -fprofile-use ifeq ($(BUILD),optimize) - CFLAGS += -Ofast -fwhole-program -flto -DNDEBUG + CFLAGS += -Ofast -fwhole-program -DNDEBUG + LTOFLAG = -flto else CFLAGS += -O0 -g -DDEBUG endif ifeq ($(ARCH),x64-modern) - CFLAGS += -m64 -march=native -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -m64 -march=native -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT endif ifeq ($(ARCH),x64) CFLAGS += -m64 -mtune=generic -DUSE_GAS_X64 endif ifeq ($(ARCH),x86) - CFLAGS += -m32 -mtune=generic -DUSE_GAS_X86 + CFLAGS += -m32 -mtune=generic -DUSE_GAS_X86 -DUSE_GAS_MMX ifeq ($(BUILD),optimize) CFLAGS += -fomit-frame-pointer endif @@ -163,6 +206,9 @@ ifeq ($(COMP),g++) ifeq ($(OS),osx) CFLAGS += -mmacosx-version-min=10.4 -mdynamic-no-pic + ifeq ($(ARCH),x86) + CFLAGS += -msse2 -DhasSSE2 + endif endif ifeq ($(OS),windows) CFLAGS += -D__USE_MINGW_ANSI_STDIO -DWINVER=0x501 @@ -175,7 +221,7 @@ endif #icc ifeq ($(COMP),icc) - CFLAGS = -std=c99 -Wall -Wcheck -wd2259 -wd913 -D_GNU_SOURCE=1 + CFLAGS = -std=c99 -Wall -Wcheck -wd2259 -D_GNU_SOURCE=1 -DUNICODE -Qoption,cpp,--unicode_source_kind,"UTF-8" PGO_GEN = -prof_gen PGO_USE = -prof_use -wd11505 @@ -186,13 +232,16 @@ ifeq ($(COMP),icc) endif ifeq ($(ARCH),x64-modern) - CFLAGS += -m64 -xHOST -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -m64 -march=core-avx2 -DUSE_GAS_X64 -DPOPCOUNT + endif + ifeq ($(ARCH),x64-avx512) + CFLAGS += -m64 -march=skylake-avx512 -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT endif ifeq ($(ARCH),x64) - CFLAGS += -m64 -xHost -DUSE_GAS_X64 + CFLAGS += -m64 -DUSE_GAS_X64 endif ifeq ($(ARCH),x32-modern) - CFLAGS += -m64 -xHost -ipo -auto-ilp32 -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -m64 -ipo -auto-ilp32 -DUSE_GAS_X64 -DPOPCOUNT endif ifeq ($(ARCH),x86) CFLAGS += -m32 -DUSE_GAS_X86 @@ -201,7 +250,7 @@ endif #pcc ifeq ($(COMP),pcc) - CFLAGS = - -D_GNU_SOURCE=1 + CFLAGS = - -D_GNU_SOURCE=1 -DUNICODE ifeq ($(BUILD),optimize) CFLAGS += -O4 -DNDEBUG @@ -210,7 +259,7 @@ ifeq ($(COMP),pcc) endif ifeq ($(ARCH),x64-modern) - CFLAGS += -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT endif ifeq ($(ARCH),x64) CFLAGS += -DUSE_GAS_X64 @@ -222,29 +271,39 @@ endif #clang ifeq ($(COMP),clang) - CFLAGS = -std=c99 -pedantic -W -Wall -D_GNU_SOURCE=1 -Wno-invalid-source-encoding + CFLAGS = -std=c99 -pedantic -W -Wall -D_GNU_SOURCE=1 -DUNICODE PGO_GEN = -fprofile-instr-generate PGO_USE = -fprofile-instr-use=edax.profdata PGO = llvm-profdata merge -output=edax.profdata $(BIN)/*.profraw ifeq ($(BUILD),optimize) - CFLAGS += -O3 -flto -ffast-math -fomit-frame-pointer -DNDEBUG -fuse-ld=gold + CFLAGS += -O3 -ffast-math -fomit-frame-pointer -DNDEBUG + LTOFLAG = -flto else CFLAGS += -O0 -g -DDEBUG endif ifeq ($(ARCH),x64-modern) - CFLAGS += -m64 -march=native -DUSE_GAS_X64 -DPOPCOUNT + CFLAGS += -m64 -march=core-avx2 -DUSE_GAS_X64 -DPOPCOUNT -DLASTFLIP_HIGHCUT + endif + ifeq ($(ARCH),x64-avx512) + CFLAGS += -m64 -march=skylake-avx512 -DUSE_GAS_X64 -DPOPCOUNT endif ifeq ($(ARCH),x64) CFLAGS += -m64 -DUSE_GAS_X64 endif ifeq ($(ARCH),x86) - CFLAGS += -m32 -DUSE_GAS_X86 + CFLAGS += -m32 -DUSE_GAS_X86 -DUSE_GAS_MMX endif ifeq ($(OS),osx) CFLAGS += -mmacosx-version-min=10.4 -mdynamic-no-pic + ifeq ($(ARCH),x86) + CFLAGS += -msse2 -DhasSSE2 + endif + ifeq ($(ARCH),arm) + CFLAGS += -march=armv8.3-a + endif endif endif @@ -259,25 +318,16 @@ endif ifeq ($(OS),windows) EXE = wEdax-$(ARCH).exe LIBS += -lws2_32 - ifeq ($(ARCH),x86) - LIBS += -lpthread + ifneq (,$(findstring x86,$(ARCH))) + LIBS += -Bstatic -Wl,-Bstatic,-lpthread endif endif ifeq ($(OS),osx) - EXE = mEdax + EXE = mEdax-$(ARCH) LIBS += -lpthread endif -ifeq ($(ARCH),x64) - CFLAGS += -DHAS_CPU_64 -endif -ifeq ($(ARCH),x64-modern) - CFLAGS += -DHAS_CPU_64 -endif -ifeq ($(ARCH),x32) - CFLAGS += -DHAS_CPU_64 -endif -ifeq ($(ARCH),x32-modern) +ifneq (,$(findstring x64,$(ARCH))$(findstring x32,$(ARCH))) CFLAGS += -DHAS_CPU_64 endif @@ -308,8 +358,8 @@ help: @echo " x32-modern x32 with popcount" @echo " x32 x32" @echo " x86 x86" - @echo " ARM ARM v5 & up" - @echo " ARMv7 ARM v7-a" + @echo " arm arm v5 & up" + @echo " armv7 arm v7-a" @echo "" @echo "Compilers:" @echo " gcc GNU C compiler version >= 4.6" @@ -328,50 +378,59 @@ help: build: @echo "building edax..." - $(CC) $(CFLAGS) all.c -o $(BIN)/$(EXE) $(LIBS) + $(CC) $(CFLAGS) $(LTOFLAG) all.c -s -o $(BIN)/$(EXE) $(LIBS) + +source: + $(CC) $(CFLAGS) -S all.c pgo-build: @echo "building edax with pgo..." $(MAKE) clean - $(CC) $(CFLAGS) $(PGO_GEN) all.c -o $(BIN)/$(EXE) $(LIBS) + $(CC) $(CFLAGS) $(LTOFLAG) $(PGO_GEN) all.c -o $(BIN)/$(EXE) $(LIBS) cd $(BIN); echo -e 'count games 10\ncount positions 9\n' | ./$(EXE) - cd $(BIN); ./$(EXE) -l 60 -solve problem/fforum-20-39.obf + cd $(BIN); ./$(EXE) -l 60 -solve ../problem/fforum-20-39.obf cd $(BIN); ./$(EXE) -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo rm -f ../bin/book.pgo ../bin/book.pgo.store $(PGO) - $(CC) $(CFLAGS) $(PGO_USE) all.c -s -o $(BIN)/$(EXE) $(LIBS) + $(CC) $(CFLAGS) $(LTOFLAG) $(PGO_USE) all.c -s -o $(BIN)/$(EXE) $(LIBS) pgo-rebuild: @echo "rebuilding edax with pgo..." - $(CC) $(CFLAGS) $(PGO_USE) all.c -s -o $(BIN)/$(EXE) $(LIBS) + $(CC) $(CFLAGS) $(LTOFLAG) $(PGO_USE) all.c -s -o $(BIN)/$(EXE) $(LIBS) prof: @echo "rebuilding edax for profiling..." - $(CC) $(CFLAGS) all.c -g -inline-level=0 -o $(BIN)/$(EXE) $(LIBS) -lprofiler + $(CC) $(CFLAGS) $(LTOFLAG) all.c -pg -o $(BIN)/$(EXE) $(LIBS) # -inline-level=0 -lprofiler debug: - make ARCH=$(ARCH) COMP=$(COMP) OS=$(OS) BUILD=debug build + make ARCH=$(ARCH) COMP=$(COMP) OS=$(OS) BUILD=debug prof release: $(MAKE) pgo-build ARCH=x64 OS=linux COMP=gcc $(MAKE) build ARCH=x64 OS=windows COMP=gcc CC='x86_64-w64-mingw32-gcc' $(MAKE) build ARCH=x86 OS=windows COMP=gcc CC='i686-w64-mingw32-gcc' - $(MAKE) build ARCH=ARMv7 OS=android COMP=gcc-old CC='arm-linux-androideabi-gcc' + $(MAKE) build ARCH=armv7 OS=android COMP=gcc CC='arm-linux-androideabi-gcc --sysroot=$SYSROOT' $(MAKE) clean $(MAKE) build ARCH=x64 OS=osx COMP=gcc-old CC=i686-apple-darwin10-gcc android: - $(MAKE) build ARCH=ARMv7 OS=android COMP=gcc-old CC='arm-linux-androideabi-gcc' + ndk-build NDK_PROJECT_PATH=. NDK_APPLICATION_MK=./Application.mk # NDK_DEBUG=1 + +macuniversal: + $(MAKE) build ARCH=x86 OS=osx COMP=clang + $(MAKE) build ARCH=x64 OS=osx COMP=clang + lipo -create -arch i686 ../bin/mEdax-x86 -arch x86_64 ../bin/mEdax-x64 -arch arm64 ../bin/mEdax-arm -output ../bin/mEdax + rm -f ../bin/mEdax-x86 ../bin/mEdax-x64 clean: rm -f pgopti* *.dyn all.gc* *~ *.o generate_flip generate_count_flip *.prof* noip: - $(CC) -g $(CFLAGS) $(SRC) -o $(BIN)/$(EXE) $(LIBS) + $(CC) -g $(CFLAGS) $(LTOFLAG) $(SRC) -o $(BIN)/$(EXE) $(LIBS) code: - $(CC) $(CFLAGS) generate_flip.c -o generate_flip - $(CC) $(CFLAGS) generate_count_flip.c -o generate_count_flip + $(CC) $(CFLAGS) $(LTOFLAG) generate_flip.c -o generate_flip + $(CC) $(CFLAGS) $(LTOFLAG) generate_count_flip.c -o generate_count_flip generate_flip generate_count_flip diff --git a/src/NMakefile b/src/NMakefile index 73a9ab40..ba1a68ef 100644 --- a/src/NMakefile +++ b/src/NMakefile @@ -4,35 +4,112 @@ # Compilation options for Microsoft Visual C++ & nmake. # # If you have a CPU supporting popcount (Intel Nehalem (i7) or AMD Barcelona or greater), -# add /D "POPCOUNT" to the compiler option set. +# add /D POPCOUNT to the compiler option set. # -# Microsoft Visual C++ 2008 or better for Windows (7 or Vista) 64bits. -W64_VC_FLAGS = /I"..\include" /O2 /GL /fp:fast /favor:INTEL64 /arch:AVX\ - /D "NDEBUG" /D "inline=__inline" /D "__func__=__FUNCTION__" /D "USE_MSVC_X64" /D "POPCOUNT"\ - /MT +# Microsoft Visual C++ 2008 or better for Windows (7 or Vista). +# VC_FLAGS = /source-charset:.1252 /execution-charset:.1252\ +VC_FLAGS = /D UNICODE /utf-8 /D _CRT_SECURE_NO_DEPRECATE /I"..\include" /O2 /fp:fast /GS- /D NDEBUG /MT -# Microsoft Visual C++ 2008 of better for Windows (7 or Vista or xp) 32bits. -W32_VC_FLAGS = /I"..\include" /O2 /GL /fp:fast\ - /D "NDEBUG" /D "inline=__inline" /D "__func__=__FUNCTION__" /D "USE_MASM_X86"\ - /MT /L ws2_32 +vc-w64-modern: + cl $(VC_FLAGS) /GL /D HAS_CPU_64 /arch:AVX2 /D POPCOUNT /D LASTFLIP_HIGHCUT all.c ws2_32.lib /Fe..\bin\wEdax-w64-modern.exe /link /VERSION:4.5 -win-vc-x64: - cl $(W64_VC_FLAGS) all.c ws2_32.lib /Fe..\bin\wEdax-w64.exe /link /machine:x64 /VERSION:4.4 +vc-w64-avx512: + cl $(VC_FLAGS) /GL /D HAS_CPU_64 /arch:AVX512 /D POPCOUNT /D LASTFLIP_HIGHCUT all.c ws2_32.lib /Fe..\bin\wEdax-w64-avx512.exe /link /VERSION:4.5 +# cl $(VC_FLAGS) /GL /D HAS_CPU_64 /arch:AVX2 /D __AVX512VL__ /D POPCOUNT /D LASTFLIP_HIGHCUT all.c ws2_32.lib /Fe..\bin\wEdax-w64-avx512.exe /link /VERSION:4.5 -win-vc-x86: - cl $(W32_VC_FLAGS) all.c /Fe..\bin\wEdax-w32.exe +vc-w64-popcnt: + cl $(VC_FLAGS) /GL /D HAS_CPU_64 /D POPCOUNT all.c ws2_32.lib /Fe..\bin\wEdax-w64-popcnt.exe /link /VERSION:4.5 -win-vc-pgo-x64: +vc-w64-k10: + cl $(VC_FLAGS) /GL /D HAS_CPU_64 /D POPCOUNT /D __LZCNT__ /D MOVE_GENERATOR=MOVE_GENERATOR_BITSCAN /favor:AMD64 all.c ws2_32.lib /Fe..\bin\wEdax-w64-k10.exe /link /VERSION:4.5 + +vc-w64: + cl $(VC_FLAGS) /GL /D HAS_CPU_64 all.c ws2_32.lib /Fe..\bin\wEdax-w64.exe /link /VERSION:4.5 + +vc-w32-modern: + cl $(VC_FLAGS) /GL /D hasSSE2 /arch:AVX2 /D POPCOUNT all.c ws2_32.lib /Fe..\bin\wEdax-w32-modern.exe + +vc-w32-sse: + cl $(VC_FLAGS) /GL /D hasSSE2 all.c ws2_32.lib /Fe..\bin\wEdax-w32-sse.exe + +vc-w32-mmx: + cl $(VC_FLAGS) /GL /arch:IA32 /D hasMMX all.c ws2_32.lib /Fe..\bin\wEdax-w32-mmx.exe + +vc-w32: + cl $(VC_FLAGS) /GL /arch:IA32 all.c ws2_32.lib /Fe..\bin\wEdax-w32.exe + +vc-a64: +# vcvarsamd64_arm64.bat + cl $(VC_FLAGS) /GL /D HAS_CPU_64 all.c ws2_32.lib /Fe..\bin\wEdax-a64.exe /link /VERSION:4.5 + +vc-a32: +# vcvarsamd64_arm.bat + cl $(VC_FLAGS) /GL all.c ws2_32.lib /Fe..\bin\wEdax-a32.exe + +icc-w64-modern: + icl $(VC_FLAGS) /GL /D HAS_CPU_64 /arch:AVX2 /D POPCOUNT /D LASTFLIP_HIGHCUT all.c ws2_32.lib /Fe..\bin\wEdax-w64-modern.exe /link /VERSION:4.5 + +icc-w64-avx512: + icl $(VC_FLAGS) /GL /D HAS_CPU_64 /arch:CORE-AVX512 /D POPCOUNT /D LASTFLIP_HIGHCUT all.c ws2_32.lib /Fe..\bin\wEdax-w64-avx512.exe /link /VERSION:4.5 + +icc-w64: + icl $(VC_FLAGS) /GL /D HAS_CPU_64 all.c ws2_32.lib /Fe..\bin\wEdax-w64.exe /link /VERSION:4.5 + +icc-w32: + icl $(VC_FLAGS) /GL /arch:IA32 all.c ws2_32.lib /Fe..\bin\wEdax-w32.exe + +clang-w64-modern: + clang-cl $(VC_FLAGS) /D HAS_CPU_64 /arch:AVX2 /D POPCOUNT all.c ws2_32.lib /Fe..\bin\wEdax-w64-modern.exe /link /VERSION:4.5 + +clang-w64: + clang-cl $(VC_FLAGS) /D HAS_CPU_64 all.c ws2_32.lib /Fe..\bin\wEdax-w64.exe /link /VERSION:4.5 + +clang-w32: + clang-cl $(VC_FLAGS) /arch:IA32 all.c ws2_32.lib /Fe..\bin\wEdax-w32.exe + +vc-pgo-w64-modern: set VCPROFILE_PATH=..\src - cl $(W64_VC_FLAGS) all.c ws2_32.lib /Fe..\bin\wEdax-x64.exe /link /ltcg:pgi /machine:x64 /VERSION:4.4 + cl $(VC_FLAGS) /GL /arch:AVX2 /D POPCOUNT /D LASTFLIP_HIGHCUT all.c ws2_32.lib /Fe..\bin\wEdax-w64-modern.exe /link /ltcg:pgi /VERSION:4.5 + cd ..\bin + wEdax-w64-modern -l 60 -solve ..\problem\fforum-20-39.obf + wEdax-w64-modern -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo + del book.pgo book.pgo.store + cd ..\src + link all.obj ws2_32.lib /out:..\bin\wEdax-w64-modern.exe /ltcg:pgo /VERSION:4.5 + del *.pgc ..\bin\*.pgd + +vc-pgo-w64-k10: + set VCPROFILE_PATH=..\src + cl $(VC_FLAGS) /GL /D POPCOUNT /D __LZCNT__ /D MOVE_GENERATOR=MOVE_GENERATOR_BITSCAN all.c ws2_32.lib /Fe..\bin\wEdax-w64-k10.exe /link /ltcg:pgi /VERSION:4.5 + cd ..\bin + wEdax-w64-k10 -l 60 -solve ..\problem\fforum-20-39.obf + wEdax-w64-k10 -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo + del book.pgo book.pgo.store + cd ..\src + link all.obj ws2_32.lib /out:..\bin\wEdax-w64-k10.exe /ltcg:pgo /VERSION:4.5 + del *.pgc ..\bin\*.pgd + +vc-pgo-w64: + set VCPROFILE_PATH=..\src + cl $(VC_FLAGS) /GL all.c ws2_32.lib /Fe..\bin\wEdax-w64.exe /link /ltcg:pgi /VERSION:4.5 + cd ..\bin + wEdax-w64 -l 60 -solve ..\problem\fforum-20-39.obf + wEdax-w64 -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo + del book.pgo book.pgo.store + cd ..\src + link all.obj ws2_32.lib /out:..\bin\wEdax-w64.exe /ltcg:pgo /VERSION:4.5 + del *.pgc ..\bin\*.pgd + +icc-pgo-w64-modern: + icl $(VC_FLAGS) /arch:AVX2 /D POPCOUNT /D LASTFLIP_HIGHCUT /Qprof-gen all.c ws2_32.lib /Fe..\bin\wEdax-w64-modern.exe /link /VERSION:4.5 cd ..\bin - wEdax-x64.exe -l 60 -solve problem\fforum-20-39.obf - wEdax-x64.exe -l 21 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo + wEdax-w64-modern -l 60 -solve ..\problem\fforum-20-39.obf + wEdax-w64-modern -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo + del book.pgo book.pgo.store cd ..\src - link all.obj ws2_32.lib /out:..\bin\wEdax-x64.exe /ltcg:pgo /machine:x64 /VERSION:4.4 + icl $(VC_FLAGS) /GL /arch:AVX2 /D POPCOUNT /D LASTFLIP_HIGHCUT /Qprof-use /Qip all.c ws2_32.lib /Fe..\bin\wEdax-w64-modern.exe /link /VERSION:4.5 - clean: del -f pgopti* *.dyn all.gc* *~ *.p* *.obj diff --git a/src/base.c b/src/base.c index 29a295e9..aef0f865 100644 --- a/src/base.c +++ b/src/base.c @@ -3,7 +3,7 @@ * * Header file for game base management. * - * @date 1998 - 2017 + * @date 1998 - 2020 * @author Richard Delorme * @version 4.4 */ @@ -131,7 +131,7 @@ static void wthor_players_init(WthorBase *base) static void wthor_players_load(WthorBase *base, const char *file) { FILE *f; - WthorHeader header[1]; + WthorHeader header; int i, r; r = base->n_players = 0; @@ -143,9 +143,9 @@ static void wthor_players_load(WthorBase *base, const char *file) return; } - if (wthor_header_read(header, f)) {; + if (wthor_header_read(&header, f)) {; - base->n_players = header->n; + base->n_players = header.n; base->player = (char (*)[20]) malloc(base->n_players * sizeof (*base->player)); if (base->player) { @@ -174,19 +174,19 @@ static void wthor_players_load(WthorBase *base, const char *file) static void wthor_players_save(WthorBase *base, const char *file) { FILE *f; - WthorHeader header[1]; + WthorHeader header; int i, r; r = 0; - wthor_header_set(header, 0, base->n_players, 0); + wthor_header_set(&header, 0, base->n_players, 0); if ((f = fopen(file, "wb")) == NULL) { warn("Cannot open Wthor players' file %s\n", file); return; } - if (wthor_header_write(header, f)) { + if (wthor_header_write(&header, f)) { for (i = 0; i < base->n_players; ++i) { r += fwrite(base->player[i], 20, 1, f); @@ -216,16 +216,15 @@ int wthor_player_get(WthorBase *base, const char *name) assert(base->player != NULL && base->n_players > 0); for (i = 0; i < base->n_players; ++i) { - if (strcmp(name, base->player[i]) == 0) return i; + if (strcmp(name, base->player[i]) == 0) return i; } n = base->n_players + 1; player = (char (*)[20]) realloc(base->player, n * sizeof (*base->player)); if (player) { - base->player = player; - base->n_players = n; - strncpy(base->player[i], name, 20); // used on purpose, as strncpy fills with '\0' the field name - base->player[i][19] = '\0'; // force null terminated string + base->player = player; + base->n_players = n; + sprintf(base->player[i], "%-.19s", name); // force null terminated string } else { warn("Cannot allocate Wthor players' array\n"); i = 0; @@ -243,7 +242,7 @@ int wthor_player_get(WthorBase *base, const char *name) static void wthor_tournaments_load(WthorBase *base, const char *file) { FILE *f; - WthorHeader header[1]; + WthorHeader header; int i, r; r = 0; @@ -254,8 +253,8 @@ static void wthor_tournaments_load(WthorBase *base, const char *file) return; } - if (wthor_header_read(header, f)) { - base->n_tournaments = header->n; + if (wthor_header_read(&header, f)) { + base->n_tournaments = header.n; base->tournament = (char (*)[26]) malloc(base->n_tournaments * sizeof (*base->tournament)); if (base->tournament) { @@ -285,7 +284,7 @@ static void wthor_tournaments_load(WthorBase *base, const char *file) static void wthor_tournaments_save(WthorBase *base, const char *file) { FILE *f; - WthorHeader header[1]; + WthorHeader header; int i, r; r = 0; @@ -294,8 +293,8 @@ static void wthor_tournaments_save(WthorBase *base, const char *file) warn("Cannot open Wthor tournaments' file %s\n", file); return; } - wthor_header_set(header, 0, base->n_tournaments, 0); - if (wthor_header_write(header, f)) { + wthor_header_set(&header, 0, base->n_tournaments, 0); + if (wthor_header_write(&header, f)) { for (i = r = 0; i < base->n_tournaments; ++i) { r += fwrite(base->tournament[i], 26, 1, f); } @@ -344,8 +343,8 @@ bool wthor_load(WthorBase *base, const char *file) wthor_players_load(base, path); if ((f = fopen(file, "rb")) != NULL) { - if (wthor_header_read(base->header, f) && base->header->board_size == 8) { - base->n_games = base->header->n_games; + if (wthor_header_read(&base->header, f) && base->header.board_size == 8) { + base->n_games = base->header.n_games; base->game = (WthorGame*) malloc(base->n_games * sizeof (WthorGame)); if (base->game) { @@ -394,8 +393,8 @@ bool wthor_save(WthorBase *base, const char *file) wthor_players_save(base, path); if ((f = fopen(file, "wb")) != NULL) { - wthor_header_set(base->header, base->n_games, 0, 0); - r = wthor_header_write(base->header, f); + wthor_header_set(&base->header, base->n_games, 0, 0); + r = wthor_header_write(&base->header, f); if (base->game) { r += (fwrite(base->game, sizeof (WthorGame), base->n_games, f) == (unsigned) base->n_games); } @@ -445,20 +444,20 @@ bool base_to_wthor(const Base *base, WthorBase *wthor) */ void wthor_print_game(WthorBase *base, int i, FILE *f) { - Game game[1]; + Game game; if (0 <= i && i < base->n_games) { fprintf(f, "Game #%d: %s: %4d - %s vs. %s: ", i, base->tournament[base->game[i].tournament], - base->header->game_year, + base->header.game_year, base->player[base->game[i].black], base->player[base->game[i].white]); - wthor_to_game(base->game + i, game); - game_export_text(game, f); + wthor_to_game(base->game + i, &game); + game_export_text(&game, f); - fprintf(f, "Theoric score %d empties : %+02d, ", base->header->depth, base->game[i].theoric_score); + fprintf(f, "Theoric score %d empties : %+02d, ", base->header.depth, base->game[i].theoric_score); fprintf(f, "Score final : %+02d (as black disc count.)\n", base->game[i].score); } } @@ -474,7 +473,7 @@ void wthor_print_game(WthorBase *base, int i, FILE *f) static void wthorgame_get_board(WthorGame *game, const int n_empties, Board *board, int *player) { int i; - Move move[1]; + Move move; char s_move[4]; *player = BLACK; board_init(board); @@ -482,11 +481,11 @@ static void wthorgame_get_board(WthorGame *game, const int n_empties, Board *boa if (board_is_pass(board)) { board_pass(board); *player ^= 1; } - board_get_move(board, move_from_wthor(game->x[i]), move); - if (board_check_move(board, move)) { - board_update(board, move); *player ^= 1; + board_get_move_flip(board, move_from_wthor(game->x[i]), &move); + if (board_check_move(board, &move)) { + board_update(board, &move); *player ^= 1; } else { - warn("Illegal move %s\n", move_to_string(move->x, *player, s_move)); + warn("Illegal move %s\n", move_to_string(move.x, *player, s_move)); break; } } @@ -501,10 +500,10 @@ static void wthorgame_get_board(WthorGame *game, const int n_empties, Board *boa */ int pv_check(const Board *init_board, Line *pv, Search *search) { - Game game[1]; + Game game; - line_to_game(init_board, pv, game); - return game_analyze(game, search, board_count_empties(init_board), false); + line_to_game(init_board, pv, &game); + return game_analyze(&game, search, board_count_empties(init_board), false); } /** @@ -515,9 +514,9 @@ int pv_check(const Board *init_board, Line *pv, Search *search) */ void wthor_test(const char *file, Search *search) { - WthorBase base[1]; + WthorBase base; WthorGame *wthor; - Board board[1]; + Board board; int player; int score; int n_empties; @@ -526,7 +525,7 @@ void wthor_test(const char *file, Search *search) long long t; int n_err; - if (wthor_load(base, file)) { + if (wthor_load(&base, file)) { if (search->options.verbosity == 1) { if (search->options.header) puts(search->options.header); @@ -538,11 +537,11 @@ void wthor_test(const char *file, Search *search) t = 0; foreach_wthorgame(wthor, base) { - wthorgame_get_board(wthor, base->header->depth, board, &player); - n_empties = board_count_empties(board); - if (n_empties != base->header->depth && !board_is_game_over(board)) { + wthorgame_get_board(wthor, base.header.depth, &board, &player); + n_empties = board_count_empties(&board); + if (n_empties != base.header.depth && !board_is_game_over(&board)) { warn("Incomplete or Illegal game: %d empties\n", n_empties); - wthor_print_game(base, wthor - base->game, stderr); + wthor_print_game(&base, wthor - base.game, stderr); continue; } @@ -550,33 +549,33 @@ void wthor_test(const char *file, Search *search) else score = 2 * wthor->theoric_score - 64; if (abs(score) > 64) { warn("Impossible theoric score:\n"); - wthor_print_game(base, wthor - base->game, stderr); + wthor_print_game(&base, wthor - base.game, stderr); continue; } search_cleanup(search); - search_set_board(search, board, player); - search_set_level(search, 60, base->header->depth); + search_set_board(search, &board, player); + search_set_level(search, 60, base.header.depth); search_run(search); if (search->options.verbosity) putchar('\n'); n_nodes += search->result->n_nodes; t += search->result->time; if (score != search->result->score) { warn("Wrong theoric score: %+d (Wthor) instead of %+d (Edax)\n", score, search->result->score); - wthor_print_game(base, wthor - base->game, stderr); + wthor_print_game(&base, wthor - base.game, stderr); ++n_failure; assert(false); // stop here when debug is on } if (options.pv_check) { Line pv; - line_copy(&pv, search->result->pv, 0); - n_err = pv_check(board, &pv, search); + line_copy(&pv, &search->result->pv, 0); + n_err = pv_check(&board, &pv, search); if (n_err) { char s[80]; warn("Wrong pv:\n"); - board_print(board, player, stderr); - fprintf(stderr, "setboard %s\nplay ", board_to_string(board, player, s)); + board_print(&board, player, stderr); + fprintf(stderr, "setboard %s\nplay ", board_to_string(&board, player, s)); line_print(&pv, 200, " ", stderr); putc('\n', stderr); putc('\n', stderr); assert(false); // stop here when debug is on @@ -584,7 +583,7 @@ void wthor_test(const char *file, Search *search) } if (search->options.verbosity == 0) { - printf("%s game: %4d, error: %2d ; ", file, (int)(wthor - base->game) + 1, n_failure); + printf("%s game: %4d, error: %2d ; ", file, (int)(wthor - base.game) + 1, n_failure); printf("%lld n, ", n_nodes); time_print(t, false, stdout); putchar('\r'); fflush(stdout); } @@ -594,7 +593,7 @@ void wthor_test(const char *file, Search *search) } putchar('\n'); - wthor_free(base); + wthor_free(&base); } return; } @@ -610,18 +609,18 @@ void wthor_test(const char *file, Search *search) */ void wthor_eval(const char *file, Search *search, unsigned long long histogram[129][65]) { - WthorBase base[1]; + WthorBase base; WthorGame *wthor; - Board board[1]; + Board board; int player; int score; int n_empties; - if (wthor_load(base, file)) { + if (wthor_load(&base, file)) { foreach_wthorgame(wthor, base) { - wthorgame_get_board(wthor, base->header->depth, board, &player); - n_empties = board_count_empties(board); - if (n_empties != base->header->depth && !board_is_game_over(board)) { + wthorgame_get_board(wthor, base.header.depth, &board, &player); + n_empties = board_count_empties(&board); + if (n_empties != base.header.depth && !board_is_game_over(&board)) { continue; } @@ -632,12 +631,12 @@ void wthor_eval(const char *file, Search *search, unsigned long long histogram[1 } search_cleanup(search); - search_set_board(search, board, player); - search_set_level(search, options.level, base->header->depth); + search_set_board(search, &board, player); + search_set_level(search, options.level, base.header.depth); search_run(search); ++histogram[search->result->score + 64][(score + 64) / 2]; } - wthor_free(base); + wthor_free(&base); } return; } @@ -650,17 +649,17 @@ void wthor_eval(const char *file, Search *search, unsigned long long histogram[1 */ void wthor_edaxify(const char *file) { - WthorBase base[1]; + WthorBase base; WthorGame *wthor; - if (wthor_load(base, file)) { + if (wthor_load(&base, file)) { foreach_wthorgame(wthor, base) { wthor->black = 1368; // "Edax (delorme)" wthor->white = 1368; // "Edax (delorme)" wthor->tournament = 157; // "Etudes" } - wthor_save(base, file); - wthor_free(base); + wthor_save(&base, file); + wthor_free(&base); } } @@ -740,11 +739,11 @@ void base_unique(Base *base) bool base_load(Base *base, const char *file) { void (*load)(Game*, FILE*) = game_import_text; - Game game[1]; + Game game; FILE *f; char ext[8]; int l; - WthorHeader header[1]; + WthorHeader header; l = strlen(file); strcpy(ext, file + l - 4); string_to_lowercase(ext); if (strcmp(ext, ".txt") == 0) load = game_import_text; @@ -766,11 +765,11 @@ bool base_load(Base *base, const char *file) } info("loading games..."); - if (load == game_import_wthor) wthor_header_read(header, f); + if (load == game_import_wthor) wthor_header_read(&header, f); for (;;) { - load(game, f); + load(&game, f); if (ferror(f) || feof(f)) break; - base_append(base, game); + base_append(base, &game); } info("done (%d games loaded)\n", base->n_games); @@ -841,15 +840,15 @@ void base_save(const Base *base, const char *file) void base_to_problem(Base *base, const int n_empties, const char *problem) { int i; - Board board[1]; + Board board; char s[80]; FILE *f; f = fopen(problem, "w"); for (i = 0; i < base->n_games; ++i) { - if (game_get_board(base->game + i, 60 - n_empties, board)) { - board_to_string(board, n_empties & 1, s); + if (game_get_board(base->game + i, 60 - n_empties, &board)) { + board_to_string(&board, n_empties & 1, s); fprintf(f, "%s\n", s); } } @@ -867,14 +866,14 @@ void base_to_problem(Base *base, const int n_empties, const char *problem) void base_to_FEN(Base *base, const int n_empties, const char *problem) { int i; - Board board[1]; + Board board; FILE *f; f = fopen(problem, "w"); for (i = 0; i < base->n_games; ++i) { - if (game_get_board(base->game + i, 60 - n_empties, board)) { - board_print_FEN(board, n_empties & 1, f); + if (game_get_board(base->game + i, 60 - n_empties, &board)) { + board_print_FEN(&board, n_empties & 1, f); putc('\n', f); } } @@ -943,8 +942,8 @@ void base_complete(Base *base, Search *search) void base_compare(const char *file_1, const char *file_2) { Base base_1[1], base_2[2]; - PositionHash hash[1]; - Board board[1]; + PositionHash hash; + Board board; int i, j; long long n_1, n_2, n_2_only; @@ -955,13 +954,13 @@ void base_compare(const char *file_1, const char *file_2) n_2_only = 0; base_load(base_1, file_1); - positionhash_init(hash, options.hash_table_size); + positionhash_init(&hash, options.hash_table_size); for (i = 0; i < base_1->n_games; ++i) { Game *game = base_1->game + i; - *board = *game->initial_board; + board = game->initial_board; for (j = 0; j < 60 && game->move[j] != NOMOVE; ++j) { - if (!game_update_board(board, game->move[j])) break; // BAD MOVE -> end of game - if (positionhash_append(hash, board)) { + if (!game_update_board(&board, game->move[j])) break; // BAD MOVE -> end of game + if (positionhash_append(&hash, &board)) { ++n_1; } } @@ -971,30 +970,30 @@ void base_compare(const char *file_1, const char *file_2) base_load(base_2, file_2); for (i = 0; i < base_2->n_games; ++i) { Game *game = base_2->game + i; - *board = *game->initial_board; + board = game->initial_board; for (j = 0; j < 60 && game->move[j] != NOMOVE; ++j) { - if (!game_update_board(board, game->move[j])) break; // BAD MOVE -> end of game - if (positionhash_append(hash, board)) { + if (!game_update_board(&board, game->move[j])) break; // BAD MOVE -> end of game + if (positionhash_append(&hash, &board)) { ++n_2_only; } } } - positionhash_delete(hash); - positionhash_init(hash, options.hash_table_size); + positionhash_delete(&hash); + positionhash_init(&hash, options.hash_table_size); for (i = 0; i < base_2->n_games; ++i) { Game *game = base_2->game + i; - *board = *game->initial_board; + board = game->initial_board; for (j = 0; j < 60 && game->move[j] != NOMOVE; ++j) { - if (!game_update_board(board, game->move[j])) break; // BAD MOVE -> end of game - if (positionhash_append(hash, board)) { + if (!game_update_board(&board, game->move[j])) break; // BAD MOVE -> end of game + if (positionhash_append(&hash, &board)) { ++n_2; } } } base_free(base_2); - positionhash_delete(hash); + positionhash_delete(&hash); printf("%s : %lld positions - %lld original positions\n", file_1, n_1, n_1 - (n_2- n_2_only)); printf("%s : %lld positions - %lld original positions\n", file_2, n_2, n_2_only); diff --git a/src/base.h b/src/base.h index 15ce13f7..0c019b84 100644 --- a/src/base.h +++ b/src/base.h @@ -36,7 +36,7 @@ typedef struct WthorHeader { } WthorHeader; typedef struct WthorBase { - WthorHeader header[1]; /** Header */ + WthorHeader header; /** Header */ char (*tournament)[26]; /** tournaments */ int n_tournaments; /** tournament number */ char (*player)[20]; /** players */ @@ -60,7 +60,7 @@ void wthor_eval(const char*, struct Search*, unsigned long long histogram[129][6 void wthor_edaxify(const char*); #define foreach_wthorgame(wgame, wbase) \ - for ((wgame) = (wbase)->game ; (wgame) < (wbase)->game + (wbase)->header->n_games; ++(wgame)) + for ((wgame) = (wbase).game ; (wgame) < (wbase).game + (wbase).header.n_games; ++(wgame)) void base_init(Base*); void base_free(Base*); diff --git a/src/bench.c b/src/bench.c index 073ab174..7b6fad96 100644 --- a/src/bench.c +++ b/src/bench.c @@ -1,9 +1,9 @@ /** * @file bench.c * - * @date 1998 - 2017 + * @date 1998 - 2023 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #include "bit.h" @@ -20,7 +20,7 @@ * * @return a CPU clock tick. */ -static unsigned long long click() +static unsigned long long click(void) { #if defined(USE_GAS_X64) @@ -34,7 +34,7 @@ static unsigned long long click() __asm__ volatile ( "rdtsc" : "=A" (a)); return a; -#elif defined(_WIN32) +#elif defined(_WIN32) && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) return __rdtsc(); #else return cpu_clock(); @@ -44,25 +44,29 @@ static unsigned long long click() /* * @brief Move generator performance test. */ -static void bench_move_generator() +static void bench_move_generator(void) { const char *b = "OOOOOOOOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOOOOOOOO O"; char m[4]; Board board; Move move; int i, x; + volatile int v; const int N_WARMUP = 1000; const int N_REPEAT = 1000000; unsigned long long c, overhead; double t, t_mean, t_var, t_min, t_max; + v = 0; c = -click(); for (i = 0; i < N_WARMUP; ++i) { + v += i; } c += click(); c = -click(); for (i = 0; i < N_REPEAT; ++i) { + v += i; } c += click(); overhead = c; @@ -78,13 +82,13 @@ static void bench_move_generator() c = -click(); for (i = 0; i < N_WARMUP; ++i) { - board_get_move(&board, x, &move); + v += board_get_move_flip(&board, x, &move); } c += click(); c = -click(); for (i = 0; i < N_REPEAT; ++i) { - board_get_move(&board, x, &move); + v += board_get_move_flip(&board, x, &move); } c += click(); @@ -94,37 +98,42 @@ static void bench_move_generator() if (t < t_min) t_min = t; if (t > t_max) t_max = t; - if (options.verbosity >= 2) printf("board_get_move: %s %.1f clicks;\n", move_to_string(x, WHITE, m), t); + if (options.verbosity >= 2) printf("board_get_move_flip: %s %.1f clicks;\n", move_to_string(x, WHITE, m), t); } t_mean /= x; t_var = t_var / x - (t_mean * t_mean); - - printf("board_get_move: %.2f < %.2f +/- %.2f < %.2f\n", t_min, t_mean, sqrt(t_var), t_max); + + printf("board_get_move_flip: %.2f < %.2f +/- %.2f < %.2f\n", t_min, t_mean, sqrt(t_var), t_max); } /* * @brief Last Move performance test. */ -static void bench_count_last_flip() +static void bench_count_last_flip(void) { const char *b = "OOOOOOOOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOOOOOOOO O"; char m[4]; Board board; int i, x; + volatile int v; const int N_WARMUP = 1000; const int N_REPEAT = 1000000; unsigned long long c, overhead; double t, t_mean, t_var, t_min, t_max; + v = 0; + c = -click(); for (i = 0; i < N_WARMUP; ++i) { + v += i; } c += click(); c = -click(); for (i = 0; i < N_REPEAT; ++i) { + v += i; } c += click(); overhead = c; @@ -136,17 +145,17 @@ static void bench_count_last_flip() for (x = A1; x < PASS; ++x) { board_set(&board, b); board.player &= ~x_to_bit(x); - board.opponent &= ~x_to_bit(x); - + // board.opponent &= ~x_to_bit(x); + c = -click(); for (i = 0; i < N_WARMUP; ++i) { - count_last_flip(x, board.player); + v += last_flip(x, board.player & ~i); } c += click(); c = -click(); for (i = 0; i < N_REPEAT; ++i) { - count_last_flip(x, board.player); + v += last_flip(x, board.player& ~i); } c += click(); @@ -162,32 +171,37 @@ static void bench_count_last_flip() t_mean /= x; t_var = t_var / x - (t_mean * t_mean); - + printf("count_last_flip: %.2f < %.2f +/- %.2f < %.2f\n", t_min, t_mean, sqrt(t_var), t_max); } /* * @brief Scoring performance test. */ -static void bench_board_score_1() +static void bench_board_score_1(void) { const char *b = "OOOOOOOOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOOOOOOOO O"; char m[4]; Board board; int i, x; + volatile int v; const int N_WARMUP = 1000; const int N_REPEAT = 1000000; unsigned long long c, overhead; double t, t_mean, t_var, t_min, t_max; + board_set(&board, b); + v = 0; c = -click(); for (i = 0; i < N_WARMUP; ++i) { + v += i; } c += click(); c = -click(); for (i = 0; i < N_REPEAT; ++i) { + v += i; } c += click(); overhead = c; @@ -200,16 +214,16 @@ static void bench_board_score_1() board_set(&board, b); board.player &= ~x_to_bit(x); board.opponent &= ~x_to_bit(x); - + c = -click(); for (i = 0; i < N_WARMUP; ++i) { - board_score_1(&board, SCORE_MAX, x); + v += board_score_1(board.player, SCORE_MAX - 1, x); } c += click(); c = -click(); for (i = 0; i < N_REPEAT; ++i) { - board_score_1(&board, SCORE_MAX, x); + v += board_score_1(board.player, SCORE_MAX - 1, x); } c += click(); @@ -225,38 +239,41 @@ static void bench_board_score_1() t_mean /= x; t_var = t_var / x - (t_mean * t_mean); - + printf("board_score_1: %.2f < %.2f +/- %.2f < %.2f\n", t_min, t_mean, sqrt(t_var), t_max); } /* * @brief Mobility performance test. */ -static void bench_mobility() +static void bench_mobility(void) { const char *b = "OOOOOOOOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOOOOOOOO O"; char m[4]; Board board; - int i, x, v; + int i, x; + volatile int v; const int N_WARMUP = 1000; const int N_REPEAT = 1000000; unsigned long long c, overhead; double t, t_mean, t_var, t_min, t_max; board_set(&board, b); - - x = A1; + v = 0; c = -click(); for (i = 0; i < N_WARMUP; ++i) { - board.player &= ~x_to_bit(x); - board.opponent &= ~x_to_bit(x); + board.player &= ~i; + board.opponent &= ~i; + v += i; } c += click(); + board_set(&board, b); c = -click(); for (i = 0; i < N_REPEAT; ++i) { - board.player &= ~x_to_bit(x); - board.opponent &= ~x_to_bit(x); + board.player &= ~i; + board.opponent &= ~i; + v += i; } c += click(); overhead = 0; @@ -267,21 +284,22 @@ static void bench_mobility() for (x = A1; x < PASS; ++x) { board_set(&board, b); - + v = 0; c = -click(); for (i = 0; i < N_WARMUP; ++i) { - board.player &= ~x_to_bit(x); - board.opponent &= ~x_to_bit(x); + board.player &= ~i; + board.opponent &= ~i; v += get_mobility(board.player, board.opponent); v -= get_mobility(board.opponent, board.player); } c += click(); + board_set(&board, b); c = -click(); for (i = 0; i < N_REPEAT; ++i) { - board.player &= ~x_to_bit(x); - board.opponent &= ~x_to_bit(x); + board.player &= ~i; + board.opponent &= ~i; v += get_mobility(board.player, board.opponent); v -= get_mobility(board.opponent, board.player); } @@ -299,26 +317,28 @@ static void bench_mobility() t_mean /= x; t_var = t_var / x - (t_mean * t_mean); - + printf("mobility: %.2f < %.2f +/- %.2f < %.2f\n", t_min, t_mean, sqrt(t_var), t_max); } /* * @brief Stability performance test. */ -static void bench_stability() +static void bench_stability(void) { const char *b = "OOOOOOOOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOXXXXXXOOOOOOOOO O"; char m[4]; Board board; - int i, x, v; + int i, x; + volatile int v; const int N_WARMUP = 1000; const int N_REPEAT = 1000000; unsigned long long c, overhead; double t, t_mean, t_var, t_min, t_max; board_init(&board); - + + v = 0; x = A1; c = -click(); for (i = 0; i < N_WARMUP; ++i) { @@ -327,6 +347,7 @@ static void bench_stability() } c += click(); + board_set(&board, b); c = -click(); for (i = 0; i < N_REPEAT; ++i) { board.player &= ~x_to_bit(x); @@ -341,7 +362,7 @@ static void bench_stability() for (x = A1; x < PASS; ++x) { board_set(&board, b); - + v = 0; c = -click(); for (i = 0; i < N_WARMUP; ++i) { @@ -351,6 +372,7 @@ static void bench_stability() } c += click(); + board_set(&board, b); c = -click(); for (i = 0; i < N_REPEAT; ++i) { board.player &= ~x_to_bit(x); diff --git a/src/bit.c b/src/bit.c index 32f6e243..de9f903e 100644 --- a/src/bit.c +++ b/src/bit.c @@ -6,53 +6,42 @@ * a macro needs to be defined to chose between different flavors of the * algorithm. * - * @date 1998 - 2017 + * @date 1998 - 2023 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #include "bit.h" #include "util.h" +/** Table for a 32-bits-at-a-time software CRC-32C calculation. + * This tablehas built into it the pre and post bit inversion of the CRC. */ +#ifndef crc32c_u64 +static unsigned int crc32c_table[4][256]; +#endif + /** coordinate to bit table converter */ -const unsigned long long X_TO_BIT[] = { - 0x0000000000000001ULL, 0x0000000000000002ULL, 0x0000000000000004ULL, 0x0000000000000008ULL, - 0x0000000000000010ULL, 0x0000000000000020ULL, 0x0000000000000040ULL, 0x0000000000000080ULL, - 0x0000000000000100ULL, 0x0000000000000200ULL, 0x0000000000000400ULL, 0x0000000000000800ULL, - 0x0000000000001000ULL, 0x0000000000002000ULL, 0x0000000000004000ULL, 0x0000000000008000ULL, - 0x0000000000010000ULL, 0x0000000000020000ULL, 0x0000000000040000ULL, 0x0000000000080000ULL, - 0x0000000000100000ULL, 0x0000000000200000ULL, 0x0000000000400000ULL, 0x0000000000800000ULL, - 0x0000000001000000ULL, 0x0000000002000000ULL, 0x0000000004000000ULL, 0x0000000008000000ULL, - 0x0000000010000000ULL, 0x0000000020000000ULL, 0x0000000040000000ULL, 0x0000000080000000ULL, - 0x0000000100000000ULL, 0x0000000200000000ULL, 0x0000000400000000ULL, 0x0000000800000000ULL, - 0x0000001000000000ULL, 0x0000002000000000ULL, 0x0000004000000000ULL, 0x0000008000000000ULL, - 0x0000010000000000ULL, 0x0000020000000000ULL, 0x0000040000000000ULL, 0x0000080000000000ULL, - 0x0000100000000000ULL, 0x0000200000000000ULL, 0x0000400000000000ULL, 0x0000800000000000ULL, - 0x0001000000000000ULL, 0x0002000000000000ULL, 0x0004000000000000ULL, 0x0008000000000000ULL, - 0x0010000000000000ULL, 0x0020000000000000ULL, 0x0040000000000000ULL, 0x0080000000000000ULL, - 0x0100000000000000ULL, 0x0200000000000000ULL, 0x0400000000000000ULL, 0x0800000000000000ULL, - 0x1000000000000000ULL, 0x2000000000000000ULL, 0x4000000000000000ULL, 0x8000000000000000ULL, - 0, 0 // <- hack for passing move & nomove -}; +unsigned long long X_TO_BIT[66]; -/** Conversion array: neighbour bits */ +/** Conversion array: flippable neighbour bits */ +// https://eukaryote.hateblo.jp/entry/2020/04/26/031246 const unsigned long long NEIGHBOUR[] = { - 0x0000000000000302ULL, 0x0000000000000705ULL, 0x0000000000000e0aULL, 0x0000000000001c14ULL, - 0x0000000000003828ULL, 0x0000000000007050ULL, 0x000000000000e0a0ULL, 0x000000000000c040ULL, - 0x0000000000030203ULL, 0x0000000000070507ULL, 0x00000000000e0a0eULL, 0x00000000001c141cULL, - 0x0000000000382838ULL, 0x0000000000705070ULL, 0x0000000000e0a0e0ULL, 0x0000000000c040c0ULL, - 0x0000000003020300ULL, 0x0000000007050700ULL, 0x000000000e0a0e00ULL, 0x000000001c141c00ULL, - 0x0000000038283800ULL, 0x0000000070507000ULL, 0x00000000e0a0e000ULL, 0x00000000c040c000ULL, - 0x0000000302030000ULL, 0x0000000705070000ULL, 0x0000000e0a0e0000ULL, 0x0000001c141c0000ULL, - 0x0000003828380000ULL, 0x0000007050700000ULL, 0x000000e0a0e00000ULL, 0x000000c040c00000ULL, - 0x0000030203000000ULL, 0x0000070507000000ULL, 0x00000e0a0e000000ULL, 0x00001c141c000000ULL, - 0x0000382838000000ULL, 0x0000705070000000ULL, 0x0000e0a0e0000000ULL, 0x0000c040c0000000ULL, - 0x0003020300000000ULL, 0x0007050700000000ULL, 0x000e0a0e00000000ULL, 0x001c141c00000000ULL, - 0x0038283800000000ULL, 0x0070507000000000ULL, 0x00e0a0e000000000ULL, 0x00c040c000000000ULL, - 0x0302030000000000ULL, 0x0705070000000000ULL, 0x0e0a0e0000000000ULL, 0x1c141c0000000000ULL, - 0x3828380000000000ULL, 0x7050700000000000ULL, 0xe0a0e00000000000ULL, 0xc040c00000000000ULL, - 0x0203000000000000ULL, 0x0507000000000000ULL, 0x0a0e000000000000ULL, 0x141c000000000000ULL, - 0x2838000000000000ULL, 0x5070000000000000ULL, 0xa0e0000000000000ULL, 0x40c0000000000000ULL, + 0x0000000000000302ULL, 0x0000000000000604ULL, 0x0000000000000e0aULL, 0x0000000000001c14ULL, + 0x0000000000003828ULL, 0x0000000000007050ULL, 0x0000000000006020ULL, 0x000000000000c040ULL, + 0x0000000000030200ULL, 0x0000000000060400ULL, 0x00000000000e0a00ULL, 0x00000000001c1400ULL, + 0x0000000000382800ULL, 0x0000000000705000ULL, 0x0000000000602000ULL, 0x0000000000c04000ULL, + 0x0000000003020300ULL, 0x0000000006040600ULL, 0x000000000e0a0e00ULL, 0x000000001c141c00ULL, + 0x0000000038283800ULL, 0x0000000070507000ULL, 0x0000000060206000ULL, 0x00000000c040c000ULL, + 0x0000000302030000ULL, 0x0000000604060000ULL, 0x0000000e0a0e0000ULL, 0x0000001c141c0000ULL, + 0x0000003828380000ULL, 0x0000007050700000ULL, 0x0000006020600000ULL, 0x000000c040c00000ULL, + 0x0000030203000000ULL, 0x0000060406000000ULL, 0x00000e0a0e000000ULL, 0x00001c141c000000ULL, + 0x0000382838000000ULL, 0x0000705070000000ULL, 0x0000602060000000ULL, 0x0000c040c0000000ULL, + 0x0003020300000000ULL, 0x0006040600000000ULL, 0x000e0a0e00000000ULL, 0x001c141c00000000ULL, + 0x0038283800000000ULL, 0x0070507000000000ULL, 0x0060206000000000ULL, 0x00c040c000000000ULL, + 0x0002030000000000ULL, 0x0004060000000000ULL, 0x000a0e0000000000ULL, 0x00141c0000000000ULL, + 0x0028380000000000ULL, 0x0050700000000000ULL, 0x0020600000000000ULL, 0x0040c00000000000ULL, + 0x0203000000000000ULL, 0x0406000000000000ULL, 0x0a0e000000000000ULL, 0x141c000000000000ULL, + 0x2838000000000000ULL, 0x5070000000000000ULL, 0x2060000000000000ULL, 0x40c0000000000000ULL, 0, 0 // <- hack for passing move & nomove }; @@ -69,95 +58,146 @@ const unsigned long long NEIGHBOUR[] = { * @param b 64-bit integer to count bits of. * @return the number of bits set. */ + +#ifndef POPCOUNT + #if 0 int bit_count(unsigned long long b) { -#if defined(POPCOUNT) - - #if defined(USE_GAS_X64) - __asm__("popcntq %1,%0" :"=r" (b) :"rm" (b)); - return (int) b; - #elif defined(USE_MSVC_X64) - return __popcnt64(b); - #elif defined(USE_GCC_X64) - return __builtin_popcountll(b); - #endif - -// MMX does not help much here :-( -#elif defined (USE_GAS_MMX) - - const unsigned long long M55 = 0x5555555555555555ULL; - const unsigned long long M33 = 0x3333333333333333ULL; - const unsigned long long M0F = 0x0F0F0F0F0F0F0F0FULL; - int count; - - __asm__ volatile( - "movq %1, %%mm1\n\t" - "pxor %%mm2, %%mm2\n\t" - - "movq %%mm1, %%mm0\n\t" - "psrlq $1, %%mm1\n\t" - "pand %2, %%mm1\n\t" - "psubd %%mm1, %%mm0\n\t" - - "movq %%mm0, %%mm1\n\t" - "psrlq $2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %3, %%mm0\n\t" - "paddd %%mm1, %%mm0\n\t" - - "movq %%mm0, %%mm1\n\t" - "psrlq $4, %%mm0\n\t" - "paddd %%mm1, %%mm0\n\t" - "pand %4, %%mm0\n\t" - - "psadbw %%mm2, %%mm0\n\t" - "movd %%mm0, %0\n\t" - "emms\n\t" - : "=a" (count) : "m" (b), "m" (M55), "m" (M33), "m" (M0F)); - - return count; + int c; -#else + b = b - ((b >> 1) & 0x5555555555555555ULL); + b = ((b >> 2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); + #ifdef HAS_CPU_64 + b = (b + (b >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + c = (b * 0x0101010101010101ULL) >> 56; + #else + c = (b >> 32) + b; + c = (c & 0x0F0F0F0F) + ((c >> 4) & 0x0F0F0F0F); + c = (c * 0x01010101) >> 24; + #endif + return c; +} - register unsigned long long c = b - - ((b >> 1) & 0x7777777777777777ULL) - - ((b >> 2) & 0x3333333333333333ULL) - - ((b >> 3) & 0x1111111111111111ULL); - c = ((c + (c >> 4)) & 0x0F0F0F0F0F0F0F0FULL) * 0x0101010101010101ULL; + #else +// https://github.com/official-stockfish/Stockfish/pull/620/files +// 2% faster than SWAR bit_count for 32 & 64 non-POPCOUNT build +unsigned char PopCnt16[1 << 16]; - return (int)(c >> 56); +static int bit_count_32_SWAR(unsigned int b) +{ + b = b - ((b >> 1) & 0x55555555); + b = ((b >> 2) & 0x333333333) + (b & 0x33333333); + b = ((b >> 4) + b) & 0x0F0F0F0F; + return (b * 0x01010101) >> 24; +} + #endif +#endif +/** + * @brief initialize PopCnt16 table and check MMX/SSE availability. + */ +void bit_init(void) +{ + unsigned int n; + unsigned long long ll; +#ifndef crc32c_u64 + unsigned int k, crc; + + // http://stackoverflow.com/a/17646775/1821055 + // https://github.com/baruch/crcbench + // Generate byte-wise table. + for (n = 0; n < 256; n++) { + crc = ~n; + for (k = 0; k < 8; k++) + crc = (crc >> 1) ^ (-(int)(crc & 1) & 0x82f63b78); + crc32c_table[0][n] = ~crc; + } + // Use byte-wise table to generate word-wise table. + for (n = 0; n < 256; n++) { + crc = ~crc32c_table[0][n]; + for (k = 1; k < 4; k++) { + crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8); + crc32c_table[k][n] = ~crc; + } + } #endif -} + ll = 1; + for (n = 0; n < 66; ++n) { // X_TO_BIT[64] = X_TO_BIT[65] = 0 for passing move & nomove + X_TO_BIT[n] = ll; + ll <<= 1; + } + +#ifndef POPCOUNT + for (n = 0; n < (1 << 16); ++n) + PopCnt16[n] = bit_count_32_SWAR(n); +#endif + +#if (defined(USE_GAS_MMX) || defined(USE_MSVC_X86)) && !defined(hasSSE2) + init_mmx(); +#endif +#if defined(ANDROID) && !defined(__ARM_NEON) && !defined(hasSSE2) + init_neon(); +#endif +} /** * @brief count the number of discs, counting the corners twice. * * This is a variation of the above algorithm to count the mobility and favour * the corners. This function is useful for move sorting. + * (SSE/Neon version caliculates 2 elements in parallel.) * * @param v 64-bit integer to count bits of. * @return the number of bit set, counting the corners twice. */ -int bit_weighted_count(const unsigned long long v) +#if !defined(__AVX2__) && defined(hasSSE2) && !defined(POPCOUNT) +__m128i bit_weighted_count_sse(unsigned long long Q0, unsigned long long Q1) { -#if defined(POPCOUNT) - - return bit_count(v) + bit_count(v & 0x8100000000000081ULL); - -#else + static const V2DI mask15 = {{ 0x1555555555555515, 0x1555555555555515 }}; + static const V2DI mask01 = {{ 0x0100000000000001, 0x0100000000000001 }}; + static const V2DI mask33 = {{ 0x3333333333333333, 0x3333333333333333 }}; + static const V2DI mask0F = {{ 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F }}; + + __m128i v = _mm_set_epi64x(Q1, Q0); + v = _mm_add_epi64(_mm_sub_epi64(v, _mm_and_si128(_mm_srli_epi64(v, 1), mask15.v2)), _mm_and_si128(v, mask01.v2)); + v = _mm_add_epi64(_mm_and_si128(v, mask33.v2), _mm_and_si128(_mm_srli_epi64(v, 2), mask33.v2)); + v = _mm_and_si128(_mm_add_epi64(v, _mm_srli_epi64(v, 4)), mask0F.v2); + return _mm_sad_epu8(v, _mm_setzero_si128()); +} - register unsigned long long b; - b = v - ((v >> 1) & 0x1555555555555515ULL) + (v & 0x0100000000000001ULL); - b = ((b >> 2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); - b = ((b >> 4) + b) & 0x0f0f0f0f0f0f0f0fULL; - b *= 0x0101010101010101ULL; +#elif defined(__ARM_NEON) +uint64x2_t bit_weighted_count_neon(unsigned long long Q0, unsigned long long Q1) +{ + uint64x2_t v = vcombine_u64(vcreate_u64(Q0), vcreate_u64(Q1)); + return vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vaddq_u8(vcntq_u8(vreinterpretq_u8_u64(v)), + vcntq_u8(vreinterpretq_u8_u64(vandq_u64(v, vdupq_n_u64(0x8100000000000081)))))))); +} - return (int)(b >> 56); +#elif 0 // SWAR, for record +int bit_weighted_count(unsigned long long v) +{ + int c; + + v = v - ((v >> 1) & 0x1555555555555515) + (v & 0x0100000000000001); + v = ((v >> 2) & 0x3333333333333333) + (v & 0x3333333333333333); + c = (v >> 32) + v; + c = (c & 0x0F0F0F0F) + ((c >> 4) & 0x0F0F0F0F); + c = (c * 0x01010101) >> 24; + return c; +} -#endif +#else +int bit_weighted_count(unsigned long long v) +{ + unsigned int AH18 = ((v >> 56) | (v << 8)) & 0x8181; // ror 56 + #ifdef POPCOUNT + return bit_count(v) + bit_count_32(AH18); + #else + return bit_count(v) + PopCnt16[AH18]; + #endif } +#endif /** * @@ -168,60 +208,71 @@ int bit_weighted_count(const unsigned long long v) * magic numbers is provided. * * @param b 64-bit integer. - * @return the index of the first bit set. + * @return the index of the first bit set. (undefined if b = 0) */ -int first_bit(unsigned long long b) +#if !defined(first_bit_32) && !defined(HAS_CPU_64) +int first_bit_32(unsigned int b) { -#if defined(USE_GAS_X64) + #if defined(_MSC_VER) + unsigned long index; + _BitScanForward(&index, b); + return (int) index; - __asm__("bsfq %1,%0" : "=r" (b) : "rm" (b)); + #elif defined(USE_GAS_X64) || defined(USE_GAS_X86) + __asm__("rep; bsf %1, %0" : "=r" (b) : "rm" (b)); // tzcnt on BMI CPUs, bsf otherwise return (int) b; -#elif defined(USE_GAS_X86) + #elif defined(USE_MSVC_X86) + __asm { + bsf eax, word ptr b + } + + #elif defined(USE_GCC_ARM) + return __builtin_clz(b & -b) ^ 31; + + #else + static const unsigned char magic[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; - int x1, x2; - __asm__ ("bsf %0,%0\n" - "jnz 1f\n" - "bsf %1,%0\n" - "jz 1f\n" - "addl $32,%0\n" - "1:": "=&q" (x1), "=&q" (x2):"1" ((int) (b >> 32)), "0" ((int) b)); - return x1; + return magic[((b & (-b)) * 0x077CB531U) >> 27]; + #endif +} +#endif // first_bit_32 -#elif defined(USE_MSVC_X64) +#ifndef first_bit +int first_bit(unsigned long long b) +{ + #if defined(USE_GAS_X64) + __asm__("rep; bsfq %1, %0" : "=r" (b) : "rm" (b)); // tzcntq on BMI CPUs + return (int) b; + #elif defined(USE_GAS_X86) + int x; + __asm__ ("bsf %2, %0\n\t" // (ZF differs from tzcnt) + "jnz 1f\n\t" + "bsf %1, %0\n\t" + "addl $32, %0\n" + "1:" : "=&q" (x) : "g" ((int) (b >> 32)), "g" ((int) b)); + return x; + + #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) unsigned long index; _BitScanForward64(&index, b); return (int) index; -#elif defined(USE_GCC_X64) - - return __builtin_ctzll(b); - -#elif defined(USE_MASM_X86) + #elif defined(USE_MSVC_X86) __asm { - xor eax, eax - bsf edx, dword ptr b - jnz l1 - bsf edx, dword ptr b+4 - mov eax, 32 - jnz l1 - mov edx, -32 - l1: add eax, edx + bsf eax, dword ptr b + jnz l1 + bsf eax, dword ptr b+4 + add eax, 32 + l1: } -#elif defined(USE_GCC_ARM) - const unsigned int lb = (unsigned int)b; - if (lb) { - return __builtin_clz(lb & -lb) ^ 31; - } else { - const unsigned int hb = b >> 32; - return 32 + (__builtin_clz(hb & -hb) ^ 31); - } - -#else - - const int magic[64] = { + #elif defined(HAS_CPU_64) + static const unsigned char magic[64] = { 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, @@ -234,23 +285,34 @@ int first_bit(unsigned long long b) return magic[((b & (-b)) * 0x07EDD5E59A4E28C2ULL) >> 58]; -#endif + #else + const unsigned int lb = (unsigned int) b; + if (lb) { + return first_bit_32(lb); + } else { + return 32 + first_bit_32(b >> 32); + } + #endif } +#endif // first_bit +#if 0 /** * @brief Search the next bit set. * * In practice, clear the first bit set and search the next one. * * @param b 64-bit integer. - * @return the index of the next bit set. + * @return the index of the next bit set. */ int next_bit(unsigned long long *b) { *b &= *b - 1; return first_bit(*b); } +#endif +#ifndef last_bit /** * @brief Search the last bit set (same as log2()). * @@ -263,33 +325,25 @@ int next_bit(unsigned long long *b) */ int last_bit(unsigned long long b) { -#if defined(USE_GAS_X64) - - __asm__("bsrq %1,%0" :"=r" (b) :"rm" (b)); + #if defined(USE_GAS_X64) + __asm__("bsrq %1, %0" :"=r" (b) :"rm" (b)); return b; -#elif defined(USE_MSVC_X64) - + #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) unsigned long index; _BitScanReverse64(&index, b); return (int) index; -#elif defined(USE_GCC_X64) - - return 63 - __builtin_clzll(b); - -#elif defined(USE_GAS_X86) + #elif defined(USE_GAS_X86) + int x; + __asm__ ("bsr %1, %0\n\t" + "leal 32(%0), %0\n\t" + "jnz 1f\n\t" + "bsr %2, %0\n\t" + "1:" : "=&q" (x) : "g" ((int) (b >> 32)), "g" ((int) b)); + return x; - int x1, x2; - __asm__ ("bsr %1,%0\n" - "jnz 1f\n" - "bsr %0,%0\n" - "subl $32,%0\n" - "1: addl $32,%0\n" : "=&q" (x1), "=&q" (x2) : "1" ((int) (b >> 32)), "0" ((int) b)); - return x1; - - -#elif defined(USE_GCC_ARM) + #elif 0 // defined(USE_GCC_ARM) const unsigned int hb = b >> 32; if (hb) { return 63 - __builtin_clz(hb); @@ -297,31 +351,26 @@ int last_bit(unsigned long long b) return 31 - __builtin_clz((int) b); } - -#elif defined(USE_MASM_X86) + #elif defined(USE_MSVC_X86) __asm { - xor eax, eax - bsr edx, dword ptr b+4 - jnz l1 - bsr edx, dword ptr b - mov eax, 32 - jnz l1 - mov edx, -32 - l1: add eax, edx + bsr eax, dword ptr b+4 + lea eax, [eax+32] + jnz l1 + bsr eax, dword ptr b + l1: } - -#else - - const int magic[64] = { - 63, 0, 58, 1, 59, 47, 53, 2, - 60, 39, 48, 27, 54, 33, 42, 3, - 61, 51, 37, 40, 49, 18, 28, 20, - 55, 30, 34, 11, 43, 14, 22, 4, - 62, 57, 46, 52, 38, 26, 32, 41, - 50, 36, 17, 19, 29, 10, 13, 21, - 56, 45, 25, 31, 35, 16, 9, 12, - 44, 24, 15, 8, 23, 7, 6, 5 + #elif defined(HAS_CPU_64) + // https://www.chessprogramming.org/BitScan#De_Bruijn_Multiplication_2 + static const unsigned char magic[64] = { + 0, 47, 1, 56, 48, 27, 2, 60, + 57, 49, 41, 37, 28, 16, 3, 61, + 54, 58, 35, 52, 50, 42, 21, 44, + 38, 32, 29, 23, 17, 11, 4, 62, + 46, 55, 26, 59, 40, 36, 15, 53, + 34, 51, 20, 43, 31, 22, 10, 45, + 25, 39, 14, 33, 19, 30, 9, 24, + 13, 18, 8, 12, 7, 6, 5, 63 }; b |= b >> 1; @@ -330,30 +379,36 @@ int last_bit(unsigned long long b) b |= b >> 8; b |= b >> 16; b |= b >> 32; - b = (b >> 1) + 1; - - return magic[(b * 0x07EDD5E59A4E28C2ULL) >> 58]; -#endif + return magic[(b * 0x03f79d71b4cb0a89) >> 58]; + + #else + static const unsigned char clz_table_4bit[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + int n = 63; + unsigned int x; + + x = b >> 32; + if (x == 0) { n = 31; x = (unsigned int) b; } + if ((x & 0xFFFF0000) == 0) { n -= 16; x <<= 16; } + if ((x & 0xFF000000) == 0) { n -= 8; x <<= 8; } + if ((x & 0xF0000000) == 0) { n -= 4; x <<= 4; } + n -= clz_table_4bit[x >> (32 - 4)]; + return n; + #endif } +#endif // last_bit +#ifndef bswap_int /** - * @brief Transpose the unsigned long long (symetry % A1-H8 diagonal). - * @param b An unsigned long long - * @return The transposed unsigned long long. + * @brief Mirror the unsigned int (little <-> big endian). + * @param i An unsigned int. + * @return The mirrored int. */ -unsigned long long transpose(unsigned long long b) +unsigned int bswap_int(unsigned int i) { - unsigned long long t; - - t = (b ^ (b >> 7)) & 0x00aa00aa00aa00aaULL; - b = b ^ t ^ (t << 7); - t = (b ^ (b >> 14)) & 0x0000cccc0000ccccULL; - b = b ^ t ^ (t << 14); - t = (b ^ (b >> 28)) & 0x00000000f0f0f0f0ULL; - b = b ^ t ^ (t << 28); - - return b; + i = ((i >> 8) & 0x00FF00FFU) | ((i & 0x00FF00FFU) << 8); + i = (i >> 16) | (i << 16); + return i; } /** @@ -363,47 +418,101 @@ unsigned long long transpose(unsigned long long b) */ unsigned long long vertical_mirror(unsigned long long b) { - b = ((b >> 8) & 0x00FF00FF00FF00FFULL) | ((b << 8) & 0xFF00FF00FF00FF00ULL); - b = ((b >> 16) & 0x0000FFFF0000FFFFULL) | ((b << 16) & 0xFFFF0000FFFF0000ULL); - b = ((b >> 32) & 0x00000000FFFFFFFFULL) | ((b << 32) & 0xFFFFFFFF00000000ULL); - return b; + return bswap_int((unsigned int)(b >> 32)) | ((unsigned long long) bswap_int((unsigned int) b) << 32); } +#endif // bswap_int /** * @brief Mirror the unsigned long long (exchange the line 1 - 8, 2 - 7, 3 - 6 & 4 - 5). * @param b An unsigned long long. * @return The mirrored unsigned long long. */ +unsigned int horizontal_mirror_32(unsigned int b) +{ +#ifdef __ARM_ACLE + return __rev(__rbit(b)); +#else + b = ((b >> 1) & 0x55555555U) + 2 * (b & 0x55555555U); + b = ((b >> 2) & 0x33333333U) + 4 * (b & 0x33333333U); + b = ((b >> 4) & 0x0F0F0F0FU) + 16 * (b & 0x0F0F0F0FU); + return b; +#endif +} + unsigned long long horizontal_mirror(unsigned long long b) { - b = ((b >> 1) & 0x5555555555555555ULL) | ((b << 1) & 0xAAAAAAAAAAAAAAAAULL); - b = ((b >> 2) & 0x3333333333333333ULL) | ((b << 2) & 0xCCCCCCCCCCCCCCCCULL); - b = ((b >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((b << 4) & 0xF0F0F0F0F0F0F0F0ULL); +#if defined(HAS_CPU_64) && !defined(__ARM_ACLE) + b = ((b >> 1) & 0x5555555555555555ULL) | ((b & 0x5555555555555555ULL) << 1); + b = ((b >> 2) & 0x3333333333333333ULL) | ((b & 0x3333333333333333ULL) << 2); + b = ((b >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((b & 0x0F0F0F0F0F0F0F0FULL) << 4); + return b; +#else + return ((unsigned long long) horizontal_mirror_32(b >> 32) << 32) + | horizontal_mirror_32((unsigned int) b); +#endif +} - return b; +/** + * @brief Transpose the unsigned long long (symetry % A1-H8 diagonal, or swap axes). + * @param b An unsigned long long + * @return The transposed unsigned long long. + */ +#ifdef __AVX2__ +unsigned long long transpose(unsigned long long b) +{ + __m256i v = _mm256_sllv_epi64(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(b)), _mm256_set_epi64x(0, 1, 2, 3)); + return ((unsigned long long) _mm256_movemask_epi8(v) << 32) + | (unsigned int) _mm256_movemask_epi8(_mm256_slli_epi64(v, 4)); } +#else +unsigned long long transpose(unsigned long long b) +{ + unsigned long long t; + + t = (b ^ (b >> 7)) & 0x00aa00aa00aa00aaULL; + b = b ^ t ^ (t << 7); + t = (b ^ (b >> 14)) & 0x0000cccc0000ccccULL; + b = b ^ t ^ (t << 14); + t = (b ^ (b >> 28)) & 0x00000000f0f0f0f0ULL; + b = b ^ t ^ (t << 28); + + return b; +} +#endif // __AVX2__ + +#ifndef crc32c_u64 /** - * @brief Swap bytes of a short (little <-> big endian). - * @param s An unsigned short. - * @return The mirrored short. + * @brief Caliculate crc32c checksum for 8 bytes data + * @param crc Initial crc from previous data. + * @param data Data to accumulate. + * @return Resulting crc. */ -unsigned short bswap_short(unsigned short s) +unsigned int crc32c_u64(unsigned int crc, unsigned long long data) { - return (unsigned short) ((s >> 8) & 0x00FF) | ((s << 8) & 0xFF00); + crc ^= (unsigned int) data; + crc = crc32c_table[3][crc & 0xff] ^ + crc32c_table[2][(crc >> 8) & 0xff] ^ + crc32c_table[1][(crc >> 16) & 0xff] ^ + crc32c_table[0][crc >> 24]; + crc ^= (unsigned int) (data >> 32); + return crc32c_table[3][crc & 0xff] ^ + crc32c_table[2][(crc >> 8) & 0xff] ^ + crc32c_table[1][(crc >> 16) & 0xff] ^ + crc32c_table[0][crc >> 24]; } /** - * @brief Mirror the unsigned int (little <-> big endian). - * @param i An unsigned int. - * @return The mirrored int. + * @brief Caliculate crc32c checksum for a byte + * @param crc Initial crc from previous data. + * @param data Data to accumulate. + * @return Resulting crc. */ -unsigned int bswap_int(unsigned int i) +unsigned int crc32c_u8(unsigned int crc, unsigned int data) { - i = ((i >> 8) & 0x00FF00FFU) | ((i << 8) & 0xFF00FF00U); - i = ((i >> 16) & 0x0000FFFFU) | ((i << 16) & 0xFFFF0000U); - return i; + return crc32c_table[0][(crc ^ data) & 0xff] ^ (crc >> 8); } +#endif /** * @brief Get a random set bit index. @@ -432,27 +541,22 @@ int get_rand_bit(unsigned long long b, Random *r) * @param b The unsigned long long. * @param f Output stream. */ -void bitboard_write(const unsigned long long b, FILE *f) +void bitboard_write(unsigned long long b, FILE *f) { - int i, j, x; - const char *color = ".X"; + int i, j; + static const char color[2] = ".X"; fputs(" A B C D E F G H\n", f); for (i = 0; i < 8; ++i) { fputc(i + '1', f); fputc(' ', f); for (j = 0; j < 8; ++j) { - x = i * 8 + j; - fputc(color[((b >> (unsigned)x) & 1)], f); + fputc(color[b & 1], f); fputc(' ', f); + b >>= 1; } fputc(i + '1', f); fputc('\n', f); } fputs(" A B C D E F G H\n", f); } - - - - - diff --git a/src/bit.h b/src/bit.h index e850ae1b..c08243ea 100644 --- a/src/bit.h +++ b/src/bit.h @@ -3,40 +3,229 @@ * * Bitwise operations header file. * - * @date 1998 - 2017 + * @date 1998 - 2023 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #ifndef EDAX_BIT_H #define EDAX_BIT_H #include +#include +#include +#include + +#include "bit_intrinsics.h" struct Random; /* declaration */ -int bit_count(unsigned long long); -int bit_weighted_count(const unsigned long long); -int first_bit(unsigned long long); -int next_bit(unsigned long long*); -int last_bit(unsigned long long); -void bitboard_write(const unsigned long long, FILE*); +void bit_init(void); +// int next_bit(unsigned long long*); +void bitboard_write(unsigned long long, FILE*); unsigned long long transpose(unsigned long long); -unsigned long long vertical_mirror(unsigned long long); +unsigned int horizontal_mirror_32(unsigned int b); unsigned long long horizontal_mirror(unsigned long long); -unsigned int bswap_int(unsigned int); -unsigned short bswap_short(unsigned short); int get_rand_bit(unsigned long long, struct Random*); -/** Loop over each bit set. */ -#define foreach_bit(i, b) for (i = first_bit(b); b; i = next_bit(&b)) +#if !defined(__AVX2__) && defined(hasSSE2) && !defined(POPCOUNT) + __m128i bit_weighted_count_sse(unsigned long long, unsigned long long); +#elif defined (__ARM_NEON) + uint64x2_t bit_weighted_count_neon(unsigned long long, unsigned long long); +#else + int bit_weighted_count(unsigned long long); +#endif + +extern unsigned long long X_TO_BIT[]; +extern const unsigned long long NEIGHBOUR[]; -extern const unsigned long long X_TO_BIT[]; /** Return a bitboard with bit x set. */ -#define x_to_bit(x) X_TO_BIT[x] +// https://eukaryote.hateblo.jp/entry/2020/04/12/054905 +#ifdef HAS_CPU_64 // 1% slower on Sandy Bridge + #define x_to_bit(x) (1ULL << (x)) +#else + #define x_to_bit(x) X_TO_BIT[x] +#endif + +/** Loop over each bit set. */ +#if defined(tzcnt_u64) + #define first_bit(x) tzcnt_u64(x) + #define last_bit(x) (63 - lzcnt_u64(x)) +#elif ((defined(__GNUC__) && (__GNUC__ >= 4)) || __has_builtin(__builtin_ctzll)) && !defined(__INTEL_COMPILER) + #define first_bit(x) __builtin_ctzll(x) + #define last_bit(x) (63 - __builtin_clzll(x)) +#else + int first_bit(unsigned long long); + int last_bit(unsigned long long); +#endif + +#if defined(HAS_CPU_64) || !defined(__STDC_HOSTED__) // __STDC_HOSTED__ (C99) to declare var in for statement + #define foreach_bit(i, b) for (i = first_bit(b); b; i = first_bit(b &= (b - 1))) +#else + #ifdef tzcnt_u32 + #define first_bit_32(x) tzcnt_u32(x) + #else + int first_bit_32(unsigned int); + #endif + #define foreach_bit(i, b) (void) i; for (unsigned int _j = 0; _j < sizeof(b) * CHAR_BIT; _j += sizeof(int) * CHAR_BIT) \ + for (int _r = (b >> _j), i = first_bit_32(_r) + _j; _r; i = first_bit_32(_r &= (_r - 1)) + _j) +#endif + +// popcount +#ifdef __ARM_NEON + #ifdef HAS_CPU_64 + #define bit_count(x) vaddv_u8(vcnt_u8(vcreate_u8(x))) + #define bit_count_32(x) vaddv_u8(vcnt_u8(vcreate_u8((unsigned int) x))) + #else + #define bit_count(x) vget_lane_u32(vreinterpret_u32_u64(vpaddl_u32(vpaddl_u16(vpaddl_u8(vcnt_u8(vcreate_u8(x)))))), 0) + #define bit_count_32(x) vget_lane_u32(vpaddl_u16(vpaddl_u8(vcnt_u8(vcreate_u8(x)))), 0) + #endif + +#elif defined(POPCOUNT) + /* + #if defined (USE_GAS_X64) + static inline int bit_count (unsigned long long x) { + long long y; + __asm__ ( "popcntq %1,%0" : "=r" (y) : "rm" (x)); + return y; + } + #elif defined (USE_GAS_X86) + static inline int bit_count (unsigned long long x) { + unsigned int y0, y1; + __asm__ ( "popcntl %2,%0\n\t" + "popcntl %3,%1" + : "=&r" (y0), "=&r" (y1) + : "rm" ((unsigned int) x), "rm" ((unsigned int) (x >> 32))); + return y0 + y1; + } + */ + #ifdef _MSC_VER + #if defined(_M_ARM) || defined(_M_ARM64) + #define bit_count(x) _CountOneBits64(x) + #define bit_count_32(x) _CountOneBits(x) + #elif defined(_M_X64) + #define bit_count(x) ((int) __popcnt64(x)) + #define bit_count_32(x) __popcnt(x) + #else + #define bit_count(x) (__popcnt((unsigned int) (x)) + __popcnt((unsigned int) ((x) >> 32))) + #define bit_count_32(x) __popcnt(x) + #endif + #else + #define bit_count(x) __builtin_popcountll(x) + #define bit_count_32(x) __builtin_popcount(x) + #endif + #define bit_count_si64(x) bit_count(_mm_cvtsi128_si64(x)) + +#else + extern unsigned char PopCnt16[1 << 16]; + static inline int bit_count(unsigned long long b) { + union { unsigned long long bb; unsigned short u[4]; } v = { b }; + return (unsigned char)(PopCnt16[v.u[0]] + PopCnt16[v.u[1]] + PopCnt16[v.u[2]] + PopCnt16[v.u[3]]); + } + static inline int bit_count_32(unsigned int b) { + union { unsigned int bb; unsigned short u[2]; } v = { b }; + return (unsigned char)(PopCnt16[v.u[0]] + PopCnt16[v.u[1]]); + } + #define bit_count_si64(x) ((unsigned char)(PopCnt16[_mm_extract_epi16((x), 0)] + PopCnt16[_mm_extract_epi16((x), 1)] + PopCnt16[_mm_extract_epi16((x), 2)] + PopCnt16[_mm_extract_epi16((x), 3)])) +#endif + +#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) + #ifndef hasSSE2 + extern bool hasSSE2; + #endif + #ifndef hasMMX + extern bool hasMMX; + #endif +#endif + +#if defined(ANDROID) && ((defined(__arm__) && !defined(__ARM_NEON)) || (defined(__i386__) && !defined(hasSSE2))) +extern bool hasSSE2; +#endif + +/** Board : board representation */ +typedef struct Board { + unsigned long long player, opponent; /**< bitboard representation */ +} Board; + +typedef union { + unsigned long long ull[2]; + Board board; // for vboard optimization in search + #ifdef __ARM_NEON + uint64x2_t v2; + #elif defined(hasSSE2) || defined(USE_MSVC_X86) + __m128i v2; + __m128d d2; // used in flip_carry_sse_32.c + #endif +} +#if defined(__GNUC__) && !defined(hasSSE2) +__attribute__ ((aligned (16))) +#endif +V2DI; + +typedef union { + unsigned long long ull[4]; + #ifdef __AVX2__ + __m256i v4; + #endif + #ifdef hasSSE2 + __m128i v2[2]; + #endif + #ifdef USE_MSVC_X86 + __m64 v1[4]; + #endif +} V4DI; + +typedef union { + unsigned long long ull[8]; + #ifdef __AVX512VL__ + __m512i v8; + #endif + #ifdef __AVX2__ + __m256i v4[2]; + #endif +} V8DI; + +/* Define function attributes directive when available */ + +#if (defined(_MSC_VER) || defined(__clang__)) && defined(hasSSE2) + #define vectorcall __vectorcall +#elif defined(__GNUC__) && defined(__i386__) + #define vectorcall __attribute__((sseregparm)) +#elif 0 // defined(__GNUC__) // erroreous result on pgo-build + #define vectorcall __attribute__((sysv_abi)) +#else + #define vectorcall +#endif + +// X64 compatibility sims for X86 +#if !defined(HAS_CPU_64) && (defined(hasSSE2) || defined(USE_MSVC_X86)) + // static inline __m128i _mm_cvtsi64_si128(const unsigned long long x) { + // return _mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(x >> 32)); + // } + // better code but requires lvalue + #define _mm_cvtsi64_si128(x) _mm_loadl_epi64((__m128i *) &(x)) + static inline unsigned long long vectorcall _mm_cvtsi128_si64(__m128i x) { + return *(unsigned long long *) &x; + } + static inline unsigned long long vectorcall _mm_extract_epi64(__m128i x, int i) { + return ((unsigned long long *) &x)[i]; + } -//#define x_to_bit(x) (1ULL << (x)) // 1% slower on Sandy Bridge + #if defined(_MSC_VER) && _MSC_VER<1900 + static inline __m128i _mm_set_epi64x(unsigned long long b, unsigned long long a) { + return _mm_unpacklo_epi64(_mm_cvtsi64_si128(b), _mm_cvtsi64_si128(a)); + } + static inline __m128i _mm_set1_epi64x(unsigned long long x) { + __m128i t = _mm_cvtsi64_si128(x); + return _mm_unpacklo_epi64(t, t); + } + #endif +#endif // !HAS_CPU_64 +#if __clang_major__ == 3 // undefined reference to `llvm.x86.avx.storeu.dq.256' + #define _mm_storeu_si128(a,b) *(__m128i *)(a) = (b) + #define _mm256_storeu_si256(a,b) *(__m256i *)(a) = (b) #endif +#endif // EDAX_BIT_H diff --git a/src/bit_intrinsics.h b/src/bit_intrinsics.h new file mode 100644 index 00000000..3ddd4834 --- /dev/null +++ b/src/bit_intrinsics.h @@ -0,0 +1,221 @@ +/** + * @file bit_intrinsics.h + * + * CPU dependent bit operation intrinsics. + * + * @date 2020 - 2024 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.5 + */ + +#ifndef EDAX_BIT_INTRINSICS_H +#define EDAX_BIT_INTRINSICS_H + +#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) + #define HAS_CPU_64 1 +#endif + +#if defined(__SSE2__) || defined(__AVX__) || defined(_M_X64) + #define hasSSE2 1 +#endif + +#ifdef hasSSE2 + #define hasMMX 1 +#endif + +#if defined(ANDROID) && defined(__arm__) + #if __ANDROID_API__ < 21 + #define DISPATCH_NEON 1 + #else + #define __ARM_NEON 1 + #endif +#elif defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) + #define __ARM_NEON 1 +#endif +#ifdef __ARM_NEON + #include "arm_neon.h" +#endif + +#ifdef _MSC_VER + #include + #ifdef _M_IX86 + #define USE_MSVC_X86 1 + #endif +#elif defined(hasSSE2) + #include +#endif + +#ifndef __has_builtin // Compatibility with non-clang compilers. + #define __has_builtin(x) 0 +#endif + +// mirror byte +#if defined(_M_ARM) // || (defined(_M_ARM64) && _MSC_VER >= 1922) // https://developercommunity.visualstudio.com/t/ARM64-still-missing-RBIT-intrinsics/10547420 + #define mirror_byte(b) (_arm_rbit(b) >> 24) +#elif defined(__ARM_ACLE) + #include + #define mirror_byte(b) (__rbit(b) >> 24) +#elif defined(HAS_CPU_64) + // http://graphics.stanford.edu/~seander/bithacks.html + #define mirror_byte(b) (unsigned char)((((b) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) +#else + static inline unsigned char mirror_byte(unsigned int b) { return ((((b * 0x200802) & 0x4422110) + ((b << 7) & 0x880)) * 0x01010101 >> 24); } +#endif + +// rotl8 +#if __has_builtin(__builtin_rotateleft8) + #define rotl8(x,y) __builtin_rotateleft8((x),(y)) +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) && (defined(__x86_64__) || defined(__i386__)) + #define rotl8(x,y) __builtin_ia32_rolqi((x),(y)) +#elif defined(_MSC_VER) + #define rotl8(x,y) _rotl8((x),(y)) +#else // may not compile into 8-bit rotate + #define rotl8(x,y) ((unsigned char)(((x)<<(y))|((unsigned char)(x)>>(8-(y))))) +#endif + +// bswap +#ifdef _MSC_VER + #define bswap_short(x) _byteswap_ushort(x) + #define bswap_int(x) _byteswap_ulong(x) + #define vertical_mirror(x) _byteswap_uint64(x) +#else + #if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) || __has_builtin(__builtin_bswap16) + #define bswap_short(x) __builtin_bswap16(x) + #else + #define bswap_short(x) (((unsigned short) (x) >> 8) | ((unsigned short) (x) << 8)) + #endif + #if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) || __has_builtin(__builtin_bswap64) + #define bswap_int(x) __builtin_bswap32(x) + #define vertical_mirror(x) __builtin_bswap64(x) + #else + unsigned int bswap_int(unsigned int); + unsigned long long vertical_mirror(unsigned long long); + #endif +#endif + +// lzcnt / tzcnt (0 allowed) + +#ifdef USE_GAS_X86 +#ifdef __LZCNT__ +static inline int _lzcnt_u64(unsigned long long x) { + int y; + __asm__ ( + "lzcntl %1, %0\n\t" + "lzcntl %2, %2\n\t" + "leal (%0, %2), %0\n\t" + "cmovnc %2, %0" + : "=&r" (y) : "0" ((unsigned int) x), "r" ((unsigned int) (x >> 32)) ); + return y; +} +#endif +#ifdef __BMI__ +static inline int _tzcnt_u64(unsigned long long x) { + int y; + __asm__ ( + "tzcntl %1, %0\n\t" + "tzcntl %2, %2\n\t" + "leal (%0, %2), %0\n\t" + "cmovnc %2, %0" + : "=&r" (y) : "0" ((unsigned int) (x >> 32)), "r" ((unsigned int) x) ); + return y; +} +#endif +#elif defined(USE_MSVC_X86) && (defined(__AVX2__) || defined(__LZCNT__)) +static inline int _lzcnt_u64(unsigned long long x) { + __asm { + lzcnt eax, dword ptr x + lzcnt edx, dword ptr x+4 + lea eax, [eax+edx] + cmovnc eax, edx + } +} + +static inline int _tzcnt_u64(unsigned long long x) { + __asm { + tzcnt eax, dword ptr x+4 + tzcnt edx, dword ptr x + lea eax, [eax+edx] + cmovnc eax, edx + } +} +#endif + +#if defined(__AVX2__) || defined(__LZCNT__) + #define lzcnt_u32(x) _lzcnt_u32(x) + #define lzcnt_u64(x) _lzcnt_u64(x) + +#elif defined(_M_ARM) || defined(_M_ARM64) + #define lzcnt_u32(x) _CountLeadingZeros(x) + #define lzcnt_u64(x) _CountLeadingZeros64(x) + +#elif defined(_MSC_VER) + static inline int lzcnt_u32(unsigned int n) { + unsigned long i; + if (!_BitScanReverse(&i, n)) + i = 32 ^ 31; + return i ^ 31; + } + #ifdef _M_X64 + static inline int lzcnt_u64(unsigned long long n) { + unsigned long i; + if (!_BitScanReverse64(&i, n)) + i = 64 ^ 63; + return i ^ 63; + } + #else + static inline int lzcnt_u64(unsigned long long n) { + unsigned long i; + if (_BitScanReverse(&i, n >> 32)) + return i ^ 31; + if (!_BitScanReverse(&i, (unsigned int) n)) + i = 64 ^ 63; + return i ^ 63; + } + #endif + +#elif defined(__ARM_FEATURE_CLZ) + #if __ARM_ACLE >= 110 + #define lzcnt_u32(x) __clz(x) + #define lzcnt_u64(x) __clzll(x) + #else // strictly-incorrect patch + #define lzcnt_u32(x) __builtin_clz(x) + #define lzcnt_u64(x) __builtin_clzll(x) + #endif + +#else + static inline int lzcnt_u32(unsigned long x) { return (x ? __builtin_clz(x) : 32); } + static inline int lzcnt_u64(unsigned long x) { return (x ? __builtin_clzll(x) : 64); } +#endif + +#if defined(__BMI__) || defined(__AVX2__) + #define tzcnt_u32(x) _tzcnt_u32(x) + #define tzcnt_u64(x) _tzcnt_u64(x) + +#elif defined(__ARM_FEATURE_CLZ) + #ifdef _M_ARM + #define tzcnt_u32(x) _arm_clz(_arm_rbit(x)) + #elif __has_builtin(__rbit) // (__ARM_ARCH >= 6 && __ARM_ISA_THUMB >= 2) || __ARM_ARCH >= 7 // not for gcc + #define tzcnt_u32(x) __clz(__rbit(x)) + #endif +#endif + +#if defined(__SSE4_2__) || defined(__AVX__) + #ifdef HAS_CPU_64 + #define crc32c_u64(crc,d) _mm_crc32_u64((crc),(d)) + #else + #define crc32c_u64(crc,d) _mm_crc32_u32(_mm_crc32_u32((crc),(d)),((d)>>32)) + #endif + #define crc32c_u8(crc,d) _mm_crc32_u8((crc),(d)) + +#elif defined(__ARM_FEATURE_CRC32) + #include "arm_acle.h" + #define crc32c_u64(crc,d) __crc32cd((crc),(d)) + #define crc32c_u8(crc,d) __crc32cb((crc),(d)) + +#else + unsigned int crc32c_u64(unsigned int crc, unsigned long long data); + unsigned int crc32c_u8(unsigned int crc, unsigned int data); +#endif + +#endif // EDAX_BIT_INTRINSICS_H diff --git a/src/board.c b/src/board.c index 4ffaf446..e4f31fb9 100644 --- a/src/board.c +++ b/src/board.c @@ -11,9 +11,10 @@ * some board properties. Most of the functions are optimized to be as fast as * possible, while remaining readable. * - * @date 1998 - 2017 + * @date 1998 - 2024 * @author Richard Delorme - * @version 4.4 + * @author Toshihiko Okuhara + * @version 4.5 */ #include "board.h" @@ -21,7 +22,6 @@ #include "bit.h" #include "hash.h" #include "move.h" -#include "settings.h" #include "util.h" #include @@ -30,102 +30,49 @@ #if MOVE_GENERATOR == MOVE_GENERATOR_CARRY - #ifdef HAS_CPU_64 - #include "flip_carry_64.c" - #include "count_last_flip_carry_64.c" - #else - #include "flip_carry_32.c" - #include "count_last_flip_carry_32.c" - #endif + #include "flip_carry_64.c" #elif MOVE_GENERATOR == MOVE_GENERATOR_SSE #include "flip_sse.c" - #include "count_last_flip_kindergarten.c" #elif MOVE_GENERATOR == MOVE_GENERATOR_BITSCAN + #ifdef __ARM_NEON + #define flip_neon flip + #include "flip_neon_bitscan.c" + #else #include "flip_bitscan.c" - #include "count_last_flip_bitscan.c" + #endif #elif MOVE_GENERATOR == MOVE_GENERATOR_ROXANE #include "flip_roxane.c" - #include "count_last_flip_kindergarten.c" +#elif MOVE_GENERATOR == MOVE_GENERATOR_32 + #include "flip_carry_sse_32.c" +#elif MOVE_GENERATOR == MOVE_GENERATOR_SSE_BSWAP + #include "flip_sse_bswap.c" +#elif MOVE_GENERATOR == MOVE_GENERATOR_AVX + #include "flip_avx_ppfill.c" +#elif MOVE_GENERATOR == MOVE_GENERATOR_AVX512 + #include "flip_avx512cd.c" +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + #ifdef __aarch64__ + #include "flip_neon_rbit.c" + #else + #include "flip_neon_lzcnt.c" + #endif +#elif MOVE_GENERATOR == MOVE_GENERATOR_SVE + #include "flip_sve_lzcnt.c" #else // MOVE_GENERATOR == MOVE_GENERATOR_KINDERGARTEN #include "flip_kindergarten.c" - #include "count_last_flip_kindergarten.c" #endif /** edge stability global data */ -static unsigned char edge_stability[256][256]; - -/** conversion from an 8-bit line to the A1-A8 line */ -static const unsigned long long A1_A8[256] = { - 0x0000000000000000ULL, 0x0000000000000001ULL, 0x0000000000000100ULL, 0x0000000000000101ULL, 0x0000000000010000ULL, 0x0000000000010001ULL, 0x0000000000010100ULL, 0x0000000000010101ULL, - 0x0000000001000000ULL, 0x0000000001000001ULL, 0x0000000001000100ULL, 0x0000000001000101ULL, 0x0000000001010000ULL, 0x0000000001010001ULL, 0x0000000001010100ULL, 0x0000000001010101ULL, - 0x0000000100000000ULL, 0x0000000100000001ULL, 0x0000000100000100ULL, 0x0000000100000101ULL, 0x0000000100010000ULL, 0x0000000100010001ULL, 0x0000000100010100ULL, 0x0000000100010101ULL, - 0x0000000101000000ULL, 0x0000000101000001ULL, 0x0000000101000100ULL, 0x0000000101000101ULL, 0x0000000101010000ULL, 0x0000000101010001ULL, 0x0000000101010100ULL, 0x0000000101010101ULL, - 0x0000010000000000ULL, 0x0000010000000001ULL, 0x0000010000000100ULL, 0x0000010000000101ULL, 0x0000010000010000ULL, 0x0000010000010001ULL, 0x0000010000010100ULL, 0x0000010000010101ULL, - 0x0000010001000000ULL, 0x0000010001000001ULL, 0x0000010001000100ULL, 0x0000010001000101ULL, 0x0000010001010000ULL, 0x0000010001010001ULL, 0x0000010001010100ULL, 0x0000010001010101ULL, - 0x0000010100000000ULL, 0x0000010100000001ULL, 0x0000010100000100ULL, 0x0000010100000101ULL, 0x0000010100010000ULL, 0x0000010100010001ULL, 0x0000010100010100ULL, 0x0000010100010101ULL, - 0x0000010101000000ULL, 0x0000010101000001ULL, 0x0000010101000100ULL, 0x0000010101000101ULL, 0x0000010101010000ULL, 0x0000010101010001ULL, 0x0000010101010100ULL, 0x0000010101010101ULL, - 0x0001000000000000ULL, 0x0001000000000001ULL, 0x0001000000000100ULL, 0x0001000000000101ULL, 0x0001000000010000ULL, 0x0001000000010001ULL, 0x0001000000010100ULL, 0x0001000000010101ULL, - 0x0001000001000000ULL, 0x0001000001000001ULL, 0x0001000001000100ULL, 0x0001000001000101ULL, 0x0001000001010000ULL, 0x0001000001010001ULL, 0x0001000001010100ULL, 0x0001000001010101ULL, - 0x0001000100000000ULL, 0x0001000100000001ULL, 0x0001000100000100ULL, 0x0001000100000101ULL, 0x0001000100010000ULL, 0x0001000100010001ULL, 0x0001000100010100ULL, 0x0001000100010101ULL, - 0x0001000101000000ULL, 0x0001000101000001ULL, 0x0001000101000100ULL, 0x0001000101000101ULL, 0x0001000101010000ULL, 0x0001000101010001ULL, 0x0001000101010100ULL, 0x0001000101010101ULL, - 0x0001010000000000ULL, 0x0001010000000001ULL, 0x0001010000000100ULL, 0x0001010000000101ULL, 0x0001010000010000ULL, 0x0001010000010001ULL, 0x0001010000010100ULL, 0x0001010000010101ULL, - 0x0001010001000000ULL, 0x0001010001000001ULL, 0x0001010001000100ULL, 0x0001010001000101ULL, 0x0001010001010000ULL, 0x0001010001010001ULL, 0x0001010001010100ULL, 0x0001010001010101ULL, - 0x0001010100000000ULL, 0x0001010100000001ULL, 0x0001010100000100ULL, 0x0001010100000101ULL, 0x0001010100010000ULL, 0x0001010100010001ULL, 0x0001010100010100ULL, 0x0001010100010101ULL, - 0x0001010101000000ULL, 0x0001010101000001ULL, 0x0001010101000100ULL, 0x0001010101000101ULL, 0x0001010101010000ULL, 0x0001010101010001ULL, 0x0001010101010100ULL, 0x0001010101010101ULL, - 0x0100000000000000ULL, 0x0100000000000001ULL, 0x0100000000000100ULL, 0x0100000000000101ULL, 0x0100000000010000ULL, 0x0100000000010001ULL, 0x0100000000010100ULL, 0x0100000000010101ULL, - 0x0100000001000000ULL, 0x0100000001000001ULL, 0x0100000001000100ULL, 0x0100000001000101ULL, 0x0100000001010000ULL, 0x0100000001010001ULL, 0x0100000001010100ULL, 0x0100000001010101ULL, - 0x0100000100000000ULL, 0x0100000100000001ULL, 0x0100000100000100ULL, 0x0100000100000101ULL, 0x0100000100010000ULL, 0x0100000100010001ULL, 0x0100000100010100ULL, 0x0100000100010101ULL, - 0x0100000101000000ULL, 0x0100000101000001ULL, 0x0100000101000100ULL, 0x0100000101000101ULL, 0x0100000101010000ULL, 0x0100000101010001ULL, 0x0100000101010100ULL, 0x0100000101010101ULL, - 0x0100010000000000ULL, 0x0100010000000001ULL, 0x0100010000000100ULL, 0x0100010000000101ULL, 0x0100010000010000ULL, 0x0100010000010001ULL, 0x0100010000010100ULL, 0x0100010000010101ULL, - 0x0100010001000000ULL, 0x0100010001000001ULL, 0x0100010001000100ULL, 0x0100010001000101ULL, 0x0100010001010000ULL, 0x0100010001010001ULL, 0x0100010001010100ULL, 0x0100010001010101ULL, - 0x0100010100000000ULL, 0x0100010100000001ULL, 0x0100010100000100ULL, 0x0100010100000101ULL, 0x0100010100010000ULL, 0x0100010100010001ULL, 0x0100010100010100ULL, 0x0100010100010101ULL, - 0x0100010101000000ULL, 0x0100010101000001ULL, 0x0100010101000100ULL, 0x0100010101000101ULL, 0x0100010101010000ULL, 0x0100010101010001ULL, 0x0100010101010100ULL, 0x0100010101010101ULL, - 0x0101000000000000ULL, 0x0101000000000001ULL, 0x0101000000000100ULL, 0x0101000000000101ULL, 0x0101000000010000ULL, 0x0101000000010001ULL, 0x0101000000010100ULL, 0x0101000000010101ULL, - 0x0101000001000000ULL, 0x0101000001000001ULL, 0x0101000001000100ULL, 0x0101000001000101ULL, 0x0101000001010000ULL, 0x0101000001010001ULL, 0x0101000001010100ULL, 0x0101000001010101ULL, - 0x0101000100000000ULL, 0x0101000100000001ULL, 0x0101000100000100ULL, 0x0101000100000101ULL, 0x0101000100010000ULL, 0x0101000100010001ULL, 0x0101000100010100ULL, 0x0101000100010101ULL, - 0x0101000101000000ULL, 0x0101000101000001ULL, 0x0101000101000100ULL, 0x0101000101000101ULL, 0x0101000101010000ULL, 0x0101000101010001ULL, 0x0101000101010100ULL, 0x0101000101010101ULL, - 0x0101010000000000ULL, 0x0101010000000001ULL, 0x0101010000000100ULL, 0x0101010000000101ULL, 0x0101010000010000ULL, 0x0101010000010001ULL, 0x0101010000010100ULL, 0x0101010000010101ULL, - 0x0101010001000000ULL, 0x0101010001000001ULL, 0x0101010001000100ULL, 0x0101010001000101ULL, 0x0101010001010000ULL, 0x0101010001010001ULL, 0x0101010001010100ULL, 0x0101010001010101ULL, - 0x0101010100000000ULL, 0x0101010100000001ULL, 0x0101010100000100ULL, 0x0101010100000101ULL, 0x0101010100010000ULL, 0x0101010100010001ULL, 0x0101010100010100ULL, 0x0101010100010101ULL, - 0x0101010101000000ULL, 0x0101010101000001ULL, 0x0101010101000100ULL, 0x0101010101000101ULL, 0x0101010101010000ULL, 0x0101010101010001ULL, 0x0101010101010100ULL, 0x0101010101010101ULL, -}; - -/** conversion from an 8-bit line to the H1-H8 line */ -static const unsigned long long H1_H8[256] = { - 0x0000000000000000ULL, 0x0000000000000080ULL, 0x0000000000008000ULL, 0x0000000000008080ULL, 0x0000000000800000ULL, 0x0000000000800080ULL, 0x0000000000808000ULL, 0x0000000000808080ULL, - 0x0000000080000000ULL, 0x0000000080000080ULL, 0x0000000080008000ULL, 0x0000000080008080ULL, 0x0000000080800000ULL, 0x0000000080800080ULL, 0x0000000080808000ULL, 0x0000000080808080ULL, - 0x0000008000000000ULL, 0x0000008000000080ULL, 0x0000008000008000ULL, 0x0000008000008080ULL, 0x0000008000800000ULL, 0x0000008000800080ULL, 0x0000008000808000ULL, 0x0000008000808080ULL, - 0x0000008080000000ULL, 0x0000008080000080ULL, 0x0000008080008000ULL, 0x0000008080008080ULL, 0x0000008080800000ULL, 0x0000008080800080ULL, 0x0000008080808000ULL, 0x0000008080808080ULL, - 0x0000800000000000ULL, 0x0000800000000080ULL, 0x0000800000008000ULL, 0x0000800000008080ULL, 0x0000800000800000ULL, 0x0000800000800080ULL, 0x0000800000808000ULL, 0x0000800000808080ULL, - 0x0000800080000000ULL, 0x0000800080000080ULL, 0x0000800080008000ULL, 0x0000800080008080ULL, 0x0000800080800000ULL, 0x0000800080800080ULL, 0x0000800080808000ULL, 0x0000800080808080ULL, - 0x0000808000000000ULL, 0x0000808000000080ULL, 0x0000808000008000ULL, 0x0000808000008080ULL, 0x0000808000800000ULL, 0x0000808000800080ULL, 0x0000808000808000ULL, 0x0000808000808080ULL, - 0x0000808080000000ULL, 0x0000808080000080ULL, 0x0000808080008000ULL, 0x0000808080008080ULL, 0x0000808080800000ULL, 0x0000808080800080ULL, 0x0000808080808000ULL, 0x0000808080808080ULL, - 0x0080000000000000ULL, 0x0080000000000080ULL, 0x0080000000008000ULL, 0x0080000000008080ULL, 0x0080000000800000ULL, 0x0080000000800080ULL, 0x0080000000808000ULL, 0x0080000000808080ULL, - 0x0080000080000000ULL, 0x0080000080000080ULL, 0x0080000080008000ULL, 0x0080000080008080ULL, 0x0080000080800000ULL, 0x0080000080800080ULL, 0x0080000080808000ULL, 0x0080000080808080ULL, - 0x0080008000000000ULL, 0x0080008000000080ULL, 0x0080008000008000ULL, 0x0080008000008080ULL, 0x0080008000800000ULL, 0x0080008000800080ULL, 0x0080008000808000ULL, 0x0080008000808080ULL, - 0x0080008080000000ULL, 0x0080008080000080ULL, 0x0080008080008000ULL, 0x0080008080008080ULL, 0x0080008080800000ULL, 0x0080008080800080ULL, 0x0080008080808000ULL, 0x0080008080808080ULL, - 0x0080800000000000ULL, 0x0080800000000080ULL, 0x0080800000008000ULL, 0x0080800000008080ULL, 0x0080800000800000ULL, 0x0080800000800080ULL, 0x0080800000808000ULL, 0x0080800000808080ULL, - 0x0080800080000000ULL, 0x0080800080000080ULL, 0x0080800080008000ULL, 0x0080800080008080ULL, 0x0080800080800000ULL, 0x0080800080800080ULL, 0x0080800080808000ULL, 0x0080800080808080ULL, - 0x0080808000000000ULL, 0x0080808000000080ULL, 0x0080808000008000ULL, 0x0080808000008080ULL, 0x0080808000800000ULL, 0x0080808000800080ULL, 0x0080808000808000ULL, 0x0080808000808080ULL, - 0x0080808080000000ULL, 0x0080808080000080ULL, 0x0080808080008000ULL, 0x0080808080008080ULL, 0x0080808080800000ULL, 0x0080808080800080ULL, 0x0080808080808000ULL, 0x0080808080808080ULL, - 0x8000000000000000ULL, 0x8000000000000080ULL, 0x8000000000008000ULL, 0x8000000000008080ULL, 0x8000000000800000ULL, 0x8000000000800080ULL, 0x8000000000808000ULL, 0x8000000000808080ULL, - 0x8000000080000000ULL, 0x8000000080000080ULL, 0x8000000080008000ULL, 0x8000000080008080ULL, 0x8000000080800000ULL, 0x8000000080800080ULL, 0x8000000080808000ULL, 0x8000000080808080ULL, - 0x8000008000000000ULL, 0x8000008000000080ULL, 0x8000008000008000ULL, 0x8000008000008080ULL, 0x8000008000800000ULL, 0x8000008000800080ULL, 0x8000008000808000ULL, 0x8000008000808080ULL, - 0x8000008080000000ULL, 0x8000008080000080ULL, 0x8000008080008000ULL, 0x8000008080008080ULL, 0x8000008080800000ULL, 0x8000008080800080ULL, 0x8000008080808000ULL, 0x8000008080808080ULL, - 0x8000800000000000ULL, 0x8000800000000080ULL, 0x8000800000008000ULL, 0x8000800000008080ULL, 0x8000800000800000ULL, 0x8000800000800080ULL, 0x8000800000808000ULL, 0x8000800000808080ULL, - 0x8000800080000000ULL, 0x8000800080000080ULL, 0x8000800080008000ULL, 0x8000800080008080ULL, 0x8000800080800000ULL, 0x8000800080800080ULL, 0x8000800080808000ULL, 0x8000800080808080ULL, - 0x8000808000000000ULL, 0x8000808000000080ULL, 0x8000808000008000ULL, 0x8000808000008080ULL, 0x8000808000800000ULL, 0x8000808000800080ULL, 0x8000808000808000ULL, 0x8000808000808080ULL, - 0x8000808080000000ULL, 0x8000808080000080ULL, 0x8000808080008000ULL, 0x8000808080008080ULL, 0x8000808080800000ULL, 0x8000808080800080ULL, 0x8000808080808000ULL, 0x8000808080808080ULL, - 0x8080000000000000ULL, 0x8080000000000080ULL, 0x8080000000008000ULL, 0x8080000000008080ULL, 0x8080000000800000ULL, 0x8080000000800080ULL, 0x8080000000808000ULL, 0x8080000000808080ULL, - 0x8080000080000000ULL, 0x8080000080000080ULL, 0x8080000080008000ULL, 0x8080000080008080ULL, 0x8080000080800000ULL, 0x8080000080800080ULL, 0x8080000080808000ULL, 0x8080000080808080ULL, - 0x8080008000000000ULL, 0x8080008000000080ULL, 0x8080008000008000ULL, 0x8080008000008080ULL, 0x8080008000800000ULL, 0x8080008000800080ULL, 0x8080008000808000ULL, 0x8080008000808080ULL, - 0x8080008080000000ULL, 0x8080008080000080ULL, 0x8080008080008000ULL, 0x8080008080008080ULL, 0x8080008080800000ULL, 0x8080008080800080ULL, 0x8080008080808000ULL, 0x8080008080808080ULL, - 0x8080800000000000ULL, 0x8080800000000080ULL, 0x8080800000008000ULL, 0x8080800000008080ULL, 0x8080800000800000ULL, 0x8080800000800080ULL, 0x8080800000808000ULL, 0x8080800000808080ULL, - 0x8080800080000000ULL, 0x8080800080000080ULL, 0x8080800080008000ULL, 0x8080800080008080ULL, 0x8080800080800000ULL, 0x8080800080800080ULL, 0x8080800080808000ULL, 0x8080800080808080ULL, - 0x8080808000000000ULL, 0x8080808000000080ULL, 0x8080808000008000ULL, 0x8080808000008080ULL, 0x8080808000800000ULL, 0x8080808000800080ULL, 0x8080808000808000ULL, 0x8080808000808080ULL, - 0x8080808080000000ULL, 0x8080808080000080ULL, 0x8080808080008000ULL, 0x8080808080008080ULL, 0x8080808080800000ULL, 0x8080808080800080ULL, 0x8080808080808000ULL, 0x8080808080808080ULL, -}; +unsigned char edge_stability[256 * 256]; + +#if (defined(USE_GAS_MMX) || defined(USE_MSVC_X86)) && !defined(hasSSE2) + #include "board_mmx.c" +#endif +#if (defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(hasSSE2) || defined(__ARM_NEON)) && !defined(ANDROID) + #include "board_sse.c" +#endif + /** * @brief Swap players. @@ -151,36 +98,35 @@ void board_swap_players(Board *board) * @param string string describing the board * @return turn's color. */ -int board_set(Board *board, const char *string) +int board_set(Board *board, const char *s) { int i; - const char *s = string; + unsigned long long b = 1; board->player = board->opponent = 0; - for (i = A1; i <= H8; ++i) { - if (*s == '\0') break; + for (i = A1; (i <= H8) && (*s != '\0'); ++s) { switch (tolower(*s)) { case 'b': case 'x': case '*': - board->player |= x_to_bit(i); + board->player |= b; break; case 'o': case 'w': - board->opponent |= x_to_bit(i); + board->opponent |= b; break; case '-': case '.': break; default: - i--; - break; + continue; } - ++s; + ++i; + b <<= 1; } board_check(board); - for (;*s != '\0'; ++s) { + for (; *s != '\0'; ++s) { switch (tolower(*s)) { case 'b': case 'x': @@ -229,7 +175,7 @@ int board_from_FEN(Board *board, const char *string) board->opponent |= x_to_bit(i); ++i; } else { - return EMPTY; + return EMPTY; } } @@ -251,8 +197,8 @@ int board_from_FEN(Board *board, const char *string) */ void board_init(Board *board) { - board->player = 0x0000000810000000ULL; // BLACK - board->opponent = 0x0000001008000000ULL; // WHITE + board->player = 0x0000000810000000; // BLACK + board->opponent = 0x0000001008000000; // WHITE } /** @@ -272,7 +218,7 @@ void board_check(const Board *board) } // empty center ? - if (((board->player|board->opponent) & 0x0000001818000000ULL) != 0x0000001818000000ULL) { + if (~(board->player|board->opponent) & 0x0000001818000000) { error("Empty center?\n"); board_print(board, BLACK, stderr); } @@ -286,27 +232,13 @@ void board_check(const Board *board) * * @param b1 first board * @param b2 second board - * @return -1, 0, 1 - */ -int board_compare(const Board *b1, const Board *b2) -{ - if (b1->player > b2->player) return 1; - else if (b1->player < b2->player) return -1; - else if (b1->opponent > b2->opponent) return 1; - else if (b1->opponent < b2->opponent) return -1; - else return 0; -} - -/** - * @brief Compare two board for equality - * - * @param b1 first board - * @param b2 second board - * @return true if both board are equal + * @return true if b1 is lesser than b2 */ -bool board_equal(const Board *b1, const Board *b2) +bool board_lesser(const Board *b1, const Board *b2) { - return (b1->player == b2->player && b1->opponent == b2->opponent); + if (b1->player != b2->player) + return (b1->player < b2->player); + else return (b1->opponent < b2->opponent); } /** @@ -316,30 +248,40 @@ bool board_equal(const Board *b1, const Board *b2) * @param s symetry * @param sym symetric output board */ -void board_symetry(const Board *board, const int s, Board *sym) +#if !defined(hasSSE2) && !defined(__ARM_NEON) // SSE version in board_sse.c +void board_horizontal_mirror(const Board *board, Board *sym) { - register unsigned long long player = board->player; - register unsigned long long opponent = board->opponent; + sym->player = horizontal_mirror(board->player); + sym->opponent = horizontal_mirror(board->opponent); +} - if (s & 1) { - player = horizontal_mirror(player); - opponent = horizontal_mirror(opponent); - } - if (s & 2) { - player = vertical_mirror(player); - opponent = vertical_mirror(opponent); - } - if (s & 4) { - player = transpose(player); - opponent = transpose(opponent); - } +void board_vertical_mirror(const Board *board, Board *sym) +{ + sym->player = vertical_mirror(board->player); + sym->opponent = vertical_mirror(board->opponent); +} - sym->player = player; - sym->opponent = opponent; +void board_transpose(const Board *board, Board *sym) +{ + sym->player = transpose(board->player); + sym->opponent = transpose(board->opponent); +} + +void board_symetry(const Board *board, const int s, Board *sym) +{ + *sym = *board; + if (s & 1) + board_horizontal_mirror(sym, sym); + if (s & 2) + board_vertical_mirror(sym, sym); + if (s & 4) + board_transpose(sym, sym); board_check(sym); } +#endif +#ifndef __AVX2__ // AVX2 version in board_sse.c /** * @brief unique board * @@ -350,16 +292,22 @@ void board_symetry(const Board *board, const int s, Board *sym) */ int board_unique(const Board *board, Board *unique) { - Board sym; + Board sym[8]; int i, s = 0; - assert(board != unique); + board_horizontal_mirror(board, &sym[1]); + board_vertical_mirror(board, &sym[2]); + board_vertical_mirror(&sym[1], &sym[3]); + board_transpose(board, &sym[4]); + board_vertical_mirror(&sym[4], &sym[5]); // v-h reverted + board_horizontal_mirror(&sym[4], &sym[6]); + board_vertical_mirror(&sym[6], &sym[7]); *unique = *board; for (i = 1; i < 8; ++i) { - board_symetry(board, i, &sym); - if (board_compare(&sym, unique) < 0) { - *unique = sym; + // board_symetry(board, i, &sym); // moved to before loop to minimize symetry ops + if (board_lesser(&sym[i], unique)) { + *unique = sym[i]; s = i; } } @@ -367,6 +315,7 @@ int board_unique(const Board *board, Board *unique) board_check(unique); return s; } +#endif /** * @brief Get a random board by playing random moves. @@ -377,22 +326,22 @@ int board_unique(const Board *board, Board *unique) */ void board_rand(Board *board, int n_ply, Random *r) { - Move move[1]; + Move move; unsigned long long moves; int ply; board_init(board); for (ply = 0; ply < n_ply; ply++) { - moves = get_moves(board->player, board->opponent); + moves = board_get_moves(board); if (!moves) { board_pass(board); - moves = get_moves(board->player, board->opponent); + moves = board_get_moves(board); if (!moves) { break; } } - board_get_move(board, get_rand_bit(moves, r), move); - board_update(board, move); + board_get_move_flip(board, get_rand_bit(moves, r), &move); + board_update(board, &move); } } @@ -407,10 +356,10 @@ void board_rand(Board *board, int n_ply, Random *r) * @param move a Move structure remembering the modification. * @return the flipped discs. */ -unsigned long long board_get_move(const Board *board, const int x, Move *move) +unsigned long long board_get_move_flip(const Board *board, const int x, Move *move) { - move->flipped = flip[x](board->player, board->opponent); move->x = x; + move->flipped = board_flip(board, x); return move->flipped; } @@ -424,8 +373,8 @@ unsigned long long board_get_move(const Board *board, const int x, Move *move) bool board_check_move(const Board *board, Move *move) { if (move->x == PASS) return !can_move(board->player, board->opponent); - else if ((x_to_bit(move->x) & ~(board->player|board->opponent)) == 0) return false; - else if (move->flipped != flip[move->x](board->player, board->opponent)) return false; + else if (x_to_bit(move->x) & (board->player | board->opponent)) return false; + else if (move->flipped != board_flip(board, move->x)) return false; else return true; } @@ -436,14 +385,28 @@ bool board_check_move(const Board *board, Move *move) * according to the 'move' description. * * @param board the board to modify - * @param move A Move structure describing the modification. + * @param move A Move structure describing the modification (may be PASS). */ void board_update(Board *board, const Move *move) { - board->player ^= (move->flipped | x_to_bit(move->x)); - board->opponent ^= move->flipped; - board_swap_players(board); +#if defined(hasSSE2) && (defined(HAS_CPU_64) || !defined(__3dNOW__)) // 3DNow CPU has fast emms, and possibly slow SSE + __m128i OP = _mm_loadu_si128((__m128i *) board); + OP = _mm_xor_si128(OP, _mm_or_si128(_mm_set1_epi64x(move->flipped), _mm_loadl_epi64((__m128i *) &X_TO_BIT[move->x]))); + _mm_storeu_si128((__m128i *) board, _mm_shuffle_epi32(OP, 0x4e)); + +#elif defined(hasMMX) + __m64 F = *(__m64 *) &move->flipped; + __m64 P = _m_pxor(*(__m64 *) &board->player, _m_por(F, *(__m64 *) &X_TO_BIT[move->x])); + __m64 O = _m_pxor(*(__m64 *) &board->opponent, F); + *(__m64 *) &board->player = O; + *(__m64 *) &board->opponent = P; + _mm_empty(); +#else + unsigned long long O = board->opponent; + board->opponent = board->player ^ (move->flipped | X_TO_BIT[move->x]); + board->player = O ^ move->flipped; +#endif board_check(board); } @@ -458,10 +421,24 @@ void board_update(Board *board, const Move *move) */ void board_restore(Board *board, const Move *move) { - board_swap_players(board); - board->player ^= (move->flipped | x_to_bit(move->x)); - board->opponent ^= move->flipped; +#if defined(hasSSE2) && (defined(HAS_CPU_64) || !defined(__3dNOW__)) + __m128i OP = _mm_shuffle_epi32(_mm_loadu_si128((__m128i *) board), 0x4e); + OP = _mm_xor_si128(OP, _mm_or_si128(_mm_set1_epi64x(move->flipped), _mm_loadl_epi64((__m128i *) &X_TO_BIT[move->x]))); + _mm_storeu_si128((__m128i *) board, OP); + +#elif defined(hasMMX) + __m64 F = *(__m64 *) &move->flipped; + __m64 P = *(__m64 *) &board->opponent; + __m64 O = *(__m64 *) &board->player; + *(__m64 *) &board->player = _m_pxor(P, _m_por(F, *(__m64 *) &X_TO_BIT[move->x])); + *(__m64 *) &board->opponent = _m_pxor(O, F); + _mm_empty(); +#else + unsigned long long P = board->player; + board->player = board->opponent ^ (move->flipped | X_TO_BIT[move->x]); + board->opponent = P ^ move->flipped; +#endif board_check(board); } @@ -475,48 +452,32 @@ void board_restore(Board *board, const Move *move) void board_pass(Board *board) { board_swap_players(board); + board_check(board); } +#if (MOVE_GENERATOR != MOVE_GENERATOR_AVX) && (MOVE_GENERATOR != MOVE_GENERATOR_AVX512) && (MOVE_GENERATOR != MOVE_GENERATOR_SSE) && (MOVE_GENERATOR != MOVE_GENERATOR_NEON) // SSE version in board_sse.c /** * @brief Compute a board resulting of a move played on a previous board. * * @param board board to play the move on. - * @param x move to play. + * @param x move to play (may be PASS). * @param next resulting board. * @return flipped discs. */ unsigned long long board_next(const Board *board, const int x, Board *next) { - const unsigned long long flipped = flip[x](board->player, board->opponent); + const unsigned long long flipped = board_flip(board, x); const unsigned long long player = board->opponent ^ flipped; - next->opponent = board->player ^ (flipped | x_to_bit(x)); + next->opponent = board->player ^ (flipped | X_TO_BIT[x]); next->player = player; return flipped; } +#endif -/** - * @brief Compute a board resulting of an opponent move played on a previous board. - * - * Compute the board after passing and playing a move. - * - * @param board board to play the move on. - * @param x opponent move to play. - * @param next resulting board. - * @return flipped discs. - */ -unsigned long long board_pass_next(const Board *board, const int x, Board *next) -{ - const unsigned long long flipped = flip[x](board->opponent, board->player); - - next->opponent = board->opponent ^ (flipped | x_to_bit(x)); - next->player = board->player ^ flipped; - - return flipped; -} - +#if !defined(hasSSE2) && !defined(__ARM_NEON) // SSE version in board_sse.c /** * @brief Get a part of the moves. * @@ -531,44 +492,50 @@ unsigned long long board_pass_next(const Board *board, const int x, Board *next) * @return some legal moves in a 64-bit unsigned integer. */ static inline unsigned long long get_some_moves(const unsigned long long P, const unsigned long long mask, const int dir) +// x86 build will use helper for long long shift unless inlined { -#if PARALLEL_PREFIX & 1 +#if KOGGE_STONE & 1 + // kogge-stone algorithm + // 6 << + 6 >> + 12 & + 7 | + // + better instruction independency + unsigned long long flip_l, flip_r; + unsigned long long mask_l, mask_r; + int d; + + flip_l = flip_r = P; + mask_l = mask_r = mask; + d = dir; + + flip_l |= mask_l & (flip_l << d); flip_r |= mask_r & (flip_r >> d); + mask_l &= (mask_l << d); mask_r &= (mask_r >> d); + d <<= 1; + flip_l |= mask_l & (flip_l << d); flip_r |= mask_r & (flip_r >> d); + mask_l &= (mask_l << d); mask_r &= (mask_r >> d); + d <<= 1; + flip_l |= mask_l & (flip_l << d); flip_r |= mask_r & (flip_r >> d); + + return ((flip_l & mask) << dir) | ((flip_r & mask) >> dir); + +#elif PARALLEL_PREFIX & 1 // 1-stage Parallel Prefix (intermediate between kogge stone & sequential) // 6 << + 6 >> + 7 | + 10 & - register unsigned long long flip_l, flip_r; - register unsigned long long mask_l, mask_r; + unsigned long long flip_l, flip_r; + unsigned long long mask_l, mask_r; const int dir2 = dir + dir; flip_l = mask & (P << dir); flip_r = mask & (P >> dir); flip_l |= mask & (flip_l << dir); flip_r |= mask & (flip_r >> dir); - mask_l = mask & (mask << dir); mask_r = mask & (mask >> dir); + mask_l = mask & (mask << dir); mask_r = mask_l >> dir; flip_l |= mask_l & (flip_l << dir2); flip_r |= mask_r & (flip_r >> dir2); flip_l |= mask_l & (flip_l << dir2); flip_r |= mask_r & (flip_r >> dir2); return (flip_l << dir) | (flip_r >> dir); -#elif KOGGE_STONE & 1 - // kogge-stone algorithm - // 6 << + 6 >> + 12 & + 7 | - // + better instruction independency - register unsigned long long flip_l, flip_r; - register unsigned long long mask_l, mask_r; - const int dir2 = dir << 1; - const int dir4 = dir << 2; - - flip_l = P | (mask & (P << dir)); flip_r = P | (mask & (P >> dir)); - mask_l = mask & (mask << dir); mask_r = mask & (mask >> dir); - flip_l |= mask_l & (flip_l << dir2); flip_r |= mask_r & (flip_r >> dir2); - mask_l &= (mask_l << dir2); mask_r &= (mask_r >> dir2); - flip_l |= mask_l & (flip_l << dir4); flip_r |= mask_r & (flip_r >> dir4); - - return ((flip_l & mask) << dir) | ((flip_r & mask) >> dir); - #else // sequential algorithm // 7 << + 7 >> + 6 & + 12 | - register unsigned long long flip; + unsigned long long flip; flip = (((P << dir) | (P >> dir)) & mask); flip |= (((flip << dir) | (flip >> dir)) & mask); @@ -592,152 +559,26 @@ static inline unsigned long long get_some_moves(const unsigned long long P, cons */ unsigned long long get_moves(const unsigned long long P, const unsigned long long O) { -#if defined(USE_GAS_MMX) - /* mm7: P, mm6: O */ - const unsigned long long mask_7e = 0x7e7e7e7e7e7e7e7eULL; - - __asm__ volatile( - "movl %3, %%esi\n\t" "movq %1, %%mm7\n\t" - "movl %4, %%edi\n\t" "movq %2, %%mm6\n\t" - /* shift=+1 */ /* shift=+8 */ - "movl %%esi, %%eax\n\t" "movq %%mm7, %%mm0\n\t" - "movq %5, %%mm5\n\t" - "shrl $1, %%eax\n\t" "psrlq $8, %%mm0\n\t" - "andl $2122219134, %%edi\n\t" "pand %%mm6, %%mm5\n\t" - "andl %%edi, %%eax\n\t" "pand %%mm6, %%mm0\n\t" /* 0 m7&o6 m6&o5 .. m1&o0 */ - "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm1\n\t" - "shrl $1, %%eax\n\t" "psrlq $8, %%mm0\n\t" - "movl %%edi, %%ecx\n\t" "movq %%mm6, %%mm3\n\t" - "andl %%edi, %%eax\n\t" "pand %%mm6, %%mm0\n\t" /* 0 0 m7&o6&o5 .. m2&o1&o0 */ - "shrl $1, %%ecx\n\t" "psrlq $8, %%mm3\n\t" - "orl %%edx, %%eax\n\t" "por %%mm1, %%mm0\n\t" /* 0 m7&o6 (m6&o5)|(m7&o6&o5) .. (m1&o0) */ - "andl %%edi, %%ecx\n\t" "pand %%mm6, %%mm3\n\t" /* 0 o7&o6 o6&o5 o5&o4 o4&o3 .. */ - "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm4\n\t" - "shrl $2, %%eax\n\t" "psrlq $16, %%mm0\n\t" - "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" /* 0 0 0 m7&o6&o5&o4 (m6&o5&o4&o3)|(m7&o6&o5&o4&o3) .. */ - "orl %%eax, %%edx\n\t" "por %%mm0, %%mm4\n\t" - "shrl $2, %%eax\n\t" "psrlq $16, %%mm0\n\t" - "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" /* 0 0 0 0 0 m7&o6&..&o2 (m6&o5&..&o1)|(m7&o6&..&o1) .. */ - "orl %%edx, %%eax\n\t" "por %%mm0, %%mm4\n\t" - "shrl $1, %%eax\n\t" "psrlq $8, %%mm4\n\t" /* result of +8 */ - /* shift=-1 */ /* shift=-8 */ - "movq %%mm7, %%mm0\n\t" - "addl %%esi, %%esi\n\t" "psllq $8, %%mm0\n\t" - "andl %%edi, %%esi\n\t" "pand %%mm6, %%mm0\n\t" - "movl %%esi, %%edx\n\t" "movq %%mm0, %%mm1\n\t" - "addl %%esi, %%esi\n\t" "psllq $8, %%mm0\n\t" - "andl %%edi, %%esi\n\t" "pand %%mm6, %%mm0\n\t" - "orl %%esi, %%edx\n\t" "por %%mm1, %%mm0\n\t" - "addl %%ecx, %%ecx\n\t" "psllq $8, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "leal (,%%edx,4), %%esi\n\t" "psllq $16, %%mm0\n\t" - "andl %%ecx, %%esi\n\t" "pand %%mm3, %%mm0\n\t" - "orl %%esi, %%edx\n\t" "por %%mm0, %%mm1\n\t" - "shll $2, %%esi\n\t" "psllq $16, %%mm0\n\t" - "andl %%ecx, %%esi\n\t" "pand %%mm3, %%mm0\n\t" - "orl %%edx, %%esi\n\t" "por %%mm1, %%mm0\n\t" - "addl %%esi, %%esi\n\t" "psllq $8, %%mm0\n\t" - "orl %%eax, %%esi\n\t" "por %%mm0, %%mm4\n\t" - /* Serialize */ /* shift=+7 */ - "movq %%mm7, %%mm0\n\t" - "movd %%esi, %%mm1\n\t" - "psrlq $7, %%mm0\n\t" - "psllq $32, %%mm1\n\t" - "pand %%mm5, %%mm0\n\t" - "por %%mm1, %%mm4\n\t" - "movq %%mm0, %%mm1\n\t" - "psrlq $7, %%mm0\n\t" - "pand %%mm5, %%mm0\n\t" - "movq %%mm5, %%mm3\n\t" - "por %%mm1, %%mm0\n\t" - "psrlq $7, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "pand %%mm5, %%mm3\n\t" - "psrlq $14, %%mm0\n\t" - "pand %%mm3, %%mm0\n\t" - "movl %1, %%esi\n\t" "por %%mm0, %%mm1\n\t" - "movl %2, %%edi\n\t" "psrlq $14, %%mm0\n\t" - "andl $2122219134, %%edi\n\t" "pand %%mm3, %%mm0\n\t" - "movl %%edi, %%ecx\n\t" "por %%mm1, %%mm0\n\t" - "shrl $1, %%ecx\n\t" "psrlq $7, %%mm0\n\t" - "andl %%edi, %%ecx\n\t" "por %%mm0, %%mm4\n\t" - /* shift=+1 */ /* shift=-7 */ - "movl %%esi, %%eax\n\t" "movq %%mm7, %%mm0\n\t" - "shrl $1, %%eax\n\t" "psllq $7, %%mm0\n\t" - "andl %%edi, %%eax\n\t" "pand %%mm5, %%mm0\n\t" - "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm1\n\t" - "shrl $1, %%eax\n\t" "psllq $7, %%mm0\n\t" - "andl %%edi, %%eax\n\t" "pand %%mm5, %%mm0\n\t" - "orl %%edx, %%eax\n\t" "por %%mm1, %%mm0\n\t" - "psllq $7, %%mm3\n\t" - "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm1\n\t" - "shrl $2, %%eax\n\t" "psllq $14, %%mm0\n\t" - "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" - "orl %%eax, %%edx\n\t" "por %%mm0, %%mm1\n\t" - "shrl $2, %%eax\n\t" "psllq $14, %%mm0\n\t" - "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" - "orl %%edx, %%eax\n\t" "por %%mm1, %%mm0\n\t" - "shrl $1, %%eax\n\t" "psllq $7, %%mm0\n\t" - "por %%mm0, %%mm4\n\t" - /* shift=-1 */ /* shift=+9 */ - "movq %%mm7, %%mm0\n\t" - "addl %%esi, %%esi\n\t" "psrlq $9, %%mm0\n\t" - "andl %%edi, %%esi\n\t" "pand %%mm5, %%mm0\n\t" - "movl %%esi, %%edx\n\t" "movq %%mm0, %%mm1\n\t" - "addl %%esi, %%esi\n\t" "psrlq $9, %%mm0\n\t" - "andl %%edi, %%esi\n\t" "pand %%mm5, %%mm0\n\t" - "movq %%mm5, %%mm3\n\t" - "orl %%esi, %%edx\n\t" "por %%mm1, %%mm0\n\t" - "psrlq $9, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "addl %%ecx, %%ecx\n\t" "pand %%mm5, %%mm3\n\t" - "leal (,%%edx,4), %%esi\n\t" "psrlq $18, %%mm0\n\t" - "andl %%ecx, %%esi\n\t" "pand %%mm3, %%mm0\n\t" - "orl %%esi, %%edx\n\t" "por %%mm0, %%mm1\n\t" - "shll $2, %%esi\n\t" "psrlq $18, %%mm0\n\t" - "andl %%ecx, %%esi\n\t" "pand %%mm3, %%mm0\n\t" - "orl %%edx, %%esi\n\t" "por %%mm1, %%mm0\n\t" - "addl %%esi, %%esi\n\t" "psrlq $9, %%mm0\n\t" - "orl %%eax, %%esi\n\t" "por %%mm0, %%mm4\n\t" - /* Serialize */ /* shift=-9 */ - "movq %%mm7, %%mm0\n\t" - "movd %%esi, %%mm1\n\t" - "psllq $9, %%mm0\n\t" - "por %%mm1, %%mm4\n\t" - "pand %%mm5, %%mm0\n\t" - "movq %%mm0, %%mm1\n\t" - "psllq $9, %%mm0\n\t" - "pand %%mm5, %%mm0\n\t" - "por %%mm1, %%mm0\n\t" - "psllq $9, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "psllq $18, %%mm0\n\t" - "pand %%mm3, %%mm0\n\t" - "por %%mm0, %%mm1\n\t" - "psllq $18, %%mm0\n\t" - "pand %%mm3, %%mm0\n\t" - "por %%mm1, %%mm0\n\t" - "psllq $9, %%mm0\n\t" - "por %%mm0, %%mm4\n\t" - /* mm4 is the pseudo-feasible moves at this point. */ - /* Let mm7 be the feasible moves, i.e., mm4 restricted to empty squares. */ - "por %%mm6, %%mm7\n\t" - "pandn %%mm4, %%mm7\n\t" - "movq %%mm7, %0\n\t" - "emms" /* Reset the FP/MMX unit. */ - : "=g" (moves) : "m" (P), "m" (O), "g" (P >> 32), "g" (O >> 32), "m" (mask_7e) : "eax", "edx", "ecx", "esi", "edi" ); + unsigned long long moves, OM; -#else - const unsigned long long mask = O & 0x7E7E7E7E7E7E7E7Eull; + #if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(DISPATCH_NEON) + if (hasSSE2) + return get_moves_sse(P, O); + #endif + #if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) + if (hasMMX) + return get_moves_mmx(P, O); + #endif - return (get_some_moves(P, mask, 1) // horizontal + OM = O & 0x7e7e7e7e7e7e7e7e; + moves = ( get_some_moves(P, OM, 1) // horizontal | get_some_moves(P, O, 8) // vertical - | get_some_moves(P, mask, 7) // diagonals - | get_some_moves(P, mask, 9)) - & ~(P|O); // mask with empties + | get_some_moves(P, OM, 7) // diagonals + | get_some_moves(P, OM, 9)); -#endif + return moves & ~(P|O); // mask with empties } +#endif // hasSSE2/__ARM_NEON /** * @brief Get legal moves on a 6x6 board. @@ -750,13 +591,9 @@ unsigned long long get_moves(const unsigned long long P, const unsigned long lon */ unsigned long long get_moves_6x6(const unsigned long long P, const unsigned long long O) { - const unsigned long long E = (~(P|O) & 0x007E7E7E7E7E7E00ull); // empties - - return ((get_some_moves(P, O & 0x003C3C3C3C3C3C00ull, 1) // horizontal - | get_some_moves(P, O & 0x00007E7E7E7E0000ull, 8) // vertical - | get_some_moves(P, O & 0x00003C3C3C3C0000ull, 7) // diagonals - | get_some_moves(P, O & 0x00003C3C3C3C0000ull, 9)) - & E); // mask with empties + unsigned long long PM = P & 0x007E7E7E7E7E7E00; + unsigned long long OM = O & 0x007E7E7E7E7E7E00; + return get_moves(PM, OM) & 0x007E7E7E7E7E7E00; } /** @@ -768,12 +605,18 @@ unsigned long long get_moves_6x6(const unsigned long long P, const unsigned long */ bool can_move(const unsigned long long P, const unsigned long long O) { +#if defined(hasMMX) || defined(__ARM_NEON) + return get_moves(P, O) != 0; + +#else const unsigned long long E = ~(P|O); // empties + const unsigned long long OM = O & 0x7E7E7E7E7E7E7E7E; - return (get_some_moves(P, O & 0x007E7E7E7E7E7E00ull, 7) & E) // diagonals - || (get_some_moves(P, O & 0x007E7E7E7E7E7E00ull, 9) & E) - || (get_some_moves(P, O & 0x7E7E7E7E7E7E7E7Eull, 1) & E) // horizontal - || (get_some_moves(P, O & 0x00FFFFFFFFFFFF00ull, 8) & E); // vertical + return (get_some_moves(P, OM, 7) & E) // diagonals + || (get_some_moves(P, OM, 9) & E) + || (get_some_moves(P, OM, 1) & E) // horizontal + || (get_some_moves(P, O, 8) & E); // vertical +#endif } /** @@ -785,12 +628,7 @@ bool can_move(const unsigned long long P, const unsigned long long O) */ bool can_move_6x6(const unsigned long long P, const unsigned long long O) { - const unsigned long long E = (~(P|O) & 0x007E7E7E7E7E7E00ull); // empties - - return (get_some_moves(P, O & 0x00003C3C3C3C0000ull, 7) & E) // diagonals - || (get_some_moves(P, O & 0x00003C3C3C3C0000ull, 9) & E) - || (get_some_moves(P, O & 0x003C3C3C3C3C3C00ull, 1) & E) // horizontal - || (get_some_moves(P, O & 0x00007E7E7E7E0000ull, 8) & E); // vertical + return get_moves_6x6(P, O) != 0; } /** @@ -807,21 +645,17 @@ int get_mobility(const unsigned long long P, const unsigned long long O) return bit_count(get_moves(P, O)); } -int get_weighted_mobility(const unsigned long long P, const unsigned long long O) -{ - return bit_weighted_count(get_moves(P, O)); -} - +#ifndef __AVX2__ // AVX2 version in board_sse.c /** * @brief Get some potential moves. * - * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. * @param dir flipping direction. * @return some potential moves in a 64-bit unsigned integer. */ -static inline unsigned long long get_some_potential_moves(const unsigned long long P, const int dir) +static inline unsigned long long get_some_potential_moves(const unsigned long long O, const int dir) { - return (P << dir | P >> dir); + return (O << dir | O >> dir); } /** @@ -833,28 +667,15 @@ static inline unsigned long long get_some_potential_moves(const unsigned long lo * @param O bitboard with opponent's discs. * @return all potential moves in a 64-bit unsigned integer. */ -static unsigned long long get_potential_moves(const unsigned long long P, const unsigned long long O) +unsigned long long get_potential_moves(const unsigned long long P, const unsigned long long O) { - return (get_some_potential_moves(O & 0x7E7E7E7E7E7E7E7Eull, 1) // horizontal - | get_some_potential_moves(O & 0x00FFFFFFFFFFFF00ull, 8) // vertical - | get_some_potential_moves(O & 0x007E7E7E7E7E7E00ull, 7) // diagonals - | get_some_potential_moves(O & 0x007E7E7E7E7E7E00ull, 9)) + return (get_some_potential_moves(O & 0x7E7E7E7E7E7E7E7E, 1) // horizontal + | get_some_potential_moves(O & 0x00FFFFFFFFFFFF00, 8) // vertical + | get_some_potential_moves(O & 0x007E7E7E7E7E7E00, 7) // diagonals + | get_some_potential_moves(O & 0x007E7E7E7E7E7E00, 9)) & ~(P|O); // mask with empties } - -/** - * @brief Get potential mobility. - * - * Count the list of empty squares in contact of a player square. - * - * @param P bitboard with player's discs. - * @param O bitboard with opponent's discs. - * @return a count of potential moves. - */ -int get_potential_mobility(const unsigned long long P, const unsigned long long O) -{ - return bit_weighted_count(get_potential_moves(P, O)); -} +#endif // AVX2 /** * @brief search stable edge patterns. @@ -867,53 +688,53 @@ int get_potential_mobility(const unsigned long long P, const unsigned long long */ static int find_edge_stable(const int old_P, const int old_O, int stable) { - register int P, O, x, y; + int P, O, O2, X, F; const int E = ~(old_P | old_O); // empties stable &= old_P; // mask stable squares with remaining player squares. if (!stable || E == 0) return stable; - for (x = 0; x < 8; ++x) { - if (E & x_to_bit(x)) { //is x an empty square ? + for (X = 0x01; X <= 0x80; X <<= 1) { + if (E & X) { // is x an empty square ? O = old_O; - P = old_P | x_to_bit(x); // player plays on it - if (x > 1) { // flip left discs - for (y = x - 1; y > 0 && (O & x_to_bit(y)); --y) ; - if (P & x_to_bit(y)) { - for (y = x - 1; y > 0 && (O & x_to_bit(y)); --y) { - O ^= x_to_bit(y); P ^= x_to_bit(y); - } - } - } - if (x < 6) { // flip right discs - for (y = x + 1; y < 8 && (O & x_to_bit(y)); ++y) ; - if (P & x_to_bit(y)) { - for (y = x + 1; y < 8 && (O & x_to_bit(y)); ++y) { - O ^= x_to_bit(y); P ^= x_to_bit(y); - } - } + P = old_P | X; // player plays on it + if (X > 0x02) { // flip left discs (using parallel prefix) + F = O & (X >> 1); + F |= O & (F >> 1); + O2 = O & (O >> 1); + F |= O2 & (F >> 2); + F |= O2 & (F >> 2); + F &= -(P & (F >> 1)); + O ^= F; + P ^= F; } + // if (X < 0x40) { // flip right discs (using carry propagation) + F = (O + X + X) & P; + F -= (X + X) & -(int)(F != 0); + O ^= F; + P ^= F; + // } stable = find_edge_stable(P, O, stable); // next move if (!stable) return stable; P = old_P; - O = old_O | x_to_bit(x); // opponent plays on it - if (x > 1) { - for (y = x - 1; y > 0 && (P & x_to_bit(y)); --y) ; - if (O & x_to_bit(y)) { - for (y = x - 1; y > 0 && (P & x_to_bit(y)); --y) { - O ^= x_to_bit(y); P ^= x_to_bit(y); - } - } - } - if (x < 6) { - for (y = x + 1; y < 8 && (P & x_to_bit(y)); ++y) ; - if (O & x_to_bit(y)) { - for (y = x + 1; y < 8 && (P & x_to_bit(y)); ++y) { - O ^= x_to_bit(y); P ^= x_to_bit(y); - } - } + O = old_O | X; // opponent plays on it + if (X > 0x02) { // flip left discs (using parallel prefix) + F = P & (X >> 1); + F |= P & (F >> 1); + O2 = P & (P >> 1); + F |= O2 & (F >> 2); + F |= O2 & (F >> 2); + F &= -(O & (F >> 1)); + O ^= F; + P ^= F; } + // if (X < 0x40) { // flip right discs (using carry propagation) + F = (P + X + X) & O; + F -= (X + X) & -(int)(F != 0); + O ^= F; + P ^= F; + // } stable = find_edge_stable(P, O, stable); // next move if (!stable) return stable; } @@ -923,109 +744,145 @@ static int find_edge_stable(const int old_P, const int old_O, int stable) } /** - * @brief Initialize the edge stability tables. + * @brief Initialize the edge stability table. */ void edge_stability_init(void) { - int P, O; + int P, O, PO, rPO; + // long long t = cpu_clock(); - for (P = 0; P < 256; ++P) - for (O = 0; O < 256; ++O) { + for (PO = 0; PO < 256 * 256; ++PO) { + P = PO >> 8; + O = PO & 0xFF; if (P & O) { // illegal positions - edge_stability[P][O] = 0; + edge_stability[PO] = 0; } else { - edge_stability[P][O] = find_edge_stable(P, O, P); + rPO = horizontal_mirror_32(PO); + if (PO > rPO) + edge_stability[PO] = mirror_byte(edge_stability[rPO]); + else + edge_stability[PO] = find_edge_stable(P, O, P); } } + // printf("edge_stability_init: %d\n", (int)(cpu_clock() - t)); } +#ifdef HAS_CPU_64 +#define packA1A8(X) ((((X) & 0x0101010101010101) * 0x0102040810204080) >> 56) +#define packH1H8(X) ((((X) & 0x8080808080808080) * 0x0002040810204081) >> 56) +#else +#define packA1A8(X) (((((unsigned int)(X) & 0x01010101) + (((unsigned int)((X) >> 32) & 0x01010101) << 4)) * 0x01020408) >> 24) +#define packH1H8(X) (((((unsigned int)((X) >> 32) & 0x80808080) + (((unsigned int)(X) & 0x80808080) >> 4)) * 0x00204081) >> 24) +#endif + +#if !defined(hasSSE2) && !defined(__ARM_NEON) /** - * @brief Get full lines. + * @brief Get stable edge. + * + * Compute the exact stable edges from precomputed tables. + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return a bitboard with (some of) player's stable discs. * - * @param line all discs on a line. - * @param dir tested direction - * @return a bitboard with full lines along the tested direction. */ -static inline unsigned long long get_full_lines(const unsigned long long line, const int dir) -{ -#if KOGGE_STONE & 2 - - // kogge-stone algorithm - // 5 << + 5 >> + 7 & + 10 | - // + better instruction independency - register unsigned long long full_l, full_r, edge_l, edge_r; - const unsigned long long edge = 0xff818181818181ffULL; - const int dir2 = dir << 1; - const int dir4 = dir << 2; +unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) +{ // compute the exact stable edges (from precomputed tables) + return edge_stability[((unsigned int) P & 0xff) * 256 + ((unsigned int) O & 0xff)] + | (unsigned long long) edge_stability[(unsigned int) (P >> 56) * 256 + (unsigned int) (O >> 56)] << 56 + | unpackA2A7(edge_stability[packA1A8(P) * 256 + packA1A8(O)]) + | unpackH2H7(edge_stability[packH1H8(P) * 256 + packH1H8(O)]); +} - full_l = line & (edge | (line >> dir)); full_r = line & (edge | (line << dir)); - edge_l = edge | (edge >> dir); edge_r = edge | (edge << dir); - full_l &= edge_l | (full_l >> dir2); full_r &= edge_r | (full_r << dir2); - edge_l |= edge_l >> dir2; edge_r |= edge_r << dir2; - full_l &= edge_l | (full_l >> dir4); full_r &= edge_r | (full_r << dir4); +/** + * @brief Estimate the stability of edges. + * + * Count the number (in fact a lower estimate) of stable discs on the edges. + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return the number of stable discs on the edges. + */ +int get_edge_stability(const unsigned long long P, const unsigned long long O) +{ + unsigned int packedstable = edge_stability[((unsigned int) P & 0xff) * 256 + ((unsigned int) O & 0xff)] + | edge_stability[(unsigned int) (P >> 56) * 256 + (unsigned int) (O >> 56)] << 8 + | edge_stability[packA1A8(P) * 256 + packA1A8(O)] << 16 + | edge_stability[packH1H8(P) * 256 + packH1H8(O)] << 24; + return bit_count_32(packedstable & 0xffff7e7e); +} +#endif - return full_r & full_l; +/** + * @brief Get full lines. + * + * @param disc all discs on the board. + * @param full all 1 if full line, otherwise all 0. + */ -#elif PARALLEL_PREFIX & 2 +#if !defined(__ARM_NEON) && !defined(hasSSE2) && !defined(hasMMX) + #ifdef HAS_CPU_64 - // 1-stage Parallel Prefix (intermediate between kogge stone & sequential) - // 5 << + 5 >> + 7 & + 10 | - register unsigned long long full_l, full_r; - register unsigned long long edge_l, edge_r; - const unsigned long long edge = 0xff818181818181ffULL; - const int dir2 = dir + dir; +static unsigned long long get_full_lines_h(unsigned long long full) +{ + full &= full >> 1; + full &= full >> 2; + full &= full >> 4; + return (full & 0x0101010101010101) * 0xff; +} - full_l = edge | (line << dir); full_r = edge | (line >> dir); - full_l &= edge | (full_l << dir); full_r &= edge | (full_r >> dir); - edge_l = edge | (edge << dir); edge_r = edge | (edge >> dir); - full_l &= edge_l | (full_l << dir2); full_r &= edge_r | (full_r >> dir2); - full_l &= edge_l | (full_l << dir2); full_r &= edge_r | (full_r >> dir2); +static unsigned long long get_full_lines_v(unsigned long long full) +{ + full &= (full >> 8) | (full << 56); // ror 8 + full &= (full >> 16) | (full << 48); // ror 16 + full &= (full >> 32) | (full << 32); // ror 32 + return full; +} - return full_l & full_r; + #else -#else +static unsigned int get_full_lines_h_32(unsigned int full) +{ + full &= full >> 1; + full &= full >> 2; + full &= full >> 4; + return (full & 0x01010101) * 0xff; +} - // sequential algorithm - // 6 << + 6 >> + 12 & + 5 | - register unsigned long long full; - const unsigned long long edge = line & 0xff818181818181ffULL; +static unsigned long long get_full_lines_h(unsigned long long full) +{ + return ((unsigned long long) get_full_lines_h_32(full >> 32) << 32) | get_full_lines_h_32(full); +} - full = (line & (((line >> dir) & (line << dir)) | edge)); - full &= (((full >> dir) & (full << dir)) | edge); - full &= (((full >> dir) & (full << dir)) | edge); - full &= (((full >> dir) & (full << dir)) | edge); - full &= (((full >> dir) & (full << dir)) | edge); +static unsigned long long get_full_lines_v(unsigned long long full) +{ + unsigned int t = (unsigned int) full & (unsigned int)(full >> 32); + t &= (t >> 16) | (t << 16); // ror 16 + t &= (t >> 8) | (t << 24); // ror 8 + return t | ((unsigned long long) t << 32); +} - return ((full >> dir) & (full << dir)); + #endif -#endif -} +void get_full_lines(const unsigned long long disc, unsigned long long full[4]) +{ + unsigned long long l7, l9, r7, r9; // full lines + full[0] = get_full_lines_h(disc); + full[1] = get_full_lines_v(disc); -#ifdef __X86_64__ -#define packA1A8(X) ((((X) & 0x0101010101010101ULL) * 0x0102040810204080ULL) >> 56) -#define packH1H8(X) ((((X) & 0x8080808080808080ULL) * 0x0002040810204081ULL) >> 56) -#else -#define packA1A8(X) (((((unsigned int)(X) & 0x01010101u) + (((unsigned int)((X) >> 32) & 0x01010101u) << 4)) * 0x01020408u) >> 24) -#define packH1H8(X) (((((unsigned int)((X) >> 32) & 0x80808080u) + (((unsigned int)(X) & 0x80808080u) >> 4)) * 0x00204081u) >> 24) -#endif + l7 = r7 = disc; + l7 &= 0xff01010101010101 | (l7 >> 7); r7 &= 0x80808080808080ff | (r7 << 7); + l7 &= 0xffff030303030303 | (l7 >> 14); r7 &= 0xc0c0c0c0c0c0ffff | (r7 << 14); + l7 &= 0xffffffff0f0f0f0f | (l7 >> 28); r7 &= 0xf0f0f0f0ffffffff | (r7 << 28); + full[3] = l7 & r7; -/** - * @brief Get stable edge. - * - * @param P bitboard with player's discs. - * @param O bitboard with opponent's discs. - * @return a bitboard with (some of) player's stable discs. - * - */ -static inline unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) -{ - // compute the exact stable edges (from precomputed tables) - return edge_stability[P & 0xff][O & 0xff] - | ((unsigned long long)edge_stability[P >> 56][O >> 56]) << 56 - | A1_A8[edge_stability[packA1A8(P)][packA1A8(O)]] - | H1_H8[edge_stability[packH1H8(P)][packH1H8(O)]]; + l9 = r9 = disc; + l9 &= 0xff80808080808080 | (l9 >> 9); r9 &= 0x01010101010101ff | (r9 << 9); + l9 &= 0xffffc0c0c0c0c0c0 | (l9 >> 18); r9 &= 0x030303030303ffff | (r9 << 18); + full[2] = l9 & r9 & (0x0f0f0f0ff0f0f0f0 | (l9 >> 36) | (r9 << 36)); } +#endif // __ARM_NEON/hasSSE2/hasMMX /** * @brief Estimate the stability. @@ -1036,49 +893,72 @@ static inline unsigned long long get_stable_edge(const unsigned long long P, con * @param O bitboard with opponent's discs. * @return the number of stable discs. */ -int get_stability(const unsigned long long P, const unsigned long long O) +#ifndef __AVX2__ // AVX2 version in board_sse.c + #if !(defined(hasMMX) && !defined(hasSSE2)) // MMX version of get_stability in board_mmx.c + #if !(defined(hasSSE2) && !defined(HAS_CPU_64)) // 32bit SSE version in board_sse.c +// compute the other stable discs (ie discs touching another stable disc in each flipping direction). +int get_spreaded_stability(unsigned long long stable, unsigned long long P_central, unsigned long long full[4]) { - const unsigned long long disc = (P | O); - const unsigned long long central_mask = (P & 0x007e7e7e7e7e7e00ULL); - const unsigned long long full_h = get_full_lines(disc, 1); - const unsigned long long full_v = get_full_lines(disc, 8); - const unsigned long long full_d7 = get_full_lines(disc, 7); - const unsigned long long full_d9 = get_full_lines(disc, 9); - register unsigned long long stable_h, stable_v, stable_d7, stable_d9, stable, new_stable; - - // compute the exact stable edges (from precomputed tables) - new_stable = get_stable_edge(P, O); - - // add full lines - new_stable |= (full_h & full_v & full_d7 & full_d9 & central_mask); - - // now compute the other stable discs (ie discs touching another stable disc in each flipping direction). - stable = 0; - while (new_stable & ~stable) { - stable |= new_stable; - stable_h = ((stable >> 1) | (stable << 1) | full_h); - stable_v = ((stable >> 8) | (stable << 8) | full_v); - stable_d7 = ((stable >> 7) | (stable << 7) | full_d7); - stable_d9 = ((stable >> 9) | (stable << 9) | full_d9); - new_stable = (stable_h & stable_v & stable_d7 & stable_d9 & central_mask); - } + unsigned long long stable_h, stable_v, stable_d7, stable_d9, old_stable; + + if (stable == 0) // (2%) + return 0; + + do { + old_stable = stable; + stable_h = ((stable >> 1) | (stable << 1) | full[0]); + stable_v = ((stable >> 8) | (stable << 8) | full[1]); + stable_d9 = ((stable >> 9) | (stable << 9) | full[2]); + stable_d7 = ((stable >> 7) | (stable << 7) | full[3]); + stable |= (stable_h & stable_v & stable_d9 & stable_d7 & P_central); + } while (stable != old_stable); // (44%) return bit_count(stable); } + #endif + +// returns stability count only +int get_stability(const unsigned long long P, const unsigned long long O) +{ + unsigned long long stable = get_stable_edge(P, O); // compute the exact stable edges + unsigned long long P_central = P & 0x007e7e7e7e7e7e00; + unsigned long long full[4]; + + get_full_lines(P | O, full); // add full lines + stable |= (P_central & full[0] & full[1] & full[2] & full[3]); + + return get_spreaded_stability(stable, P_central, full); // compute the other stable discs +} + +// returns all full in full[4] in addition to stability count +int get_stability_fulls(const unsigned long long P, const unsigned long long O, unsigned long long full[5]) +{ + unsigned long long stable = get_stable_edge(P, O); // compute the exact stable edges + unsigned long long P_central = P & 0x007e7e7e7e7e7e00; + + get_full_lines(P | O, full); // add full lines + full[4] = full[0] & full[1] & full[2] & full[3]; + stable |= (P_central & full[4]); + + return get_spreaded_stability(stable, P_central, full); // compute the other stable discs +} + #endif /** - * @brief Estimate the stability of edges. + * @brief Get intersection of full lines. * - * Count the number (in fact a lower estimate) of stable discs on the edges. + * Get intersection of full lines. * - * @param P bitboard with player's discs. - * @param O bitboard with opponent's discs. - * @return the number of stable discs on the edges. + * @param disc bitboard with occupied discs. + * @return the intersection of full lines. */ -int get_edge_stability(const unsigned long long P, const unsigned long long O) +unsigned long long get_all_full_lines(const unsigned long long disc) { - return bit_count(get_stable_edge(P, O)); + unsigned long long full[4]; + get_full_lines(disc, full); + return full[0] & full[1] & full[2] & full[3]; } +#endif // __AVX2__ /** * @brief Estimate corner stability. @@ -1093,8 +973,40 @@ int get_edge_stability(const unsigned long long P, const unsigned long long O) */ int get_corner_stability(const unsigned long long P) { - const unsigned long long stable = ((((0x0100000000000001ULL & P) << 1) | ((0x8000000000000080ULL & P) >> 1) | ((0x0000000000000081ULL & P) << 8) | ((0x8100000000000000ULL & P) >> 8) | 0x8100000000000081ULL) & P); - return bit_count(stable); +#ifdef POPCOUNT + // stable = (((0x0100000000000001 & P) << 1) | ((0x8000000000000080 & P) >> 1) | ((0x0000000000000081 & P) << 8) | ((0x8100000000000000 & P) >> 8) | 0x8100000000000081) & P; + unsigned int P2187 = (P >> 48) | (P << 16); // ror 48 + unsigned int stable = 0x00818100 & P2187; + stable |= ((((stable * 5) >> 1) & 0x00424200) | (stable << 8) | (stable >> 8)) & P2187; // 1-8 alias does not matter since corner is stable anyway + return bit_count_32(stable); + +#else // kindergarten + static const char n_stable_h2a2h1g1b1a1[64] = { + 0, 1, 0, 2, 0, 1, 0, 2, 1, 2, 1, 3, 2, 3, 2, 4, + 0, 2, 0, 3, 0, 2, 0, 3, 1, 3, 1, 4, 2, 4, 2, 5, + 0, 1, 0, 2, 0, 1, 0, 2, 2, 3, 2, 4, 3, 4, 3, 5, + 0, 2, 0, 3, 0, 2, 0, 3, 2, 4, 2, 5, 3, 5, 3, 6 + }; + + #if 0 // defined(__BMI2__) && !defined(__bdver4__) && !defined(__znver1__) && !defined(__znver2__) // BMI2 CPU has POPCOUNT + int cnt = n_stable_h2a2h1g1b1a1[_pext_u32((unsigned int) vertical_mirror(P), 0x000081c3)] + + n_stable_h2a2h1g1b1a1[_pext_u32((unsigned int) P, 0x000081c3)]; + + #else + static const char n_stable_h8g8b8a8h7a7[64] = { + 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 2, 3, 2, 3, + 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 2, 3, 2, 3, + 1, 1, 2, 2, 2, 3, 3, 4, 1, 1, 2, 2, 3, 4, 4, 5, + 2, 2, 3, 3, 3, 4, 4, 5, 2, 2, 3, 3, 4, 5, 5, 6 + }; + + int cnt = n_stable_h8g8b8a8h7a7[(((unsigned int) (P >> 32) & 0xc3810000) * 0x00000411) >> 26] + + n_stable_h2a2h1g1b1a1[(((unsigned int) P & 0x000081c3) * 0x04410000) >> 26]; + #endif + // assert(cnt == bit_count((((0x0100000000000001 & P) << 1) | ((0x8000000000000080 & P) >> 1) | ((0x0000000000000081 & P) << 8) | ((0x8100000000000000 & P) >> 8) | 0x8100000000000081) & P)); + return cnt; + +#endif } /** @@ -1105,27 +1017,8 @@ int get_corner_stability(const unsigned long long P) */ unsigned long long board_get_hash_code(const Board *board) { - unsigned long long h1, h2; - const unsigned char *p = (const unsigned char*)board; - - h1 = hash_rank[0][p[0]]; - h2 = hash_rank[1][p[1]]; - h1 ^= hash_rank[2][p[2]]; - h2 ^= hash_rank[3][p[3]]; - h1 ^= hash_rank[4][p[4]]; - h2 ^= hash_rank[5][p[5]]; - h1 ^= hash_rank[6][p[6]]; - h2 ^= hash_rank[7][p[7]]; - h1 ^= hash_rank[8][p[8]]; - h2 ^= hash_rank[9][p[9]]; - h1 ^= hash_rank[10][p[10]]; - h2 ^= hash_rank[11][p[11]]; - h1 ^= hash_rank[12][p[12]]; - h2 ^= hash_rank[13][p[13]]; - h1 ^= hash_rank[14][p[14]]; - h2 ^= hash_rank[15][p[15]]; - - return h1 ^ h2; + unsigned long long crc = crc32c_u64(0, board->player); + return (crc << 32) | crc32c_u64(crc, board->opponent); } /** @@ -1139,7 +1032,8 @@ unsigned long long board_get_hash_code(const Board *board) */ int board_get_square_color(const Board *board, const int x) { - return 2 - 2 * ((board->player >> x) & 1) - ((board->opponent >> x) & 1); + unsigned long long b = x_to_bit(x); + return (int) ((board->player & b) == 0) * 2 - (int) ((board->opponent & b) != 0); } /** @@ -1151,7 +1045,7 @@ int board_get_square_color(const Board *board, const int x) */ bool board_is_occupied(const Board *board, const int x) { - return (board->player | board->opponent) & x_to_bit(x); + return ((board->player | board->opponent) & x_to_bit(x)) != 0; // omitting != 0 causes bogus code on MSVC19 /GL } /** @@ -1162,8 +1056,8 @@ bool board_is_occupied(const Board *board, const int x) */ bool board_is_pass(const Board *board) { - return can_move(board->player, board->opponent) == false && - can_move(board->opponent, board->player) == true; + return !can_move(board->player, board->opponent) && + can_move(board->opponent, board->player); } /** @@ -1174,8 +1068,8 @@ bool board_is_pass(const Board *board) */ bool board_is_game_over(const Board *board) { - return can_move(board->player, board->opponent) == false && - can_move(board->opponent, board->player) == false; + return !can_move(board->player, board->opponent) && + !can_move(board->opponent, board->player); } @@ -1201,31 +1095,42 @@ int board_count_empties(const Board *board) */ void board_print(const Board *board, const int player, FILE *f) { - int i, j, square, x; - const char *color = "?*O-." + 1; - unsigned long long moves = get_moves(board->player, board->opponent); + int i, j, square; + unsigned long long bk, wh; + const char color[5] = "?*O-."; + unsigned long long moves = board_get_moves(board); + + if (player == BLACK) { + bk = board->player; + wh = board->opponent; + } else { + bk = board->opponent; + wh = board->player; + } fputs(" A B C D E F G H\n", f); for (i = 0; i < 8; ++i) { fputc(i + '1', f); fputc(' ', f); for (j = 0; j < 8; ++j) { - x = i * 8 + j; - if (player == BLACK) square = 2 - ((board->opponent >> x) & 1) - 2 * ((board->player >> x) & 1); - else square = 2 - ((board->player >> x) & 1) - 2 * ((board->opponent >> x) & 1); - if (square == EMPTY && (moves & x_to_bit(x))) ++square; - fputc(color[square], f); + square = 2 - (wh & 1) - 2 * (bk & 1); + if ((square == EMPTY) && (moves & 1)) + square = EMPTY + 1; + fputc(color[square + 1], f); fputc(' ', f); + bk >>= 1; + wh >>= 1; + moves >>= 1; } fputc(i + '1', f); if (i == 1) - fprintf(f, " %c to move", color[player]); + fprintf(f, " %c to move", color[player + 1]); else if (i == 3) fprintf(f, " %c: discs = %2d moves = %2d", - color[player], bit_count(board->player), get_mobility(board->player, board->opponent)); + color[player + 1], bit_count(board->player), get_mobility(board->player, board->opponent)); else if (i == 4) fprintf(f, " %c: discs = %2d moves = %2d", - color[!player], bit_count(board->opponent), get_mobility(board->opponent, board->player)); + color[2 - player], bit_count(board->opponent), get_mobility(board->opponent, board->player)); else if (i == 5) fprintf(f, " empties = %2d ply = %2d", 64 - bit_count(board->opponent|board->player), bit_count(board->opponent|board->player) - 3); @@ -1244,12 +1149,22 @@ void board_print(const Board *board, const int player, FILE *f) char* board_to_string(const Board *board, const int player, char *s) { int square, x; - const char *color = "XO-?"; + unsigned long long bk, wh; + static const char color[4] = "XO-?"; + + if (player == BLACK) { + bk = board->player; + wh = board->opponent; + } else { + bk = board->opponent; + wh = board->player; + } for (x = 0; x < 64; ++x) { - if (player == BLACK) square = 2 - ((board->opponent >> x) & 1) - 2 * ((board->player >> x) & 1); - else square = 2 - ((board->player >> x) & 1) - 2 * ((board->opponent >> x) & 1); + square = 2 - (wh & 1) - 2 * (bk & 1); s[x] = color[square]; + bk >>= 1; + wh >>= 1; } s[64] = ' '; s[65] = color[player]; @@ -1284,45 +1199,48 @@ void board_print_FEN(const Board *board, const int player, FILE *f) char* board_to_FEN(const Board *board, const int player, char *string) { int square, x, r, c; - const char *piece = "pP-?"; - const char *color = "bw"; + unsigned long long bk, wh; + static const char piece[4] = "pP-?"; + static const char color[2] = "bw"; int n_empties = 0; char *s = string; static char local_string[128]; if (s == NULL) s = string = local_string; - for (r = 7; r >= 0; --r) - for (c = 0; c < 8; ++c) { - if (c == 0 && r < 7) { - if (n_empties) { - *s++ = n_empties + '0'; - n_empties = 0; - } - *s++ = '/'; - } - x = 8 * r + c; - if (player == BLACK) square = 2 - ((board->opponent >> x) & 1) - 2 * ((board->player >> x) & 1); - else square = 2 - ((board->player >> x) & 1) - 2 * ((board->opponent >> x) & 1); + if (player == BLACK) { + bk = board->player; + wh = board->opponent; + } else { + bk = board->opponent; + wh = board->player; + } - if (square == EMPTY) { - ++n_empties; - } else { - if (n_empties) { - *s++ = n_empties + '0'; - n_empties = 0; + for (r = 7; r >= 0; --r) { + for (c = 0; c < 8; ++c) { + x = 8 * r + c; + square = 2 - ((wh >> x) & 1) - 2 * ((bk >> x) & 1); + + if (square == EMPTY) { + ++n_empties; + } else { + if (n_empties) { + *s++ = n_empties + '0'; + n_empties = 0; + } + *s++ = piece[square]; } - *s++ = piece[square]; } - } - if (n_empties) { - *s++ = n_empties + '0'; - n_empties = 0; + if (n_empties) { + *s++ = n_empties + '0'; + n_empties = 0; + } + if (r > 0) + *s++ = '/'; } *s++ = ' '; *s++ = color[player]; - *s++ = ' '; *s++ = '-'; *s++ = ' '; *s++ = '-'; - *s++ = ' '; *s++ = '0'; *s++ = ' '; *s++ = '1'; *s = '\0'; + strcpy(s, " - - 0 1"); return string; } diff --git a/src/board.h b/src/board.h index cf55e6a3..2da07dbb 100644 --- a/src/board.h +++ b/src/board.h @@ -3,23 +3,22 @@ * * Board management header file. * - * @date 1998 - 2017 + * @date 1998 - 2024 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #ifndef EDAX_BOARD_H #define EDAX_BOARD_H #include "const.h" +#include "settings.h" +#include "bit.h" #include #include -/** Board : board representation */ -typedef struct Board { - unsigned long long player, opponent; /**< bitboard representation */ -} Board; +// struct Board: moved to bit.h struct Move; struct Random; @@ -28,22 +27,49 @@ struct Random; void board_init(Board*); int board_set(Board*, const char*); int board_from_FEN(Board*, const char*); -int board_compare(const Board*, const Board*); -bool board_equal(const Board*, const Board*); +bool board_lesser(const Board*, const Board*); +void board_horizontal_mirror(const Board *, Board *); +void board_vertical_mirror(const Board *, Board *); +void board_transpose(const Board *, Board *); void board_symetry(const Board*, const int, Board*); int board_unique(const Board*, Board*); void board_check(const Board*); void board_rand(Board*, int, struct Random*); +// Compare two board for equality +#define board_equal(b1,b2) ((b1)->player == (b2)->player && (b1)->opponent == (b2)->opponent) + int board_count_last_flips(const Board*, const int); -unsigned long long board_get_move(const Board*, const int, struct Move*); +unsigned long long board_get_move_flip(const Board*, const int, struct Move*); bool board_check_move(const Board*, struct Move*); void board_swap_players(Board*); void board_update(Board*, const struct Move*); void board_restore(Board*, const struct Move*); void board_pass(Board*); -unsigned long long board_next(const Board*, const int, Board*); -unsigned long long board_pass_next(const Board*, const int, Board*); + +bool can_move(const unsigned long long, const unsigned long long); +unsigned long long get_moves_6x6(const unsigned long long, const unsigned long long); +bool can_move_6x6(const unsigned long long, const unsigned long long); +int get_mobility(const unsigned long long, const unsigned long long); +#ifdef __AVX2__ + __m128i vectorcall get_moves_and_potential(__m256i, __m256i); +#else + unsigned long long get_potential_moves(const unsigned long long, const unsigned long long); +#endif + +void edge_stability_init(void); +unsigned long long get_stable_edge(const unsigned long long, const unsigned long long); +#ifndef __AVX2__ // public for android dispatch + void get_full_lines(const unsigned long long, unsigned long long [4]); + #if !(defined(hasMMX) && !defined(hasSSE2)) + int get_spreaded_stability(unsigned long long, unsigned long long, unsigned long long [4]); + #endif +#endif +unsigned long long get_all_full_lines(const unsigned long long); +int get_stability(const unsigned long long, const unsigned long long); +int get_stability_fulls(const unsigned long long, const unsigned long long, unsigned long long [5]); +int get_edge_stability(const unsigned long long, const unsigned long long); +int get_corner_stability(const unsigned long long); unsigned long long board_get_hash_code(const Board*); int board_get_square_color(const Board*, const int); bool board_is_occupied(const Board*, const int); @@ -54,20 +80,119 @@ char* board_to_FEN(const Board*, const int, char*); bool board_is_pass(const Board*); bool board_is_game_over(const Board*); int board_count_empties(const Board *board); +#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) + void init_mmx (void); + unsigned long long get_moves_mmx(const unsigned long long, const unsigned long long); + unsigned long long get_moves_sse(const unsigned long long, const unsigned long long); -int count_last_flip(const int, const unsigned long long); -extern unsigned long long (*flip[BOARD_SIZE + 2])(const unsigned long long, const unsigned long long); -unsigned long long get_moves(const unsigned long long, const unsigned long long); -bool can_move(const unsigned long long, const unsigned long long); -unsigned long long get_moves_6x6(const unsigned long long, const unsigned long long); -bool can_move_6x6(const unsigned long long, const unsigned long long); -int get_mobility(const unsigned long long, const unsigned long long); -int get_weighted_mobility(const unsigned long long, const unsigned long long); -int get_potential_mobility(const unsigned long long, const unsigned long long); -void edge_stability_init(void); -int get_stability(const unsigned long long, const unsigned long long); -int get_edge_stability(const unsigned long long, const unsigned long long); -int get_corner_stability(const unsigned long long); +#elif defined(ANDROID) && !defined(__ARM_NEON) && !defined(hasSSE2) + void init_neon (void); + unsigned long long get_moves_sse(unsigned long long, unsigned long long); +#endif + +extern unsigned char edge_stability[256 * 256]; + +// a1/a8/h1/h8 are already stable in horizontal line, so omit them in vertical line to ease kindergarten for CPU_64 +#if 0 // defined(__BMI2__) && defined(HAS_CPU_64) && !defined(__bdver4__) && !defined(__znver1__) && !defined(__znver2__) // pdep is slow on AMD before Zen3 + #define unpackA2A7(x) _pdep_u64((x), 0x0101010101010101) + #define unpackH2H7(x) _pdep_u64((x), 0x8080808080808080) +#else + #define unpackA2A7(x) ((((x) & 0x7e) * 0x0000040810204080) & 0x0001010101010100) + #define unpackH2H7(x) ((((x) & 0x7e) * 0x0002040810204000) & 0x0080808080808000) +#endif +#if (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_CARRY) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_KINDERGARTEN) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_BITSCAN) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_32) + extern int (*count_last_flip[BOARD_SIZE + 1])(const unsigned long long); + #define last_flip(x,P) count_last_flip[x](P) +#else + extern int last_flip(int pos, unsigned long long P); #endif +#if (MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_AVX512) + extern __m128i vectorcall mm_Flip(const __m128i OP, int pos); + inline __m128i vectorcall reduce_vflip(__m128i flip) { return _mm_or_si128(flip, _mm_shuffle_epi32(flip, 0x4e)); } + #define Flip(x,P,O) ((unsigned long long) _mm_cvtsi128_si64(reduce_vflip(mm_Flip(_mm_set_epi64x((O), (P)), (x))))) + #define board_flip(board,x) ((unsigned long long) _mm_cvtsi128_si64(reduce_vflip(mm_Flip(_mm_loadu_si128((__m128i *) (board)), (x))))) + #define vboard_flip(board,x) ((unsigned long long) _mm_cvtsi128_si64(reduce_vflip(mm_Flip((board).v2, (x))))) + +#elif MOVE_GENERATOR == MOVE_GENERATOR_SSE + extern __m128i (vectorcall *mm_flip[BOARD_SIZE + 2])(const __m128i); + #define Flip(x,P,O) ((unsigned long long) _mm_cvtsi128_si64(mm_flip[x](_mm_set_epi64x((O), (P))))) + #define mm_Flip(OP,x) mm_flip[x](OP) + #define reduce_vflip(x) (x) + #define board_flip(board,x) ((unsigned long long) _mm_cvtsi128_si64(mm_flip[x](_mm_loadu_si128((__m128i *) (board))))) + #define vboard_flip(board,x) ((unsigned long long) _mm_cvtsi128_si64(mm_flip[x]((board).v2))) + +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + extern uint64x2_t mm_Flip(uint64x2_t OP, int pos); + #define Flip(x,P,O) vgetq_lane_u64(mm_Flip(vcombine_u64(vcreate_u64(P), vcreate_u64(O)), (x)), 0) + #define board_flip(board,x) vgetq_lane_u64(mm_Flip(vld1q_u64((uint64_t *) (board)), (x)), 0) + #define vboard_flip(board,x) vgetq_lane_u64(mm_Flip((board).v2, (x)), 0) + +#elif MOVE_GENERATOR == MOVE_GENERATOR_SVE + extern uint64_t Flip(int pos, uint64_t P, uint64_t O); + #define mm_Flip(OP,x) vdupq_n_u64(Flip((x), vgetq_lane_u64((OP), 0), vgetq_lane_u64((OP), 1))) + #define board_flip(board,x) Flip((x), (board)->player, (board)->opponent) + #define vboard_flip(board,x) Flip((x), vgetq_lane_u64((board).v2, 0), vgetq_lane_u64((board).v2, 1)) + +#elif MOVE_GENERATOR == MOVE_GENERATOR_32 + extern unsigned long long (*flip[BOARD_SIZE + 2])(unsigned int, unsigned int, unsigned int, unsigned int); + #define Flip(x,P,O) flip[x]((unsigned int)(P), (unsigned int)((P) >> 32), (unsigned int)(O), (unsigned int)((O) >> 32)) + #ifdef __BIG_ENDIAN__ + #define board_flip(board,x) flip[x]((unsigned int)((board)->player), ((unsigned int *) &(board)->player)[0], (unsigned int)((board)->opponent), ((unsigned int *) &(board)->opponent)[0]) + #else + #define board_flip(board,x) flip[x]((unsigned int)((board)->player), ((unsigned int *) &(board)->player)[1], (unsigned int)((board)->opponent), ((unsigned int *) &(board)->opponent)[1]) + #endif + #if defined(USE_GAS_MMX) && !defined(hasSSE2) + extern void init_flip_sse(void); + #endif + +#else + #if MOVE_GENERATOR == MOVE_GENERATOR_SSE_BSWAP + extern unsigned long long Flip(int, unsigned long long, unsigned long long); + #else + extern unsigned long long (*flip[BOARD_SIZE + 2])(const unsigned long long, const unsigned long long); + #define Flip(x,P,O) flip[x]((P), (O)) + #endif + + #define board_flip(board,x) Flip((x), (board)->player, (board)->opponent) +#endif + +#ifndef vboard_flip + #define vboard_flip(vboard,x) board_flip(&(vboard).board, (x)) +#endif + +// Use backup copy of search->board in a vector register if available (assume *pboard == vboard on entry) +#ifdef hasSSE2 + #define vboard_update(pboard,vboard,move) _mm_storeu_si128((__m128i *) (pboard), _mm_shuffle_epi32(_mm_xor_si128((vboard).v2, _mm_or_si128(_mm_set1_epi64x((move)->flipped), _mm_loadl_epi64((__m128i *) &X_TO_BIT[move->x]))), 0x4e)) +#else + #define vboard_update(pboard,vboard,move) board_update((pboard), (move)) +#endif + +// Pass Board in a vector register to Flip +#if (MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_AVX512) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE) + unsigned long long vectorcall board_next_sse(__m128i OP, const int x, Board *next); + #define board_next(board,x,next) board_next_sse(_mm_loadu_si128((__m128i *) (board)), (x), (next)) + #define vboard_next(vboard,x,next) board_next_sse((vboard).v2, (x), (next)) +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + unsigned long long board_next_neon(uint64x2_t OP, const int x, Board *next); + #define board_next(board,x,next) board_next_neon(vld1q_u64((uint64_t *) (board)), (x), (next)) + #define vboard_next(vboard,x,next) board_next_neon((vboard).v2, (x), (next)) +#else + unsigned long long board_next(const Board *board, const int x, Board *next); + #define vboard_next(vboard,x,next) board_next(&(vboard).board, (x), (next)) +#endif + +// Pass vboard to get_moves if vectorcall available, otherwise board +#if defined(__AVX2__) && (defined(_MSC_VER) || defined(__linux__)) + unsigned long long vectorcall get_moves_avx(__m256i PP, __m256i OO); + #define get_moves(P,O) get_moves_avx(_mm256_set1_epi64x(P), _mm256_set1_epi64x(O)) + #define board_get_moves(board) get_moves_avx(_mm256_set1_epi64x((board)->player), _mm256_set1_epi64x((board)->opponent)) + #define vboard_get_moves(vboard) get_moves_avx(_mm256_broadcastq_epi64((vboard).v2), _mm256_broadcastq_epi64(_mm_unpackhi_epi64((vboard).v2, (vboard).v2))) +#else + unsigned long long get_moves(const unsigned long long, const unsigned long long); + #define board_get_moves(board) get_moves((board)->player, (board)->opponent) + #define vboard_get_moves(vboard) get_moves((vboard).board.player, (vboard).board.opponent) +#endif + +#endif diff --git a/src/board_mmx.c b/src/board_mmx.c new file mode 100644 index 00000000..a9dff19b --- /dev/null +++ b/src/board_mmx.c @@ -0,0 +1,414 @@ +/** + * @file board_mmx.c + * + * MMX translation of some board.c functions for X86-32 + * + * If both hasMMX and hasSSE2 are undefined, dynamic dispatching code + * will be generated. (This setting requires VC or GCC 4.4+) + * + * @date 2014 - 2023 + * @author Toshihiko Okuhara + * @version 4.5 + */ + +#include "bit.h" +#include "hash.h" +#include "board.h" +#include "move.h" + +#ifdef USE_GAS_MMX + #ifndef hasMMX + #pragma GCC push_options + #pragma GCC target ("mmx") + #endif + #include +#endif + +static const unsigned long long mask_7e = 0x7e7e7e7e7e7e7e7eULL; +#ifndef POPCOUNT +static const unsigned long long mask_55 = 0x5555555555555555ULL; +static const unsigned long long mask_33 = 0x3333333333333333ULL; +static const unsigned long long mask_0F = 0x0f0f0f0f0f0f0f0fULL; +#endif + +#ifndef hasMMX +bool hasMMX = false; +#endif +bool hasSSE2 = false; + +void init_mmx (void) +{ + int flg1, flg2, cpuid_edx, cpuid_ecx; +#ifdef USE_MSVC_X86 + int cpuinfo[4]; + + __asm { + pushfd + pop eax + mov flg2, eax + btc eax, 21 + push eax + popfd + pushfd + pop flg1 + } + + if (flg1 == flg2) /* CPUID not supported */ + return; + + __cpuid(cpuinfo, 1); + cpuid_edx = cpuinfo[3]; + cpuid_ecx = cpuinfo[2]; + +#else + __asm__ ( + "pushfl\n\t" + "popl %0\n\t" + "movl %0, %1\n\t" + "btc $21, %0\n\t" /* flip ID bit in EFLAGS */ + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0" + : "=r" (flg1), "=r" (flg2) ); + + if (flg1 == flg2) /* CPUID not supported */ + return; + + __asm__ ( + "movl $1, %%eax\n\t" + "cpuid" + : "=d" (cpuid_edx), "=c" (cpuid_ecx) :: "%eax", "%ebx" ); + +#endif + +#ifndef hasMMX + hasMMX = ((cpuid_edx & 0x00800000u) != 0); +#endif + hasSSE2 = ((cpuid_edx & 0x04000000u) != 0); + // hasPOPCNT = ((cpuid_ecx & 0x00800000u) != 0); + +#if (MOVE_GENERATOR == MOVE_GENERATOR_32) + if (hasSSE2) + init_flip_sse(); +#endif +} + +/** + * @brief MMX translation of get_moves + * + * x 2 faster bench mobility on 32-bit x86. + * + */ +#ifdef USE_MSVC_X86 + +unsigned long long get_moves_mmx(const unsigned long long P_, const unsigned long long O_) +{ + unsigned int movesL, movesH, mO1, flip1, pre1; + __m64 P, O, M, mO, flip, pre; + + P = _m_punpckldq(_m_from_int(P_), _m_from_int(P_ >> 32)); + O = _m_punpckldq(_m_from_int(O_), _m_from_int(O_ >> 32)); mO1 = (unsigned int) O_ & 0x7e7e7e7e; + /* shift = +8 */ /* shift = +1 */ + flip = _m_pand(O, _m_psllqi(P, 8)); flip1 = mO1 & ((unsigned int) P_ << 1); + flip = _m_por(flip, _m_pand(O, _m_psllqi(flip, 8))); flip1 |= mO1 & (flip1 << 1); + pre = _m_pand(O, _m_psllqi(O, 8)); pre1 = mO1 & (mO1 << 1); + flip = _m_por(flip, _m_pand(pre, _m_psllqi(flip, 16))); flip1 |= pre1 & (flip1 << 2); + flip = _m_por(flip, _m_pand(pre, _m_psllqi(flip, 16))); flip1 |= pre1 & (flip1 << 2); + M = _m_psllqi(flip, 8); movesL = flip1 << 1; + /* shift = -8 */ /* shift = -1 */ + flip = _m_pand(O, _m_psrlqi(P, 8)); flip1 = mO1 & ((unsigned int) P_ >> 1); + flip = _m_por(flip, _m_pand(O, _m_psrlqi(flip, 8))); flip1 |= mO1 & (flip1 >> 1); + pre = _m_psrlqi(pre, 8); pre1 >>= 1; + flip = _m_por(flip, _m_pand(pre, _m_psrlqi(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + flip = _m_por(flip, _m_pand(pre, _m_psrlqi(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + M = _m_por(M, _m_psrlqi(flip, 8)); movesL |= flip1 >> 1; + /* shift = +7 */ + mO = _m_pand(O, *(__m64 *) &mask_7e); mO1 = (unsigned int)(O_ >> 32) & 0x7e7e7e7e; + flip = _m_pand(mO, _m_psllqi(P, 7)); + flip = _m_por(flip, _m_pand(mO, _m_psllqi(flip, 7))); + pre = _m_pand(mO, _m_psllqi(mO, 7)); + flip = _m_por(flip, _m_pand(pre, _m_psllqi(flip, 14))); + flip = _m_por(flip, _m_pand(pre, _m_psllqi(flip, 14))); + M = _m_por(M, _m_psllqi(flip, 7)); + /* shift = -7 */ /* shift = +1 */ + flip = _m_pand(mO, _m_psrlqi(P, 7)); flip1 = mO1 & ((unsigned int)(P_ >> 32) << 1); + flip = _m_por(flip, _m_pand(mO, _m_psrlqi(flip, 7))); flip1 |= mO1 & (flip1 << 1); + pre = _m_psrlqi(pre, 7); pre1 = mO1 & (mO1 << 1); + flip = _m_por(flip, _m_pand(pre, _m_psrlqi(flip, 14))); flip1 |= pre1 & (flip1 << 2); + flip = _m_por(flip, _m_pand(pre, _m_psrlqi(flip, 14))); flip1 |= pre1 & (flip1 << 2); + M = _m_por(M, _m_psrlqi(flip, 7)); movesH = flip1 << 1; + /* shift = +9 */ /* shift = -1 */ + flip = _m_pand(mO, _m_psllqi(P, 9)); flip1 = mO1 & ((unsigned int)(P_ >> 32) >> 1); + flip = _m_por(flip, _m_pand(mO, _m_psllqi(flip, 9))); flip1 |= mO1 & (flip1 >> 1); + pre = _m_pand(mO, _m_psllqi(mO, 9)); pre1 >>= 1; + flip = _m_por(flip, _m_pand(pre, _m_psllqi(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + flip = _m_por(flip, _m_pand(pre, _m_psllqi(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + M = _m_por(M, _m_psllqi(flip, 9)); movesH |= flip1 >> 1; + /* shift = -9 */ + flip = _m_pand(mO, _m_psrlqi(P, 9)); + flip = _m_por(flip, _m_pand(mO, _m_psrlqi(flip, 9))); + pre = _m_psrlqi(pre, 9); + flip = _m_por(flip, _m_pand(pre, _m_psrlqi(flip, 18))); + flip = _m_por(flip, _m_pand(pre, _m_psrlqi(flip, 18))); + M = _m_por(M, _m_psrlqi(flip, 9)); + + movesL |= _m_to_int(M); + movesH |= _m_to_int(_m_punpckhdq(M, M)); + _mm_empty(); + return (((unsigned long long) movesH << 32) | movesL) & ~(P_|O_); // mask with empties +} + +#else + +unsigned long long get_moves_mmx(const unsigned long long P, const unsigned long long O) +{ + unsigned long long moves; + __asm__ ( + "movl %1, %%ebx\n\t" "movd %1, %%mm4\n\t" // (movd for store-forwarding) + "movl %3, %%edi\n\t" "movd %3, %%mm5\n\t" + "andl $0x7e7e7e7e, %%edi\n\t" "punpckldq %2, %%mm4\n\t" + "punpckldq %4, %%mm5\n\t" + /* shift=-1 */ /* shift=-8 */ + "movl %%ebx, %%eax\n\t" "movq %%mm4, %%mm0\n\t" + "shrl $1, %%eax\n\t" "psrlq $8, %%mm0\n\t" + "andl %%edi, %%eax\n\t" "pand %%mm5, %%mm0\n\t" // 0 m7&o6 m6&o5 .. m1&o0 + "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm1\n\t" + "shrl $1, %%eax\n\t" "psrlq $8, %%mm0\n\t" + "movl %%edi, %%ecx\n\t" "movq %%mm5, %%mm3\n\t" + "andl %%edi, %%eax\n\t" "pand %%mm5, %%mm0\n\t" // 0 0 m7&o6&o5 .. m2&o1&o0 + "shrl $1, %%ecx\n\t" "psrlq $8, %%mm3\n\t" + "orl %%edx, %%eax\n\t" "por %%mm1, %%mm0\n\t" // 0 m7&o6 (m6&o5)|(m7&o6&o5) .. (m1&o0) + "andl %%edi, %%ecx\n\t" "pand %%mm5, %%mm3\n\t" // 0 o7&o6 o6&o5 o5&o4 o4&o3 .. + "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm2\n\t" + "shrl $2, %%eax\n\t" "psrlq $16, %%mm0\n\t" + "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" // 0 0 0 m7&o6&o5&o4 (m6&o5&o4&o3)|(m7&o6&o5&o4&o3) .. + "orl %%eax, %%edx\n\t" "por %%mm0, %%mm2\n\t" + "shrl $2, %%eax\n\t" "psrlq $16, %%mm0\n\t" + "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" // 0 0 0 0 0 m7&o6&..&o2 (m6&o5&..&o1)|(m7&o6&..&o1) .. + "orl %%edx, %%eax\n\t" "por %%mm0, %%mm2\n\t" + "shrl $1, %%eax\n\t" "psrlq $8, %%mm2\n\t" + /* shift=+1 */ /* shift=+8 */ + "movq %%mm4, %%mm0\n\t" + "addl %%ebx, %%ebx\n\t" "psllq $8, %%mm0\n\t" + "andl %%edi, %%ebx\n\t" "pand %%mm5, %%mm0\n\t" + "movl %%ebx, %%edx\n\t" "movq %%mm0, %%mm1\n\t" + "addl %%ebx, %%ebx\n\t" "psllq $8, %%mm0\n\t" + "andl %%edi, %%ebx\n\t" "pand %%mm5, %%mm0\n\t" + "orl %%ebx, %%edx\n\t" "por %%mm1, %%mm0\n\t" + "addl %%ecx, %%ecx\n\t" "psllq $8, %%mm3\n\t" + "movq %%mm0, %%mm1\n\t" + "leal (,%%edx,4), %%ebx\n\t" "psllq $16, %%mm0\n\t" + "andl %%ecx, %%ebx\n\t" "pand %%mm3, %%mm0\n\t" + "orl %%ebx, %%edx\n\t" "por %%mm0, %%mm1\n\t" + "shll $2, %%ebx\n\t" "psllq $16, %%mm0\n\t" + "andl %%ecx, %%ebx\n\t" "pand %%mm3, %%mm0\n\t" + "orl %%edx, %%ebx\n\t" "por %%mm1, %%mm0\n\t" + "addl %%ebx, %%ebx\n\t" "psllq $8, %%mm0\n\t" + "orl %%eax, %%ebx\n\t" "por %%mm0, %%mm2\n\t" + /* shift=-7 */ + "pand %5, %%mm5\n\t" + "movq %%mm4, %%mm0\n\t" + "psrlq $7, %%mm0\n\t" + "pand %%mm5, %%mm0\n\t" + "movq %%mm0, %%mm1\n\t" + "psrlq $7, %%mm0\n\t" + "pand %%mm5, %%mm0\n\t" + "movq %%mm5, %%mm3\n\t" + "por %%mm1, %%mm0\n\t" + "psrlq $7, %%mm3\n\t" + "movq %%mm0, %%mm1\n\t" + "pand %%mm5, %%mm3\n\t" + "psrlq $14, %%mm0\n\t" + "pand %%mm3, %%mm0\n\t" + "movl %2, %%esi\n\t" "por %%mm0, %%mm1\n\t" + "movl %4, %%edi\n\t" "psrlq $14, %%mm0\n\t" + "andl $0x7e7e7e7e,%%edi\n\t" "pand %%mm3, %%mm0\n\t" + "movl %%edi, %%ecx\n\t" "por %%mm1, %%mm0\n\t" + "shrl $1, %%ecx\n\t" "psrlq $7, %%mm0\n\t" + "andl %%edi, %%ecx\n\t" "por %%mm0, %%mm2\n\t" + /* shift=-1 */ /* shift=+7 */ + "movl %%esi, %%eax\n\t" "movq %%mm4, %%mm0\n\t" + "shrl $1, %%eax\n\t" "psllq $7, %%mm0\n\t" + "andl %%edi, %%eax\n\t" "pand %%mm5, %%mm0\n\t" + "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm1\n\t" + "shrl $1, %%eax\n\t" "psllq $7, %%mm0\n\t" + "andl %%edi, %%eax\n\t" "pand %%mm5, %%mm0\n\t" + "orl %%edx, %%eax\n\t" "por %%mm1, %%mm0\n\t" + "psllq $7, %%mm3\n\t" + "movl %%eax, %%edx\n\t" "movq %%mm0, %%mm1\n\t" + "shrl $2, %%eax\n\t" "psllq $14, %%mm0\n\t" + "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" + "orl %%eax, %%edx\n\t" "por %%mm0, %%mm1\n\t" + "shrl $2, %%eax\n\t" "psllq $14, %%mm0\n\t" + "andl %%ecx, %%eax\n\t" "pand %%mm3, %%mm0\n\t" + "orl %%edx, %%eax\n\t" "por %%mm1, %%mm0\n\t" + "shrl $1, %%eax\n\t" "psllq $7, %%mm0\n\t" + "por %%mm0, %%mm2\n\t" + /* shift=+1 */ /* shift=-9 */ + "movq %%mm4, %%mm0\n\t" + "addl %%esi, %%esi\n\t" "psrlq $9, %%mm0\n\t" + "andl %%edi, %%esi\n\t" "pand %%mm5, %%mm0\n\t" + "movl %%esi, %%edx\n\t" "movq %%mm0, %%mm1\n\t" + "addl %%esi, %%esi\n\t" "psrlq $9, %%mm0\n\t" + "andl %%edi, %%esi\n\t" "pand %%mm5, %%mm0\n\t" + "movq %%mm5, %%mm3\n\t" + "orl %%esi, %%edx\n\t" "por %%mm1, %%mm0\n\t" + "psrlq $9, %%mm3\n\t" + "movq %%mm0, %%mm1\n\t" + "addl %%ecx, %%ecx\n\t" "pand %%mm5, %%mm3\n\t" + "leal (,%%edx,4), %%esi\n\t" "psrlq $18, %%mm0\n\t" + "andl %%ecx, %%esi\n\t" "pand %%mm3, %%mm0\n\t" + "orl %%esi, %%edx\n\t" "por %%mm0, %%mm1\n\t" + "shll $2, %%esi\n\t" "psrlq $18, %%mm0\n\t" + "andl %%ecx, %%esi\n\t" "pand %%mm3, %%mm0\n\t" + "orl %%edx, %%esi\n\t" "por %%mm1, %%mm0\n\t" + "addl %%esi, %%esi\n\t" "psrlq $9, %%mm0\n\t" + "orl %%eax, %%esi\n\t" "por %%mm0, %%mm2\n\t" + /* shift=+9 */ + "movq %%mm4, %%mm0\n\t" + "psllq $9, %%mm0\n\t" + "pand %%mm5, %%mm0\n\t" + "movq %%mm0, %%mm1\n\t" + "psllq $9, %%mm0\n\t" + "pand %%mm5, %%mm0\n\t" + "por %%mm1, %%mm0\n\t" + "psllq $9, %%mm3\n\t" + "movq %%mm0, %%mm1\n\t" + "psllq $18, %%mm0\n\t" + "pand %%mm3, %%mm0\n\t" + "movl %1, %%eax\n\t" "por %%mm0, %%mm1\n\t" + "movl %2, %%edx\n\t" "psllq $18, %%mm0\n\t" + "orl %3, %%eax\n\t" "pand %%mm3, %%mm0\n\t" + "orl %4, %%edx\n\t" "por %%mm1, %%mm0\n\t" + "notl %%eax\n\t" "psllq $9, %%mm0\n\t" + "notl %%edx\n\t" "por %%mm0, %%mm2\n\t" + /* mm2|(esi:ebx) is the pseudo-feasible moves at this point. */ + /* Let edx:eax be the feasible moves, i.e., mm2 restricted to empty squares. */ + "movd %%mm2, %%ecx\n\t" "punpckhdq %%mm2, %%mm2\n\t" + "orl %%ecx, %%ebx\n\t" + "movd %%mm2, %%ecx\n\t" + "orl %%ecx, %%esi\n\t" + "andl %%ebx, %%eax\n\t" + "andl %%esi, %%edx\n\t" + "emms" /* Reset the FP/MMX unit. */ + : "=&A" (moves) + : "m" (P), "m" (((unsigned int *)&P)[1]), "m" (O), "m" (((unsigned int *)&O)[1]), "m" (mask_7e) + : "ebx", "ecx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5" ); + + return moves; +} +#endif + +/** + * @brief MMX translation of get_stability() + * + * x 1.5 faster bench stability on 32-bit x86. + * + */ +#ifdef hasMMX +static void get_full_lines(const unsigned long long disc_, unsigned long long full[4]) +{ + __m64 disc = *(__m64 *) &disc_; + __m64 full_l, full_r; + unsigned int full_v; + const __m64 kFF = _m_pcmpeqb(disc, disc); + static const unsigned long long e7[] = { 0xff01010101010101, 0x80808080808080ff, 0xffff030303030303, 0xc0c0c0c0c0c0ffff, 0xffffffff0f0f0f0f, 0xf0f0f0f0ffffffff }; + static const unsigned long long e9[] = { 0xff80808080808080, 0x01010101010101ff, 0xffffc0c0c0c0c0c0, 0x030303030303ffff, 0x0f0f0f0ff0f0f0f0 }; + + // get_full_lines_mmx(full_d7, disc, 7, e7); + full_l = _m_pand(disc, _m_por(((__m64 *) e7)[0], _m_psrlqi(disc, 7))); + full_r = _m_pand(disc, _m_por(((__m64 *) e7)[1], _m_psllqi(disc, 7))); + full_l = _m_pand(full_l, _m_por(((__m64 *) e7)[2], _m_psrlqi(full_l, 14))); + full_r = _m_pand(full_r, _m_por(((__m64 *) e7)[3], _m_psllqi(full_r, 14))); + full_l = _m_pand(full_l, _m_por(((__m64 *) e7)[4], _m_psrlqi(full_l, 28))); + full_r = _m_pand(full_r, _m_por(((__m64 *) e7)[5], _m_psllqi(full_r, 28))); + ((__m64 *) full)[3] = _m_pand(full_l, full_r); + + // get_full_lines_mmx(full_d9, disc, 9, e9); + full_l = _m_pand(disc, _m_por(((__m64 *) e9)[0], _m_psrlqi(disc, 9))); + full_r = _m_pand(disc, _m_por(((__m64 *) e9)[1], _m_psllqi(disc, 9))); + full_l = _m_pand(full_l, _m_por(((__m64 *) e9)[2], _m_psrlqi(full_l, 18))); + full_r = _m_pand(full_r, _m_por(((__m64 *) e9)[3], _m_psllqi(full_r, 18))); + ((__m64 *) full)[2] = _m_pand(_m_pand(full_l, full_r), _m_por(((__m64 *) e9)[4], _m_por(_m_psrlqi(full_l, 36), _m_psllqi(full_r, 36)))); + + // get_full_lines_mmx(full_h, disc, 1, e1); + ((__m64 *) full)[0] = _m_pcmpeqb(kFF, disc); + _mm_empty(); + + // get_full_lines_mmx(full_v, disc, 8, e8); + full_v = (unsigned int) disc_ & (unsigned int)(disc_ >> 32); + full_v &= (full_v >> 16) | (full_v << 16); // ror 16 + full_v &= (full_v >> 8) | (full_v << 24); // ror 8 + full[1] = full_v | ((unsigned long long) full_v << 32); +} + +// returns all full in full[4] in addition to stability count +int get_stability_fulls(unsigned long long P, unsigned long long O, unsigned long long full[5]) +{ + __m64 P_central, stable, stable_h, stable_v, stable_d7, stable_d9, old_stable, m; + unsigned int OL, OH, PL, PH, t, a1a8, h1h8, SL, SH; + + get_full_lines(P | O, full); + + OL = (unsigned int) O; OH = (unsigned int)(O >> 32); + PL = (unsigned int) P; PH = (unsigned int)(P >> 32); + SL = PL & 0x7f7f7f00; SH = PH & 0x007f7f7f; + P_central = _m_punpckldq(_m_from_int(SL), _m_from_int(SH)); + + // P_central & allfull + full[4] = full[0] & full[1] & full[2] & full[3]; + SL &= (unsigned int) full[4]; + SH &= (unsigned int)(full[4] >> 32); + + // compute the exact stable edges (from precomputed tables) + a1a8 = edge_stability[((((PL & 0x01010101) + ((PH & 0x01010101) << 4)) * 0x01020408) >> 24) * 256 + + ((((OL & 0x01010101) + ((OH & 0x01010101) << 4)) * 0x01020408) >> 24)]; + h1h8 = edge_stability[((((PH & 0x80808080) + ((PL & 0x80808080) >> 4)) * 0x00204081) >> 24) * 256 + + ((((OH & 0x80808080) + ((OL & 0x80808080) >> 4)) * 0x00204081) >> 24)]; + SL |= edge_stability[(PL & 0xff) * 256 + (OL & 0xff)] + | (((a1a8 & 0x0f) * 0x00204081) & 0x01010101) + | (((h1h8 & 0x0f) * 0x10204080) & 0x80808080); + SH |= (edge_stability[((PH >> 16) & 0xff00) + (OH >> 24)] << 24) + | (((a1a8 >> 4) * 0x00204081) & 0x01010101) + | (((h1h8 >> 4) * 0x10204080) & 0x80808080); + stable = _m_punpckldq(_m_from_int(SL), _m_from_int(SH)); + + // now compute the other stable discs (ie discs touching another stable disc in each flipping direction). + t = SL | SH; + if (t) { + do { + old_stable = stable; + stable_h = _m_por(_m_por(_m_psrlqi(stable, 1), _m_psllqi(stable, 1)), ((__m64 *) full)[0]); + stable_v = _m_por(_m_por(_m_psrlqi(stable, 8), _m_psllqi(stable, 8)), ((__m64 *) full)[1]); + stable_d7 = _m_por(_m_por(_m_psrlqi(stable, 7), _m_psllqi(stable, 7)), ((__m64 *) full)[3]); + stable_d9 = _m_por(_m_por(_m_psrlqi(stable, 9), _m_psllqi(stable, 9)), ((__m64 *) full)[2]); + stable = _m_por(stable, _m_pand(_m_pand(_m_pand(_m_pand(stable_h, stable_v), stable_d7), stable_d9), P_central)); + m = _m_pxor(stable, old_stable); + } while (_m_to_int(_m_packsswb(m, m)) != 0); + + #ifdef POPCOUNT + t = bit_count_32(_m_to_int(stable)) + bit_count_32(_m_to_int(_m_psrlqi(stable, 32))); + #else + m = _m_psubd(stable, _m_pand(_m_psrlqi(stable, 1), *(__m64 *) &mask_55)); + m = _m_paddd(_m_pand(m, *(__m64 *) &mask_33), _m_pand(_m_psrlqi(m, 2), *(__m64 *) &mask_33)); + m = _m_pand(_m_paddd(m, _m_psrlqi(m, 4)), *(__m64 *) &mask_0F); + t = ((unsigned int) _m_to_int(_m_paddb(m, _m_psrlqi(m, 32))) * 0x01010101u) >> 24; + #endif + } + _mm_empty(); + return t; +} + +// returns stability count only +int get_stability(const unsigned long long P, const unsigned long long O) +{ + unsigned long long full[5]; + + return get_stability_fulls(P, O, full); +} +#endif // hasMMX + +#if !defined(hasMMX) && defined(USE_GAS_MMX) + #pragma GCC pop_options +#endif diff --git a/src/board_sse.c b/src/board_sse.c new file mode 100644 index 00000000..9a5ef40a --- /dev/null +++ b/src/board_sse.c @@ -0,0 +1,954 @@ +/** + * @file board_sse.c + * + * SSE/AVX translation of some board.c functions + * + * @date 2014 - 2024 + * @author Toshihiko Okuhara + * @version 4.5 + */ + +#include "bit.h" +#include "hash.h" +#include "board.h" + +#if defined(ANDROID) && !defined(HAS_CPU_64) && !defined(hasSSE2) +#include "android/cpu-features.h" + +bool hasSSE2 = false; + +void init_neon (void) +{ + #ifdef __arm__ + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) { + #if (MOVE_GENERATOR == MOVE_GENERATOR_BITSCAN) + extern unsigned long long (*flip_neon[66])(const unsigned long long, const unsigned long long); + memcpy(flip, flip_neon, sizeof(flip_neon)); + #endif + hasSSE2 = true; // for eval_update_sse + } + #elif defined(__i386__) // android x86 w/o SSE2 - uncommon and not tested + int cpuid_edx, cpuid_ecx; + __asm__ ( + "movl $1, %%eax\n\t" + "cpuid" + : "=d" (cpuid_edx), "=c" (cpuid_ecx) :: "%eax", "%ebx" ); + if ((cpuid_edx & 0x04000000u) != 0) + hasSSE2 = true; + #endif +} +#endif + +/** + * @brief SSE2 translation of board_symetry + * + * @param board input board + * @param sym symetric output board + */ +#ifdef hasSSE2 + +static __m128i vectorcall board_horizontal_mirror_sse(__m128i bb) +{ + const __m128i mask0F0F = _mm_set1_epi16(0x0F0F); + #if defined(__SSSE3__) || defined(__AVX__) // pshufb (cf. http://wm.ite.pl/articles/sse-popcount.html) + const __m128i mbitrev = _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0); + bb = _mm_or_si128(_mm_shuffle_epi8(mbitrev, _mm_and_si128(_mm_srli_epi64(bb, 4), mask0F0F)), + _mm_slli_epi64(_mm_shuffle_epi8(mbitrev, _mm_and_si128(bb, mask0F0F)), 4)); + #else + const __m128i mask5555 = _mm_set1_epi16(0x5555); + const __m128i mask3333 = _mm_set1_epi16(0x3333); + bb = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(bb, 1), mask5555), _mm_slli_epi64(_mm_and_si128(bb, mask5555), 1)); + bb = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(bb, 2), mask3333), _mm_slli_epi64(_mm_and_si128(bb, mask3333), 2)); + bb = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(bb, 4), mask0F0F), _mm_slli_epi64(_mm_and_si128(bb, mask0F0F), 4)); + #endif + return bb; +} + +void board_horizontal_mirror(const Board *board, Board *sym) +{ + _mm_storeu_si128((__m128i *) sym, board_horizontal_mirror_sse(_mm_loadu_si128((__m128i *) board))); +} + +static __m128i vectorcall board_vertical_mirror_sse(__m128i bb) +{ + #if defined(__SSSE3__) || defined(__AVX__) // pshufb + return _mm_shuffle_epi8(bb, _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); + #else + bb = _mm_or_si128(_mm_srli_epi16(bb, 8), _mm_slli_epi16(bb, 8)); + return _mm_shufflehi_epi16(_mm_shufflelo_epi16(bb, 0x1b), 0x1b); + #endif +} + +void board_vertical_mirror(const Board *board, Board *sym) +{ + #if defined(__SSSE3__) || defined(__AVX__) || !defined(HAS_CPU_64) + _mm_storeu_si128((__m128i *) sym, board_vertical_mirror_sse(_mm_loadu_si128((__m128i *) board))); + #else // use BSWAP64 + sym->player = vertical_mirror(board->player); + sym->opponent = vertical_mirror(board->opponent); + #endif +} + +static __m128i vectorcall board_transpose_sse(__m128i bb) +{ + const __m128i mask00AA = _mm_set1_epi16(0x00AA); + const __m128i maskCCCC = _mm_set1_epi32(0x0000CCCC); + const __m128i mask00F0 = _mm_set1_epi64x(0x00000000F0F0F0F0); + __m128i tt = _mm_and_si128(_mm_xor_si128(bb, _mm_srli_epi64(bb, 7)), mask00AA); + bb = _mm_xor_si128(_mm_xor_si128(bb, tt), _mm_slli_epi64(tt, 7)); + tt = _mm_and_si128(_mm_xor_si128(bb, _mm_srli_epi64(bb, 14)), maskCCCC); + bb = _mm_xor_si128(_mm_xor_si128(bb, tt), _mm_slli_epi64(tt, 14)); + tt = _mm_and_si128(_mm_xor_si128(bb, _mm_srli_epi64(bb, 28)), mask00F0); + bb = _mm_xor_si128(_mm_xor_si128(bb, tt), _mm_slli_epi64(tt, 28)); + return bb; +} + +void board_transpose(const Board *board, Board *sym) +{ + _mm_storeu_si128((__m128i *) sym, board_transpose_sse(_mm_loadu_si128((__m128i *) board))); +} + +void board_symetry(const Board *board, const int s, Board *sym) +{ + __m128i bb = _mm_loadu_si128((__m128i *) board); + if (s & 1) + bb = board_horizontal_mirror_sse(bb); + if (s & 2) + bb = board_vertical_mirror_sse(bb); + if (s & 4) + bb = board_transpose_sse(bb); + + _mm_storeu_si128((__m128i *) sym, bb); + board_check(sym); +} + +#elif defined(__ARM_NEON) && !defined(DISPATCH_NEON) + +static uint64x2_t board_horizontal_mirror_neon(uint64x2_t bb) +{ + #ifdef HAS_CPU_64 + bb = vreinterpretq_u64_u8(vrbitq_u8(vreinterpretq_u8_u64(bb))); + #else + bb = vbslq_u64(vdupq_n_u64(0x5555555555555555), vshrq_n_u64(bb, 1), vshlq_n_u64(bb, 1)); + bb = vbslq_u64(vdupq_n_u64(0x3333333333333333), vshrq_n_u64(bb, 2), vshlq_n_u64(bb, 2)); + bb = vreinterpretq_u64_u8(vsliq_n_u8(vshrq_n_u8(vreinterpretq_u8_u64(bb), 4), vreinterpretq_u8_u64(bb), 4)); + #endif + return bb; +} + +void board_horizontal_mirror(const Board *board, Board *sym) +{ + vst1q_u64((uint64_t *) sym, board_horizontal_mirror_neon(vld1q_u64((uint64_t *) board))); +} + +static uint64x2_t board_vertical_mirror_neon(uint64x2_t bb) +{ + return vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(bb))); +} + +void board_vertical_mirror(const Board *board, Board *sym) +{ + vst1q_u64((uint64_t *) sym, board_vertical_mirror_neon(vld1q_u64((uint64_t *) board))); +} + +static uint64x2_t board_transpose_neon(uint64x2_t bb) +{ + uint64x2_t tt = vandq_u64(veorq_u64(bb, vshrq_n_u64(bb, 7)), vdupq_n_u64(0x00AA00AA00AA00AA)); + bb = veorq_u64(veorq_u64(bb, tt), vshlq_n_u64(tt, 7)); + tt = vandq_u64(veorq_u64(bb, vshrq_n_u64(bb, 14)), vdupq_n_u64(0x0000CCCC0000CCCC)); + bb = veorq_u64(veorq_u64(bb, tt), vshlq_n_u64(tt, 14)); + tt = vandq_u64(veorq_u64(bb, vshrq_n_u64(bb, 28)), vdupq_n_u64(0x00000000F0F0F0F0)); + bb = veorq_u64(veorq_u64(bb, tt), vshlq_n_u64(tt, 28)); + return bb; +} + +void board_transpose(const Board *board, Board *sym) +{ + vst1q_u64((uint64_t *) sym, board_transpose_neon(vld1q_u64((uint64_t *) board))); +} + +void board_symetry(const Board *board, const int s, Board *sym) +{ + uint64x2_t bb = vld1q_u64((uint64_t *) board); + if (s & 1) + bb = board_horizontal_mirror_neon(bb); + if (s & 2) + bb = board_vertical_mirror_neon(bb); + if (s & 4) + bb = board_transpose_neon(bb); + + vst1q_u64((uint64_t *) sym, bb); + board_check(sym); +} + +#endif // hasSSE2/Neon + +#ifdef __AVX2__ +/** + * @brief unique board + * + * Compute a board unique from all its possible symertries. + * + * @param board input board + * @param unique output board + */ +static void board_horizontal_mirror_avx(const __m256i *bb, __m256i *sym) +{ + const __m256i mask0F0F = _mm256_set1_epi16(0x0F0F); + const __m256i mbitrev = _mm256_set_epi8( //cf. http://wm.ite.pl/articles/sse-popcount.html + 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, + 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0); + *sym = _mm256_or_si256(_mm256_shuffle_epi8(mbitrev, _mm256_and_si256(_mm256_srli_epi64(*bb, 4), mask0F0F)), + _mm256_slli_epi64(_mm256_shuffle_epi8(mbitrev, _mm256_and_si256(*bb, mask0F0F)), 4)); +} + +static void board_vertical_mirror_avx(const __m256i *bb, __m256i *sym) +{ + *sym = _mm256_shuffle_epi8(*bb, _mm256_set_epi8( + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); +} + +int board_unique(const Board *board, Board *unique) +{ + Board sym[8]; + int i, j, s = 0; + static const char reorder[8] = { 0, 2, 4, 6, 1, 5, 3, 7 }; + + sym[0] = *board; + board_transpose(board, &sym[1]); // was sym[4] + board_horizontal_mirror_avx((__m256i *) &sym[0], (__m256i *) &sym[2]); // were sym[1] & sym[6] + board_vertical_mirror_avx((__m256i *) &sym[0], (__m256i *) &sym[4]); // were sym[2] & sym[5] + board_vertical_mirror_avx((__m256i *) &sym[2], (__m256i *) &sym[6]); // were sym[3] & sym[7] + + *unique = *board; + for (i = 1; i < 8; ++i) { + j = reorder[i]; + if (board_lesser(&sym[j], unique)) { + *unique = sym[j]; + s = i; + } + } + + board_check(unique); + return s; +} +#endif + +/** + * @brief Compute a board resulting of a move played on a previous board. + * + * @param OP board to play the move on. + * @param x move to play. + * @param next resulting board. + * @return flipped discs. + */ +#if (MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_AVX512) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE) + +unsigned long long vectorcall board_next_sse(__m128i OP, const int x, Board *next) +{ + __m128i flipped = reduce_vflip(mm_Flip(OP, x)); + + OP = _mm_xor_si128(OP, _mm_or_si128(flipped, _mm_loadl_epi64((__m128i *) &X_TO_BIT[x]))); + _mm_storeu_si128((__m128i *) next, _mm_shuffle_epi32(OP, 0x4e)); + + return _mm_cvtsi128_si64(flipped); +} + +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + +unsigned long long board_next_neon(uint64x2_t OP, const int x, Board *next) +{ + uint64x2_t flipped = mm_Flip(OP, x); + #if !defined(_MSC_VER) && !defined(__clang__) // MSVC-arm32 does not have vld1q_lane_u64 + // arm64-gcc-13: 21, armv8a-clang-16: 23, msvc-arm64-19: 22, gcc-arm-13: 18, clang-armv7-11: 29 // https://godbolt.org/z/cvhns39rK + OP = veorq_u64(OP, vorrq_u64(flipped, vld1q_lane_u64((uint64_t *) &X_TO_BIT[x], flipped, 0))); + vst1q_u64((uint64_t *) next, vextq_u64(OP, OP, 1)); + #else // arm64-gcc-13: 21, armv8a-clang-16: 22, msvc-arm64-19: 21, gcc-arm-13: 23, clang-armv7-11: 27 + OP = veorq_u64(OP, flipped); + vst1q_u64((uint64_t *) next, vcombine_u64(vget_high_u64(OP), vorr_u64(vget_low_u64(OP), vld1_u64((uint64_t *) &X_TO_BIT[x])))); + #endif + return vgetq_lane_u64(flipped, 0); +} +#endif + +/** + * @brief X64 optimized get_moves + * + * Diag-7 is converted to diag-9 (v.v.) using vertical mirroring + * in SSE versions. + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return all legal moves in a 64-bit unsigned integer. + */ +#ifdef __AVX2__ // 4 AVX + + #if defined(_MSC_VER) || defined(__linux__) // vectorcall and SYSV-ABI passes __m256i in registers +unsigned long long vectorcall get_moves_avx(__m256i PP, __m256i OO) +{ + #else +unsigned long long get_moves(unsigned long long P, unsigned long long O) // minGW +{ + __m256i PP = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(P)); + __m256i OO = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(O)); + #endif + __m256i MM, flip_l, flip_r, pre_l, pre_r, shift2; + __m128i M; + const __m256i shift1897 = _mm256_set_epi64x(7, 9, 8, 1); + __m256i mOO = _mm256_and_si256(OO, _mm256_set_epi64x(0x007E7E7E7E7E7E00, 0x007E7E7E7E7E7E00, 0x00FFFFFFFFFFFF00, 0x7E7E7E7E7E7E7E7E)); + __m128i occupied = _mm_or_si128(_mm256_castsi256_si128(PP), _mm256_castsi256_si128(OO)); + + flip_l = _mm256_and_si256(mOO, _mm256_sllv_epi64(PP, shift1897)); + flip_r = _mm256_and_si256(mOO, _mm256_srlv_epi64(PP, shift1897)); + flip_l = _mm256_or_si256(flip_l, _mm256_and_si256(mOO, _mm256_sllv_epi64(flip_l, shift1897))); + flip_r = _mm256_or_si256(flip_r, _mm256_and_si256(mOO, _mm256_srlv_epi64(flip_r, shift1897))); + pre_l = _mm256_and_si256(mOO, _mm256_sllv_epi64(mOO, shift1897)); + pre_r = _mm256_srlv_epi64(pre_l, shift1897); + shift2 = _mm256_add_epi64(shift1897, shift1897); + flip_l = _mm256_or_si256(flip_l, _mm256_and_si256(pre_l, _mm256_sllv_epi64(flip_l, shift2))); + flip_r = _mm256_or_si256(flip_r, _mm256_and_si256(pre_r, _mm256_srlv_epi64(flip_r, shift2))); + flip_l = _mm256_or_si256(flip_l, _mm256_and_si256(pre_l, _mm256_sllv_epi64(flip_l, shift2))); + flip_r = _mm256_or_si256(flip_r, _mm256_and_si256(pre_r, _mm256_srlv_epi64(flip_r, shift2))); + MM = _mm256_or_si256(_mm256_sllv_epi64(flip_l, shift1897), _mm256_srlv_epi64(flip_r, shift1897)); + + M = _mm_or_si128(_mm256_castsi256_si128(MM), _mm256_extracti128_si256(MM, 1)); + return _mm_cvtsi128_si64(_mm_andnot_si128(occupied, _mm_or_si128(M, _mm_unpackhi_epi64(M, M)))); // mask with empties +} + +#elif defined(__x86_64__) || defined(_M_X64) // 2 SSE, 2 CPU + +unsigned long long get_moves(const unsigned long long P, const unsigned long long O) +{ + unsigned long long moves, mO, flip1, pre1, flip8, pre8; + __m128i PP, mOO, MM, flip, pre; + + mO = O & 0x7e7e7e7e7e7e7e7eULL; + PP = _mm_set_epi64x(vertical_mirror(P), P); + mOO = _mm_set_epi64x(vertical_mirror(mO), mO); + /* shift=-9:+7 */ /* shift=+1 */ /* shift = +8 */ + flip = _mm_and_si128(mOO, _mm_slli_epi64(PP, 7)); flip1 = mO & (P << 1); flip8 = O & (P << 8); + flip = _mm_or_si128(flip, _mm_and_si128(mOO, _mm_slli_epi64(flip, 7))); flip1 |= mO & (flip1 << 1); flip8 |= O & (flip8 << 8); + pre = _mm_and_si128(mOO, _mm_slli_epi64(mOO, 7)); pre1 = mO & (mO << 1); pre8 = O & (O << 8); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 14))); flip1 |= pre1 & (flip1 << 2); flip8 |= pre8 & (flip8 << 16); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 14))); flip1 |= pre1 & (flip1 << 2); flip8 |= pre8 & (flip8 << 16); + MM = _mm_slli_epi64(flip, 7); moves = flip1 << 1; moves |= flip8 << 8; + /* shift=-7:+9 */ /* shift=-1 */ /* shift = -8 */ + flip = _mm_and_si128(mOO, _mm_slli_epi64(PP, 9)); flip1 = mO & (P >> 1); flip8 = O & (P >> 8); + flip = _mm_or_si128(flip, _mm_and_si128(mOO, _mm_slli_epi64(flip, 9))); flip1 |= mO & (flip1 >> 1); flip8 |= O & (flip8 >> 8); + pre = _mm_and_si128(mOO, _mm_slli_epi64(mOO, 9)); pre1 >>= 1; pre8 >>= 8; + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); flip8 |= pre8 & (flip8 >> 16); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); flip8 |= pre8 & (flip8 >> 16); + MM = _mm_or_si128(MM, _mm_slli_epi64(flip, 9)); moves |= flip1 >> 1; moves |= flip8 >> 8; + + moves |= _mm_cvtsi128_si64(MM) | vertical_mirror(_mm_cvtsi128_si64(_mm_unpackhi_epi64(MM, MM))); + return moves & ~(P|O); // mask with empties +} + +#elif defined(__aarch64__) || defined(_M_ARM64) // 4 CPU + +unsigned long long get_moves(const unsigned long long P, const unsigned long long O) +{ + unsigned long long moves, mO; + unsigned long long flip1, flip7, flip9, flip8, pre1, pre7, pre9, pre8; + + mO = O & 0x7e7e7e7e7e7e7e7eULL; + flip1 = mO & (P << 1); flip7 = mO & (P << 7); flip9 = mO & (P << 9); flip8 = O & (P << 8); + flip1 |= mO & (flip1 << 1); flip7 |= mO & (flip7 << 7); flip9 |= mO & (flip9 << 9); flip8 |= O & (flip8 << 8); + pre1 = mO & (mO << 1); pre7 = mO & (mO << 7); pre9 = mO & (mO << 9); pre8 = O & (O << 8); + flip1 |= pre1 & (flip1 << 2); flip7 |= pre7 & (flip7 << 14); flip9 |= pre9 & (flip9 << 18); flip8 |= pre8 & (flip8 << 16); + flip1 |= pre1 & (flip1 << 2); flip7 |= pre7 & (flip7 << 14); flip9 |= pre9 & (flip9 << 18); flip8 |= pre8 & (flip8 << 16); + moves = flip1 << 1; moves |= flip7 << 7; moves |= flip9 << 9; moves |= flip8 << 8; + flip1 = mO & (P >> 1); flip7 = mO & (P >> 7); flip9 = mO & (P >> 9); flip8 = O & (P >> 8); + flip1 |= mO & (flip1 >> 1); flip7 |= mO & (flip7 >> 7); flip9 |= mO & (flip9 >> 9); flip8 |= O & (flip8 >> 8); + pre1 >>= 1; pre7 >>= 7; pre9 >>= 9; pre8 >>= 8; + flip1 |= pre1 & (flip1 >> 2); flip7 |= pre7 & (flip7 >> 14); flip9 |= pre9 & (flip9 >> 18); flip8 |= pre8 & (flip8 >> 16); + flip1 |= pre1 & (flip1 >> 2); flip7 |= pre7 & (flip7 >> 14); flip9 |= pre9 & (flip9 >> 18); flip8 |= pre8 & (flip8 >> 16); + moves |= flip1 >> 1; moves |= flip7 >> 7; moves |= flip9 >> 9; moves |= flip8 >> 8; + + return moves & ~(P|O); // mask with empties +} + +#elif defined(__ARM_NEON) // 3 Neon, 1 CPU(32) + + #ifndef DISPATCH_NEON + #define get_moves_sse get_moves // no dispatch + #endif + +unsigned long long get_moves_sse(unsigned long long P, unsigned long long O) +{ + unsigned int mO, movesL, movesH, flip1, pre1; + uint64x1_t rP, rO; + uint64x2_t PP, OO, MM, flip, pre; + + /* vertical_mirror in PP[1], OO[1] */ mO = (unsigned int) O & 0x7e7e7e7e; + rP = vreinterpret_u64_u8(vrev64_u8(vcreate_u8(P))); flip1 = mO & ((unsigned int) P << 1); + PP = vcombine_u64(vcreate_u64(P), rP); flip1 |= mO & (flip1 << 1); + pre1 = mO & (mO << 1); + rO = vreinterpret_u64_u8(vrev64_u8(vcreate_u8(O))); flip1 |= pre1 & (flip1 << 2); + OO = vcombine_u64(vcreate_u64(O), rO); flip1 |= pre1 & (flip1 << 2); + movesL = flip1 << 1; + + flip = vandq_u64(OO, vshlq_n_u64(PP, 8)); flip1 = mO & ((unsigned int) P >> 1); + flip = vorrq_u64(flip, vandq_u64(OO, vshlq_n_u64(flip, 8))); flip1 |= mO & (flip1 >> 1); + pre = vandq_u64(OO, vshlq_n_u64(OO, 8)); pre1 >>= 1; + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + MM = vshlq_n_u64(flip, 8); movesL |= flip1 >> 1; + + OO = vandq_u64(OO, vdupq_n_u64(0x7e7e7e7e7e7e7e7e)); mO = (unsigned int) (O >> 32) & 0x7e7e7e7e; + flip = vandq_u64(OO, vshlq_n_u64(PP, 7)); flip1 = mO & ((unsigned int) (P >> 32) << 1); + flip = vorrq_u64(flip, vandq_u64(OO, vshlq_n_u64(flip, 7))); flip1 |= mO & (flip1 << 1); + pre = vandq_u64(OO, vshlq_n_u64(OO, 7)); pre1 = mO & (mO << 1); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 14))); flip1 |= pre1 & (flip1 << 2); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 14))); flip1 |= pre1 & (flip1 << 2); + MM = vorrq_u64(MM, vshlq_n_u64(flip, 7)); movesH = flip1 << 1; + + flip = vandq_u64(OO, vshlq_n_u64(PP, 9)); flip1 = mO & ((unsigned int) (P >> 32) >> 1); + flip = vorrq_u64(flip, vandq_u64(OO, vshlq_n_u64(flip, 9))); flip1 |= mO & (flip1 >> 1); + pre = vandq_u64(OO, vshlq_n_u64(OO, 9)); pre1 >>= 1; + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + MM = vorrq_u64(MM, vshlq_n_u64(flip, 9)); movesH |= flip1 >> 1; + + movesL |= vgetq_lane_u32(vreinterpretq_u32_u64(MM), 0) | bswap_int(vgetq_lane_u32(vreinterpretq_u32_u64(MM), 3)); + movesH |= vgetq_lane_u32(vreinterpretq_u32_u64(MM), 1) | bswap_int(vgetq_lane_u32(vreinterpretq_u32_u64(MM), 2)); + return (movesL | ((unsigned long long) movesH << 32)) & ~(P|O); // mask with empties +} + +#else // AVX/x86_64/arm +/** + * @brief SSE optimized get_moves for x86 - 3 SSE, 1 CPU(32) + * + */ + #if defined(hasSSE2) || defined(USE_MSVC_X86) || defined(ANDROID) + + #ifdef hasSSE2 + #define get_moves_sse get_moves // no dispatch + #endif + +unsigned long long get_moves_sse(const unsigned long long P, const unsigned long long O) +{ + unsigned int mO, movesL, movesH, flip1, pre1; + __m128i OP, rOP, PP, OO, MM, flip, pre; + + // vertical_mirror in PP[1], OO[1] + OP = _mm_unpacklo_epi64(_mm_cvtsi64_si128(P), _mm_cvtsi64_si128(O)); mO = (unsigned int) O & 0x7e7e7e7eU; + rOP = _mm_shufflelo_epi16(OP, 0x1B); flip1 = mO & ((unsigned int) P << 1); + rOP = _mm_shufflehi_epi16(rOP, 0x1B); flip1 |= mO & (flip1 << 1); + rOP = _mm_or_si128(_mm_srli_epi16(rOP, 8), _mm_slli_epi16(rOP, 8)); pre1 = mO & (mO << 1); + flip1 |= pre1 & (flip1 << 2); + PP = _mm_unpacklo_epi64(OP, rOP); flip1 |= pre1 & (flip1 << 2); + OO = _mm_unpackhi_epi64(OP, rOP); movesL = flip1 << 1; + + flip = _mm_and_si128(OO, _mm_slli_epi64(PP, 8)); flip1 = mO & ((unsigned int) P >> 1); + flip = _mm_or_si128(flip, _mm_and_si128(OO, _mm_slli_epi64(flip, 8))); flip1 |= mO & (flip1 >> 1); + pre = _mm_and_si128(OO, _mm_slli_epi64(OO, 8)); pre1 >>= 1; + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + MM = _mm_slli_epi64(flip, 8); movesL |= flip1 >> 1; + + OO = _mm_and_si128(OO, _mm_set1_epi8(0x7e)); mO = (unsigned int) (O >> 32) & 0x7e7e7e7eU; + flip = _mm_and_si128(OO, _mm_slli_epi64(PP, 7)); flip1 = mO & ((unsigned int) (P >> 32) << 1); + flip = _mm_or_si128(flip, _mm_and_si128(OO, _mm_slli_epi64(flip, 7))); flip1 |= mO & (flip1 << 1); + pre = _mm_and_si128(OO, _mm_slli_epi64(OO, 7)); pre1 = mO & (mO << 1); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 14))); flip1 |= pre1 & (flip1 << 2); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 14))); flip1 |= pre1 & (flip1 << 2); + MM = _mm_or_si128(MM, _mm_slli_epi64(flip, 7)); movesH = flip1 << 1; + + flip = _mm_and_si128(OO, _mm_slli_epi64(PP, 9)); flip1 = mO & ((unsigned int) (P >> 32) >> 1); + flip = _mm_or_si128(flip, _mm_and_si128(OO, _mm_slli_epi64(flip, 9))); flip1 |= mO & (flip1 >> 1); + pre = _mm_and_si128(OO, _mm_slli_epi64(OO, 9)); pre1 >>= 1; + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + MM = _mm_or_si128(MM, _mm_slli_epi64(flip, 9)); movesH |= flip1 >> 1; + + movesL |= _mm_cvtsi128_si32(MM); MM = _mm_srli_si128(MM, 4); + movesH |= _mm_cvtsi128_si32(MM); MM = _mm_srli_si128(MM, 4); + movesH |= bswap_int(_mm_cvtsi128_si32(MM)); + movesL |= bswap_int(_mm_cvtsi128_si32(_mm_srli_si128(MM, 4))); + return (movesL | ((unsigned long long) movesH << 32)) & ~(P|O); // mask with empties +} + + #else // non-VEX asm + +unsigned long long get_moves_sse(const unsigned long long P, const unsigned long long O) +{ + unsigned long long moves; + static const V2DI mask7e = {{ 0x7e7e7e7e7e7e7e7eULL, 0x7e7e7e7e7e7e7e7eULL }}; + + __asm__ ( + "movl %1, %%ebx\n\t" + "movl %3, %%edi\n\t" + "andl $0x7e7e7e7e, %%edi\n\t" + /* shift=-1 */ /* vertical mirror in PP[1], OO[1] */ + "movl %%ebx, %%eax\n\t" "movd %1, %%xmm4\n\t" // (movd for store-forwarding) + "shrl $1, %%eax\n\t" "movd %2, %%xmm0\n\t" + "andl %%edi, %%eax\n\t" "movd %3, %%xmm5\n\t" + "movl %%eax, %%edx\n\t" "movd %4, %%xmm1\n\t" + "shrl $1, %%eax\n\t" "punpckldq %%xmm0, %%xmm4\n\t" // P + "movl %%edi, %%ecx\n\t" "punpckldq %%xmm1, %%xmm5\n\t" // O + "andl %%edi, %%eax\n\t" "punpcklqdq %%xmm5, %%xmm4\n\t" // OP + "shrl $1, %%ecx\n\t" "pshuflw $0x1b, %%xmm4, %%xmm0\n\t" + "orl %%edx, %%eax\n\t" "pshufhw $0x1b, %%xmm0, %%xmm0\n\t" + "andl %%edi, %%ecx\n\t" "movdqa %%xmm0, %%xmm1\n\t" + "movl %%eax, %%edx\n\t" "psllw $8, %%xmm0\n\t" + "shrl $2, %%eax\n\t" "psrlw $8, %%xmm1\n\t" + "andl %%ecx, %%eax\n\t" "por %%xmm1, %%xmm0\n\t" // rOP + "orl %%eax, %%edx\n\t" + "shrl $2, %%eax\n\t" "movdqa %%xmm4, %%xmm5\n\t" + "andl %%ecx, %%eax\n\t" "punpcklqdq %%xmm0, %%xmm4\n\t" // PP + "orl %%edx, %%eax\n\t" "punpckhqdq %%xmm0, %%xmm5\n\t" // OO + "shrl $1, %%eax\n\t" + /* shift=+1 */ /* shift=-8:+8 */ + "movdqa %%xmm4, %%xmm0\n\t" + "addl %%ebx, %%ebx\n\t" "psllq $8, %%xmm0\n\t" + "andl %%edi, %%ebx\n\t" "pand %%xmm5, %%xmm0\n\t" // 0 m7&o6 m6&o5 .. m1&o0 + "movl %%ebx, %%edx\n\t" "movdqa %%xmm0, %%xmm1\n\t" + "addl %%ebx, %%ebx\n\t" "psllq $8, %%xmm0\n\t" + "movdqa %%xmm5, %%xmm3\n\t" + "andl %%edi, %%ebx\n\t" "pand %%xmm5, %%xmm0\n\t" // 0 0 m7&o6&o5 .. m2&o1&o0 + "psllq $8, %%xmm3\n\t" + "orl %%ebx, %%edx\n\t" "por %%xmm1, %%xmm0\n\t" // 0 m7&o6 (m6&o5)|(m7&o6&o5) .. (m1&o0) + "addl %%ecx, %%ecx\n\t" "pand %%xmm5, %%xmm3\n\t" // 0 o7&o6 o6&o5 o5&o4 o4&o3 .. + "movdqa %%xmm0, %%xmm2\n\t" + "leal (,%%edx,4), %%ebx\n\t" "psllq $16, %%xmm0\n\t" + "andl %%ecx, %%ebx\n\t" "pand %%xmm3, %%xmm0\n\t" // 0 0 0 m7&o6&o5&o4 (m6&o5&o4&o3)|(m7&o6&o5&o4&o3) .. + "orl %%ebx, %%edx\n\t" "por %%xmm0, %%xmm2\n\t" + "shll $2, %%ebx\n\t" "psllq $16, %%xmm0\n\t" + "andl %%ecx, %%ebx\n\t" "pand %%xmm3, %%xmm0\n\t" // 0 0 0 0 0 m7&o6&..&o2 (m6&o5&..&o1)|(m7&o6&..&o1) .. + "orl %%edx, %%ebx\n\t" "por %%xmm0, %%xmm2\n\t" + "addl %%ebx, %%ebx\n\t" "psllq $8, %%xmm2\n\t" + "orl %%eax, %%ebx\n\t" + + "movl %2, %%esi\n\t" + "movl %4, %%edi\n\t" + /* shift=-1 */ /* shift=-9:+7 */ + "andl $0x7e7e7e7e,%%edi\n\t" "pand %5, %%xmm5\n\t" + "movl %%esi, %%eax\n\t" "movdqa %%xmm4, %%xmm0\n\t" + "shrl $1, %%eax\n\t" "psllq $7, %%xmm0\n\t" + "andl %%edi, %%eax\n\t" "pand %%xmm5, %%xmm0\n\t" + "movl %%eax, %%edx\n\t" "movdqa %%xmm0, %%xmm1\n\t" + "shrl $1, %%eax\n\t" "psllq $7, %%xmm0\n\t" + "andl %%edi, %%eax\n\t" "pand %%xmm5, %%xmm0\n\t" + "movl %%edi, %%ecx\n\t" "movdqa %%xmm5, %%xmm3\n\t" + "orl %%edx, %%eax\n\t" "por %%xmm1, %%xmm0\n\t" + "shrl $1, %%ecx\n\t" "psllq $7, %%xmm3\n\t" + "movl %%eax, %%edx\n\t" "movdqa %%xmm0, %%xmm1\n\t" + "andl %%edi, %%ecx\n\t" "pand %%xmm5, %%xmm3\n\t" + "shrl $2, %%eax\n\t" "psllq $14, %%xmm0\n\t" + "andl %%ecx, %%eax\n\t" "pand %%xmm3, %%xmm0\n\t" + "orl %%eax, %%edx\n\t" "por %%xmm0, %%xmm1\n\t" + "shrl $2, %%eax\n\t" "psllq $14, %%xmm0\n\t" + "andl %%ecx, %%eax\n\t" "pand %%xmm3, %%xmm0\n\t" + "orl %%edx, %%eax\n\t" "por %%xmm1, %%xmm0\n\t" + "shrl $1, %%eax\n\t" "psllq $7, %%xmm0\n\t" + "por %%xmm0, %%xmm2\n\t" + /* shift=+1 */ /* shift=-7:+9 */ + "movdqa %%xmm4, %%xmm0\n\t" + "addl %%esi, %%esi\n\t" "psllq $9, %%xmm0\n\t" + "andl %%edi, %%esi\n\t" "pand %%xmm5, %%xmm0\n\t" + "movl %%esi, %%edx\n\t" "movdqa %%xmm0, %%xmm1\n\t" + "addl %%esi, %%esi\n\t" "psllq $9, %%xmm0\n\t" + "andl %%edi, %%esi\n\t" "pand %%xmm5, %%xmm0\n\t" + "movdqa %%xmm5, %%xmm3\n\t" + "orl %%esi, %%edx\n\t" "por %%xmm1, %%xmm0\n\t" + "psllq $9, %%xmm3\n\t" + "movdqa %%xmm0, %%xmm1\n\t" + "addl %%ecx, %%ecx\n\t" "pand %%xmm5, %%xmm3\n\t" + "leal (,%%edx,4), %%esi\n\t" "psllq $18, %%xmm0\n\t" + "andl %%ecx, %%esi\n\t" "pand %%xmm3, %%xmm0\n\t" + "orl %%esi, %%edx\n\t" "por %%xmm0, %%xmm1\n\t" + "shll $2, %%esi\n\t" "psllq $18, %%xmm0\n\t" + "andl %%ecx, %%esi\n\t" "pand %%xmm3, %%xmm0\n\t" + "orl %%edx, %%esi\n\t" "por %%xmm1, %%xmm0\n\t" + "addl %%esi, %%esi\n\t" "psllq $9, %%xmm0\n\t" + "orl %%eax, %%esi\n\t" "por %%xmm0, %%xmm2\n\t" + + "movl %1, %%eax\n\t" "movhlps %%xmm2, %%xmm3\n\t" + "movl %2, %%edx\n\t" "movd %%xmm3, %%edi\n\t" "movd %%xmm2, %%ecx\n\t" + "psrlq $32, %%xmm3\n\t" "psrlq $32, %%xmm2\n\t" + "bswapl %%edi\n\t" "orl %%ecx, %%ebx\n\t" + "orl %3, %%eax\n\t" "orl %%edi, %%esi\n\t" + "orl %4, %%edx\n\t" "movd %%xmm3, %%edi\n\t" "movd %%xmm2, %%ecx\n\t" + "notl %%eax\n\t" "bswapl %%edi\n\t" + "notl %%edx\n\t" "orl %%edi, %%ebx\n\t" "orl %%ecx, %%esi\n\t" + "andl %%esi, %%edx\n\t" + "andl %%ebx, %%eax" + : "=&A" (moves) + : "m" (P), "m" (((unsigned int *)&P)[1]), "m" (O), "m" (((unsigned int *)&O)[1]), "m" (mask7e) + : "ebx", "ecx", "esi", "edi" ); + + return moves; +} + + #endif // hasSSE2 +#endif // x86 + +#if defined(hasSSE2) || (defined(__ARM_NEON) && !defined(DISPATCH_NEON)) + +/** + * @brief SSE/neon optimized get_stable_edge + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return a bitboard with (some of) player's stable discs. + * + */ + #if defined(__aarch64__) || defined(_M_ARM64) // for vaddvq +unsigned long long get_stable_edge(unsigned long long P, unsigned long long O) +{ // compute the exact stable edges (from precomputed tables) + // const int16x8_t shiftv = { 0, 1, 2, 3, 4, 5, 6, 7 }; // error on MSVC + const uint64x2_t shiftv = { 0x0003000200010000, 0x0007000600050004 }; + uint8x16_t PO = vzip1q_u8(vreinterpretq_u8_u64(vdupq_n_u64(O)), vreinterpretq_u8_u64(vdupq_n_u64(P))); + unsigned int a1a8 = edge_stability[vaddvq_u16(vshlq_u16(vreinterpretq_u16_u8(vandq_u8(PO, vdupq_n_u8(1))), vreinterpretq_s16_u64(shiftv)))]; + unsigned int h1h8 = edge_stability[vaddvq_u16(vshlq_u16(vreinterpretq_u16_u8(vshrq_n_u8(PO, 7)), vreinterpretq_s16_u64(shiftv)))]; + return edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 0)] + | (unsigned long long) edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 7)] << 56 + | unpackA2A7(a1a8) | unpackH2H7(h1h8); +} + + #elif defined(__ARM_NEON) // Neon kindergarten +unsigned long long get_stable_edge(unsigned long long P, unsigned long long O) +{ // compute the exact stable edges (from precomputed tables) + const uint64x2_t kMul = { 0x1020408001020408, 0x1020408001020408 }; + uint64x2_t PP = vcombine_u64(vshl_n_u64(vcreate_u64(P), 7), vcreate_u64(P)); + uint64x2_t OO = vcombine_u64(vshl_n_u64(vcreate_u64(O), 7), vcreate_u64(O)); + uint32x4_t QP = vmulq_u32(vreinterpretq_u32_u64(kMul), vreinterpretq_u32_u8(vshrq_n_u8(vreinterpretq_u8_u64(PP), 7))); + uint32x4_t QO = vmulq_u32(vreinterpretq_u32_u64(kMul), vreinterpretq_u32_u8(vshrq_n_u8(vreinterpretq_u8_u64(OO), 7))); + uint32x2_t DP = vpadd_u32(vget_low_u32(QP), vget_high_u32(QP)); // P_h1h8 * * * P_a1a8 * * * + uint32x2_t DO = vpadd_u32(vget_low_u32(QO), vget_high_u32(QO)); // O_h1h8 * * * O_a1a8 * * * + uint8x8_t DB = vtrn_u8(vreinterpret_u8_u32(DO), vreinterpret_u8_u32(DP)).val[1]; // P_h1h8 O_h1h8 * * P_a1a8 O_a1a8 * * + unsigned int a1a8 = edge_stability[vget_lane_u16(vreinterpret_u16_u8(DB), 1)]; + unsigned int h1h8 = edge_stability[vget_lane_u16(vreinterpret_u16_u8(DB), 3)]; + uint8x16_t PO = vzipq_u8(vreinterpretq_u8_u64(OO), vreinterpretq_u8_u64(PP)).val[1]; + return edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 0)] + | (unsigned long long) edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 7)] << 56 + | unpackA2A7(a1a8) | unpackH2H7(h1h8); +} + + #elif defined(hasSSE2) +unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) +{ + // compute the exact stable edges (from precomputed tables) + unsigned int a1a8, h1h8; + unsigned long long stable_edge; + + __m128i P0 = _mm_cvtsi64_si128(P); + __m128i O0 = _mm_cvtsi64_si128(O); + __m128i PO = _mm_unpacklo_epi8(O0, P0); + stable_edge = edge_stability[_mm_extract_epi16(PO, 0)] + | ((unsigned long long) edge_stability[_mm_extract_epi16(PO, 7)] << 56); + + PO = _mm_unpacklo_epi64(O0, P0); + a1a8 = edge_stability[_mm_movemask_epi8(_mm_slli_epi64(PO, 7))]; + h1h8 = edge_stability[_mm_movemask_epi8(PO)]; + stable_edge |= unpackA2A7(a1a8) | unpackH2H7(h1h8); + + return stable_edge; +} + #endif + +/** + * @brief SSE/neon optimized get_edge_stability + * + * Compute the exact stable edges from precomputed tables. + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return the number of stable discs on the edges. + * + */ + #if defined(__aarch64__) || defined(_M_ARM64) // for vaddvq +int get_edge_stability(const unsigned long long P, const unsigned long long O) +{ + const uint64x2_t shiftv = { 0x0003000200010000, 0x0007000600050004 }; + uint8x16_t PO = vzip1q_u8(vreinterpretq_u8_u64(vdupq_n_u64(O)), vreinterpretq_u8_u64(vdupq_n_u64(P))); + uint8x8_t packedstable = vcreate_u8((edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 0)] + | edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 7)] << 8) & 0x7e7e); + packedstable = vset_lane_u8(edge_stability[vaddvq_u16(vshlq_u16(vreinterpretq_u16_u8(vandq_u8(PO, vdupq_n_u8(1))), vreinterpretq_s16_u64(shiftv)))], packedstable, 2); + packedstable = vset_lane_u8(edge_stability[vaddvq_u16(vshlq_u16(vreinterpretq_u16_u8(vshrq_n_u8(PO, 7)), vreinterpretq_s16_u64(shiftv)))], packedstable, 3); + return vaddv_u8(vcnt_u8(packedstable)); +} + + #elif defined(__ARM_NEON) // Neon kindergarten +int get_edge_stability(const unsigned long long P, const unsigned long long O) +{ + const uint64x2_t kMul = { 0x1020408001020408, 0x1020408001020408 }; + uint64x2_t PP = vcombine_u64(vshl_n_u64(vcreate_u64(P), 7), vcreate_u64(P)); + uint64x2_t OO = vcombine_u64(vshl_n_u64(vcreate_u64(O), 7), vcreate_u64(O)); + uint32x4_t QP = vmulq_u32(vreinterpretq_u32_u64(kMul), vreinterpretq_u32_u8(vshrq_n_u8(vreinterpretq_u8_u64(PP), 7))); + uint32x4_t QO = vmulq_u32(vreinterpretq_u32_u64(kMul), vreinterpretq_u32_u8(vshrq_n_u8(vreinterpretq_u8_u64(OO), 7))); + uint32x2_t DP = vpadd_u32(vget_low_u32(QP), vget_high_u32(QP)); // P_h1h8 * * * P_a1a8 * * * + uint32x2_t DO = vpadd_u32(vget_low_u32(QO), vget_high_u32(QO)); // O_h1h8 * * * O_a1a8 * * * + uint8x8_t DB = vtrn_u8(vreinterpret_u8_u32(DO), vreinterpret_u8_u32(DP)).val[1]; // P_h1h8 O_h1h8 * * P_a1a8 O_a1a8 * * + uint8x16_t PO = vzipq_u8(vreinterpretq_u8_u64(OO), vreinterpretq_u8_u64(PP)).val[1]; + uint8x8_t packedstable = vcreate_u8((edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 0)] + | edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 7)] << 8) & 0x7e7e); + packedstable = vset_lane_u8(edge_stability[vget_lane_u16(vreinterpret_u16_u8(DB), 1)], packedstable, 2); + packedstable = vset_lane_u8(edge_stability[vget_lane_u16(vreinterpret_u16_u8(DB), 3)], packedstable, 3); + return vget_lane_u32(vpaddl_u16(vpaddl_u8(vcnt_u8(packedstable))), 0); +} + + #elif defined(hasSSE2) +int get_edge_stability(const unsigned long long P, const unsigned long long O) +{ + __m128i P0 = _mm_cvtsi64_si128(P); + __m128i O0 = _mm_cvtsi64_si128(O); + __m128i PO = _mm_unpacklo_epi8(O0, P0); + unsigned int packedstable = edge_stability[_mm_extract_epi16(PO, 0)] | edge_stability[_mm_extract_epi16(PO, 7)] << 8; + PO = _mm_unpacklo_epi64(O0, P0); + packedstable |= edge_stability[_mm_movemask_epi8(_mm_slli_epi64(PO, 7))] << 16 | edge_stability[_mm_movemask_epi8(PO)] << 24; + return bit_count_32(packedstable & 0xffff7e7e); +} + #endif + +/** + * @brief AVX2/SSE/neon optimized get_full_lines. + * + * SSE pcmpeqb for horizontal get_full_lines. + * CPU rotate for vertical get_full_lines. + * Diag-7 is converted to diag-9 using vertical mirroring. + * + * @param disc all discs on the board. + * @param full all 1 if full line, otherwise all 0. + */ + #ifdef __AVX2__ + +static __m256i vectorcall get_full_lines(const unsigned long long disc) +{ + __m128i l81, l79, l8; + __m256i v4_disc, lr79; + const __m128i kff = _mm_set1_epi8(-1); + #if 0 // PCMPEQQ + static const V4DI m791 = {{ 0x0402010000804020, 0x2040800000010204, 0x0804020180402010, 0x1020408001020408 }}; // V8SI + static const V4DI m792 = {{ 0x0000008040201008, 0x0000000102040810, 0x1008040201000000, 0x0810204080000000 }}; + static const V4DI m793 = {{ 0x0000804020100804, 0x0000010204081020, 0x2010080402010000, 0x0408102040800000 }}; + static const V4DI m794 = {{ 0x0080402010080402, 0x0001020408102040, 0x4020100804020100, 0x0204081020408000 }}; + static const V2DI m795 = {{ 0x8040201008040201, 0x0102040810204080 }}; + + l81 = _mm_cvtsi64_si128(disc); v4_disc = _mm256_broadcastq_epi64(l81); + l81 = _mm_cmpeq_epi8(kff, l81); lr79 = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(v4_disc, m791.v4), m791.v4), m791.v4); + lr79 = _mm256_or_si256(lr79, _mm256_and_si256(_mm256_cmpeq_epi64(_mm256_and_si256(v4_disc, m792.v4), m792.v4), m792.v4)); + l8 = _mm256_castsi256_si128(v4_disc); lr79 = _mm256_or_si256(lr79, _mm256_and_si256(_mm256_cmpeq_epi64(_mm256_and_si256(v4_disc, m793.v4), m793.v4), m793.v4)); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 1)); lr79 = _mm256_or_si256(lr79, _mm256_and_si256(_mm256_cmpeq_epi64(_mm256_and_si256(v4_disc, m794.v4), m794.v4), m794.v4)); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 2)); l79 = _mm_and_si128(_mm_cmpeq_epi64(_mm_and_si128(_mm256_castsi256_si128(v4_disc), m795.v2), m795.v2), m795.v2); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 4)); l79 = _mm_or_si128(l79, _mm_or_si128(_mm256_extracti128_si256(lr79, 1), _mm256_castsi256_si128(lr79))); + + #elif 0 // PCMPEQD + __m256i lm79; + static const V4DI m790 = {{ 0x80c0e0f0783c1e0f, 0x0103070f1e3c78f0, 0x70381c0e07030100, 0x0e1c3870e0c08000 }}; + static const V4DI m791 = {{ 0x0402010000804020, 0x2040800000010204, 0x0804020180402010, 0x1020408001020408 }}; // V8SI + static const V4DI m792 = {{ 0x2010884440201088, 0x0408112202040811, 0x2211080411080402, 0x4488102088102040 }}; // V8SI + static const V4DI m793 = {{ 0x8844221110884422, 0x1122448808112244, 0x0000000044221108, 0x0000000022448810 }}; // V8SI + + l81 = _mm_cvtsi64_si128(disc); v4_disc = _mm256_broadcastq_epi64(l81); + l81 = _mm_cmpeq_epi8(kff, l81); lm79 = _mm256_and_si256(v4_disc, m790.v4); + lm79 = _mm256_or_si256(lm79, _mm256_shuffle_epi32(lm79, 0xb1)); + l8 = _mm256_castsi256_si128(v4_disc); lr79 = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(lm79, m792.v4), m792.v4), m792.v4); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 1)); lr79 = _mm256_or_si256(lr79, _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(lm79, m793.v4), m793.v4), m793.v4)); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 2)); lr79 = _mm256_and_si256(_mm256_or_si256(lr79, _mm256_shuffle_epi32(lr79, 0xb1)), m790.v4); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 4)); lr79 = _mm256_or_si256(lr79, _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(v4_disc, m791.v4), m791.v4), m791.v4)); + l79 = _mm_or_si128(_mm256_extracti128_si256(lr79, 1), _mm256_castsi256_si128(lr79)); + + #else // Kogge-Stone + const __m128i mcpyswap = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0); + const __m128i mbswapll = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + static const V4DI shiftlr[] = {{{ 9, 7, 7, 9 }}, {{ 18, 14, 14, 18 }}, {{ 36, 28, 28, 36 }}}; + static const V4DI e790 = {{ 0xff80808080808080, 0xff01010101010101, 0xff01010101010101, 0xff80808080808080 }}; + static const V4DI e791 = {{ 0xffffc0c0c0c0c0c0, 0xffff030303030303, 0xffff030303030303, 0xffffc0c0c0c0c0c0 }}; + static const V4DI e792 = {{ 0xfffffffff0f0f0f0, 0xffffffff0f0f0f0f, 0xffffffff0f0f0f0f, 0xfffffffff0f0f0f0 }}; + + l81 = _mm_cvtsi64_si128(disc); v4_disc = _mm256_castsi128_si256(_mm_shuffle_epi8(l81, mcpyswap)); + l81 = _mm_cmpeq_epi8(kff, l81); v4_disc = _mm256_permute4x64_epi64(v4_disc, 0x50); // disc, disc, rdisc, rdisc + lr79 = _mm256_and_si256(v4_disc, _mm256_or_si256(e790.v4, _mm256_srlv_epi64(v4_disc, shiftlr[0].v4))); + l8 = _mm256_castsi256_si128(v4_disc); lr79 = _mm256_and_si256(lr79, _mm256_or_si256(e791.v4, _mm256_srlv_epi64(lr79, shiftlr[1].v4))); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 1)); lr79 = _mm256_and_si256(lr79, _mm256_or_si256(e792.v4, _mm256_srlv_epi64(lr79, shiftlr[2].v4))); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 2)); l79 = _mm_shuffle_epi8(_mm256_extracti128_si256(lr79, 1), mbswapll); + l8 = _mm_and_si128(l8, _mm_alignr_epi8(l8, l8, 4)); l79 = _mm_and_si128(l79, _mm256_castsi256_si128(lr79)); + #endif + l81 = _mm_unpacklo_epi64(l81, l8); + return _mm256_insertf128_si256(_mm256_castsi128_si256(l81), l79, 1); +} + + #elif defined(__ARM_NEON) + +void get_full_lines(const unsigned long long disc, unsigned long long full[4]) +{ + unsigned long long l8; + uint8x8_t l01; + uint64x2_t l79, r79; + const uint64x2_t e790 = vdupq_n_u64(0x007f7f7f7f7f7f7f); + const uint64x2_t e791 = vdupq_n_u64(0xfefefefefefefe00); + const uint64x2_t e792 = vdupq_n_u64(0x00003f3f3f3f3f3f); + const uint64x2_t e793 = vdupq_n_u64(0x0f0f0f0ff0f0f0f0); + + l01 = vcreate_u8(disc); l79 = r79 = vreinterpretq_u64_u8(vcombine_u8(l01, vrev64_u8(l01))); + l01 = vceq_u8(l01, vdup_n_u8(0xff)); l79 = vandq_u64(l79, vornq_u64(vshrq_n_u64(l79, 9), e790)); + full[0] = vget_lane_u64(vreinterpret_u64_u8(l01), 0); + r79 = vandq_u64(r79, vornq_u64(vshlq_n_u64(r79, 9), e791)); + l8 = disc; l79 = vbicq_u64(l79, vbicq_u64(e792, vshrq_n_u64(l79, 18))); // De Morgan + l8 &= (l8 >> 8) | (l8 << 56); r79 = vbicq_u64(r79, vshlq_n_u64(vbicq_u64(e792, r79), 18)); + l8 &= (l8 >> 16) | (l8 << 48); l79 = vandq_u64(vandq_u64(l79, r79), vorrq_u64(e793, vsliq_n_u64(vshrq_n_u64(l79, 36), r79, 36))); + l8 &= (l8 >> 32) | (l8 << 32); full[2] = vgetq_lane_u64(l79, 0); + full[1] = l8; full[3] = vertical_mirror(vgetq_lane_u64(l79, 1)); +} + + #else // 1 CPU, 3 SSE + +void get_full_lines(const unsigned long long disc, unsigned long long full[4]) +{ + unsigned long long rdisc = vertical_mirror(disc); + unsigned long long l8; + __m128i l01, l79, r79; // full lines + const __m128i kff = _mm_set1_epi8(-1); + const __m128i e790 = _mm_set1_epi64x(0xff80808080808080); + const __m128i e791 = _mm_set1_epi64x(0x01010101010101ff); + const __m128i e792 = _mm_set1_epi64x(0x00003f3f3f3f3f3f); + const __m128i e793 = _mm_set1_epi64x(0x0f0f0f0ff0f0f0f0); + + l01 = l79 = _mm_cvtsi64_si128(disc); l79 = r79 = _mm_unpacklo_epi64(l79, _mm_cvtsi64_si128(rdisc)); + l01 = _mm_cmpeq_epi8(kff, l01); l79 = _mm_and_si128(l79, _mm_or_si128(e790, _mm_srli_epi64(l79, 9))); + _mm_storel_epi64((__m128i*) &full[0], l01); + r79 = _mm_and_si128(r79, _mm_or_si128(e791, _mm_slli_epi64(r79, 9))); + l8 = disc; l79 = _mm_andnot_si128(_mm_andnot_si128(_mm_srli_epi64(l79, 18), e792), l79); // De Morgan + l8 &= (l8 >> 8) | (l8 << 56); r79 = _mm_andnot_si128(_mm_slli_epi64(_mm_andnot_si128(r79, e792), 18), r79); + l8 &= (l8 >> 16) | (l8 << 48); l79 = _mm_and_si128(_mm_and_si128(l79, r79), _mm_or_si128(e793, _mm_or_si128(_mm_srli_epi64(l79, 36), _mm_slli_epi64(r79, 36)))); + l8 &= (l8 >> 32) | (l8 << 32); _mm_storel_epi64((__m128i *) &full[2], l79); + full[1] = l8; full[3] = vertical_mirror(_mm_cvtsi128_si64(_mm_unpackhi_epi64(l79, l79))); +} + + #endif +#endif // hasSSE2/__ARM_NEON + +#ifdef __AVX2__ +/** + * @brief AVX2 optimized get_stability + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return the number of stable discs. + */ + +// compute the other stable discs (ie discs touching another stable disc in each flipping direction). +static int vectorcall get_spreaded_stability(unsigned long long stable, unsigned long long P_central, __m256i v4_full) +{ + __m128i v2_stable, v2_old_stable, v2_P_central; + __m256i v4_stable; + const __m256i shift1897 = _mm256_set_epi64x(7, 9, 8, 1); + + if (stable == 0) + return 0; + + v2_stable = _mm_cvtsi64_si128(stable); + v2_P_central = _mm_cvtsi64_si128(P_central); + do { + v2_old_stable = v2_stable; + v4_stable = _mm256_broadcastq_epi64(v2_stable); + v4_stable = _mm256_or_si256(_mm256_or_si256(_mm256_srlv_epi64(v4_stable, shift1897), _mm256_sllv_epi64(v4_stable, shift1897)), v4_full); + v2_stable = _mm_and_si128(_mm256_castsi256_si128(v4_stable), _mm256_extracti128_si256(v4_stable, 1)); + v2_stable = _mm_and_si128(v2_stable, _mm_unpackhi_epi64(v2_stable, v2_stable)); + v2_stable = _mm_or_si128(v2_old_stable, _mm_and_si128(v2_stable, v2_P_central)); + } while (!_mm_testc_si128(v2_old_stable, v2_stable)); + + return bit_count(_mm_cvtsi128_si64(v2_stable)); +} +#elif defined(hasSSE2) && !defined(HAS_CPU_64) +// 32bit SSE optimized get_spreaded_stability +int get_spreaded_stability(unsigned long long stable, unsigned long long P_central, unsigned long long full[4]) +{ + __m128i v_stable, stable_vh, stable_d79, old_stable; + + if (stable == 0) // (2%) + return 0; + + v_stable = _mm_cvtsi64_si128(stable); + do { + old_stable = v_stable; + stable_vh = _mm_loadu_si128((__m128i *) &full[0]); + stable_vh = _mm_or_si128(stable_vh, _mm_unpacklo_epi64(_mm_srli_epi64(v_stable, 1), _mm_srli_epi64(v_stable, 8))); + stable_vh = _mm_or_si128(stable_vh, _mm_unpacklo_epi64(_mm_slli_epi64(v_stable, 1), _mm_slli_epi64(v_stable, 8))); + stable_d79 = _mm_loadu_si128((__m128i *) &full[2]); + stable_d79 = _mm_or_si128(stable_d79, _mm_unpacklo_epi64(_mm_srli_epi64(v_stable, 9), _mm_srli_epi64(v_stable, 7))); + stable_d79 = _mm_or_si128(stable_d79, _mm_unpacklo_epi64(_mm_slli_epi64(v_stable, 9), _mm_slli_epi64(v_stable, 7))); + v_stable = _mm_and_si128(stable_vh, stable_d79); + v_stable = _mm_and_si128(v_stable, _mm_unpackhi_epi64(v_stable, v_stable)); + v_stable = _mm_or_si128(old_stable, _mm_and_si128(v_stable, _mm_loadl_epi64((__m128i *) &P_central))); + } while (_mm_movemask_epi8(_mm_cmpeq_epi8(v_stable, old_stable)) != 0xffff); // (44%) + + return bit_count_si64(v_stable); +} +#endif + +#ifdef __AVX2__ +// returns stability count only +int get_stability(const unsigned long long P, const unsigned long long O) +{ + unsigned long long stable = get_stable_edge(P, O); // compute the exact stable edges + unsigned long long P_central = P & 0x007e7e7e7e7e7e00; + + __m256i v4_full = get_full_lines(P | O); // add full lines + __m128i v2_full = _mm_and_si128(_mm256_castsi256_si128(v4_full), _mm256_extracti128_si256(v4_full, 1)); + stable |= (P_central & _mm_cvtsi128_si64(_mm_and_si128(v2_full, _mm_unpackhi_epi64(v2_full, v2_full)))); + + return get_spreaded_stability(stable, P_central, v4_full); // compute the other stable discs +} + +// returns all full in full[4] in addition to stability count +int get_stability_fulls(const unsigned long long P, const unsigned long long O, unsigned long long full[5]) +{ + unsigned long long stable = get_stable_edge(P, O); // compute the exact stable edges + unsigned long long P_central = P & 0x007e7e7e7e7e7e00; + + __m256i v4_full = get_full_lines(P | O); // add full lines + __m128i v2_full = _mm_and_si128(_mm256_castsi256_si128(v4_full), _mm256_extracti128_si256(v4_full, 1)); + // _mm256_storeu_si256((__m256i *) full, v4_full); + full[4] = _mm_cvtsi128_si64(_mm_and_si128(v2_full, _mm_unpackhi_epi64(v2_full, v2_full))); + stable |= (P_central & full[4]); + + return get_spreaded_stability(stable, P_central, v4_full); // compute the other stable discs +} + +// returns all full lines only +unsigned long long get_all_full_lines(const unsigned long long disc) +{ + __m256i v4_full = get_full_lines(disc); + __m128i v2_full = _mm_and_si128(_mm256_castsi256_si128(v4_full), _mm256_extracti128_si256(v4_full, 1)); + return _mm_cvtsi128_si64(_mm_and_si128(v2_full, _mm_unpackhi_epi64(v2_full, v2_full))); +} + +/** + * @brief AVX2 optimized get_moves + get_potential_moves. + * + * Get the bitboard of empty squares in contact of a player square, as well as real mobility. + * + * @param PP broadcasted bitboard with player's discs. + * @param OO broadcasted bitboard with opponent's discs. + * @return potential moves in a higner 64-bit, real moves in a lower 64-bit. + */ +__m128i vectorcall get_moves_and_potential(__m256i PP, __m256i OO) +{ + __m256i MM, potmob, flip_l, flip_r, pre_l, pre_r, shift2; + const __m256i shift1897 = _mm256_set_epi64x(7, 9, 8, 1); + __m256i mOO = _mm256_and_si256(OO, _mm256_set_epi64x(0x007E7E7E7E7E7E00, 0x007E7E7E7E7E7E00, 0x00FFFFFFFFFFFF00, 0x7E7E7E7E7E7E7E7E)); + __m128i occupied = _mm_or_si128(_mm256_castsi256_si128(PP), _mm256_castsi256_si128(OO)); + + flip_l = _mm256_and_si256(mOO, _mm256_sllv_epi64(PP, shift1897)); + flip_r = _mm256_and_si256(mOO, _mm256_srlv_epi64(PP, shift1897)); + flip_l = _mm256_or_si256(flip_l, _mm256_and_si256(mOO, _mm256_sllv_epi64(flip_l, shift1897))); + flip_r = _mm256_or_si256(flip_r, _mm256_and_si256(mOO, _mm256_srlv_epi64(flip_r, shift1897))); + pre_l = _mm256_sllv_epi64(mOO, shift1897); pre_r = _mm256_srlv_epi64(mOO, shift1897); + potmob = _mm256_or_si256(pre_l, pre_r); + pre_l = _mm256_and_si256(mOO, pre_l); pre_r = _mm256_and_si256(mOO, pre_r); + shift2 = _mm256_add_epi64(shift1897, shift1897); + flip_l = _mm256_or_si256(flip_l, _mm256_and_si256(pre_l, _mm256_sllv_epi64(flip_l, shift2))); + flip_r = _mm256_or_si256(flip_r, _mm256_and_si256(pre_r, _mm256_srlv_epi64(flip_r, shift2))); + flip_l = _mm256_or_si256(flip_l, _mm256_and_si256(pre_l, _mm256_sllv_epi64(flip_l, shift2))); + flip_r = _mm256_or_si256(flip_r, _mm256_and_si256(pre_r, _mm256_srlv_epi64(flip_r, shift2))); + MM = _mm256_or_si256(_mm256_sllv_epi64(flip_l, shift1897), _mm256_srlv_epi64(flip_r, shift1897)); + + MM = _mm256_or_si256(_mm256_unpacklo_epi64(MM, potmob), _mm256_unpackhi_epi64(MM, potmob)); + return _mm_andnot_si128(occupied, _mm_or_si128(_mm256_castsi256_si128(MM), _mm256_extracti128_si256(MM, 1))); // mask with empties +} + +#endif diff --git a/src/book.c b/src/book.c index 0e36ee1d..b0784d2b 100644 --- a/src/book.c +++ b/src/book.c @@ -10,9 +10,9 @@ * with two bounds from retropropagated error. * Several algorithms are present to add positions in the book, in the most usefull way. * - * @date 1998 - 2017 + * @date 1998 - 2020 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #include "book.h" @@ -49,7 +49,7 @@ static void bprint(const char *format, ...) vprintf(format, args); va_end(args); fflush (stdout); - + } } @@ -107,7 +107,7 @@ static inline bool link_is_bad(const Link *link) * @brief A position stored in the book. */ typedef struct Position { - Board board[1]; /**< (unique) board */ + Board board; /**< (unique) board */ Link leaf; /**< best remaining move */ Link* link; /**< linking moves */ unsigned int n_wins; /**< game win count */ @@ -158,29 +158,29 @@ static int get_book_depth(const int depth) */ static bool position_is_ok(const Position *position) { - Board board[1]; - Move move[1]; + Board board; + Move move; const Link *l; int i, j; char s[4]; // board is legal ? - if (position->board->player & position->board->opponent) { + if (position->board.player & position->board.opponent) { warn("Board is illegal: Two discs on the same square?\n"); - board_print(position->board, BLACK, stderr); + board_print(&position->board, BLACK, stderr); return false; } - if (((position->board->player|position->board->opponent) & 0x0000001818000000ULL) != 0x0000001818000000ULL) { + if (((position->board.player | position->board.opponent) & 0x0000001818000000ULL) != 0x0000001818000000ULL) { warn("Board is illegal: Empty center?\n"); - board_print(position->board, BLACK, stderr); + board_print(&position->board, BLACK, stderr); return false; } // is board unique - board_unique(position->board, board); - if (!board_equal(position->board, board)) { + board_unique(&position->board, &board); + if (!board_equal(&position->board, &board)) { warn("board is not unique\n"); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } @@ -188,18 +188,18 @@ static bool position_is_ok(const Position *position) foreach_link(l, position) { if (l->move == PASS) { if (position->n_link > 1 - || can_move(board->player, board->opponent) - || !can_move(board->opponent, board->player)) { + || can_move(board.player, board.opponent) + || !can_move(board.opponent, board.player)) { warn("passing move is wrong\n"); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } } else { if (/*l->move < A1 ||*/ l->move > H8 - || board_is_occupied(board, l->move) - || board_get_move(board, l->move, move) == 0) { + || board_is_occupied(&board, l->move) + || board_get_move_flip(&board, l->move, &move) == 0) { warn("link %s is wrong\n", move_to_string(l->move, WHITE, s)); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } } @@ -208,23 +208,23 @@ static bool position_is_ok(const Position *position) l = &position->leaf; if (l->move == PASS) { if (position->n_link > 0 - || can_move(board->player, board->opponent) - || !can_move(board->opponent, board->player)) { + || can_move(board.player, board.opponent) + || !can_move(board.opponent, board.player)) { warn("passing move is wrong\n"); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } } else if (l->move == NOMOVE) { - if (get_mobility(position->board->player, position->board->opponent) != position->n_link && !(position->n_link == 1 && position->link->move == PASS)) { + if (get_mobility(position->board.player, position->board.opponent) != position->n_link && !(position->n_link == 1 && position->link->move == PASS)) { warn("nomove is wrong\n"); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } } else if (/*l->move < A1 ||*/ l->move > H8 - || board_is_occupied(board, l->move) - || board_get_move(board, l->move, move) == 0) { + || board_is_occupied(&board, l->move) + || board_get_move_flip(&board, l->move, &move) == 0) { warn("leaf %s is wrong\n", move_to_string(l->move, WHITE, s)); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } @@ -233,13 +233,13 @@ static bool position_is_ok(const Position *position) for (j = i + 1; j < position->n_link; ++j) { if (position->link[j].move == position->link[i].move) { warn("doublon found in links\n"); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } } if (position->leaf.move == position->link[i].move) { warn("doublon found in links/leaf\n"); - position_print(position, position->board, stdout); + position_print(position, &position->board, stdout); return false; } } @@ -253,7 +253,7 @@ static bool position_is_ok(const Position *position) */ static void position_init(Position *position) { - position->board->player = position->board->opponent = 0; + position->board.player = position->board.opponent = 0; position->leaf = BAD_LINK; position->link = NULL; @@ -283,10 +283,10 @@ static void position_merge(Position *dest, const Position *src) { Link *l; - position_init(dest); - *dest->board = *src->board; + position_init(dest); //??? dest->n_link = 0, + dest->board = src->board; if (dest->level == src->level) { - foreach_link(l, dest) { + foreach_link(l, dest) { // so this does nothing if (l->move == src->leaf.move) return; } dest->leaf = src->leaf; @@ -319,8 +319,8 @@ static bool position_read(Position *position, FILE *f) int i; int r; - r = fread(&position->board->player, sizeof (unsigned long long), 1, f); - r += fread(&position->board->opponent, sizeof (unsigned long long), 1, f); + r = fread(&position->board.player, sizeof (unsigned long long), 1, f); + r += fread(&position->board.opponent, sizeof (unsigned long long), 1, f); r += fread(&position->n_wins, sizeof (unsigned int), 1, f); r += fread(&position->n_draws, sizeof (unsigned int), 1, f); @@ -362,12 +362,12 @@ static bool position_import(Position *position, FILE *f) { char *line, *s, *old; int value; - Move move[1]; + Move move; bool ok = false; if ((line = string_read_line(f)) != NULL) { position_init(position); - s = parse_board(line, position->board, &value); + s = parse_board(line, &position->board, &value); if (s != line) { s = parse_find(s, ','); if (*s == ',') { @@ -376,13 +376,13 @@ static bool position_import(Position *position, FILE *f) position->level = value; s = parse_find(s, ','); if (*s == ',') { - s = parse_move(old = s + 1, position->board, move); + s = parse_move(old = s + 1, &position->board, &move); if (s != old) { s = parse_find(s, ','); if (*s == ',') { s = parse_int(old = s + 1, &value); if (s != old) { - position->leaf.move = move->x; + position->leaf.move = move.x; position->leaf.score = value; } } @@ -417,8 +417,8 @@ static bool position_write(const Position *position, FILE* f) int i; int r; - r = fwrite(&position->board->player, sizeof (unsigned long long), 1, f); - r += fwrite(&position->board->opponent, sizeof (unsigned long long), 1, f); + r = fwrite(&position->board.player, sizeof (unsigned long long), 1, f); + r += fwrite(&position->board.opponent, sizeof (unsigned long long), 1, f); r += fwrite(&position->n_wins, sizeof (unsigned int), 1, f); r += fwrite(&position->n_draws, sizeof (unsigned int), 1, f); @@ -451,7 +451,7 @@ static bool position_export(const Position *p, FILE* f) { char b[80], m[4]; - board_to_string(p->board, BLACK, b); + board_to_string(&p->board, BLACK, b); move_to_string(p->leaf.move, BLACK, m); return (fprintf(f, "%s,%d,%s,%d\n", b, p->level, m, p->leaf.score) > 0); } @@ -463,11 +463,11 @@ static bool position_export(const Position *p, FILE* f) */ static void position_unique(Position *position) { - Board board[1]; + Board board; int i, s; - *board = *position->board; - if ((s = board_unique(board, position->board)) != 0) { + board = position->board; + if ((s = board_unique(&board, &position->board)) != 0) { for (i = 0; i < position->n_link; ++i) { position->link[i].move = symetry(position->link[i].move, s); } @@ -486,23 +486,23 @@ static int position_get_moves(const Position *position, const Board *board, Move { Move *previous = movelist->move; Move *move = movelist->move + 1; - Board sym[1]; + Board sym; int i, x, s; for (s = 0; s < 8; ++s) { - board_symetry(position->board, s, sym); + board_symetry(&position->board, s, &sym); - if (board_equal(sym, board)) { + if (board_equal(&sym, board)) { for (i = 0; i < position->n_link; ++i) { x = symetry(position->link[i].move, s); - board_get_move(board, x, move); + board_get_move_flip(board, x, move); move->score = position->link[i].score; previous = previous->next = move; ++move; } x = symetry(position->leaf.move, s); if (x != NOMOVE) { - board_get_move(board, x, move); + board_get_move_flip(board, x, move); move->score = position->leaf.score; previous = previous->next = move; ++move; @@ -527,7 +527,7 @@ static int position_get_moves(const Position *position, const Board *board, Move */ static void position_show(const Position *position, const Board *board, FILE *f) { - MoveList movelist[1]; + MoveList movelist; Move *move; const int n_empties = board_count_empties(board); const int color = n_empties & 1; @@ -539,7 +539,7 @@ static void position_show(const Position *position, const Board *board, FILE *f) fprintf(f, "\nLevel: %d\n", position->level); fprintf(f, "Best score: %+02d [%+02d, %+02d]\n", position->score.value, position->score.lower, position->score.upper); fprintf(f, "Moves:"); - sym = position_get_moves(position, board, movelist); + sym = position_get_moves(position, board, &movelist); foreach_move(move, movelist) { move_to_string(move->x, color, s); if (symetry(position->leaf.move, sym) == move->x) { @@ -559,7 +559,7 @@ static void position_show(const Position *position, const Board *board, FILE *f) */ static void position_print(const Position *position, const Board *board, FILE *f) { - MoveList movelist[1]; + MoveList movelist; Move *move; int color = board_count_empties(board) & 1, sym; char b[80], m[4]; @@ -569,7 +569,7 @@ static void position_print(const Position *position, const Board *board, FILE *f fprintf(f, "level:%d; ", position->level); fprintf(f, "best: %+02d [%+02d, %+02d];", position->score.value, position->score.lower, position->score.upper); fprintf(f, "moves:"); - sym = position_get_moves(position, board, movelist); + sym = position_get_moves(position, board, &movelist); foreach_move(move, movelist) { move_to_string(move->x, color, m); if (symetry(position->leaf.move, sym) == move->x) { @@ -592,11 +592,11 @@ static void position_print(const Position *position, const Board *board, FILE *f */ static void position_get_random_move(const Position *position, const Board *board, Move *move, Random *r, const int randomness) { - MoveList movelist[1]; + MoveList movelist; Move *m; int i, n; - position_get_moves(position, board, movelist); + position_get_moves(position, board, &movelist); n = 0; foreach_best_move(m, movelist) { @@ -691,7 +691,7 @@ static void position_search(Position *position, Book *book) { Search *search = book->search; Link *l; - const int n_moves = get_mobility(position->board->player, position->board->opponent); + const int n_moves = get_mobility(position->board.player, position->board.opponent); long long time; bool time_per_move; @@ -701,15 +701,15 @@ static void position_search(Position *position, Book *book) } if (position->n_link < n_moves || (position->n_link == 0 && n_moves == 0 && position->score.value == -SCORE_INF)) { - search_set_board(search, position->board, BLACK); - search_set_level(search, position->level, search->n_empties); + search_set_board(search, &position->board, BLACK); + search_set_level(search, position->level, search->eval.n_empties); foreach_link (l, position) { - movelist_exclude(search->movelist, l->move); + movelist_exclude(&search->movelist, l->move); } if (search->options.verbosity >= 2) { - board_print(search->board, search->player, stdout); + board_print(&search->board, search->player, stdout); puts(search->options.header); puts(search->options.separator); } @@ -744,29 +744,29 @@ static void position_search(Position *position, Book *book) static void position_link(Position *position, Book *book) { int x; - unsigned long long moves = get_moves(position->board->player, position->board->opponent); - Board next[1]; - Link link[1]; + unsigned long long moves = board_get_moves(&position->board); + Board next; + Link link; Position *child; if (moves) { foreach_bit(x, moves) { - board_next(position->board, x, next); - child = book_probe(book, next); + board_next(&position->board, x, &next); + child = book_probe(book, &next); if (child) { - link->score = -child->score.value; - link->move = x; - book->stats.n_links += position_add_link(position, link); + link.score = -child->score.value; + link.move = x; + book->stats.n_links += position_add_link(position, &link); } } - } else if (can_move(position->board->opponent, position->board->player)) {// pass ? - next->player = position->board->opponent; - next->opponent = position->board->player; - child = book_probe(book, next); + } else if (can_move(position->board.opponent, position->board.player)) {// pass ? + next.player = position->board.opponent; + next.opponent = position->board.player; + child = book_probe(book, &next); if (child) { - link->score = -child->score.value; - link->move = PASS; - book->stats.n_links += position_add_link(position, link); + link.score = -child->score.value; + link.move = PASS; + book->stats.n_links += position_add_link(position, &link); } } } @@ -783,21 +783,21 @@ static void position_link(Position *position, Book *book) */ static void position_expand(Position *position, Book *book) { - Position child[1]; + Position child; if (position->leaf.move != NOMOVE) { - position_init(child); + position_init(&child); - board_next(position->board, position->leaf.move, child->board); + board_next(&position->board, position->leaf.move, &child.board); - child->level = position->level; - position_link(child, book); + child.level = position->level; + position_link(&child, book); search_cleanup(book->search); - position_search(child, book); - position->leaf.score = -child->score.value; + position_search(&child, book); + position->leaf.score = -child.score.value; position_search(position, book); - position_unique(child); - book_add(book, child); + position_unique(&child); + book_add(book, &child); } } @@ -812,12 +812,12 @@ static void position_expand(Position *position, Book *book) static int position_negamax(Position *position, Book *book) { Link *l; - Board target[1]; + Board target; Position *child; if (!position->done) { GameStats stat = {0,0,0,0}; - const int n_empties = board_count_empties(position->board); + const int n_empties = board_count_empties(&position->board); const int search_depth = LEVEL[position->level][n_empties].depth; const int bias = (search_depth & 1) - (n_empties & 1); @@ -845,8 +845,8 @@ static int position_negamax(Position *position, Book *book) } foreach_link(l, position) { - board_next(position->board, l->move, target); - child = book_probe(book, target); + board_next(&position->board, l->move, &target); + child = book_probe(book, &target); position_negamax(child, book); if (l->score != -child->score.value) { l->score = -child->score.value; @@ -885,18 +885,18 @@ static int position_negamax(Position *position, Book *book) static void position_prune(Position *position, Book *book, const int player_deviation, const int opponent_deviation, const int lower, const int upper) { Link *l; - Board target[1]; + Board target; Position *child; // if position is not done yet & good enough & inside the book height limit - if (lower <= position->score.value && position->score.value <= upper && board_count_empties(position->board) >= book->options.n_empties - 1) { + if (lower <= position->score.value && position->score.value <= upper && board_count_empties(&position->board) >= book->options.n_empties - 1) { position->done = true; book->stats.n_todo++; // prune all children close to the best move foreach_link(l, position) { if (position->score.value - l->score <= player_deviation && lower <= l->score && l->score <= upper) { - board_next(position->board, l->move, target); - child = book_probe(book, target); + board_next(&position->board, l->move, &target); + child = book_probe(book, &target); position_prune(child, book, opponent_deviation, player_deviation, -upper, -lower); } } @@ -917,11 +917,11 @@ static void position_remove_links(Position *position, Book *book) { int i, j; Link *l = position->link; - Board target[1]; + Board target; for (i = 0; i < position->n_link; ++i) { - board_next(position->board, l[i].move, target); - if (!book_probe(book, target)) { + board_next(&position->board, l[i].move, &target); + if (!book_probe(book, &target)) { if (l[i].score > position->leaf.score) position->leaf = l[i]; for (j = i + 1; j < position->n_link; ++j) l[j - 1] = l[j]; --position->n_link; @@ -950,18 +950,18 @@ static void position_remove_links(Position *position, Book *book) static void position_deviate(Position *position, Book *book, const int player_deviation, const int opponent_deviation, const int lower, const int upper) { Link *l; - Board target[1]; + Board target; Position *child; // if position is not done yet & good enough & inside the book height limit - if (!position->done && lower <= position->score.value && position->score.value <= upper && board_count_empties(position->board) >= book->options.n_empties && !board_is_game_over(position->board)) { + if (!position->done && lower <= position->score.value && position->score.value <= upper && board_count_empties(&position->board) >= book->options.n_empties && !board_is_game_over(&position->board)) { position->done = true; // deviate all children close to the best move foreach_link(l, position) { if (position->score.value - l->score <= player_deviation && lower <= l->score && l->score <= upper) { - board_next(position->board, l->move, target); - child = book_probe(book, target); + board_next(&position->board, l->move, &target); + child = book_probe(book, &target); position_deviate(child, book, opponent_deviation, player_deviation, -upper, -lower); } } @@ -989,22 +989,22 @@ static void position_deviate(Position *position, Book *book, const int player_de static void position_enhance(Position *position, Book *book) { Link *l; - Board target[1]; + Board target; Position *child; - if (!position->done && board_count_empties(position->board) >= book->options.n_empties && !board_is_game_over(position->board)) { + if (!position->done && board_count_empties(&position->board) >= book->options.n_empties && !board_is_game_over(&position->board)) { position->done = true; foreach_link(l, position) { - board_next(position->board, l->move, target); - child = book_probe(book, target); + board_next(&position->board, l->move, &target); + child = book_probe(book, &target); if (-child->score.upper >= position->score.lower || -child->score.lower >= position->score.upper) { position_enhance(child, book); } } if (position->leaf.score > -SCORE_INF) { - const int n_empties = board_count_empties(position->board); + const int n_empties = board_count_empties(&position->board); const int search_depth = LEVEL[position->level][n_empties].depth; const int bias = (search_depth & 1) - (n_empties & 1); int lower, upper; @@ -1041,26 +1041,31 @@ static void board_feed_hash(Board *board, const Book *book, Search *search, cons { Position *position; const unsigned long long hash_code = board_get_hash_code(board); - MoveList movelist[1]; + MoveList movelist; Move *m; + HashStoreData hash_data; position = book_probe(book, board); if (position) { - const int n_empties = board_count_empties(position->board); - const int depth = LEVEL[position->level][n_empties].depth; - const int selectivity = LEVEL[position->level][n_empties].selectivity; + const int n_empties = board_count_empties(&position->board); const int score = position->score.value; int move = NOMOVE; - position_get_moves(position, board, movelist); + hash_data.data.wl.c.depth = LEVEL[position->level][n_empties].depth; + hash_data.data.wl.c.selectivity = LEVEL[position->level][n_empties].selectivity; + + position_get_moves(position, board, &movelist); foreach_move(m, movelist) { if (move == NOMOVE) move = m->x; board_update(board, m); board_feed_hash(board, book, search, is_pv && m->score == score); board_restore(board, m); } - hash_feed(search->hash_table, board, hash_code, depth, selectivity, score, score, move); - if (is_pv) hash_feed(search->pv_table, board, hash_code, depth, selectivity, score, score, move); + + hash_data.data.lower = hash_data.data.upper = score; + hash_data.data.move[0] = move; + hash_feed(&search->hash_table, board, hash_code, &hash_data); + if (is_pv) hash_feed(&search->pv_table, board, hash_code, &hash_data); } } @@ -1077,12 +1082,12 @@ static void board_feed_hash(Board *board, const Book *book, Search *search, cons static bool board_fill(Board *board, Book *book, int depth) { if (depth > 0) { - MoveList movelist[1]; + MoveList movelist; Move *m; bool filled = false; - movelist_get_moves(movelist, board); - if (movelist->n_moves == 0 && can_move(board->opponent, board->player)) { + movelist_get_moves(&movelist, board); + if (movelist.n_moves == 0 && can_move(board->opponent, board->player)) { board_pass(board); if (board_fill(board, book, depth - 1)) { book_add_board(book, board); @@ -1114,18 +1119,18 @@ static bool board_fill(Board *board, Book *book, int depth) */ static void position_fix(Position *position, Book *book) { - Board board[1]; + Board board; - if ((position->board->player & position->board->opponent) || - ((position->board->player | position->board->opponent) & 0x0000001818000000ULL) != 0x0000001818000000ULL) { + if ((position->board.player & position->board.opponent) || + ((position->board.player | position->board.opponent) & 0x0000001818000000ULL) != 0x0000001818000000ULL) { position_free(position); position_init(position); return; } - board_unique(position->board, board); + board_unique(&position->board, &board); position_free(position); position_init(position); - *position->board = *board; + position->board = board; position->level = book->options.level; position_link(position, book); position_search(position, book); @@ -1163,10 +1168,10 @@ static bool position_array_add(PositionArray *a, const Position *p) { int i; - board_check(p->board); + board_check(&p->board); assert(position_is_ok(p)); - for (i = 0; i < a->n; ++i) if (board_equal(a->positions[i].board, p->board)) return false; + for (i = 0; i < a->n; ++i) if (board_equal(&a->positions[i].board, &p->board)) return false; if (a->n == a->size) { Position *n; a->size += a->size / 2 + 1; @@ -1197,7 +1202,7 @@ static bool position_array_remove(PositionArray *a, const Position *p) int i, j; for (i = 0; i < a->n; ++i) { - if (board_equal(a->positions[i].board, p->board)) { + if (board_equal(&a->positions[i].board, &p->board)) { position_free(a->positions + i); for (j = i + 1; j < a->n; ++j) { a->positions[j - 1] = a->positions[j]; @@ -1231,7 +1236,7 @@ static void position_array_free(PositionArray *a) static Position* position_array_probe(PositionArray *a, const Board *board) { int i; - for (i = 0; i < a->n; ++i) if (board_equal(a->positions[i].board, board)) return a->positions + i; + for (i = 0; i < a->n; ++i) if (board_equal(&a->positions[i].board, board)) return a->positions + i; return NULL; } @@ -1292,9 +1297,9 @@ static double book_get_age(Book *book) */ static Position* book_probe(const Book *book, const Board *board) { - Board unique[1]; - board_unique(board, unique); - return position_array_probe(book->array + (board_get_hash_code(unique) & (book->n - 1)), unique); + Board unique; + board_unique(board, &unique); + return position_array_probe(book->array + (board_get_hash_code(&unique) & (book->n - 1)), &unique); } /** @@ -1305,7 +1310,7 @@ static Position* book_probe(const Book *book, const Board *board) */ static void book_add(Book *book, const Position *p) { - const unsigned long long i = board_get_hash_code(p->board) & (book->n - 1); + const unsigned long long i = board_get_hash_code(&p->board) & (book->n - 1); if (position_array_add(book->array + i, p)) { ++book->n_nodes; @@ -1321,7 +1326,7 @@ static void book_add(Book *book, const Position *p) */ static void book_remove(Book *book, const Position *p) { - const unsigned long long i = board_get_hash_code(p->board) & (book->n - 1); + const unsigned long long i = board_get_hash_code(&p->board) & (book->n - 1); if (position_array_remove(book->array + i, p)) { --book->n_nodes; @@ -1355,10 +1360,10 @@ static void book_clean(Book *book) */ static Position *book_root(Book *book) { - Board board[1]; + Board board; - board_init(board); - return book_probe(book, board); + board_init(&board); + return book_probe(book, &board); } /** @@ -1385,7 +1390,7 @@ void book_init(Book *book) for (i = 0; i < book->n; ++i) position_array_init(book->array + i); book->n_nodes = 0; - random_seed(book->random, real_clock()); + random_seed(&book->random, real_clock()); book->need_saving = false; } @@ -1414,15 +1419,15 @@ void book_free(Book *book) */ void book_new(Book *book, int level, int n_empties) { - Board board[1]; + Board board; bprint("New book %d %d...", level, n_empties); book_init(book); book->options.level = level; book->options.n_empties = n_empties; - board_init(board); - book_add_board(book, board); + board_init(&board); + book_add_board(book, &board); bprint("...done>\n"); book->need_saving = true; } @@ -1489,7 +1494,7 @@ void book_load(Book *book, const char *file) error("error while reading %s", file); } - random_seed(book->random, real_clock()); + random_seed(&book->random, real_clock()); book->need_saving = false; info("done\n"); @@ -1527,12 +1532,12 @@ void book_import(Book *book, const char *file) book->options.n_empties = 60; book->options.level = 0; foreach_position(p, a, book) { - n_empties = board_count_empties(p->board); + n_empties = board_count_empties(&p->board); if (p->level > book->options.level) book->options.level = p->level; if (n_empties < book->options.n_empties) book->options.n_empties = n_empties; } - random_seed(book->random, real_clock()); + random_seed(&book->random, real_clock()); book->need_saving = true; bprint("...done\n"); @@ -1631,12 +1636,12 @@ void book_merge(Book *dest, const Book *src) { PositionArray *a; const Position *p_src; - Position p_dest[1]; + Position p_dest; foreach_position(p_src, a, src) { - if (!book_probe(dest, p_src->board)) { - position_merge(p_dest, p_src); - book_add(dest, p_dest); + if (!book_probe(dest, &p_src->board)) { + position_merge(&p_dest, p_src); + book_add(dest, &p_dest); } } } @@ -1720,7 +1725,7 @@ void book_deepen(Book *book) bprint("Deepening book...\r"); foreach_position(p, a, book) { - int n_empties = board_count_empties(p->board); + int n_empties = board_count_empties(&p->board); if (LEVEL[p->level][n_empties].depth != LEVEL[book->options.level][n_empties].depth || LEVEL[p->level][n_empties].selectivity != LEVEL[book->options.level][n_empties].selectivity) { // No! compare depth & selectivity; p->leaf = BAD_LINK; @@ -1759,7 +1764,7 @@ void book_correct_solved(Book *book) bprint("Correcting solved positions...\r"); foreach_position(p, a, book) { - int n_empties = board_count_empties(p->board); + int n_empties = board_count_empties(&p->board); if (LEVEL[p->level][n_empties].depth == n_empties && LEVEL[p->level][n_empties].selectivity == NO_SELECTIVITY) { // No! compare depth & selectivity; old_leaf = p->leaf; p->leaf = BAD_LINK; @@ -1767,7 +1772,7 @@ void book_correct_solved(Book *book) if (p->leaf.score != old_leaf.score) { ++n_error; bprint("\nError found:\n"); - position_print(p, p->board, stdout); + position_print(p, &p->board, stdout); move_to_string(old_leaf.move, n_empties & 1, s); bprint("instead of <%s:%d>\n\n", s, old_leaf.score); } @@ -1855,7 +1860,7 @@ void book_play(Book *book) n_diffs = 0; book->stats.n_nodes = book->stats.n_links = book->stats.n_todo = 0; foreach_position(p, a, book) { - if (p->n_link == 0 && board_count_empties(p->board) >= book->options.n_empties && !board_is_game_over(p->board)) { + if (p->n_link == 0 && board_count_empties(&p->board) >= book->options.n_empties && !board_is_game_over(&p->board)) { p->todo = true; ++book->stats.n_todo; } else { p->todo = false; @@ -1896,9 +1901,9 @@ void book_fill(Book *book, const int depth) for (a = book->array; a < book->array + book->n; ++a) for (k = 0; k < a->n; ++k) { // do not use foreach_positions here! a->positions may change! p = a->positions + k; - n_empties = board_count_empties(p->board); + n_empties = board_count_empties(&p->board); if (n_empties >= book->options.n_empties) { - board_fill(p->board, book, depth); + board_fill(&p->board, book, depth); if (n_diffs < book->stats.n_nodes + book->stats.n_links) { n_diffs = book->stats.n_nodes + book->stats.n_links; bprint("Book fill...%d %d done\r", book->stats.n_nodes, book->stats.n_links); @@ -1982,7 +1987,7 @@ void book_prune(Book *book) book_clean(book); position_prune(root, book, 2*SCORE_INF, 0, -SCORE_INF, SCORE_INF); - position_print(root, root->board, stdout); + position_print(root, &root->board, stdout); bprint("Book prune %d... done\n", book->stats.n_todo); position_prune(root, book, 0, 2*SCORE_INF, -SCORE_INF, SCORE_INF); @@ -2014,7 +2019,7 @@ void book_subtree(Book *book, const Board *board) book_clean(book); position_prune(root, book, 2*SCORE_INF, 2*SCORE_INF, -SCORE_INF, SCORE_INF); - position_print(root, root->board, stdout); + position_print(root, &root->board, stdout); bprint("Book subtree %d... done\n", book->stats.n_todo); for (a = book->array; a < book->array + book->n; ++a) for (i = 0; i < a->n; ++i) if (!a->positions[i].done) {book_remove(book, a->positions + i); --i;} @@ -2083,7 +2088,7 @@ void book_info(Book *book) if (p->leaf.move != NOMOVE) ++n_leaves; ++n_level[p->level]; if (p->level != book->options.level) { - position_print(p, p->board, stdout); + position_print(p, &p->board, stdout); } } @@ -2160,16 +2165,16 @@ bool book_get_moves(Book *book, const Board *board, MoveList *movelist) void book_get_line(Book *book, const Board *board, const Move *move, Line *line) { Position *position; - Board b[1]; - Move m[1]; + Board b; + Move m; line_push(line, move->x); - board_next(board, move->x, b); + board_next(board, move->x, &b); - while ((position = book_probe(book, b)) != NULL && !board_is_game_over(position->board)) { - position_get_random_move(position, b, m, book->random, 0); - line_push(line, m->x); - board_update(b, m); + while ((position = book_probe(book, &b)) != NULL && !board_is_game_over(&position->board)) { + position_get_random_move(position, &b, &m, &book->random, 0); + line_push(line, m.x); + board_update(&b, &m); } } @@ -2182,16 +2187,20 @@ void book_get_line(Book *book, const Board *board, const Move *move, Line *line) * @param move Chosen move. * @param randomness Randomness. */ +#if 0 +#include "srbook.c" +#else bool book_get_random_move(Book *book, const Board *board, Move *move, const int randomness) { Position *position = book_probe(book, board); if (position) { - position_get_random_move(position, board, move, book->random, randomness); + position_get_random_move(position, board, move, &book->random, randomness); return true; } return false; } +#endif /** * @brief Get game statistics from a position. @@ -2213,13 +2222,13 @@ void book_get_game_stats(Book *book, const Board *board, GameStats *stat) position = book_probe(book, board); if (position) { if (position->n_wins == UINT_MAX || position->n_losses == UINT_MAX || position->n_draws == UINT_MAX || position->n_lines == UINT_MAX) { - Board target[1]; + Board target; Link *l; GameStats child; foreach_link(l, position) { - board_next(position->board, l->move, target); - book_get_game_stats(book, target, &child); + board_next(&position->board, l->move, &target); + book_get_game_stats(book, &target, &child); stat->n_wins += child.n_losses; stat->n_draws += child.n_draws; stat->n_losses += child.n_wins; @@ -2243,7 +2252,7 @@ void book_get_game_stats(Book *book, const Board *board, GameStats *stat) */ void book_add_board(Book *book, const Board *board) { - Position position[1]; + Position position; Position *probe; if (board_count_empties(board) >= book->options.n_empties - 1) { @@ -2253,14 +2262,14 @@ void book_add_board(Book *book, const Board *board) if (probe->leaf.move == NOMOVE) position_search(probe, book); if (BOOK_DEBUG) {printf("update: "); position_print(probe, board, stdout);} } else { - position_init(position); - *position->board = *board; - position->level = book->options.level; - position_link(position, book); - position_search(position, book); - if (BOOK_DEBUG) {printf("new: "); position_print(position, board, stdout);} - position_unique(position); - book_add(book, position); + position_init(&position); + position.board = *board; + position.level = book->options.level; + position_link(&position, book); + position_search(&position, book); + if (BOOK_DEBUG) {printf("new: "); position_print(&position, board, stdout);} + position_unique(&position); + book_add(book, &position); } } } @@ -2273,7 +2282,7 @@ void book_add_board(Book *book, const Board *board) */ void book_add_game(Book *book, const Game *game) { - Board board[1]; + Board board; Move stack[99]; int i, n_moves; char file[FILENAME_MAX + 1]; @@ -2281,15 +2290,15 @@ void book_add_game(Book *book, const Game *game) file_add_ext(options.book_file, ".gam", file); - board_init(board); - if (!board_equal(board, game->initial_board)) return; // skip non standard game + board_init(&board); + if (!board_equal(&board, &game->initial_board)) return; // skip non standard game for (i = n_moves = 0; i < 60 - book->options.n_empties && game->move[i] != NOMOVE; ++i) { - if (!can_move(board->player, board->opponent)) { + if (!can_move(board.player, board.opponent)) { stack[n_moves++] = MOVE_PASS; - board_pass(board); + board_pass(&board); } - if (!board_is_occupied(board, game->move[i]) && board_get_move(board, game->move[i], &stack[n_moves])) { - board_update(board, stack + n_moves); + if (!board_is_occupied(&board, game->move[i]) && board_get_move_flip(&board, game->move[i], &stack[n_moves])) { + board_update(&board, stack + n_moves); ++n_moves; } else { warn("illegal move in game"); @@ -2299,8 +2308,8 @@ void book_add_game(Book *book, const Game *game) search_cleanup(book->search); while (--n_moves >= 0) { - book_add_board(book, board); - board_restore(board, stack + n_moves); + book_add_board(book, &board); + board_restore(&board, stack + n_moves); } if (book->stats.n_nodes + book->stats.n_links > n_stats && book_get_age(book) > 3600) book_save(book, file); @@ -2355,21 +2364,21 @@ typedef struct BookCheckGame { */ void book_check_game(Book *book, MoveHash *hash, const Game *game, BookCheckGame *stat) { - Board board[1]; + Board board; Move stack[99], *iter; - MoveList movelist[1]; + MoveList movelist; int i, n_moves; int bestscore; - board_init(board); - if (!board_equal(board, game->initial_board)) return; // skip non standard game + board_init(&board); + if (!board_equal(&board, &game->initial_board)) return; // skip non standard game for (i = n_moves = 0; i <= 60 - book->options.n_empties && game->move[i] != NOMOVE; ++i) { - if (!can_move(board->player, board->opponent)) { + if (!can_move(board.player, board.opponent)) { stack[n_moves++] = MOVE_PASS; - board_pass(board); + board_pass(&board); } - if (!board_is_occupied(board, game->move[i]) && board_get_move(board, game->move[i], &stack[n_moves])) { - board_update(board, stack + n_moves); + if (!board_is_occupied(&board, game->move[i]) && board_get_move_flip(&board, game->move[i], &stack[n_moves])) { + board_update(&board, stack + n_moves); ++n_moves; } else { warn("illegal move in game"); @@ -2378,10 +2387,10 @@ void book_check_game(Book *book, MoveHash *hash, const Game *game, BookCheckGame } while (--n_moves >= 0) { - board_restore(board, stack + n_moves); - if (movehash_append(hash, board, stack[n_moves].x)) { - if (book_get_moves(book, board, movelist)) { - bestscore = movelist_first(movelist)->score; + board_restore(&board, stack + n_moves); + if (movehash_append(hash, &board, stack[n_moves].x)) { + if (book_get_moves(book, &board, &movelist)) { + bestscore = movelist_first(&movelist)->score; foreach_move(iter, movelist) { if (iter->x == stack[n_moves].x) { if (iter->score < bestscore) ++stat->bad; @@ -2389,7 +2398,7 @@ void book_check_game(Book *book, MoveHash *hash, const Game *game, BookCheckGame break; } } - } else ++stat->missing; + } else ++stat->missing; } } } @@ -2429,14 +2438,14 @@ void book_check_base(Book *book, const Base *base) */ static void extract_skeleton(Book *book, Board *board, Line *pv, Base *base) { - MoveList movelist[1]; + MoveList movelist; Move *move; - Board init[1]; - Game game[1]; + Board init; + Game game; int bestscore; - if (book_get_moves(book, board, movelist)) { - bestscore = movelist_best(movelist)->score; + if (book_get_moves(book, board, &movelist)) { + bestscore = movelist_best(&movelist)->score; foreach_move(move, movelist) { if (move->score == bestscore) { @@ -2446,12 +2455,11 @@ static void extract_skeleton(Book *book, Board *board, Line *pv, Base *base) } } } else if (pv->n_moves) { - board_init(init); - line_to_game(init, pv, game); - base_append(base, game); + board_init(&init); + line_to_game(&init, pv, &game); + base_append(base, &game); if (base->n_games % 1000 == 0) { bprint("extracting %d games\r", base->n_games); - } } } @@ -2467,21 +2475,21 @@ static void extract_skeleton(Book *book, Board *board, Line *pv, Base *base) */ void book_extract_skeleton(Book *book, Base *base) { - Line pv[1]; - Board board[1]; + Line pv; + Board board; - line_init(pv, BLACK); - line_push(pv, F5); line_push(pv, D6); line_push(pv, C4); - board_init(board); - board_next(board, F5, board); board_next(board, D6, board); board_next(board, C4, board); - extract_skeleton(book, board, pv, base); + line_init(&pv, BLACK); + line_push(&pv, F5); line_push(&pv, D6); line_push(&pv, C4); + board_init(&board); + board_next(&board, F5, &board); board_next(&board, D6, &board); board_next(&board, C4, &board); + extract_skeleton(book, &board, &pv, base); - line_init(pv, BLACK); - line_push(pv, F5); line_push(pv, F6); line_push(pv, E6); line_push(pv, F4); - board_init(board); - board_next(board, F5, board); board_next(board, F6, board); - board_next(board, E6, board); board_next(board, F4, board); - extract_skeleton(book, board, pv, base); + line_init(&pv, BLACK); + line_push(&pv, F5); line_push(&pv, F6); line_push(&pv, E6); line_push(&pv, F4); + board_init(&board); + board_next(&board, F5, &board); board_next(&board, F6, &board); + board_next(&board, E6, &board); board_next(&board, F4, &board); + extract_skeleton(book, &board, &pv, base); bprint("%d games extracted \n", base->n_games); } @@ -2497,7 +2505,7 @@ void book_extract_positions(Book *book, const int n_empties, const int n_positio { PositionArray *a; Position *p; - MoveList movelist[1]; + MoveList movelist; Move *best, *second_best; int i = 0; char s[80]; @@ -2505,14 +2513,14 @@ void book_extract_positions(Book *book, const int n_empties, const int n_positio bprint("Extracting %d positions at %d ...\n", n_positions, n_empties); foreach_position(p, a, book) { if (i == n_positions) break; - if (board_count_empties(p->board) == n_empties) { - position_get_moves(p, p->board, movelist); - best = movelist_first(movelist); + if (board_count_empties(&p->board) == n_empties) { + position_get_moves(p, &p->board, &movelist); + best = movelist_first(&movelist); if (best) { second_best = move_next(best); if (second_best && best->score > second_best->score) { ++i; - board_to_string(p->board, n_empties & 1, s); + board_to_string(&p->board, n_empties & 1, s); bprint("%s %% bm ", s); move_print(best->x, n_empties & 1, stdout); bprint(":%+2d; ba ", best->score); @@ -2554,7 +2562,7 @@ void book_stats(Book *book) printf("stage positions links leaves terminal nodes\n"); for (i = 0; i < 61; ++i) n_pos[i] = n_leaf[i] = n_link[i] = n_terminal[i] = 0; foreach_position(p, a, book) { - i = board_count_empties(p->board); + i = board_count_empties(&p->board); ++n_pos[i]; if (p->leaf.move != NOMOVE) ++n_leaf[i]; if (p->n_link == 0) ++n_terminal[i]; diff --git a/src/book.h b/src/book.h index a2018a74..c28d521f 100644 --- a/src/book.h +++ b/src/book.h @@ -3,7 +3,7 @@ * * Header file for opening book management * - * @date 1998 - 2017 + * @date 1998 - 2020 * @author Richard Delorme * @version 4.4 */ @@ -45,7 +45,7 @@ typedef struct Book { int n; int n_nodes; bool need_saving; - Random random[1]; + Random random; Search *search; } Book; diff --git a/src/cassio.c b/src/cassio.c index 0a154a5a..c262b44a 100644 --- a/src/cassio.c +++ b/src/cassio.c @@ -13,9 +13,9 @@ * - With "-follow-cassio" Edax will follow more closely Cassio's search request. By default, it * searches with settings that make it better in tournament mode against Roxane, Cassio, etc. * - * @date 1998 - 2017 + * @date 1998 - 2023 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #include "cassio.h" @@ -200,9 +200,9 @@ static bool is_position_new(Engine *engine, Board *board) cassio_debug("Position list: adding position %llx\n", board_get_hash_code(board)); engine->last_position.board[0] = *board; engine->last_position.n = MIN(ENGINE_N_POSITION, engine->last_position.n + 1); - hash_clear(engine->search->hash_table); - hash_clear(engine->search->pv_table); - hash_clear(engine->search->shallow_table); + hash_clear(&engine->search->hash_table); + hash_clear(&engine->search->pv_table); + hash_clear(&engine->search->shallow_table); return true; } @@ -220,8 +220,8 @@ static void engine_observer(Result *result) n += sprintf(engine_result + n, ", depth %d, @%d%%, %c%+d.00 <= v <= %c%+d.00, ", result->depth, selectivity_table[result->selectivity].percent, color, result->bound[result->move].lower, color, result->bound[result->move].upper); - line_to_string(result->pv, result->pv->n_moves, NULL, engine_result + n); - n += 2 * result->pv->n_moves; + line_to_string(&result->pv, result->pv.n_moves, NULL, engine_result + n); + n += 2 * result->pv.n_moves; n += sprintf(engine_result + n, ", node %llu, time %.3f", result->n_nodes, 0.001 * result->time); // avoid to send multiple times the same result. @@ -238,8 +238,8 @@ static void engine_observer(Result *result) static Search* engine_create_search(void) { Search *search; - - search = (Search*) malloc(sizeof (Search)); + + search = (Search*) mm_malloc(sizeof (Search)); if (search == NULL) { engine_send("ERROR: Cannot allocate a new search engine."); engine_send("bye bye!"); @@ -268,7 +268,7 @@ static Search* engine_create_search(void) static int engine_open(Search *search, const Board *board, const int player, const int alpha, const int beta, const int depth, const int precision) { int k; - HashData hash_data[1]; + HashData hash_data; Move *move; int score = 0; @@ -287,9 +287,9 @@ static int engine_open(Search *search, const Board *board, const int player, con search->child_nodes = 0; search_time_init(search); if (!search->options.keep_date) { - hash_clear(search->hash_table); - hash_clear(search->pv_table); - hash_clear(search->shallow_table); + hash_clear(&search->hash_table); + hash_clear(&search->pv_table); + hash_clear(&search->shallow_table); } search->height = 0; @@ -299,17 +299,17 @@ static int engine_open(Search *search, const Board *board, const int player, con search->stability_bound.lower = 2 * get_stability(board->player, board->opponent) - SCORE_MAX; // set the board - if (player != search->player || !board_equal(search->board, board)) { + if (player != search->player || !board_equal(&search->board, board)) { search_set_board(search, board, player); - if (hash_get(search->pv_table, board, board_get_hash_code(board), hash_data)) { - if (hash_data->lower == -SCORE_INF && hash_data->upper < SCORE_INF) score = hash_data->upper; - else if (hash_data->upper == +SCORE_INF && hash_data->lower > -SCORE_INF) score = hash_data->lower; - else score = (hash_data->upper + hash_data->lower) / 2; + if (hash_get_from_board(&search->pv_table, board, &hash_data)) { + if (hash_data.lower == -SCORE_INF && hash_data.upper < SCORE_INF) score = hash_data.upper; + else if (hash_data.upper == +SCORE_INF && hash_data.lower > -SCORE_INF) score = hash_data.lower; + else score = (hash_data.upper + hash_data.lower) / 2; } - if (!movelist_is_empty(search->movelist)) { - movelist_evaluate(search->movelist, search, hash_data, options.alpha, depth); - movelist_sort(search->movelist); + if (!movelist_is_empty(&search->movelist)) { + movelist_evaluate(&search->movelist, search, &hash_data, options.alpha, depth); + movelist_sort(&search->movelist); } } @@ -318,19 +318,19 @@ static int engine_open(Search *search, const Board *board, const int player, con search->result->bound[move->x].upper = SCORE_MAX; } - search->result->n_moves_left = search->result->n_moves = search->movelist->n_moves; + search->result->n_moves_left = search->result->n_moves = search->movelist.n_moves; search->result->book_move = false; // set level search->depth = depth; - if (options.transgress_cassio && (search->n_empties & 1) != (depth & 1)) ++search->depth; - if (options.transgress_cassio && search->depth > search->n_empties - 10) search->depth = search->n_empties; + if (options.transgress_cassio && (search->eval.n_empties & 1) != (depth & 1)) ++search->depth; + if (options.transgress_cassio && search->depth > search->eval.n_empties - 10) search->depth = search->eval.n_empties; search->options.depth = search->depth; - BOUND(search->depth, 0, search->n_empties, "depth"); - search->depth_pv_extension = get_pv_extension(search->depth, search->n_empties); + BOUND(search->depth, 0, search->eval.n_empties, "depth"); + search->depth_pv_extension = get_pv_extension(search->depth, search->eval.n_empties); - if (options.transgress_cassio && depth < search->n_empties) k = 0; + if (options.transgress_cassio && depth < search->eval.n_empties) k = 0; else if (precision <= 73) k = 0; else if (precision <= 87) k = 1; else if (precision <= 95) k = 2; @@ -376,7 +376,7 @@ static void engine_close(Search *search) void* engine_init(void) { Engine *engine; - + log_open(engine_log, options.ui_log_file); engine = (Engine*) malloc(sizeof (Engine)); @@ -401,10 +401,11 @@ void* engine_init(void) */ void engine_free(void *v) { - Search *search = (Search*) v; + Search *const search = (Search*) v; + if (search) { search_free(search); - free(search); + mm_free(search); } log_close(engine_log); } @@ -413,10 +414,16 @@ void engine_free(void *v) void feed_all_hash_table(Search *search, Board *board, const int depth, const int selectivity, const int lower, const int upper, const int move) { + HashStoreData hash_data; const unsigned long long hash_code = board_get_hash_code(board); - hash_feed(search->hash_table, board, hash_code, depth, selectivity, lower, upper, move); - hash_feed(search->pv_table, board, hash_code, depth, selectivity, lower, upper, move); + hash_data.data.wl.c.depth = depth; + hash_data.data.wl.c.selectivity = selectivity; + hash_data.data.move[0] = move; + hash_data.data.lower = lower; + hash_data.data.upper = upper; + hash_feed(&search->hash_table, board, hash_code, &hash_data); + hash_feed(&search->pv_table, board, hash_code, &hash_data); } /** @@ -432,12 +439,12 @@ void feed_all_hash_table(Search *search, Board *board, const int depth, const in */ void engine_feed_hash(void *v, Board *board, int lower, int upper, const int depth, const int precision, Line *pv) { - Engine *engine = (Engine*) v; - Search *search = engine->search; + Engine *const engine = (Engine*) v; + Search *const search = engine->search; int i, selectivity, tmp; int current_depth; Move *move, *child_move; - MoveList movelist[1], child_movelist[1]; + MoveList movelist, child_movelist; if (options.transgress_cassio && depth < board_count_empties(board)) selectivity = 0; else if (precision <= 73) selectivity = 0; @@ -453,15 +460,15 @@ void engine_feed_hash(void *v, Board *board, int lower, int upper, const int dep current_depth = depth - i; feed_all_hash_table(search, board, current_depth, selectivity, lower, upper, pv->move[i]); - movelist_get_moves(movelist, board); - movelist_sort_bestmove(movelist, pv->move[i]); + movelist_get_moves(&movelist, board); + movelist_sort_bestmove(&movelist, pv->move[i]); foreach_move(move, movelist) { board_update(board, move); if (move->x == pv->move[i]) { feed_all_hash_table(search, board, current_depth - 1, selectivity, -upper, -lower, NOMOVE); if (lower > SCORE_MIN) { - movelist_get_moves(child_movelist, board); + movelist_get_moves(&child_movelist, board); foreach_move(child_move, child_movelist) { board_update(board, child_move); feed_all_hash_table(search, board, current_depth - 2, selectivity, lower, SCORE_MAX, NOMOVE); @@ -474,7 +481,7 @@ void engine_feed_hash(void *v, Board *board, int lower, int upper, const int dep board_restore(board, move); } - move = movelist_first(movelist); + move = movelist_first(&movelist); if (move && move->x == pv->move[i]) { board_update(board, move); @@ -500,14 +507,14 @@ void engine_feed_hash(void *v, Board *board, int lower, int upper, const int dep */ void engine_empty_hash(void *v) { - Engine *engine = (Engine*) v; + Engine *const engine = (Engine*) v; - if (engine && engine->search && engine->search->hash_table && engine->search->pv_table) { + if (engine && engine->search) { cassio_debug("clear the hash-table.\n"); engine->last_position.n = 0; - hash_cleanup(engine->search->hash_table); - hash_cleanup(engine->search->pv_table); - hash_cleanup(engine->search->shallow_table); + hash_cleanup(&engine->search->hash_table); + hash_cleanup(&engine->search->pv_table); + hash_cleanup(&engine->search->shallow_table); } } @@ -519,49 +526,48 @@ void engine_empty_hash(void *v) */ static bool skip_search(Engine *engine, int *old_score) { - Search *search = engine->search; - Board *board = search->board; - MoveList *movelist = search->movelist; - HashData hash_data[1]; + Search *const search = engine->search; + MoveList *const movelist = &search->movelist; + HashData hash_data; Move *bestmove; int alpha = options.alpha; int beta = options.beta; Bound *bound; char s[4], b[80]; - const unsigned long long hash_code = board_get_hash_code(board); + const unsigned long long hash_code = board_get_hash_code(&search->board); *old_score = 0; - if (hash_get(search->pv_table, board, hash_code, hash_data) - || hash_get(search->hash_table, board, hash_code, hash_data)) { + if (hash_get(&search->pv_table, &search->board, hash_code, &hash_data) + || hash_get(&search->hash_table, &search->board, hash_code, &hash_data)) { // compute bounds - if (alpha < hash_data->lower) alpha = *old_score = hash_data->lower; - if (beta > hash_data->upper) beta = *old_score = hash_data->upper; + if (alpha < hash_data.lower) alpha = *old_score = hash_data.lower; + if (beta > hash_data.upper) beta = *old_score = hash_data.upper; // skip search ? - if (hash_data->depth >= search->depth && hash_data->selectivity >= search->selectivity && alpha >= beta) { - if (hash_data->move[0] != NOMOVE) movelist_sort_bestmove(movelist, hash_data->move[0]); - else if (hash_data->lower > SCORE_MIN) return false; + if (hash_data.wl.c.depth >= search->depth && hash_data.wl.c.selectivity >= search->selectivity && alpha >= beta) { + if (hash_data.move[0] != NOMOVE) movelist_sort_bestmove(movelist, hash_data.move[0]); + else if (hash_data.lower > SCORE_MIN) return false; bestmove = movelist_first(movelist); bestmove->score = *old_score; - record_best_move(search, board, bestmove, options.alpha, options.beta, search->depth); + record_best_move(search, bestmove, options.alpha, options.beta, search->depth); bound = search->result->bound + bestmove->x; if (bound->lower != bound->upper || is_pv_ok(search, bestmove->x, search->depth)) { - cassio_debug("Edax skips the search. The position is already in the hash table: %s (%d, %d) ?\n", move_to_string(bestmove->x, search->player, s), hash_data->lower, hash_data->upper); + cassio_debug("Edax skips the search. The position is already in the hash table: %s (%d, %d) ?\n", move_to_string(bestmove->x, search->player, s), hash_data.lower, hash_data.upper); engine_observer(search->result); return true; } else { cassio_debug("Edax does not skip the search : BAD PV!\n"); } } else { - if (hash_data->depth < search->depth || hash_data->selectivity < search->selectivity) { - cassio_debug("Edax does not skip the search: Level %d@%d < %d@%d\n", hash_data->depth,selectivity_table[hash_data->selectivity].percent, search->depth, selectivity_table[search->selectivity].percent); + if (hash_data.wl.c.depth < search->depth || hash_data.wl.c.selectivity < search->selectivity) { + cassio_debug("Edax does not skip the search: Level %d@%d < %d@%d\n", hash_data.wl.c.depth, selectivity_table[hash_data.wl.c.selectivity].percent, search->depth, selectivity_table[search->selectivity].percent); } else { cassio_debug("Edax does not skip the search: unsolved score alpha %d < beta %d\n", alpha, beta); } } } else { - cassio_debug("Edax does not skip the search: Position %s (hash=%llx) not found\n", board_to_string(board, search->player, b), board_get_hash_code(board)); + cassio_debug("Edax does not skip the search: Position %s (hash=%llx) not found\n", board_to_string(&search->board, search->player, b), hash_code); } return false; @@ -584,9 +590,9 @@ static bool skip_search(Engine *engine, int *old_score) */ double engine_midgame_search(void *v, const char *position, const double alpha, const double beta, const int depth, const int precision) { - Engine *engine = (Engine*) v; - Search *search = engine->search; - Board board[1]; + Engine *const engine = (Engine*) v; + Search *const search = engine->search; + Board board; int player; int old_score; @@ -596,12 +602,12 @@ double engine_midgame_search(void *v, const char *position, const double alpha, } engine->is_searching = true; - player = board_set(board, position); + player = board_set(&board, position); - old_score = engine_open(search, board, player, floor(alpha), ceil(beta), depth, precision); + old_score = engine_open(search, &board, player, floor(alpha), ceil(beta), depth, precision); if (skip_search(engine, &old_score)) { - } else if (is_position_new(engine, board)) { + } else if (is_position_new(engine, &board)) { cassio_debug("iterative deepening.\n"); iterative_deepening(search, options.alpha, options.beta); } else { @@ -629,9 +635,9 @@ double engine_midgame_search(void *v, const char *position, const double alpha, */ int engine_endgame_search(void *v, const char *position, const int alpha, const int beta, const int precision) { - Engine *engine = (Engine*) v; - Search *search = engine->search; - Board board[1]; + Engine *const engine = (Engine*) v; + Search *const search = engine->search; + Board board; int player; int old_score; int depth; @@ -642,13 +648,13 @@ int engine_endgame_search(void *v, const char *position, const int alpha, const } engine->is_searching = true; - player = board_set(board, position); - depth = board_count_empties(board); + player = board_set(&board, position); + depth = board_count_empties(&board); - old_score = engine_open(search, board, player, alpha, beta, depth, precision); + old_score = engine_open(search, &board, player, alpha, beta, depth, precision); if (skip_search(engine, &old_score)) { - } else if (is_position_new(engine, board)) { + } else if (is_position_new(engine, &board)) { cassio_debug("iterative deepening.\n"); iterative_deepening(search, options.alpha, options.beta); } else { @@ -669,7 +675,7 @@ int engine_endgame_search(void *v, const char *position, const int alpha, const */ void engine_stop(void *v) { - Search *search = (Search*) v; + Search *const search = (Search*) v; if (search == NULL) { engine_send("ERROR: Engine need to be initialized."); return; @@ -683,7 +689,7 @@ void engine_stop(void *v) void engine_loop(void) { char *cmd = NULL, *param = NULL; - Engine *engine = (Engine*) engine_init(); + Engine *const engine = (Engine*) engine_init(); // loop forever for (;;) { @@ -707,12 +713,12 @@ void engine_loop(void) } else if (strcmp(cmd, "feed-hash") == 0) { int depth = 21, precision = 73, player; double lower = -SCORE_INF, upper = SCORE_INF; - Board board[1]; - Line pv[1]; + Board board; + Line pv; char *string; errno = 0; - string = parse_board(param, board, &player); + string = parse_board(param, &board, &player); if (string == param) engine_send("Error: in feed-hash, Edax cannot parse position."); else { string = parse_real(string, &lower); @@ -727,9 +733,9 @@ void engine_loop(void) string = parse_int(string, &precision); if (errno) engine_send("Error: in feed-hash, Edax cannot parse precision."); else { - line_init(pv, player); - parse_game(string, board, pv); - engine_feed_hash(engine, board, floor(lower), ceil(upper), depth, precision, pv); + line_init(&pv, player); + parse_game(string, &board, &pv); + engine_feed_hash(engine, &board, floor(lower), ceil(upper), depth, precision, &pv); } } } @@ -751,12 +757,12 @@ void engine_loop(void) } else if (strcmp(cmd, "midgame-search") == 0) { double alpha = -SCORE_INF, beta = SCORE_INF; int player, depth = 60, precision = 100; - Board board[1]; + Board board; char b[80]; char *s; errno = 0; - s = parse_board(param, board, &player); + s = parse_board(param, &board, &player); if (s == param) engine_send("ERROR: midgame-search cannot parse position."); else { s = parse_real(s, &alpha); @@ -770,7 +776,7 @@ void engine_loop(void) else { s = parse_int(s, &precision); if (errno) engine_send("ERROR: midgame_search cannot parse precision."); - engine_midgame_search(engine, board_to_string(board, player, b), alpha, beta, depth, precision); + engine_midgame_search(engine, board_to_string(&board, player, b), alpha, beta, depth, precision); } } } @@ -780,12 +786,12 @@ void engine_loop(void) } else if (strcmp(cmd, "endgame-search") == 0) { int alpha = -SCORE_INF, beta = SCORE_INF; int player, precision = 100; - Board board[1]; + Board board; char b[80]; char *s; errno = 0; - s = parse_board(param, board, &player); + s = parse_board(param, &board, &player); if (s == param) engine_send("ERROR: endgame_search cannot parse position."); else { s = parse_int(s, &alpha); @@ -796,7 +802,7 @@ void engine_loop(void) else { s = parse_int(s, &precision); if (errno) engine_send("ERROR: endgame_search cannot parse precision."); - engine_endgame_search(engine, board_to_string(board, player, b), alpha, beta, precision); + engine_endgame_search(engine, board_to_string(&board, player, b), alpha, beta, precision); } } } diff --git a/src/const.h b/src/const.h index ef84651b..5b025d3d 100644 --- a/src/const.h +++ b/src/const.h @@ -3,9 +3,9 @@ * * Constants as macros, enums, or global consts. * - * @date 1998 - 2017 + * @date 1998 - 2024 * @author Richard Delorme - * @version 4.4 + * @version 4.5 */ #ifndef EDAX_CONST_H @@ -15,7 +15,7 @@ #define MAX_THREADS 64 /** maximal number of moves */ -#define MAX_MOVE 32 +#define MAX_MOVE 33 // https://eukaryote.hateblo.jp/entry/2023/05/23/145945 /** size of the board */ #define BOARD_SIZE 64 @@ -45,9 +45,6 @@ enum { OFF_SIDE }; -extern const unsigned long long X_TO_BIT[]; -extern const unsigned long long NEIGHBOUR[]; - /** infinite score: a huge value unreachable as a score and fitting in a char */ #define SCORE_INF 127 @@ -77,16 +74,17 @@ typedef enum Stop { } Stop; /** node type */ -typedef enum { +enum { PV_NODE, CUT_NODE, ALL_NODE -} NodeType; +}; +typedef unsigned char NodeType; #define VERSION 4 -#define RELEASE 4 -#define VERSION_STRING "4.4" -#define EDAX_NAME "Edax 4.4" +#define RELEASE 5 +#define VERSION_STRING "4.5.3" +#define EDAX_NAME "Edax 4.5.3" #define BOOK 0x424f4f4b #define EDAX 0x45444158 #define EVAL 0x4556414c diff --git a/src/count_last_flip_carry_32.c b/src/count_last_flip_32.c similarity index 96% rename from src/count_last_flip_carry_32.c rename to src/count_last_flip_32.c index 59b0da62..492c2668 100644 --- a/src/count_last_flip_carry_32.c +++ b/src/count_last_flip_32.c @@ -1,5 +1,5 @@ /** - * @file count_last_flip_carry_32.c + * @file count_last_flip_32.c * * * A function is provided to count the number of fipped disc of the last move @@ -107,7 +107,7 @@ static int count_last_flip_A1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x01010100u) + ((HIDWORD(P) & 0x01010101u) << 4)) * 0x01020408u) >> 25]; + n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x01010101u) + ((HIDWORD(P) & 0x01010101u) << 4)) * 0x01020408u) >> 25]; n_flipped += COUNT_FLIP_R[(LODWORD(P) >> 1) & 0x7f]; n_flipped += COUNT_FLIP_R[(((LODWORD(P) & 0x08040200u) + (HIDWORD(P) & 0x80402010u)) * 0x01010101u) >> 25]; @@ -124,7 +124,7 @@ static int count_last_flip_B1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x02020200u) + ((HIDWORD(P) & 0x02020202u) << 4)) * 0x00810204u) >> 25]; + n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x02020202u) + ((HIDWORD(P) & 0x02020202u) << 4)) * 0x00810204u) >> 25]; n_flipped += COUNT_FLIP_R[(LODWORD(P) >> 2) & 0x3f]; n_flipped += COUNT_FLIP_R[(((LODWORD(P) & 0x10080400u) + (HIDWORD(P) & 0x00804020u)) * 0x01010101u) >> 26]; @@ -141,9 +141,9 @@ static int count_last_flip_C1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x04040400u) + ((HIDWORD(P) & 0x04040404u) << 4)) * 0x00408102u) >> 25]; + n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x04040404u) + ((HIDWORD(P) & 0x04040404u) << 4)) * 0x00408102u) >> 25]; n_flipped += COUNT_FLIP_2[LODWORD(P) & 0xff]; - n_flipped += COUNT_FLIP_2[(((LODWORD(P) & 0x20110A04u) + (HIDWORD(P) & 0x00008040u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_2[(((LODWORD(P) & 0x20110A04u) + (HIDWORD(P) & 0x00008040u)) * 0x01010101u) >> 24]; // A3C1H6 return n_flipped; } @@ -158,9 +158,9 @@ static int count_last_flip_D1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x08080800u) + ((HIDWORD(P) & 0x08080808u) << 4)) * 0x00204081u) >> 25]; + n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x08080808u) + ((HIDWORD(P) & 0x08080808u) << 4)) * 0x00204081u) >> 25]; n_flipped += COUNT_FLIP_3[LODWORD(P) & 0xff]; - n_flipped += COUNT_FLIP_3[(((LODWORD(P) & 0x41221408u) + (HIDWORD(P) & 0x00000080u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_3[(((LODWORD(P) & 0x41221408u) + (HIDWORD(P) & 0x00000080u)) * 0x01010101u) >> 24]; // A4D1H5 return n_flipped; } @@ -175,9 +175,9 @@ static int count_last_flip_E1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x10101000u) >> 4) + (HIDWORD(P) & 0x10101010u)) * 0x01020408u) >> 25]; + n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x10101010u) >> 4) + (HIDWORD(P) & 0x10101010u)) * 0x01020408u) >> 25]; n_flipped += COUNT_FLIP_4[LODWORD(P) & 0xff]; - n_flipped += COUNT_FLIP_4[(((LODWORD(P) & 0x82442810u) + (HIDWORD(P) & 0x00000001u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_4[(((LODWORD(P) & 0x82442810u) + (HIDWORD(P) & 0x00000001u)) * 0x01010101u) >> 24]; // A5E1H4 return n_flipped; } @@ -192,9 +192,9 @@ static int count_last_flip_F1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[(((HIDWORD(P) & 0x20202020u) + ((LODWORD(P) >> 4) & 0x02020200u)) * 0x00810204u) >> 25]; + n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x20202020u) >> 4) + (HIDWORD(P) & 0x20202020u)) * 0x00810204u) >> 25]; n_flipped += COUNT_FLIP_5[LODWORD(P) & 0xff]; - n_flipped += COUNT_FLIP_5[(((LODWORD(P) & 0x04885020u) + (HIDWORD(P) & 0x00000102u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_5[(((LODWORD(P) & 0x04885020u) + (HIDWORD(P) & 0x00000102u)) * 0x01010101u) >> 24]; // A6F1H3 return n_flipped; } @@ -209,7 +209,7 @@ static int count_last_flip_G1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x40404000u) >> 4) + (HIDWORD(P) & 0x40404040u)) * 0x00408102u) >> 25]; + n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x40404040u) >> 4) + (HIDWORD(P) & 0x40404040u)) * 0x00408102u) >> 25]; n_flipped += COUNT_FLIP_L[(LODWORD(P) << 1) & 0x7e]; n_flipped += COUNT_FLIP_L[(((LODWORD(P) & 0x08102000u) + (HIDWORD(P) & 0x00010204u)) * 0x02020202u) >> 24]; @@ -226,7 +226,7 @@ static int count_last_flip_H1(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x80808000u) >> 4) + (HIDWORD(P) & 0x80808080u)) * 0x00204081u) >> 25]; + n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x80808080u) >> 4) + (HIDWORD(P) & 0x80808080u)) * 0x00204081u) >> 25]; n_flipped += COUNT_FLIP_L[LODWORD(P) & 0x7f]; n_flipped += COUNT_FLIP_L[(((LODWORD(P) & 0x10204000u) + (HIDWORD(P) & 0x01020408u)) * 0x01010101u) >> 24]; @@ -279,7 +279,7 @@ static int count_last_flip_C2(const unsigned long long P) n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x04040000u) + ((HIDWORD(P) & 0x04040404u) << 4)) * 0x00408102u) >> 26]; n_flipped += COUNT_FLIP_2[(LODWORD(P) >> 8) & 0xff]; - n_flipped += COUNT_FLIP_2[(((LODWORD(P) & 0x110A0400u) + (HIDWORD(P) & 0x00804020u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_2[(((LODWORD(P) & 0x110A0400u) + (HIDWORD(P) & 0x00804020u)) * 0x01010101u) >> 24]; // A4C2H7 return n_flipped; } @@ -296,7 +296,7 @@ static int count_last_flip_D2(const unsigned long long P) n_flipped = COUNT_FLIP_R[(((LODWORD(P) & 0x08080000u) + ((HIDWORD(P) & 0x08080808u) << 4)) * 0x00204081u) >> 26]; n_flipped += COUNT_FLIP_3[(LODWORD(P) >> 8) & 0xff]; - n_flipped += COUNT_FLIP_3[(((LODWORD(P) & 0x22140800u) + (HIDWORD(P) & 0x00008041u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_3[(((LODWORD(P) & 0x22140800u) + (HIDWORD(P) & 0x00008041u)) * 0x01010101u) >> 24]; // A5D2H6 return n_flipped; } @@ -313,7 +313,7 @@ static int count_last_flip_E2(const unsigned long long P) n_flipped = COUNT_FLIP_R[((((LODWORD(P) & 0x10100000u) >> 4) + (HIDWORD(P) & 0x10101010u)) * 0x01020408u) >> 26]; n_flipped += COUNT_FLIP_4[(LODWORD(P) >> 8) & 0xff]; - n_flipped += COUNT_FLIP_4[(((LODWORD(P) & 0x44281000u) + (HIDWORD(P) & 0x00000182u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_4[(((LODWORD(P) & 0x44281000u) + (HIDWORD(P) & 0x00000182u)) * 0x01010101u) >> 24]; // A6E2H5 return n_flipped; } @@ -330,7 +330,7 @@ static int count_last_flip_F2(const unsigned long long P) n_flipped = COUNT_FLIP_R[(((HIDWORD(P) & 0x20202020u) + ((LODWORD(P) & 0x20200000u) >> 4)) * 0x00810204u) >> 26]; n_flipped += COUNT_FLIP_5[(LODWORD(P) >> 8) & 0xff]; - n_flipped += COUNT_FLIP_5[(((LODWORD(P) & 0x88502000u) + (HIDWORD(P) & 0x00010204u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_5[(((LODWORD(P) & 0x88502000u) + (HIDWORD(P) & 0x00010204u)) * 0x01010101u) >> 24]; // A7F2H4 return n_flipped; } @@ -975,7 +975,7 @@ static int count_last_flip_C7(const unsigned long long P) n_flipped = COUNT_FLIP_L[(((LODWORD(P) & 0x04040404u) + ((HIDWORD(P) & 0x00000404u) << 4)) * 0x00810204u) >> 24]; n_flipped += COUNT_FLIP_2[(HIDWORD(P) >> 16) & 0xff]; - n_flipped += COUNT_FLIP_2[(((HIDWORD(P) & 0x00040A11u) + (LODWORD(P) & 0x20408000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_2[(((HIDWORD(P) & 0x00040A11u) + (LODWORD(P) & 0x20408000u)) * 0x01010101u) >> 24]; // A5C7H2 return n_flipped; } @@ -992,7 +992,7 @@ static int count_last_flip_D7(const unsigned long long P) n_flipped = COUNT_FLIP_L[((((HIDWORD(P) & 0x00000808u) << 4) + (LODWORD(P) & 0x08080808u)) * 0x00408102u) >> 24]; n_flipped += COUNT_FLIP_3[(HIDWORD(P) >> 16) & 0xff]; - n_flipped += COUNT_FLIP_3[(((HIDWORD(P) & 0x00081422u) + (LODWORD(P) & 0x41800000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_3[(((HIDWORD(P) & 0x00081422u) + (LODWORD(P) & 0x41800000u)) * 0x01010101u) >> 24]; // A4D7H3 return n_flipped; } @@ -1009,7 +1009,7 @@ static int count_last_flip_E7(const unsigned long long P) n_flipped = COUNT_FLIP_L[(((HIDWORD(P) & 0x00001010u) + ((LODWORD(P) & 0x10101010u) >> 4)) * 0x02040810u) >> 24]; n_flipped += COUNT_FLIP_4[(HIDWORD(P) >> 16) & 0xff]; - n_flipped += COUNT_FLIP_4[(((HIDWORD(P) & 0x00102844u) + (LODWORD(P) & 0x82010000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_4[(((HIDWORD(P) & 0x00102844u) + (LODWORD(P) & 0x82010000u)) * 0x01010101u) >> 24]; // A3E7H4 return n_flipped; } @@ -1026,7 +1026,7 @@ static int count_last_flip_F7(const unsigned long long P) n_flipped = COUNT_FLIP_L[(((HIDWORD(P) & 0x00002020u) + ((LODWORD(P) & 0x20202020u) >> 4)) * 0x01020408u) >> 24]; n_flipped += COUNT_FLIP_5[(HIDWORD(P) >> 16) & 0xff]; - n_flipped += COUNT_FLIP_5[(((HIDWORD(P) & 0x00205088u) + (LODWORD(P) & 0x04020100u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_5[(((HIDWORD(P) & 0x00205088u) + (LODWORD(P) & 0x04020100u)) * 0x01010101u) >> 24]; // A2F7H5 return n_flipped; } @@ -1111,7 +1111,7 @@ static int count_last_flip_C8(const unsigned long long P) n_flipped = COUNT_FLIP_L[(((LODWORD(P) & 0x04040404u) + ((HIDWORD(P) & 0x00040404u) << 4)) * 0x00408102u) >> 24]; n_flipped += COUNT_FLIP_2[HIDWORD(P) >> 24]; - n_flipped += COUNT_FLIP_2[(((HIDWORD(P) & 0x040A1120u) + (LODWORD(P) & 0x40800000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_2[(((HIDWORD(P) & 0x040A1120u) + (LODWORD(P) & 0x40800000u)) * 0x01010101u) >> 24]; // A6C8H3 return n_flipped; } @@ -1128,7 +1128,7 @@ static int count_last_flip_D8(const unsigned long long P) n_flipped = COUNT_FLIP_L[((((HIDWORD(P) & 0x00080808u) << 4) + (LODWORD(P) & 0x08080808u)) * 0x00204081u) >> 24]; n_flipped += COUNT_FLIP_3[HIDWORD(P) >> 24]; - n_flipped += COUNT_FLIP_3[(((HIDWORD(P) & 0x08142241u) + (LODWORD(P) & 0x80000000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_3[(((HIDWORD(P) & 0x08142241u) + (LODWORD(P) & 0x80000000u)) * 0x01010101u) >> 24]; // A5D8H4 return n_flipped; } @@ -1145,7 +1145,7 @@ static int count_last_flip_E8(const unsigned long long P) n_flipped = COUNT_FLIP_L[(((HIDWORD(P) & 0x00101010u) + ((LODWORD(P) & 0x10101010u) >> 4)) * 0x01020408u) >> 24]; n_flipped += COUNT_FLIP_4[HIDWORD(P) >> 24]; - n_flipped += COUNT_FLIP_4[(((HIDWORD(P) & 0x10284482u) + (LODWORD(P) & 0x01000000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_4[(((HIDWORD(P) & 0x10284482u) + (LODWORD(P) & 0x01000000u)) * 0x01010101u) >> 24]; // A4E8H5 return n_flipped; } @@ -1162,7 +1162,7 @@ static int count_last_flip_F8(const unsigned long long P) n_flipped = COUNT_FLIP_L[(((HIDWORD(P) & 0x00202020u) + ((LODWORD(P) & 0x20202020u) >> 4)) * 0x00810204u) >> 24]; n_flipped += COUNT_FLIP_5[HIDWORD(P) >> 24]; - n_flipped += COUNT_FLIP_5[(((HIDWORD(P) & 0x00508804u) + (LODWORD(P) & 0x02010000u)) * 0x01010101u) >> 24]; + n_flipped += COUNT_FLIP_5[(((HIDWORD(P) & 0x00508804u) + (LODWORD(P) & 0x02010000u)) * 0x01010101u) >> 24]; // A3F8H6 return n_flipped; } @@ -1214,7 +1214,7 @@ static int count_last_flip_pass(const unsigned long long P) } /** Array of functions to count flipped discs of the last move */ -int (*COUNT_LAST_FLIP[])(const unsigned long long) = { +int (*count_last_flip[])(const unsigned long long) = { count_last_flip_A1, count_last_flip_B1, count_last_flip_C1, count_last_flip_D1, count_last_flip_E1, count_last_flip_F1, count_last_flip_G1, count_last_flip_H1, count_last_flip_A2, count_last_flip_B2, count_last_flip_C2, count_last_flip_D2, @@ -1233,9 +1233,3 @@ int (*COUNT_LAST_FLIP[])(const unsigned long long) = { count_last_flip_E8, count_last_flip_F8, count_last_flip_G8, count_last_flip_H8, count_last_flip_pass, }; - -int count_last_flip(const int x, const unsigned long long P) -{ - return COUNT_LAST_FLIP[x](P); -} - diff --git a/src/count_last_flip_avx512cd.c b/src/count_last_flip_avx512cd.c new file mode 100644 index 00000000..54de932a --- /dev/null +++ b/src/count_last_flip_avx512cd.c @@ -0,0 +1,51 @@ +/** + * @file count_last_flip_avx512cd.c + * + * A function is provided to count the number of fipped disc of the last move. + * + * Count last flip using the flip_avx512cd way. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 2023 - 2024 + * @author Toshihiko Okuhara + * @version 4.5 + * + */ + +#include "bit.h" + +extern const V8DI lrmask[66]; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +int last_flip(int pos, unsigned long long P) +{ + __m256i PP = _mm256_set1_epi64x(P); + __m256i flip, outflank, eraser, rmask, lmask; + __m128i flip2; + + // left: look for player LS1B + lmask = lrmask[pos].v4[0]; + outflank = _mm256_and_si256(PP, lmask); + // set below LS1B if P is in lmask + flip = _mm256_maskz_add_epi64(_mm256_test_epi64_mask(PP, lmask), outflank, _mm256_set1_epi64x(-1)); + // flip = _mm256_and_si256(_mm256_andnot_si256(outflank, flip), lmask); + flip = _mm256_ternarylogic_epi64(outflank, flip, lmask, 0x08); + + // right: look for player bit with lzcnt + rmask = lrmask[pos].v4[1]; + eraser = _mm256_srlv_epi64(_mm256_set1_epi64x(-1), + _mm256_maskz_lzcnt_epi64(_mm256_test_epi64_mask(PP, rmask), _mm256_and_si256(PP, rmask))); + // flip = _mm256_or_si256(flip, _mm256_andnot_si256(eraser, rmask)); + flip = _mm256_ternarylogic_epi64(flip, eraser, rmask, 0xf2); + + flip2 = _mm_or_si128(_mm256_castsi256_si128(flip), _mm256_extracti128_si256(flip, 1)); + return 2 * bit_count(_mm_cvtsi128_si64(_mm_or_si128(flip2, _mm_unpackhi_epi64(flip2, flip2)))); +} diff --git a/src/count_last_flip_avx_ppfill.c b/src/count_last_flip_avx_ppfill.c new file mode 100644 index 00000000..073bc55b --- /dev/null +++ b/src/count_last_flip_avx_ppfill.c @@ -0,0 +1,57 @@ +/** + * @file count_last_flip_avx_ppfill.c + * + * A function is provided to count the number of fipped disc of the last move. + * + * Count last flip using the flip_avx_ppfill way. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 2023 + * @author Toshihiko Okuhara + * @version 4.5 + * + */ + +#include "bit.h" + +extern const V4DI lmask_v4[66], rmask_v4[66]; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +int last_flip(int pos, unsigned long long P) +{ + __m256i PP = _mm256_set1_epi64x(P); + __m256i flip, outflank, eraser, rmask, lmask; + __m128i flip2; + + rmask = rmask_v4[pos].v4; + // isolate player MS1B by clearing lower shadow bits + outflank = _mm256_and_si256(PP, rmask); + eraser = _mm256_srlv_epi64(outflank, _mm256_set_epi64x(7, 9, 8, 1)); + // eraser = player's shadow + eraser = _mm256_or_si256(eraser, outflank); + eraser = _mm256_or_si256(eraser, _mm256_srlv_epi64(eraser, _mm256_set_epi64x(14, 18, 16, 2))); + flip = _mm256_andnot_si256(eraser, rmask); + flip = _mm256_andnot_si256(_mm256_srlv_epi64(eraser, _mm256_set_epi64x(28, 36, 32, 4)), flip); + // clear if no player bit, i.e. all opponent + flip = _mm256_andnot_si256(_mm256_cmpeq_epi64(flip, rmask), flip); + + lmask = lmask_v4[pos].v4; + // look for player LS1B + outflank = _mm256_and_si256(PP, lmask); + outflank = _mm256_and_si256(outflank, _mm256_sub_epi64(_mm256_setzero_si256(), outflank)); // LS1B + // set all bits if outflank = 0, otherwise higher bits than outflank + eraser = _mm256_sub_epi64(_mm256_cmpeq_epi64(outflank, _mm256_setzero_si256()), outflank); + flip = _mm256_or_si256(flip, _mm256_andnot_si256(eraser, lmask)); + + flip2 = _mm_or_si128(_mm256_castsi256_si128(flip), _mm256_extracti128_si256(flip, 1)); + flip2 = _mm_or_si128(flip2, _mm_shuffle_epi32(flip2, 0x4e)); + return 2 * bit_count(_mm_cvtsi128_si64(flip2)); +} diff --git a/src/count_last_flip_bitscan.c b/src/count_last_flip_bitscan.c index e7a69127..6cae24ac 100644 --- a/src/count_last_flip_bitscan.c +++ b/src/count_last_flip_bitscan.c @@ -27,27 +27,14 @@ * For top to bottom flip, LS1B isolation (http://chessprogramming.wikispaces.com/ * General+Setwise+Operations) is used to get the outflank bit. * - * @date 1998 - 2017 + * @date 1998 - 2018 * @author Richard Delorme * @author Toshihiko Okuhara * @version 4.4 * */ -#include - /** precomputed count flip array */ -static const char COUNT_FLIP_R[128] = { - 0, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 10, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 12, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 10, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, - 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0 -}; - static const char COUNT_FLIP_2[256] = { 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, @@ -92,6 +79,28 @@ static const char COUNT_FLIP_5[256] = { 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +#include "bit_intrinsics.h" + +#ifdef lzcnt_u64 + +static inline int count_V_flip_reverse (unsigned long long P, int ofs) { + return (lzcnt_u64(P << ofs) & 0x38) >> 2; +} + +static inline int count_H_flip_left (unsigned long long P, int pos, int mask) { + if (pos < 8) + return (lzcnt_u32((P << (8 - pos)) & (mask << 1)) & 0x07) * 2; + else + return (lzcnt_u32((P >> (pos - 8)) & (mask << 1)) & 0x07) * 2; +} + +#else + +// with guardian bit to avoid __builtin_clz(0) // Not used +static inline int count_V_flip_reverse (unsigned long long P, int ofs) { + return ((__builtin_clzll((P << ofs) | 1) + 1) & 0x38) >> 2; +} + static const char COUNT_FLIP_L[128] = { 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -99,369 +108,302 @@ static const char COUNT_FLIP_L[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -#ifdef __LZCNT__ -static inline int count_V_flip_reverse (unsigned long long P, int ofs) { - return ((__lzcnt64(P) - ofs) >> 2) & 0x0E; -} -#else -// with guardian bit to avoid __builtin_clz(0) -static inline int count_V_flip_reverse (unsigned long long P, int ofs) { - return ((__builtin_clzll((P << ofs) | 1) + 1) >> 2) & 0x0E; +static inline int count_H_flip_left (unsigned long long P, int pos, int mask) { + if (pos < 8) + return COUNT_FLIP_L[(P << (7 - pos)) & mask]; + else + return COUNT_FLIP_L[(P >> (pos - 7)) & mask]; } -#endif -/** - * Count last flipped discs when playing on square A1. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_A1(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v, P_d9; +#endif - P_v = P & 0x0101010101010100ULL; - n_flipped = ((P_v & -P_v) * 0x000020406080a0c0ULL) >> 60; - n_flipped += COUNT_FLIP_R[(P >> 1) & 0x7f]; - P_d9 = (P & 0x8040201008040200ULL) >> 8; - n_flipped += ((P_d9 & -P_d9) * 0x0008080604028180ULL) >> 60; +#ifdef tzcnt_u32 - return n_flipped; +static inline int count_H_flip_right (unsigned long long P, int pos) { + if (pos >= 56) + return (tzcnt_u32(P >> (pos + 1)) & 0x07) * 2; + else if ((pos >= 24) && (pos < 32)) + return (tzcnt_u32((unsigned int) P >> (pos + 1)) & 0x07) * 2; + else + return (tzcnt_u32((P >> (pos + 1)) & (0x7f >> (pos & 0x07))) & 0x07) * 2; } -/** - * Count last flipped discs when playing on square B1. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_B1(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v, P_d9; +#else - P_v = P & 0x0202020202020200ULL; - n_flipped = ((P_v & -P_v) * 0x0000102030405060ULL) >> 60; - n_flipped += COUNT_FLIP_R[(P >> 2) & 0x3f]; - P_d9 = P & 0x0080402010080400ULL; - n_flipped += ((P_d9 & -P_d9) * 0x0000040403020140ULL) >> 60; +static const char COUNT_FLIP_R[128] = { + 0, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, + 10, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, + 12, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, + 10, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0, 8, 0, 2, 0, 4, 0, 2, 0, 6, 0, 2, 0, 4, 0, 2, 0 +}; - return n_flipped; +static inline int count_H_flip_right (unsigned long long P, int pos) { + if (pos >= 56) + return COUNT_FLIP_R[P >> (pos + 1)]; + else if ((pos >= 24) && (pos < 32)) + return COUNT_FLIP_R[(unsigned int) P >> (pos + 1)]; + else + return COUNT_FLIP_R[(P >> (pos + 1)) & (0x7f >> (pos & 0x07))]; } +#endif + +#ifndef lzcnt_u64 + /** - * Count last flipped discs when playing on square C1. + * Count last flipped discs when playing on square A1/A2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_C1(const unsigned long long P) +static int count_last_flip_A1(const unsigned long long P) { int n_flipped; - unsigned long long P_v; + unsigned long long P_v, P_d9; - P_v = P & 0x0404040404040400ULL; - n_flipped = ((P_v & -P_v) * 0x0000081018202830ULL) >> 60; - n_flipped += COUNT_FLIP_2[P & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x0000804020110A04ULL) * 0x0101010101010101ULL) >> 56]; + P_v = P & 0x0101010101010100; + n_flipped = ((P_v & -P_v) * 0x000020406080a0c0) >> 60; + n_flipped += count_H_flip_right(P, 0); + P_d9 = P & 0x8040201008040200; + n_flipped += (((P_d9 & -P_d9) >> 1) * 0x000010100c080503) >> 60; return n_flipped; } -/** - * Count last flipped discs when playing on square D1. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_D1(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v; +static int count_last_flip_A2(const unsigned long long P) { + return count_last_flip_A1(P >> 8); +} - P_v = P & 0x0808080808080800ULL; - n_flipped = ((P_v & -P_v) * 0x000004080c101418ULL) >> 60; - n_flipped += COUNT_FLIP_3[P & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0000008041221408ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_A8(const unsigned long long P) { + return count_last_flip_A1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_A7(const unsigned long long P) { + return count_last_flip_A1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square E1. + * Count last flipped discs when playing on square B1/B2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_E1(const unsigned long long P) +static int count_last_flip_B1(const unsigned long long P) { int n_flipped; - unsigned long long P_v; + unsigned long long P_v, P_d9; - P_v = P & 0x1010101010101000ULL; - n_flipped = ((P_v & -P_v) * 0x0000020406080a0cULL) >> 60; - n_flipped += COUNT_FLIP_4[P & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0000000182442810ULL) * 0x0101010101010101ULL) >> 56]; + P_v = P & 0x0202020202020200; + n_flipped = ((P_v & -P_v) * 0x0000102030405060) >> 60; + n_flipped += count_H_flip_right(P, 1); + P_d9 = P & 0x0080402010080400; + n_flipped += ((P_d9 & -P_d9) * 0x0000040403020140) >> 60; return n_flipped; } -/** - * Count last flipped discs when playing on square F1. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_F1(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v; +static int count_last_flip_B2(const unsigned long long P) { + return count_last_flip_B1(P >> 8); +} - P_v = P & 0x2020202020202000ULL; - n_flipped = ((P_v & -P_v) * 0x0000010203040506ULL) >> 60; - n_flipped += COUNT_FLIP_5[P & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0000010204885020ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_B8(const unsigned long long P) { + return count_last_flip_B1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_B7(const unsigned long long P) { + return count_last_flip_B1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square G1. + * Count last flipped discs when playing on square C1/C2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_G1(const unsigned long long P) +static int count_last_flip_C1(const unsigned long long P) { int n_flipped; - unsigned long long P_v, P_d7; + unsigned long long P_v; - P_v = P & 0x4040404040404000ULL; - n_flipped = ((P_v & -P_v) * 0x0000008101820283ULL) >> 60; - n_flipped += COUNT_FLIP_L[(P << 1) & 0x7e]; - P_d7 = P & 0x0001020408102000ULL; - n_flipped += ((P_d7 & -P_d7) * 0x000002081840a000ULL) >> 60; + P_v = P & 0x0404040404040400; + n_flipped = ((P_v & -P_v) * 0x0000081018202830) >> 60; + n_flipped += COUNT_FLIP_2[P & 0xff]; + n_flipped += COUNT_FLIP_2[((P & 0x0000804020110A04) * 0x0101010101010101) >> 56]; // A3C1H6 return n_flipped; } -/** - * Count last flipped discs when playing on square H1. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_H1(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v, P_d7; +static int count_last_flip_C2(const unsigned long long P) { + return count_last_flip_C1(P >> 8); +} - P_v = (P & 0x8080808080808000ULL) >> 1; - n_flipped = ((P_v & -P_v) * 0x0000008101820283ULL) >> 60; - n_flipped += COUNT_FLIP_L[P & 0x7f]; - P_d7 = P & 0x0102040810204000ULL; - n_flipped += ((P_d7 & -P_d7) * 0x000001040c2050c0ULL) >> 60; +static int count_last_flip_C8(const unsigned long long P) { + return count_last_flip_C1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_C7(const unsigned long long P) { + return count_last_flip_C1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square A2. + * Count last flipped discs when playing on square D1/D2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_A2(const unsigned long long P) +static int count_last_flip_D1(const unsigned long long P) { int n_flipped; - unsigned long long P_v, P_d9; + unsigned long long P_v; - P_v = P & 0x0101010101010000ULL; - n_flipped = ((P_v & -P_v) * 0x00000020406080a0ULL) >> 60; - n_flipped += COUNT_FLIP_R[(P >> 9) & 0x7f]; - P_d9 = (P & 0x4020100804020000ULL) >> 8; - n_flipped += ((P_d9 & -P_d9) * 0x0000080806040280ULL) >> 60; + P_v = P & 0x0808080808080800; + n_flipped = ((P_v & -P_v) * 0x000004080c101418) >> 60; + n_flipped += COUNT_FLIP_3[P & 0xff]; + n_flipped += COUNT_FLIP_3[((P & 0x0000008041221408) * 0x0101010101010101) >> 56]; // A4D1H5 return n_flipped; } -/** - * Count last flipped discs when playing on square B2. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_B2(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v, P_d9; +static int count_last_flip_D2(const unsigned long long P) { + return count_last_flip_D1(P >> 8); +} - P_v = P & 0x0202020202020000ULL; - n_flipped = ((P_v & -P_v) * 0x0000001020304050ULL) >> 60; - n_flipped += COUNT_FLIP_R[(P >> 10) & 0x3f]; - P_d9 = (P & 0x8040201008040000ULL) >> 8; - n_flipped += ((P_d9 & -P_d9) * 0x0000040403020140ULL) >> 60; +static int count_last_flip_D8(const unsigned long long P) { + return count_last_flip_D1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_D7(const unsigned long long P) { + return count_last_flip_D1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square C2. + * Count last flipped discs when playing on square E1/E2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_C2(const unsigned long long P) +static int count_last_flip_E1(const unsigned long long P) { int n_flipped; unsigned long long P_v; - P_v = P & 0x0404040404040000ULL; - n_flipped = ((P_v & -P_v) * 0x0000000810182028ULL) >> 60; - n_flipped += COUNT_FLIP_2[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x00804020110A0400ULL) * 0x0101010101010101ULL) >> 56]; + P_v = P & 0x1010101010101000; + n_flipped = ((P_v & -P_v) * 0x0000020406080a0c) >> 60; + n_flipped += COUNT_FLIP_4[P & 0xff]; + n_flipped += COUNT_FLIP_4[((P & 0x0000000182442810) * 0x0101010101010101) >> 56]; // A5E1H4 return n_flipped; } -/** - * Count last flipped discs when playing on square D2. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_D2(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v; +static int count_last_flip_E2(const unsigned long long P) { + return count_last_flip_E1(P >> 8); +} - P_v = P & 0x0808080808080000ULL; - n_flipped = ((P_v & -P_v) * 0x00000004080c1014ULL) >> 60; - n_flipped += COUNT_FLIP_3[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0000804122140800ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_E8(const unsigned long long P) { + return count_last_flip_E1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_E7(const unsigned long long P) { + return count_last_flip_E1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square E2. + * Count last flipped discs when playing on square F1/F2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_E2(const unsigned long long P) +static int count_last_flip_F1(const unsigned long long P) { int n_flipped; unsigned long long P_v; - P_v = P & 0x1010101010100000ULL; - n_flipped = ((P_v & -P_v) * 0x000000020406080aULL) >> 60; - n_flipped += COUNT_FLIP_4[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0000018244281000ULL) * 0x0101010101010101ULL) >> 56]; + P_v = P & 0x2020202020202000; + n_flipped = ((P_v & -P_v) * 0x0000010203040506) >> 60; + n_flipped += COUNT_FLIP_5[P & 0xff]; + n_flipped += COUNT_FLIP_5[((P & 0x0000010204885020) * 0x0101010101010101) >> 56]; // A6F1H3 return n_flipped; } -/** - * Count last flipped discs when playing on square F2. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_F2(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v; +static int count_last_flip_F2(const unsigned long long P) { + return count_last_flip_F1(P >> 8); +} - P_v = P & 0x2020202020200000ULL; - n_flipped = ((P_v & -P_v) * 0x0000000102030405ULL) >> 60; - n_flipped += COUNT_FLIP_5[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0001020488502000ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_F8(const unsigned long long P) { + return count_last_flip_F1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_F7(const unsigned long long P) { + return count_last_flip_F1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square G2. + * Count last flipped discs when playing on square G1/G2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_G2(const unsigned long long P) +static int count_last_flip_G1(const unsigned long long P) { int n_flipped; unsigned long long P_v, P_d7; - P_v = (P & 0x4040404040400000ULL) >> 1; - n_flipped = ((P_v & -P_v) * 0x0000000102030405ULL) >> 60; - n_flipped += COUNT_FLIP_L[(P >> 7) & 0x7e]; - P_d7 = P & 0x0102040810200000ULL; - n_flipped += ((P_d7 & -P_d7) * 0x00000002081840a0ULL) >> 60; + P_v = P & 0x4040404040404000; + n_flipped = ((P_v & -P_v) * 0x0000008101820283) >> 60; + n_flipped += count_H_flip_left(P, 6, 0x7e); + P_d7 = P & 0x0001020408102000; + n_flipped += ((P_d7 & -P_d7) * 0x000002081840a000) >> 60; return n_flipped; } -/** - * Count last flipped discs when playing on square H2. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_H2(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_v, P_d7; +static int count_last_flip_G2(const unsigned long long P) { + return count_last_flip_G1(P >> 8); +} - P_v = (P & 0x8080808080800000ULL) >> 2; - n_flipped = ((P_v & -P_v) * 0x0000000102030405ULL) >> 60; - n_flipped += COUNT_FLIP_L[(P >> 8) & 0x7f]; - P_d7 = (P & 0x0204081020400000ULL) >> 2; - n_flipped += ((P_d7 & -P_d7) * 0x0000000410308143ULL) >> 60; +static int count_last_flip_G8(const unsigned long long P) { + return count_last_flip_G1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_G7(const unsigned long long P) { + return count_last_flip_G1(vertical_mirror(P) >> 8); } /** - * Count last flipped discs when playing on square A3. + * Count last flipped discs when playing on square H1/H2. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_A3(const unsigned long long P) +static int count_last_flip_H1(const unsigned long long P) { int n_flipped; - unsigned long long P_d9; + unsigned long long P_v, P_d7; - n_flipped = COUNT_FLIP_2[((P & 0x0101010101010101ULL) * 0x0102040810204080ULL) >> 56]; - n_flipped += COUNT_FLIP_R[(P >> 17) & 0x7f]; - P_d9 = P & 0x2010080402000000ULL; - n_flipped += ((P_d9 & -P_d9) * 0x0000000008080604ULL) >> 60; - n_flipped += (P >> 1) & (~P >> 8) & 2; + P_v = P & 0x8080808080808000; + n_flipped = (((P_v & -P_v) >> 1) * 0x0000008101820283) >> 60; + n_flipped += count_H_flip_left(P, 7, 0x7f); + P_d7 = P & 0x0102040810204000; + n_flipped += ((P_d7 & -P_d7) * 0x000001040c2050c0) >> 60; return n_flipped; } -/** - * Count last flipped discs when playing on square B3. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_B3(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_d9; +static int count_last_flip_H2(const unsigned long long P) { + return count_last_flip_H1(P >> 8); +} - n_flipped = COUNT_FLIP_2[((P & 0x0202020202020202ULL) * 0x0081020408102040ULL) >> 56]; - n_flipped += COUNT_FLIP_R[(P >> 18) & 0x3f]; - P_d9 = P & 0x4020100804000000ULL; - n_flipped += ((P_d9 & -P_d9) * 0x0000000004040302ULL) >> 60; - n_flipped += (P >> 2) & (~P >> 9) & 2; +static int count_last_flip_H8(const unsigned long long P) { + return count_last_flip_H1(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_H7(const unsigned long long P) { + return count_last_flip_H1(vertical_mirror(P) >> 8); } +#endif // no lzcnt_u64 + /** * Count last flipped discs when playing on square C3. * @@ -472,10 +414,10 @@ static int count_last_flip_C3(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_2[((P & 0x0404040404040404ULL) * 0x0040810204081020ULL) >> 56]; + n_flipped = COUNT_FLIP_2[((P & 0x0404040404040404) * 0x0040810204081020) >> 56]; n_flipped += COUNT_FLIP_2[(P >> 16) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x0000000102040810ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_2[((P & 0x8040201008040201ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x0000000102040810) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x8040201008040201) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -490,10 +432,10 @@ static int count_last_flip_D3(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_2[((P & 0x0808080808080808ULL) * 0x0020408102040810ULL) >> 56]; + n_flipped = COUNT_FLIP_2[((P & 0x0808080808080808) * 0x0020408102040810) >> 56]; n_flipped += COUNT_FLIP_3[(P >> 16) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0000010204081020ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_3[((P & 0x0080402010080402ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0000010204081020) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0080402010080402) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -508,10 +450,10 @@ static int count_last_flip_E3(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_2[((P & 0x1010101010101010ULL) * 0x0010204081020408ULL) >> 56]; + n_flipped = COUNT_FLIP_2[((P & 0x1010101010101010) * 0x0010204081020408) >> 56]; n_flipped += COUNT_FLIP_4[(P >> 16) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0001020408102040ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_4[((P & 0x0000804020100804ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0001020408102040) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0000804020100804) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -526,50 +468,10 @@ static int count_last_flip_F3(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_2[((P & 0x2020202020202020ULL) * 0x0008102040810204ULL) >> 56]; + n_flipped = COUNT_FLIP_2[((P & 0x2020202020202020) * 0x0008102040810204) >> 56]; n_flipped += COUNT_FLIP_5[(P >> 16) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0102040810204080ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_5[((P & 0x0000008040201008ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; -} - -/** - * Count last flipped discs when playing on square G3. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_G3(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_d7; - - n_flipped = COUNT_FLIP_2[((P & 0x4040404040404040ULL) * 0x0004081020408102ULL) >> 56]; - n_flipped += COUNT_FLIP_L[(P >> 15) & 0x7e]; - P_d7 = P & 0x0204081020000000ULL; - n_flipped += ((P_d7 & -P_d7) * 0x0000000002081840ULL) >> 60; - n_flipped += (P >> 3) & (~P >> 12) & 2; - - return n_flipped; -} - -/** - * Count last flipped discs when playing on square H3. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_H3(const unsigned long long P) -{ - int n_flipped; - unsigned long long P_d7; - - n_flipped = COUNT_FLIP_2[((P & 0x8080808080808080ULL) * 0x0002040810204081ULL) >> 56]; - n_flipped += COUNT_FLIP_L[(P >> 16) & 0x7f]; - P_d7 = P & 0x0408102040000000ULL; - n_flipped += ((P_d7 & -P_d7) * 0x0000000001040c20ULL) >> 60; - n_flipped += (P >> 4) & (~P >> 13) & 2; + n_flipped += COUNT_FLIP_5[((P & 0x0102040810204080) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0000008040201008) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -584,9 +486,9 @@ static int count_last_flip_A4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x1008040201010101ULL) * 0x0102040808080808ULL) >> 56]; // A1A4E8 - n_flipped += COUNT_FLIP_R[(P >> 25) & 0x7f]; - n_flipped += COUNT_FLIP_4[((P & 0x0101010101020408ULL) * 0x1010101008040201ULL) >> 56]; // D1A4A8 + n_flipped = COUNT_FLIP_3[((P & 0x1008040201010101) * 0x0102040808080808) >> 56]; // A1A4E8 + n_flipped += count_H_flip_right(P, 24); + n_flipped += COUNT_FLIP_4[((P & 0x0101010101020408) * 0x1010101008040201) >> 56]; // D1A4A8 return n_flipped; } @@ -601,9 +503,9 @@ static int count_last_flip_B4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x2010080402020202ULL) * 0x0081020404040404ULL) >> 56]; // B1B4F8 - n_flipped += COUNT_FLIP_R[(P >> 26) & 0x3f]; - n_flipped += COUNT_FLIP_4[(((P & 0x0202020202040810ULL) >> 1) * 0x1010101008040201ULL) >> 56]; // E1B4B8 + n_flipped = COUNT_FLIP_3[((P & 0x2010080402020202) * 0x0081020404040404) >> 56]; // B1B4F8 + n_flipped += count_H_flip_right(P, 25); + n_flipped += COUNT_FLIP_4[(((P & 0x0202020202040810) >> 1) * 0x1010101008040201) >> 56]; // E1B4B8 return n_flipped; } @@ -618,10 +520,10 @@ static int count_last_flip_C4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x0404040404040404ULL) * 0x0040810204081020ULL) >> 56]; + n_flipped = COUNT_FLIP_3[((P & 0x0404040404040404) * 0x0040810204081020) >> 56]; n_flipped += COUNT_FLIP_2[(P >> 24) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x0000010204081020ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_2[((P & 0x4020100804020100ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x0000010204081020) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x4020100804020100) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -636,10 +538,10 @@ static int count_last_flip_D4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x0808080808080808ULL) * 0x0020408102040810ULL) >> 56]; + n_flipped = COUNT_FLIP_3[((P & 0x0808080808080808) * 0x0020408102040810) >> 56]; n_flipped += COUNT_FLIP_3[(P >> 24) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0001020408102040ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_3[((P & 0x8040201008040201ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0001020408102040) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x8040201008040201) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -654,10 +556,10 @@ static int count_last_flip_E4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x1010101010101010ULL) * 0x0010204081020408ULL) >> 56]; + n_flipped = COUNT_FLIP_3[((P & 0x1010101010101010) * 0x0010204081020408) >> 56]; n_flipped += COUNT_FLIP_4[(P >> 24) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0102040810204080ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_4[((P & 0x0080402010080402ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0102040810204080) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0080402010080402) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -672,10 +574,10 @@ static int count_last_flip_F4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x2020202020202020ULL) * 0x0008102040810204ULL) >> 56]; + n_flipped = COUNT_FLIP_3[((P & 0x2020202020202020) * 0x0008102040810204) >> 56]; n_flipped += COUNT_FLIP_5[(P >> 24) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0204081020408000ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_5[((P & 0x0000804020100804ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0204081020408000) * 0x0101010101010101) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0000804020100804) * 0x0101010101010101) >> 56]; return n_flipped; } @@ -690,9 +592,9 @@ static int count_last_flip_G4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x4040404040201008ULL) * 0x0020202020408102ULL) >> 56]; // D1G4G8 - n_flipped += COUNT_FLIP_L[(P >> 23) & 0x7e]; - n_flipped += COUNT_FLIP_4[(((P & 0x0408102040404040ULL) >> 2) * 0x0804020101010101ULL) >> 56]; // G1G4C8 + n_flipped = COUNT_FLIP_3[((P & 0x4040404040201008) * 0x0020202020408102) >> 56]; // D1G4G8 + n_flipped += count_H_flip_left(P, 30, 0x7e); + n_flipped += COUNT_FLIP_4[(((P & 0x0408102040404040) >> 2) * 0x0804020101010101) >> 56]; // G1G4C8 return n_flipped; } @@ -707,9 +609,9 @@ static int count_last_flip_H4(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_3[((P & 0x8080808080402010ULL) * 0x0010101010204081ULL) >> 56]; // E1H4H8 - n_flipped += COUNT_FLIP_L[(P >> 24) & 0x7f]; - n_flipped += COUNT_FLIP_4[(((P & 0x0810204080808080ULL) >> 3) * 0x0804020101010101ULL) >> 56]; // H1H4D8 + n_flipped = COUNT_FLIP_3[((P & 0x8080808080402010) * 0x0010101010204081) >> 56]; // E1H4H8 + n_flipped += count_H_flip_left(P, 31, 0x7f); + n_flipped += COUNT_FLIP_4[(((P & 0x0810204080808080) >> 3) * 0x0804020101010101) >> 56]; // H1H4D8 return n_flipped; } @@ -720,15 +622,8 @@ static int count_last_flip_H4(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_A5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x0804020101010101ULL) * 0x0102040810101010ULL) >> 56]; // A1A5D8 - n_flipped += COUNT_FLIP_R[(P >> 33) & 0x7f]; - n_flipped += COUNT_FLIP_3[((P & 0x0101010102040810ULL) * 0x0808080808040201ULL) >> 56]; // E1A5A8 - - return n_flipped; +static int count_last_flip_A5(const unsigned long long P) { + return count_last_flip_A4(vertical_mirror(P)); } /** @@ -737,15 +632,8 @@ static int count_last_flip_A5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_B5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x1008040202020202ULL) * 0x0081020408080808ULL) >> 56]; // B1B5E8 - n_flipped += COUNT_FLIP_R[(P >> 34) & 0x3f]; - n_flipped += COUNT_FLIP_3[(((P & 0x0202020204081020ULL) >> 1) * 0x0808080808040201ULL) >> 56]; // F1B5B8 - - return n_flipped; +static int count_last_flip_B5(const unsigned long long P) { + return count_last_flip_B4(vertical_mirror(P)); } /** @@ -754,16 +642,8 @@ static int count_last_flip_B5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_C5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x0404040404040404ULL) * 0x0040810204081020ULL) >> 56]; - n_flipped += COUNT_FLIP_2[(P >> 32) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x0001020408102040ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_2[((P & 0x2010080402010000ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_C5(const unsigned long long P) { + return count_last_flip_C4(vertical_mirror(P)); } /** @@ -772,16 +652,8 @@ static int count_last_flip_C5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_D5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x0808080808080808ULL) * 0x0020408102040810ULL) >> 56]; - n_flipped += COUNT_FLIP_3[(P >> 32) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0102040810204080ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_3[((P & 0x4020100804020100ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_D5(const unsigned long long P) { + return count_last_flip_D4(vertical_mirror(P)); } /** @@ -790,16 +662,8 @@ static int count_last_flip_D5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_E5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x1010101010101010ULL) * 0x0010204081020408ULL) >> 56]; - n_flipped += COUNT_FLIP_4[(P >> 32) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0204081020408000ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_4[((P & 0x8040201008040201ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_E5(const unsigned long long P) { + return count_last_flip_E4(vertical_mirror(P)); } /** @@ -808,16 +672,8 @@ static int count_last_flip_E5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_F5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x2020202020202020ULL) * 0x0008102040810204ULL) >> 56]; - n_flipped += COUNT_FLIP_5[(P >> 32) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0408102040800000ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_5[((P & 0x0080402010080402ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_F5(const unsigned long long P) { + return count_last_flip_F4(vertical_mirror(P)); } /** @@ -826,15 +682,8 @@ static int count_last_flip_F5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_G5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x4040404020100804ULL) * 0x0040404040408102ULL) >> 56]; // C1G5G8 - n_flipped += COUNT_FLIP_L[(P >> 31) & 0x7e]; - n_flipped += COUNT_FLIP_3[(((P & 0x0810204040404040ULL) >> 3) * 0x1008040201010101ULL) >> 56]; // G1G5D8 - - return n_flipped; +static int count_last_flip_G5(const unsigned long long P) { + return count_last_flip_G4(vertical_mirror(P)); } /** @@ -843,19 +692,12 @@ static int count_last_flip_G5(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_H5(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_4[((P & 0x8080808040201008ULL) * 0x0020202020204081ULL) >> 56]; // D1H5H8 - n_flipped += COUNT_FLIP_L[(P >> 32) & 0x7f]; - n_flipped += COUNT_FLIP_3[(((P & 0x1020408080808080ULL) >> 4) * 0x1008040201010101ULL) >> 56]; // H1H5E8 - - return n_flipped; +static int count_last_flip_H5(const unsigned long long P) { + return count_last_flip_H4(vertical_mirror(P)); } /** - * Count last flipped discs when playing on square A6. + * Count last flipped discs when playing on square A3/A6. * * @param P player's disc pattern. * @return flipped disc count. @@ -864,15 +706,25 @@ static int count_last_flip_A6(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_5[((P & 0x0402010101010101ULL) * 0x0102040810202020ULL) >> 56]; // A1A6C8 - n_flipped += COUNT_FLIP_R[(P >> 41) & 0x7f]; - n_flipped += COUNT_FLIP_2[((P & 0x0101010204081020ULL) * 0x0404040404040201ULL) >> 56]; // F1A6A8 +#ifdef __ARM_FEATURE_CLZ // shorter on arm + n_flipped = count_V_flip_reverse((P & 0x0000000101010101), 31); + n_flipped += count_V_flip_reverse((P & 0x0000000204081020), 24); + n_flipped += (((P >> 56) & ~(P >> 48) & 1) + ((P >> 58) & ~(P >> 49) & 1)) * 2; +#else + n_flipped = COUNT_FLIP_5[((P & 0x0402010101010101) * 0x0102040810202020) >> 56]; // A1A6C8 + n_flipped += COUNT_FLIP_2[((P & 0x0101010204081020) * 0x0404040404040201) >> 56]; // F1A6A8 +#endif + n_flipped += count_H_flip_right(P, 40); return n_flipped; } +static int count_last_flip_A3(const unsigned long long P) { + return count_last_flip_A6(vertical_mirror(P)); +} + /** - * Count last flipped discs when playing on square B6. + * Count last flipped discs when playing on square B3/B6. * * @param P player's disc pattern. * @return flipped disc count. @@ -881,29 +733,31 @@ static int count_last_flip_B6(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_5[((P & 0x0804020202020202ULL) * 0x0081020408101010ULL) >> 56]; // B1B6D8 - n_flipped += COUNT_FLIP_R[(P >> 42) & 0x3f]; - n_flipped += COUNT_FLIP_2[(((P & 0x0202020408102040ULL) >> 1) * 0x0404040404040201ULL) >> 56]; // G1B6B8 +#ifdef __ARM_FEATURE_CLZ + n_flipped = count_V_flip_reverse((P & 0x0000000202020202), 30); + n_flipped += count_V_flip_reverse((P & 0x0000000408102040), 23); + n_flipped += (((P >> 57) & ~(P >> 49) & 1) + ((P >> 59) & ~(P >> 50) & 1)) * 2; +#else + n_flipped = COUNT_FLIP_5[((P & 0x0804020202020202) * 0x0081020408101010) >> 56]; // B1B6D8 + n_flipped += COUNT_FLIP_2[(((P & 0x0202020408102040) >> 1) * 0x0404040404040201) >> 56]; // G1B6B8 +#endif + n_flipped += count_H_flip_right(P, 41); return n_flipped; } +static int count_last_flip_B3(const unsigned long long P) { + return count_last_flip_B6(vertical_mirror(P)); +} + /** * Count last flipped discs when playing on square C6. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_C6(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_5[((P & 0x0404040404040404ULL) * 0x0040810204081020ULL) >> 56]; - n_flipped += COUNT_FLIP_2[(P >> 40) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x0102040810204080ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_2[((P & 0x1008040201000000ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_C6(const unsigned long long P) { + return count_last_flip_C3(vertical_mirror(P)); } /** @@ -912,16 +766,8 @@ static int count_last_flip_C6(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_D6(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_5[((P & 0x0808080808080808ULL) * 0x0020408102040810ULL) >> 56]; - n_flipped += COUNT_FLIP_3[(P >> 40) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0204081020408000ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_3[((P & 0x2010080402010000ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_D6(const unsigned long long P) { + return count_last_flip_D3(vertical_mirror(P)); } /** @@ -930,16 +776,8 @@ static int count_last_flip_D6(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_E6(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_5[((P & 0x1010101010101010ULL) * 0x0010204081020408ULL) >> 56]; - n_flipped += COUNT_FLIP_4[(P >> 40) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0408102040800000ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_4[((P & 0x4020100804020100ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_E6(const unsigned long long P) { + return count_last_flip_E3(vertical_mirror(P)); } /** @@ -948,20 +786,12 @@ static int count_last_flip_E6(const unsigned long long P) * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_F6(const unsigned long long P) -{ - int n_flipped; - - n_flipped = COUNT_FLIP_5[((P & 0x2020202020202020ULL) * 0x0008102040810204ULL) >> 56]; - n_flipped += COUNT_FLIP_5[(P >> 40) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0810204080000000ULL) * 0x0101010101010101ULL) >> 56]; - n_flipped += COUNT_FLIP_5[((P & 0x8040201008040201ULL) * 0x0101010101010101ULL) >> 56]; - - return n_flipped; +static int count_last_flip_F6(const unsigned long long P) { + return count_last_flip_F3(vertical_mirror(P)); } /** - * Count last flipped discs when playing on square G6. + * Count last flipped discs when playing on square G3/G6. * * @param P player's disc pattern. * @return flipped disc count. @@ -970,15 +800,25 @@ static int count_last_flip_G6(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_5[((P & 0x4040402010080402ULL) * 0x0080808080808102ULL) >> 56]; // B1G6G8 - n_flipped += COUNT_FLIP_L[(P >> 39) & 0x7e]; - n_flipped += COUNT_FLIP_2[(((P & 0x1020404040404040ULL) >> 4) * 0x2010080402010101ULL) >> 56]; // G1G6E8 +#ifdef __ARM_FEATURE_CLZ + n_flipped = count_V_flip_reverse((P & 0x0000004040404040), 23); + n_flipped += count_V_flip_reverse((P & 0x0000002010080402), 24); + n_flipped += (((P >> 62) & ~(P >> 54) & 1) + ((P >> 60) & ~(P >> 53) & 1)) * 2; +#else + n_flipped = COUNT_FLIP_5[((P & 0x4040402010080402) * 0x0080808080808102) >> 56]; // B1G6G8 + n_flipped += COUNT_FLIP_2[(((P & 0x1020404040404040) >> 4) * 0x2010080402010101) >> 56]; // G1G6E8 +#endif + n_flipped += count_H_flip_left(P, 46, 0x7e); return n_flipped; } +static int count_last_flip_G3(const unsigned long long P) { + return count_last_flip_G6(vertical_mirror(P)); +} + /** - * Count last flipped discs when playing on square H6. + * Count last flipped discs when playing on square H3/H6. * * @param P player's disc pattern. * @return flipped disc count. @@ -987,285 +827,259 @@ static int count_last_flip_H6(const unsigned long long P) { int n_flipped; - n_flipped = COUNT_FLIP_5[((P & 0x8080804020100804ULL) * 0x0040404040404081ULL) >> 56]; // C1H6H8 - n_flipped += COUNT_FLIP_L[(P >> 40) & 0x7f]; - n_flipped += COUNT_FLIP_2[(((P & 0x2040808080808080ULL) >> 5) * 0x2010080402010101ULL) >> 56]; // H1H6F8 +#ifdef __ARM_FEATURE_CLZ + n_flipped = count_V_flip_reverse((P & 0x0000008080808080), 24); + n_flipped += count_V_flip_reverse((P & 0x0000004020100804), 25); + n_flipped += (((P >> 63) & ~(P >> 55) & 1) + ((P >> 61) & ~(P >> 54) & 1)) * 2; +#else + n_flipped = COUNT_FLIP_5[((P & 0x8080804020100804) * 0x0040404040404081) >> 56]; // C1H6H8 + n_flipped += COUNT_FLIP_2[(((P & 0x2040808080808080) >> 5) * 0x2010080402010101) >> 56]; // H1H6F8 +#endif + n_flipped += count_H_flip_left(P, 47, 0x7f); return n_flipped; } +static int count_last_flip_H3(const unsigned long long P) { + return count_last_flip_H6(vertical_mirror(P)); +} + +#ifdef lzcnt_u64 + /** - * Count last flipped discs when playing on square A7. + * Count last flipped discs when playing on square A7/A8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_A7(const unsigned long long P) +static int count_last_flip_A8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0000010101010101ULL), 23); - n_flipped += COUNT_FLIP_R[(P >> 49) & 0x7f]; - n_flipped += count_V_flip_reverse((P & 0x0000020408102040ULL), 16); + n_flipped = count_V_flip_reverse((P & 0x0101010101010101), 15); + n_flipped += count_H_flip_right(P, 56); + n_flipped += count_V_flip_reverse((P & 0x0002040810204080), 8); return n_flipped; } -/** - * Count last flipped discs when playing on square B7. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_B7(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_A7(const unsigned long long P) { + return count_last_flip_A8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0000020202020202ULL), 22); - n_flipped += COUNT_FLIP_R[(P >> 50) & 0x3f]; - n_flipped += count_V_flip_reverse((P & 0x0000040810204080ULL), 15); +static int count_last_flip_A1(const unsigned long long P) { + return count_last_flip_A8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_A2(const unsigned long long P) { + return count_last_flip_A8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square C7. + * Count last flipped discs when playing on square B7/B8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_C7(const unsigned long long P) +static int count_last_flip_B8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0000040404040404ULL), 21); - n_flipped += COUNT_FLIP_2[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x00040A1120408000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped = count_V_flip_reverse((P & 0x0202020202020202), 14); + n_flipped += count_H_flip_right(P, 57); + n_flipped += count_V_flip_reverse((P & 0x0004081020408000), 7); return n_flipped; } -/** - * Count last flipped discs when playing on square D7. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_D7(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_B7(const unsigned long long P) { + return count_last_flip_B8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0000080808080808ULL), 20); - n_flipped += COUNT_FLIP_3[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0008142241800000ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_B1(const unsigned long long P) { + return count_last_flip_B8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_B2(const unsigned long long P) { + return count_last_flip_B8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square E7. + * Count last flipped discs when playing on square C7/C8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_E7(const unsigned long long P) +static int count_last_flip_C8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0000101010101010ULL), 19); - n_flipped += COUNT_FLIP_4[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0010284482010000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped = count_V_flip_reverse((P & 0x0404040404040404), 13); + n_flipped += COUNT_FLIP_2[P >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x040A112040800000) * 0x0101010101010101) >> 56]; // A6C8H3 return n_flipped; } -/** - * Count last flipped discs when playing on square F7. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_F7(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_C7(const unsigned long long P) { + return count_last_flip_C8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0000202020202020ULL), 18); - n_flipped += COUNT_FLIP_5[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0020508804020100ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_C1(const unsigned long long P) { + return count_last_flip_C8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_C2(const unsigned long long P) { + return count_last_flip_C8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square G7. + * Count last flipped discs when playing on square D7/D8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_G7(const unsigned long long P) +static int count_last_flip_D8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0000404040404040ULL), 17); - n_flipped += COUNT_FLIP_L[(P >> 47) & 0x7e]; - n_flipped += count_V_flip_reverse((P & 0x0000201008040201ULL), 18); + n_flipped = count_V_flip_reverse((P & 0x0808080808080808), 12); + n_flipped += COUNT_FLIP_3[P >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0814224180000000) * 0x0101010101010101) >> 56]; // A5D8H4 return n_flipped; } -/** - * Count last flipped discs when playing on square H7. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_H7(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_D7(const unsigned long long P) { + return count_last_flip_D8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0000808080808080ULL), 16); - n_flipped += COUNT_FLIP_L[(P >> 48) & 0x7f]; - n_flipped += count_V_flip_reverse((P & 0x0000402010080402ULL), 17); +static int count_last_flip_D1(const unsigned long long P) { + return count_last_flip_D8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_D2(const unsigned long long P) { + return count_last_flip_D8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square A8. + * Count last flipped discs when playing on square E7/E8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_A8(const unsigned long long P) +static int count_last_flip_E8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0001010101010101ULL), 15); - n_flipped += COUNT_FLIP_R[P >> 57]; - n_flipped += count_V_flip_reverse((P & 0x0002040810204080ULL), 8); + n_flipped = count_V_flip_reverse((P & 0x1010101010101010), 11); + n_flipped += COUNT_FLIP_4[P >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x1028448201000000) * 0x0101010101010101) >> 56]; // A4E8H5 return n_flipped; } -/** - * Count last flipped discs when playing on square B8. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_B8(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_E7(const unsigned long long P) { + return count_last_flip_E8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0002020202020202ULL), 14); - n_flipped += COUNT_FLIP_R[P >> 58]; - n_flipped += count_V_flip_reverse((P & 0x0004081020408000ULL), 7); +static int count_last_flip_E1(const unsigned long long P) { + return count_last_flip_E8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_E2(const unsigned long long P) { + return count_last_flip_E8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square C8. + * Count last flipped discs when playing on square F7/F8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_C8(const unsigned long long P) +static int count_last_flip_F8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0004040404040404ULL), 13); - n_flipped += COUNT_FLIP_2[P >> 56]; - n_flipped += COUNT_FLIP_2[((P & 0x040A112040800000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped = count_V_flip_reverse((P & 0x2020202020202020), 10); + n_flipped += COUNT_FLIP_5[P >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0050880402010000) * 0x0101010101010101) >> 56]; // A3F8H6 return n_flipped; } -/** - * Count last flipped discs when playing on square D8. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_D8(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_F7(const unsigned long long P) { + return count_last_flip_F8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0008080808080808ULL), 12); - n_flipped += COUNT_FLIP_3[P >> 56]; - n_flipped += COUNT_FLIP_3[((P & 0x0814224180000000ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_F1(const unsigned long long P) { + return count_last_flip_F8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_F2(const unsigned long long P) { + return count_last_flip_F8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square E8. + * Count last flipped discs when playing on square G7/G8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_E8(const unsigned long long P) +static int count_last_flip_G8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0010101010101010ULL), 11); - n_flipped += COUNT_FLIP_4[P >> 56]; - n_flipped += COUNT_FLIP_4[((P & 0x1028448201000000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped = count_V_flip_reverse((P & 0x4040404040404040), 9); + n_flipped += count_H_flip_left(P, 62, 0x7e); + n_flipped += count_V_flip_reverse((P & 0x0020100804020100), 10); return n_flipped; } -/** - * Count last flipped discs when playing on square F8. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_F8(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_G7(const unsigned long long P) { + return count_last_flip_G8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0020202020202020ULL), 10); - n_flipped += COUNT_FLIP_5[P >> 56]; - n_flipped += COUNT_FLIP_5[((P & 0x0050880402010000ULL) * 0x0101010101010101ULL) >> 56]; +static int count_last_flip_G1(const unsigned long long P) { + return count_last_flip_G8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_G2(const unsigned long long P) { + return count_last_flip_G8(vertical_mirror(P) << 8); } /** - * Count last flipped discs when playing on square G8. + * Count last flipped discs when playing on square H7/H8. * * @param P player's disc pattern. * @return flipped disc count. */ -static int count_last_flip_G8(const unsigned long long P) +static int count_last_flip_H8(const unsigned long long P) { int n_flipped; - n_flipped = count_V_flip_reverse((P & 0x0040404040404040ULL), 9); - n_flipped += COUNT_FLIP_L[(P >> 55) & 0x7e]; - n_flipped += count_V_flip_reverse((P & 0x0020100804020100ULL), 10); + n_flipped = count_V_flip_reverse((P & 0x8080808080808080), 8); + n_flipped += count_H_flip_left(P, 63, 0x7f); + n_flipped += count_V_flip_reverse((P & 0x0040201008040201), 9); return n_flipped; } -/** - * Count last flipped discs when playing on square H8. - * - * @param P player's disc pattern. - * @return flipped disc count. - */ -static int count_last_flip_H8(const unsigned long long P) -{ - int n_flipped; +static int count_last_flip_H7(const unsigned long long P) { + return count_last_flip_H8(P << 8); +} - n_flipped = count_V_flip_reverse((P & 0x0080808080808080ULL), 8); - n_flipped += COUNT_FLIP_L[(P >> 56) & 0x7f]; - n_flipped += count_V_flip_reverse((P & 0x0040201008040201ULL), 9); +static int count_last_flip_H1(const unsigned long long P) { + return count_last_flip_H8(vertical_mirror(P)); +} - return n_flipped; +static int count_last_flip_H2(const unsigned long long P) { + return count_last_flip_H8(vertical_mirror(P) << 8); } +#endif // lzcnt_u64 + /** * Count last flipped discs when plassing. * @@ -1279,7 +1093,7 @@ static int count_last_flip_pass(const unsigned long long P) } /** Array of functions to count flipped discs of the last move */ -int (*COUNT_LAST_FLIP[])(const unsigned long long) = { +int (*count_last_flip[])(const unsigned long long) = { count_last_flip_A1, count_last_flip_B1, count_last_flip_C1, count_last_flip_D1, count_last_flip_E1, count_last_flip_F1, count_last_flip_G1, count_last_flip_H1, count_last_flip_A2, count_last_flip_B2, count_last_flip_C2, count_last_flip_D2, @@ -1298,9 +1112,3 @@ int (*COUNT_LAST_FLIP[])(const unsigned long long) = { count_last_flip_E8, count_last_flip_F8, count_last_flip_G8, count_last_flip_H8, count_last_flip_pass, }; - -int count_last_flip(const int x, const unsigned long long P) -{ - return COUNT_LAST_FLIP[x](P); -} - diff --git a/src/count_last_flip_bmi.c b/src/count_last_flip_bmi.c new file mode 100644 index 00000000..fc80a5d8 --- /dev/null +++ b/src/count_last_flip_bmi.c @@ -0,0 +1,172 @@ +/** + * @file count_last_flip_bmi.c + * + * + * A function is provided to count the number of fipped disc of the last move. + * + * This implementation uses BMI1 instructions, lzcnt and tzcnt. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 1998 - 2018 + * @author Toshihiko Okuhara + * @version 4.4 + * + */ + +#include "bit.h" + +/** precomputed count flip array */ +static const unsigned char COUNT_FLIP[8][256] = { + { + 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 12, 12, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 10, 10, 10, 10, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + }, + { + 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 8, 10, 8, 8, 8, 10, 8, 8, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + }, + { + 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 6, 10, 8, 8, 6, 6, 6, 6, 6, 10, 8, 8, 6, 6, 6, 6, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + }, + { + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, +}; + +/* bit masks for diagonal lines */ +static const unsigned long long mask_d[2][64] = { + { + 0x0000000000000001ULL, 0x0000000000000102ULL, 0x0000000000010204ULL, 0x0000000001020408ULL, + 0x0000000102040810ULL, 0x0000010204081020ULL, 0x0001020408102040ULL, 0x0102040810204080ULL, + 0x0000000000000102ULL, 0x0000000000010204ULL, 0x0000000001020408ULL, 0x0000000102040810ULL, + 0x0000010204081020ULL, 0x0001020408102040ULL, 0x0102040810204080ULL, 0x0204081020408000ULL, + 0x0000000000010204ULL, 0x0000000001020408ULL, 0x0000000102040810ULL, 0x0000010204081020ULL, + 0x0001020408102040ULL, 0x0102040810204080ULL, 0x0204081020408000ULL, 0x0408102040800000ULL, + 0x0000000001020408ULL, 0x0000000102040810ULL, 0x0000010204081020ULL, 0x0001020408102040ULL, + 0x0102040810204080ULL, 0x0204081020408000ULL, 0x0408102040800000ULL, 0x0810204080000000ULL, + 0x0000000102040810ULL, 0x0000010204081020ULL, 0x0001020408102040ULL, 0x0102040810204080ULL, + 0x0204081020408000ULL, 0x0408102040800000ULL, 0x0810204080000000ULL, 0x1020408000000000ULL, + 0x0000010204081020ULL, 0x0001020408102040ULL, 0x0102040810204080ULL, 0x0204081020408000ULL, + 0x0408102040800000ULL, 0x0810204080000000ULL, 0x1020408000000000ULL, 0x2040800000000000ULL, + 0x0001020408102040ULL, 0x0102040810204080ULL, 0x0204081020408000ULL, 0x0408102040800000ULL, + 0x0810204080000000ULL, 0x1020408000000000ULL, 0x2040800000000000ULL, 0x4080000000000000ULL, + 0x0102040810204080ULL, 0x0204081020408000ULL, 0x0408102040800000ULL, 0x0810204080000000ULL, + 0x1020408000000000ULL, 0x2040800000000000ULL, 0x4080000000000000ULL, 0x8000000000000000ULL + }, + { + 0x8040201008040201ULL, 0x0080402010080402ULL, 0x0000804020100804ULL, 0x0000008040201008ULL, + 0x0000000080402010ULL, 0x0000000000804020ULL, 0x0000000000008040ULL, 0x0000000000000080ULL, + 0x4020100804020100ULL, 0x8040201008040201ULL, 0x0080402010080402ULL, 0x0000804020100804ULL, + 0x0000008040201008ULL, 0x0000000080402010ULL, 0x0000000000804020ULL, 0x0000000000008040ULL, + 0x2010080402010000ULL, 0x4020100804020100ULL, 0x8040201008040201ULL, 0x0080402010080402ULL, + 0x0000804020100804ULL, 0x0000008040201008ULL, 0x0000000080402010ULL, 0x0000000000804020ULL, + 0x1008040201000000ULL, 0x2010080402010000ULL, 0x4020100804020100ULL, 0x8040201008040201ULL, + 0x0080402010080402ULL, 0x0000804020100804ULL, 0x0000008040201008ULL, 0x0000000080402010ULL, + 0x0804020100000000ULL, 0x1008040201000000ULL, 0x2010080402010000ULL, 0x4020100804020100ULL, + 0x8040201008040201ULL, 0x0080402010080402ULL, 0x0000804020100804ULL, 0x0000008040201008ULL, + 0x0402010000000000ULL, 0x0804020100000000ULL, 0x1008040201000000ULL, 0x2010080402010000ULL, + 0x4020100804020100ULL, 0x8040201008040201ULL, 0x0080402010080402ULL, 0x0000804020100804ULL, + 0x0201000000000000ULL, 0x0402010000000000ULL, 0x0804020100000000ULL, 0x1008040201000000ULL, + 0x2010080402010000ULL, 0x4020100804020100ULL, 0x8040201008040201ULL, 0x0080402010080402ULL, + 0x0100000000000000ULL, 0x0201000000000000ULL, 0x0402010000000000ULL, 0x0804020100000000ULL, + 0x1008040201000000ULL, 0x2010080402010000ULL, 0x4020100804020100ULL, 0x8040201008040201ULL + } +}; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +int last_flip(int pos, unsigned long long P) +{ + unsigned long long P8, P7, P9; + int n_flipped; + int x = pos & 7; + int y = pos & 0x38; + int ry = y ^ 0x38; + + n_flipped = COUNT_FLIP[x][(unsigned char) (P >> y)]; + + P8 = P & (0x0101010101010101ULL << x); + P7 = P & mask_d[0][pos]; + P9 = P & mask_d[1][pos]; + + n_flipped += ((((int) __tzcnt_u64((P8 >> y) >> 8) + (int) __lzcnt64((P8 << ry) << 8)) & 0x38) + + ((int) __tzcnt_u64((P7 >> y) >> 8) & 0x38) + + ((int) __tzcnt_u64((P9 >> y) >> 8) & 0x38) + + ((int) __lzcnt64((P7 << ry) << 8) & 0x38) + + ((int) __lzcnt64((P9 << ry) << 8) & 0x38)) >> 2; + + return n_flipped; +} diff --git a/src/count_last_flip_bmi2.c b/src/count_last_flip_bmi2.c new file mode 100644 index 00000000..f85c0e4f --- /dev/null +++ b/src/count_last_flip_bmi2.c @@ -0,0 +1,204 @@ +/** + * @file count_last_flip_bmi2.c + * + * + * A function is provided to count the number of fipped disc of the last move. + * + * The basic principle is to read into an array a precomputed result. Doing + * this is easy for a single line ; as we can use arrays of the form: + * - COUNT_FLIP[square where we play][8-bits disc pattern]. + * The problem is thus to convert any line of a 64-bits disc pattern into an + * 8-bits disc pattern. A fast way to do this is to select the right line, + * with a bit-mask, to gather the masked-bits into a continuous set by the + * BMI2 PEXT instruction. + * Once we get our 8-bits disc patterns, we directly get the number of + * flipped discs from the precomputed array, and add them from each flipping + * lines. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 1998 - 2023 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.5 + * + */ + +#include "bit.h" +#include + +/** precomputed count flip array */ +const uint8_t COUNT_FLIP[8][256] = { + { + 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 12, 12, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 10, 10, 10, 10, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + }, + { + 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 8, 10, 8, 8, 8, 10, 8, 8, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + }, + { + 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 6, 10, 8, 8, 6, 6, 6, 6, 6, 10, 8, 8, 6, 6, 6, 6, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + }, + { + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, +}; + +/* bit masks for diagonal/vertical/all lines */ +const unsigned long long mask_x[64][4] = { + { 0x0000000000000001ULL, 0x8040201008040201ULL, 0x0101010101010101ULL, 0x81412111090503ffULL }, + { 0x0000000000000102ULL, 0x0080402010080402ULL, 0x0202020202020202ULL, 0x02824222120a07ffULL }, + { 0x0000000000010204ULL, 0x0000804020100804ULL, 0x0404040404040404ULL, 0x0404844424150effULL }, + { 0x0000000001020408ULL, 0x0000008040201008ULL, 0x0808080808080808ULL, 0x08080888492a1cffULL }, + { 0x0000000102040810ULL, 0x0000000080402010ULL, 0x1010101010101010ULL, 0x10101011925438ffULL }, + { 0x0000010204081020ULL, 0x0000000000804020ULL, 0x2020202020202020ULL, 0x2020212224a870ffULL }, + { 0x0001020408102040ULL, 0x0000000000008040ULL, 0x4040404040404040ULL, 0x404142444850e0ffULL }, + { 0x0102040810204080ULL, 0x0000000000000080ULL, 0x8080808080808080ULL, 0x8182848890a0c0ffULL }, + { 0x0000000000000102ULL, 0x4020100804020104ULL, 0x0101010101010101ULL, 0x412111090503ff03ULL }, + { 0x0000000000010204ULL, 0x8040201008040201ULL, 0x0202020202020202ULL, 0x824222120a07ff07ULL }, + { 0x0000000001020408ULL, 0x0080402010080402ULL, 0x0404040404040404ULL, 0x04844424150eff0eULL }, + { 0x0000000102040810ULL, 0x0000804020100804ULL, 0x0808080808080808ULL, 0x080888492a1cff1cULL }, + { 0x0000010204081020ULL, 0x0000008040201008ULL, 0x1010101010101010ULL, 0x101011925438ff38ULL }, + { 0x0001020408102040ULL, 0x0000000080402010ULL, 0x2020202020202020ULL, 0x20212224a870ff70ULL }, + { 0x0102040810204080ULL, 0x0000000000804020ULL, 0x4040404040404040ULL, 0x4142444850e0ffe0ULL }, + { 0x0204081020408001ULL, 0x0000000000008040ULL, 0x8080808080808080ULL, 0x82848890a0c0ffc0ULL }, + { 0x0000000000010204ULL, 0x201008040201000aULL, 0x0101010101010101ULL, 0x2111090503ff0305ULL }, + { 0x0000000001020408ULL, 0x4020100804020101ULL, 0x0202020202020202ULL, 0x4222120a07ff070aULL }, + { 0x0000000102040810ULL, 0x8040201008040201ULL, 0x0404040404040404ULL, 0x844424150eff0e15ULL }, + { 0x0000010204081020ULL, 0x0080402010080402ULL, 0x0808080808080808ULL, 0x0888492a1cff1c2aULL }, + { 0x0001020408102040ULL, 0x0000804020100804ULL, 0x1010101010101010ULL, 0x1011925438ff3854ULL }, + { 0x0102040810204080ULL, 0x0000008040201008ULL, 0x2020202020202020ULL, 0x212224a870ff70a8ULL }, + { 0x0204081020408001ULL, 0x0000000080402010ULL, 0x4040404040404040ULL, 0x42444850e0ffe050ULL }, + { 0x0408102040800003ULL, 0x0000000000804020ULL, 0x8080808080808080ULL, 0x848890a0c0ffc0a0ULL }, + { 0x0000000001020408ULL, 0x1008040201000016ULL, 0x0101010101010101ULL, 0x11090503ff030509ULL }, + { 0x0000000102040810ULL, 0x2010080402010005ULL, 0x0202020202020202ULL, 0x22120a07ff070a12ULL }, + { 0x0000010204081020ULL, 0x4020100804020101ULL, 0x0404040404040404ULL, 0x4424150eff0e1524ULL }, + { 0x0001020408102040ULL, 0x8040201008040201ULL, 0x0808080808080808ULL, 0x88492a1cff1c2a49ULL }, + { 0x0102040810204080ULL, 0x0080402010080402ULL, 0x1010101010101010ULL, 0x11925438ff385492ULL }, + { 0x0204081020408001ULL, 0x0000804020100804ULL, 0x2020202020202020ULL, 0x2224a870ff70a824ULL }, + { 0x0408102040800003ULL, 0x0000008040201008ULL, 0x4040404040404040ULL, 0x444850e0ffe05048ULL }, + { 0x0810204080000007ULL, 0x0000000080402010ULL, 0x8080808080808080ULL, 0x8890a0c0ffc0a090ULL }, + { 0x0000000102040810ULL, 0x080402010000002eULL, 0x0101010101010101ULL, 0x090503ff03050911ULL }, + { 0x0000010204081020ULL, 0x100804020100000dULL, 0x0202020202020202ULL, 0x120a07ff070a1222ULL }, + { 0x0001020408102040ULL, 0x2010080402010003ULL, 0x0404040404040404ULL, 0x24150eff0e152444ULL }, + { 0x0102040810204080ULL, 0x4020100804020101ULL, 0x0808080808080808ULL, 0x492a1cff1c2a4988ULL }, + { 0x0204081020408002ULL, 0x8040201008040201ULL, 0x1010101010101010ULL, 0x925438ff38549211ULL }, + { 0x0408102040800005ULL, 0x0080402010080402ULL, 0x2020202020202020ULL, 0x24a870ff70a82422ULL }, + { 0x081020408000000bULL, 0x0000804020100804ULL, 0x4040404040404040ULL, 0x4850e0ffe0504844ULL }, + { 0x1020408000000017ULL, 0x0000008040201008ULL, 0x8080808080808080ULL, 0x90a0c0ffc0a09088ULL }, + { 0x0000010204081020ULL, 0x040201000000005eULL, 0x0101010101010101ULL, 0x0503ff0305091121ULL }, + { 0x0001020408102040ULL, 0x080402010000001dULL, 0x0202020202020202ULL, 0x0a07ff070a122242ULL }, + { 0x0102040810204080ULL, 0x100804020100000bULL, 0x0404040404040404ULL, 0x150eff0e15244484ULL }, + { 0x0204081020408001ULL, 0x2010080402010003ULL, 0x0808080808080808ULL, 0x2a1cff1c2a498808ULL }, + { 0x0408102040800003ULL, 0x4020100804020101ULL, 0x1010101010101010ULL, 0x5438ff3854921110ULL }, + { 0x081020408000000eULL, 0x8040201008040201ULL, 0x2020202020202020ULL, 0xa870ff70a8242221ULL }, + { 0x102040800000001dULL, 0x0080402010080402ULL, 0x4040404040404040ULL, 0x50e0ffe050484442ULL }, + { 0x204080000000003bULL, 0x0000804020100804ULL, 0x8080808080808080ULL, 0xa0c0ffc0a0908884ULL }, + { 0x0001020408102040ULL, 0x02010000000000beULL, 0x0101010101010101ULL, 0x03ff030509112141ULL }, + { 0x0102040810204080ULL, 0x040201000000003dULL, 0x0202020202020202ULL, 0x07ff070a12224282ULL }, + { 0x0204081020408001ULL, 0x080402010000001bULL, 0x0404040404040404ULL, 0x0eff0e1524448404ULL }, + { 0x0408102040800003ULL, 0x1008040201000007ULL, 0x0808080808080808ULL, 0x1cff1c2a49880808ULL }, + { 0x0810204080000007ULL, 0x2010080402010003ULL, 0x1010101010101010ULL, 0x38ff385492111010ULL }, + { 0x102040800000000fULL, 0x4020100804020101ULL, 0x2020202020202020ULL, 0x70ff70a824222120ULL }, + { 0x204080000000003eULL, 0x8040201008040201ULL, 0x4040404040404040ULL, 0xe0ffe05048444241ULL }, + { 0x408000000000007dULL, 0x0080402010080402ULL, 0x8080808080808080ULL, 0xc0ffc0a090888482ULL }, + { 0x0102040810204080ULL, 0x010000000000027eULL, 0x0101010101010101ULL, 0xff03050911214181ULL }, + { 0x0204081020408001ULL, 0x020100000000007dULL, 0x0202020202020202ULL, 0xff070a1222428202ULL }, + { 0x0408102040800003ULL, 0x040201000000003bULL, 0x0404040404040404ULL, 0xff0e152444840404ULL }, + { 0x0810204080000007ULL, 0x0804020100000017ULL, 0x0808080808080808ULL, 0xff1c2a4988080808ULL }, + { 0x102040800000000fULL, 0x1008040201000007ULL, 0x1010101010101010ULL, 0xff38549211101010ULL }, + { 0x204080000000001fULL, 0x2010080402010003ULL, 0x2020202020202020ULL, 0xff70a82422212020ULL }, + { 0x408000000000003fULL, 0x4020100804020101ULL, 0x4040404040404040ULL, 0xffe0504844424140ULL }, + { 0x800000000000017eULL, 0x8040201008040201ULL, 0x8080808080808080ULL, 0xffc0a09088848281ULL } +}; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +inline int last_flip(int pos, unsigned long long P) +{ + uint_fast8_t n_flipped; + int x = pos & 7; + int y = pos >> 3; + + P &= mask_x[pos][3]; // mask out unrelated bits to make dummy 0 bits for outside + // n_flipped = COUNT_FLIP[x][_bextr_u64(P, pos & 0x38, 8)]; + n_flipped = COUNT_FLIP[x][(P >> (pos & 0x38)) & 0xFF]; + n_flipped += COUNT_FLIP[y][_pext_u64(P, mask_x[pos][0])]; + n_flipped += COUNT_FLIP[y][_pext_u64(P, mask_x[pos][1])]; + n_flipped += COUNT_FLIP[y][_pext_u64(P, mask_x[pos][2])]; + + return n_flipped; +} diff --git a/src/count_last_flip_carry_64.c b/src/count_last_flip_carry_64.c index 12bab37a..9d409a28 100644 --- a/src/count_last_flip_carry_64.c +++ b/src/count_last_flip_carry_64.c @@ -27,7 +27,7 @@ * For top to bottom flip, LS1B isolation (http://chessprogramming.wikispaces.com/ * General+Setwise+Operations) is used to get the outflank bit. * - * @date 1998 - 2017 + * @date 1998 - 2018 * @author Richard Delorme * @author Toshihiko Okuhara * @version 4.4 @@ -151,7 +151,7 @@ static int count_last_flip_C1(const unsigned long long P) P_v = P & 0x0404040404040400ULL; n_flipped = ((P_v & -P_v) * 0x0000081018202830ULL) >> 60; n_flipped += COUNT_FLIP_2[P & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x0000804020110A04ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x0000804020110A04ULL) * 0x0101010101010101ULL) >> 56]; // A3C1H6 return n_flipped; } @@ -170,7 +170,7 @@ static int count_last_flip_D1(const unsigned long long P) P_v = P & 0x0808080808080800ULL; n_flipped = ((P_v & -P_v) * 0x000004080c101418ULL) >> 60; n_flipped += COUNT_FLIP_3[P & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0000008041221408ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0000008041221408ULL) * 0x0101010101010101ULL) >> 56]; // A4D1H5 return n_flipped; } @@ -189,7 +189,7 @@ static int count_last_flip_E1(const unsigned long long P) P_v = P & 0x1010101010101000ULL; n_flipped = ((P_v & -P_v) * 0x0000020406080a0cULL) >> 60; n_flipped += COUNT_FLIP_4[P & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0000000182442810ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0000000182442810ULL) * 0x0101010101010101ULL) >> 56]; // A5E1H4 return n_flipped; } @@ -208,7 +208,7 @@ static int count_last_flip_F1(const unsigned long long P) P_v = P & 0x2020202020202000ULL; n_flipped = ((P_v & -P_v) * 0x0000010203040506ULL) >> 60; n_flipped += COUNT_FLIP_5[P & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0000010204885020ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0000010204885020ULL) * 0x0101010101010101ULL) >> 56]; // A6F1H3 return n_flipped; } @@ -307,7 +307,7 @@ static int count_last_flip_C2(const unsigned long long P) P_v = P & 0x0404040404040000ULL; n_flipped = ((P_v & -P_v) * 0x0000000810182028ULL) >> 60; n_flipped += COUNT_FLIP_2[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x00804020110A0400ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x00804020110A0400ULL) * 0x0101010101010101ULL) >> 56]; // A4C2H7 return n_flipped; } @@ -326,7 +326,7 @@ static int count_last_flip_D2(const unsigned long long P) P_v = P & 0x0808080808080000ULL; n_flipped = ((P_v & -P_v) * 0x00000004080c1014ULL) >> 60; n_flipped += COUNT_FLIP_3[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0000804122140800ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0000804122140800ULL) * 0x0101010101010101ULL) >> 56]; // A5D2H6 return n_flipped; } @@ -345,7 +345,7 @@ static int count_last_flip_E2(const unsigned long long P) P_v = P & 0x1010101010100000ULL; n_flipped = ((P_v & -P_v) * 0x000000020406080aULL) >> 60; n_flipped += COUNT_FLIP_4[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0000018244281000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0000018244281000ULL) * 0x0101010101010101ULL) >> 56]; // A6E2H5 return n_flipped; } @@ -364,7 +364,7 @@ static int count_last_flip_F2(const unsigned long long P) P_v = P & 0x2020202020200000ULL; n_flipped = ((P_v & -P_v) * 0x0000000102030405ULL) >> 60; n_flipped += COUNT_FLIP_5[(P >> 8) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0001020488502000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0001020488502000ULL) * 0x0101010101010101ULL) >> 56]; // A7F2H4 return n_flipped; } @@ -1015,7 +1015,7 @@ static int count_last_flip_C7(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0000040404040404ULL) * 0x0040810204081020ULL) >> 55]; n_flipped += COUNT_FLIP_2[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_2[((P & 0x00040A1120408000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x00040A1120408000ULL) * 0x0101010101010101ULL) >> 56]; // A5C7H2 return n_flipped; } @@ -1032,7 +1032,7 @@ static int count_last_flip_D7(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0000080808080808ULL) * 0x0020408102040810ULL) >> 55]; n_flipped += COUNT_FLIP_3[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_3[((P & 0x0008142241800000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0008142241800000ULL) * 0x0101010101010101ULL) >> 56]; // A4D7H3 return n_flipped; } @@ -1049,7 +1049,7 @@ static int count_last_flip_E7(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0000101010101010ULL) * 0x0010204081020408ULL) >> 55]; n_flipped += COUNT_FLIP_4[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_4[((P & 0x0010284482010000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x0010284482010000ULL) * 0x0101010101010101ULL) >> 56]; // A3E7H4 return n_flipped; } @@ -1066,7 +1066,7 @@ static int count_last_flip_F7(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0000202020202020ULL) * 0x0008102040810204ULL) >> 55]; n_flipped += COUNT_FLIP_5[(P >> 48) & 0xff]; - n_flipped += COUNT_FLIP_5[((P & 0x0020508804020100ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0020508804020100ULL) * 0x0101010101010101ULL) >> 56]; // A2F7H5 return n_flipped; } @@ -1151,7 +1151,7 @@ static int count_last_flip_C8(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0004040404040404ULL) * 0x0040810204081020ULL) >> 56]; n_flipped += COUNT_FLIP_2[P >> 56]; - n_flipped += COUNT_FLIP_2[((P & 0x040A112040800000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_2[((P & 0x040A112040800000ULL) * 0x0101010101010101ULL) >> 56]; // A6C8H3 return n_flipped; } @@ -1168,7 +1168,7 @@ static int count_last_flip_D8(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0008080808080808ULL) * 0x0020408102040810ULL) >> 56]; n_flipped += COUNT_FLIP_3[P >> 56]; - n_flipped += COUNT_FLIP_3[((P & 0x0814224180000000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_3[((P & 0x0814224180000000ULL) * 0x0101010101010101ULL) >> 56]; // A5D8H4 return n_flipped; } @@ -1185,7 +1185,7 @@ static int count_last_flip_E8(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0010101010101010ULL) * 0x0010204081020408ULL) >> 56]; n_flipped += COUNT_FLIP_4[P >> 56]; - n_flipped += COUNT_FLIP_4[((P & 0x1028448201000000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_4[((P & 0x1028448201000000ULL) * 0x0101010101010101ULL) >> 56]; // A4E8H5 return n_flipped; } @@ -1202,7 +1202,7 @@ static int count_last_flip_F8(const unsigned long long P) n_flipped = COUNT_FLIP_L[((P & 0x0020202020202020ULL) * 0x0008102040810204ULL) >> 56]; n_flipped += COUNT_FLIP_5[P >> 56]; - n_flipped += COUNT_FLIP_5[((P & 0x0050880402010000ULL) * 0x0101010101010101ULL) >> 56]; + n_flipped += COUNT_FLIP_5[((P & 0x0050880402010000ULL) * 0x0101010101010101ULL) >> 56]; // A3F8H6 return n_flipped; } @@ -1254,7 +1254,7 @@ static int count_last_flip_pass(const unsigned long long P) } /** Array of functions to count flipped discs of the last move */ -int (*COUNT_LAST_FLIP[])(const unsigned long long) = { +int (*count_last_flip[])(const unsigned long long) = { count_last_flip_A1, count_last_flip_B1, count_last_flip_C1, count_last_flip_D1, count_last_flip_E1, count_last_flip_F1, count_last_flip_G1, count_last_flip_H1, count_last_flip_A2, count_last_flip_B2, count_last_flip_C2, count_last_flip_D2, @@ -1274,8 +1274,3 @@ int (*COUNT_LAST_FLIP[])(const unsigned long long) = { count_last_flip_pass, }; -int count_last_flip(const int x, const unsigned long long P) -{ - return COUNT_LAST_FLIP[x](P); -} - diff --git a/src/count_last_flip_kindergarten.c b/src/count_last_flip_kindergarten.c index e70c55b1..26d10b64 100644 --- a/src/count_last_flip_kindergarten.c +++ b/src/count_last_flip_kindergarten.c @@ -22,7 +22,7 @@ * For optimization purpose, the value returned is twice the number of flipped * disc, to facilitate the computation of disc difference. * - * With Modifications by Valéry ClaudePierre (merging diagonals). + * With Modifications by Valery ClaudePierre (merging diagonals). * @todo 135° merge as done by Toshihiko Okuhara * * @date 1998 - 2017 @@ -1232,7 +1232,7 @@ static int count_last_flip_pass(const unsigned long long P) } /** Array of functions to count flipped discs of the last move */ -int (*COUNT_LAST_FLIP[])(const unsigned long long) = { +int (*count_last_flip[])(const unsigned long long) = { count_last_flip_A1, count_last_flip_B1, count_last_flip_C1, count_last_flip_D1, count_last_flip_E1, count_last_flip_F1, count_last_flip_G1, count_last_flip_H1, count_last_flip_A2, count_last_flip_B2, count_last_flip_C2, count_last_flip_D2, @@ -1252,8 +1252,3 @@ int (*COUNT_LAST_FLIP[])(const unsigned long long) = { count_last_flip_pass, }; -int count_last_flip(const int x, const unsigned long long P) -{ - return COUNT_LAST_FLIP[x](P); -} - diff --git a/src/count_last_flip_lzcnt.c b/src/count_last_flip_lzcnt.c new file mode 100644 index 00000000..4094684f --- /dev/null +++ b/src/count_last_flip_lzcnt.c @@ -0,0 +1,236 @@ +/** + * @file count_last_flip_lzcnt.c + * + * + * A function is provided to count the number of fipped disc of the last move. + * + * The basic principle is to read into an array a precomputed result. Doing + * this is easy for a single line ; as we can use arrays of the form: + * - COUNT_FLIP[square where we play][8-bits disc pattern]. + * The problem is thus to convert any line of a 64-bits disc pattern into an + * 8-bits disc pattern. A fast way to do this is to select the right line, + * with a bit-mask, to gather the masked-bits into a continuous set by a simple + * multiplication and to right-shift the result to scale it into a number + * between 0 and 255. + * Once we get our 8-bits disc patterns, we directly get the number of + * flipped discs from the precomputed array, and add them from each flipping + * lines. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 1998 - 2014 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.4 + * + */ + +#include "bit_intrinsics.h" + +/** precomputed count flip array */ +static const unsigned char COUNT_FLIP[8][256] = { + { + 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 12, 12, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 10, 10, 10, 10, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + }, + { + 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 8, 10, 8, 8, 8, 10, 8, 8, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + }, + { + 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 6, 10, 8, 8, 6, 6, 6, 6, 6, 10, 8, 8, 6, 6, 6, 6, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + }, + { + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } +}; + +#ifdef lzcnt_u64 + +/* bit masks for vertical and diagonal lines for A8..H8 */ +static const unsigned long long mask_9_7[8][2] = { + { 0x0204081020408000, 0x0000000000000000 }, + { 0x0204081020400000, 0x8000000000000000 }, + { 0x0204081020000000, 0x8040000000000000 }, + { 0x0204081000000000, 0x8040200000000000 }, + { 0x0204080000000000, 0x8040201000000000 }, + { 0x0204000000000000, 0x8040201008000000 }, + { 0x0200000000000000, 0x8040201008040000 }, + { 0x0000000000000000, 0x8040201008040200 } +}; + +#else + +/* bit masks for vertical and diagonal lines for A1..H1 */ +static const unsigned long long mask_9_7[8][2] = { + { 0x0000000000000000, 0x4020100804020100 }, + { 0x0000000000000040, 0x0020100804020100 }, + { 0x0000000000002040, 0x0000100804020100 }, + { 0x0000000000102040, 0x0000000804020100 }, + { 0x0000000008102040, 0x0000000004020100 }, + { 0x0000000408102040, 0x0000000000020100 }, + { 0x0000020408102040, 0x0000000000000100 }, + { 0x0001020408102040, 0x0000000000000000 } +}; + +#endif + +/* bit masks for diagonal lines for A5..H6 */ +static const unsigned long long mask_d[16][2] = { + { 0x0000000102040810, 0x0804020100000000 }, + { 0x0000010204081020, 0x1008040201000000 }, + { 0x0001020408102040, 0x2010080402010000 }, + { 0x0102040810204080, 0x4020100804020100 }, + { 0x0204081020408000, 0x8040201008040201 }, + { 0x0408102040800000, 0x0080402010080402 }, + { 0x0810204080000000, 0x0000804020100804 }, + { 0x1020408000000000, 0x0000008040201008 }, + { 0x0000010204081020, 0x0402010000000000 }, + { 0x0001020408102040, 0x0804020100000000 }, + { 0x0102040810204080, 0x1008040201000000 }, + { 0x0204081020408000, 0x2010080402010000 }, + { 0x0408102040800000, 0x4020100804020100 }, + { 0x0810204080000000, 0x8040201008040201 }, + { 0x1020408000000000, 0x0080402010080402 }, + { 0x2040800000000000, 0x0000804020100804 } +}; + +#ifdef HAS_CPU_64 + +#define packV(P, x) (((((P) >> (x)) & 0x0101010101010101) * 0x0102040810204080) >> 56) +#define packD(PM) (((PM) * 0x0101010101010101) >> 56) + +#else + +#define packV(P, x) (((((((unsigned int)(P)) >> (x)) & 0x01010101u) + (((((unsigned int)((P) >> 32)) >> (x)) & 0x01010101u) << 4)) * 0x01020408u) >> 24) +#define packD(PM) (((((unsigned int)(PM)) * 0x01010101u) + (((unsigned int)((PM) >> 32)) * 0x01010101u)) >> 24) + +#endif // HAS_CPU_64 + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +int last_flip(int pos, unsigned long long P) +{ + unsigned long long P8, P7, P9; + int n_flipped; + int x = pos & 7; + + n_flipped = COUNT_FLIP[x][(unsigned char) (P >> (pos & 0x38))]; + +#ifdef lzcnt_u64 + + if (pos < 0x20) { + P = vertical_mirror(P); + pos ^= 0x38; + } + + if (pos >= 0x30) { + P <<= (64 - pos); + P8 = P & 0x0101010101010101; + P7 = P & mask_9_7[x][0]; + P9 = (P << 8) & mask_9_7[x][1]; + n_flipped += ((lzcnt_u64(P8) & 0x38) + (lzcnt_u64(P7) & 0x38) + (lzcnt_u64(P9) & 0x38)) >> 2; + + return n_flipped; + } + +#else // ls1b - slow + + if (pos & 0x10) { // 0 1 2 3 4 5 6 7 -> 0 1 4 5 4 5 0 1 + P = vertical_mirror(P); + pos ^= 0x38; + } + + if (pos < 0x10) { + P >>= (pos + 1); + P8 = P & 0x0080808080808080; + n_flipped += ((P8 & -P8) * 0x00004080c1014180) >> 60; + P7 = P & mask_9_7[x][0]; + n_flipped += ((P7 & -P7) * 0x0001040c2050c000) >> 60; + P9 = P & mask_9_7[x][1]; + n_flipped += ((P9 & -P9) * 0x000010100c080503) >> 60; + + return n_flipped; + } +#endif + + n_flipped += COUNT_FLIP[pos >> 3][packV(P, x)]; + P7 = P & mask_d[pos - 0x20][0]; + n_flipped += COUNT_FLIP[x][packD(P7)]; + P9 = P & mask_d[pos - 0x20][1]; + n_flipped += COUNT_FLIP[x][packD(P9)]; + + return n_flipped; +} diff --git a/src/count_last_flip_neon.c b/src/count_last_flip_neon.c new file mode 100644 index 00000000..45cb89ea --- /dev/null +++ b/src/count_last_flip_neon.c @@ -0,0 +1,293 @@ +/** + * @file count_last_flip_neon.c + * + * + * A function is provided to count the number of fipped disc of the last move. + * + * The basic principle is to read into an array a precomputed result. Doing + * this is easy for a single line ; as we can use arrays of the form: + * - COUNT_FLIP[square where we play][8-bits disc pattern]. + * The problem is thus to convert any line of a 64-bits disc pattern into an + * 8-bits disc pattern. A fast way to do this is to select the right line, + * with a bit-mask, to gather the masked-bits into a continuous set by the + * neon vaddvq_u16 instruction. + * Once we get our 8-bits disc patterns, we directly get the number of + * flipped discs from the precomputed array, and add them from each flipping + * lines. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 1998 - 2023 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.5 + * + */ + +#include + +/** precomputed count flip array */ +const unsigned char COUNT_FLIP[8][256] = { + { + 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 12, 12, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 10, 10, 10, 10, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + }, + { + 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 8, 10, 8, 8, 8, 10, 8, 8, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + }, + { + 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 6, 10, 8, 8, 6, 6, 6, 6, 6, 10, 8, 8, 6, 6, 6, 6, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + }, + { + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, +}; + +#ifdef HAS_CPU_64 +/* bit masks for diagonal lines (interleaved) */ +const uint64x2_t mask_dvhd[64][2] = { + {{ 0x000000000000ff01, 0x0000000000000000 }, { 0x0801040102010101, 0x8001400120011001 }}, + {{ 0x000000000001ff02, 0x0000000000000000 }, { 0x1002080204020202, 0x0002800240022002 }}, + {{ 0x000000010002ff04, 0x0000000000000000 }, { 0x2004100408040404, 0x0004000480044004 }}, + {{ 0x000100020004ff08, 0x0000000000000000 }, { 0x4008200810080808, 0x0008000800088008 }}, + {{ 0x000200040008ff10, 0x0000000000000001 }, { 0x8010401020101010, 0x0010001000100010 }}, + {{ 0x000400080010ff20, 0x0000000000010002 }, { 0x0020802040202020, 0x0020002000200020 }}, + {{ 0x000800100020ff40, 0x0000000100020004 }, { 0x0040004080404040, 0x0040004000400040 }}, + {{ 0x001000200040ff80, 0x0001000200040008 }, { 0x0080008000808080, 0x0080008000800080 }}, + {{ 0x00000000ff010002, 0x0000000000000000 }, { 0x0401020101010001, 0x4001200110010801 }}, + {{ 0x00000001ff020004, 0x0000000000000000 }, { 0x0802040202020102, 0x8002400220021002 }}, + {{ 0x00010002ff040008, 0x0000000000000000 }, { 0x1004080404040204, 0x0004800440042004 }}, + {{ 0x00020004ff080010, 0x0000000000000001 }, { 0x2008100808080408, 0x0008000880084008 }}, + {{ 0x00040008ff100020, 0x0000000000010002 }, { 0x4010201010100810, 0x0010001000108010 }}, + {{ 0x00080010ff200040, 0x0000000100020004 }, { 0x8020402020201020, 0x0020002000200020 }}, + {{ 0x00100020ff400080, 0x0001000200040008 }, { 0x0040804040402040, 0x0040004000400040 }}, + {{ 0x00200040ff800000, 0x0002000400080010 }, { 0x0080008080804080, 0x0080008000800080 }}, + {{ 0x0000ff0100020004, 0x0000000000000000 }, { 0x0201010100010001, 0x2001100108010401 }}, + {{ 0x0001ff0200040008, 0x0000000000000000 }, { 0x0402020201020002, 0x4002200210020802 }}, + {{ 0x0002ff0400080010, 0x0000000000000001 }, { 0x0804040402040104, 0x8004400420041004 }}, + {{ 0x0004ff0800100020, 0x0000000000010002 }, { 0x1008080804080208, 0x0008800840082008 }}, + {{ 0x0008ff1000200040, 0x0000000100020004 }, { 0x2010101008100410, 0x0010001080104010 }}, + {{ 0x0010ff2000400080, 0x0001000200040008 }, { 0x4020202010200820, 0x0020002000208020 }}, + {{ 0x0020ff4000800000, 0x0002000400080010 }, { 0x8040404020401040, 0x0040004000400040 }}, + {{ 0x0040ff8000000000, 0x0004000800100020 }, { 0x0080808040802080, 0x0080008000800080 }}, + {{ 0xff01000200040008, 0x0000000000000000 }, { 0x0101000100010001, 0x1001080104010201 }}, + {{ 0xff02000400080010, 0x0000000000000001 }, { 0x0202010200020002, 0x2002100208020402 }}, + {{ 0xff04000800100020, 0x0000000000010002 }, { 0x0404020401040004, 0x4004200410040804 }}, + {{ 0xff08001000200040, 0x0000000100020004 }, { 0x0808040802080108, 0x8008400820081008 }}, + {{ 0xff10002000400080, 0x0001000200040008 }, { 0x1010081004100210, 0x0010801040102010 }}, + {{ 0xff20004000800000, 0x0002000400080010 }, { 0x2020102008200420, 0x0020002080204020 }}, + {{ 0xff40008000000000, 0x0004000800100020 }, { 0x4040204010400840, 0x0040004000408040 }}, + {{ 0xff80000000000000, 0x0008001000200040 }, { 0x8080408020801080, 0x0080008000800080 }}, + {{ 0x0002000400080010, 0x000000000000ff01 }, { 0x0001000100010001, 0x0801040102010101 }}, + {{ 0x0004000800100020, 0x000000000001ff02 }, { 0x0102000200020002, 0x1002080204020202 }}, + {{ 0x0008001000200040, 0x000000010002ff04 }, { 0x0204010400040004, 0x2004100408040404 }}, + {{ 0x0010002000400080, 0x000100020004ff08 }, { 0x0408020801080008, 0x4008200810080808 }}, + {{ 0x0020004000800000, 0x000200040008ff10 }, { 0x0810041002100110, 0x8010401020101010 }}, + {{ 0x0040008000000000, 0x000400080010ff20 }, { 0x1020082004200220, 0x0020802040202020 }}, + {{ 0x0080000000000000, 0x000800100020ff40 }, { 0x2040104008400440, 0x0040004080404040 }}, + {{ 0x0000000000000000, 0x001000200040ff80 }, { 0x4080208010800880, 0x0080008000808080 }}, + {{ 0x0004000800100020, 0x00000000ff010002 }, { 0x0001000100010001, 0x0401020101010001 }}, + {{ 0x0008001000200040, 0x00000001ff020004 }, { 0x0002000200020002, 0x0802040202020102 }}, + {{ 0x0010002000400080, 0x00010002ff040008 }, { 0x0104000400040004, 0x1004080404040204 }}, + {{ 0x0020004000800000, 0x00020004ff080010 }, { 0x0208010800080008, 0x2008100808080408 }}, + {{ 0x0040008000000000, 0x00040008ff100020 }, { 0x0410021001100010, 0x4010201010100810 }}, + {{ 0x0080000000000000, 0x00080010ff200040 }, { 0x0820042002200120, 0x8020402020201020 }}, + {{ 0x0000000000000000, 0x00100020ff400080 }, { 0x1040084004400240, 0x0040804040402040 }}, + {{ 0x0000000000000000, 0x00200040ff800000 }, { 0x2080108008800480, 0x0080008080804080 }}, + {{ 0x0008001000200040, 0x0000ff0100020004 }, { 0x0001000100010001, 0x0201010100010001 }}, + {{ 0x0010002000400080, 0x0001ff0200040008 }, { 0x0002000200020002, 0x0402020201020002 }}, + {{ 0x0020004000800000, 0x0002ff0400080010 }, { 0x0004000400040004, 0x0804040402040104 }}, + {{ 0x0040008000000000, 0x0004ff0800100020 }, { 0x0108000800080008, 0x1008080804080208 }}, + {{ 0x0080000000000000, 0x0008ff1000200040 }, { 0x0210011000100010, 0x2010101008100410 }}, + {{ 0x0000000000000000, 0x0010ff2000400080 }, { 0x0420022001200020, 0x4020202010200820 }}, + {{ 0x0000000000000000, 0x0020ff4000800000 }, { 0x0840044002400140, 0x8040404020401040 }}, + {{ 0x0000000000000000, 0x0040ff8000000000 }, { 0x1080088004800280, 0x0080808040802080 }}, + {{ 0x0010002000400080, 0xff01000200040008 }, { 0x0001000100010001, 0x0101000100010001 }}, + {{ 0x0020004000800000, 0xff02000400080010 }, { 0x0002000200020002, 0x0202010200020002 }}, + {{ 0x0040008000000000, 0xff04000800100020 }, { 0x0004000400040004, 0x0404020401040004 }}, + {{ 0x0080000000000000, 0xff08001000200040 }, { 0x0008000800080008, 0x0808040802080108 }}, + {{ 0x0000000000000000, 0xff10002000400080 }, { 0x0110001000100010, 0x1010081004100210 }}, + {{ 0x0000000000000000, 0xff20004000800000 }, { 0x0220012000200020, 0x2020102008200420 }}, + {{ 0x0000000000000000, 0xff40008000000000 }, { 0x0440024001400040, 0x4040204010400840 }}, + {{ 0x0000000000000000, 0xff80000000000000 }, { 0x0880048002800180, 0x8080408020801080 }} +}; +#else +/* bit masks for diagonal lines */ +const uint64x2_t mask_dvhd[64][2] = { + {{ 0x0000000000000001, 0x00000000000000ff }, { 0x0101010101010101, 0x8040201008040201 }}, + {{ 0x0000000000000102, 0x00000000000000ff }, { 0x0202020202020202, 0x0080402010080402 }}, + {{ 0x0000000000010204, 0x00000000000000ff }, { 0x0404040404040404, 0x0000804020100804 }}, + {{ 0x0000000001020408, 0x00000000000000ff }, { 0x0808080808080808, 0x0000008040201008 }}, + {{ 0x0000000102040810, 0x00000000000000ff }, { 0x1010101010101010, 0x0000000080402010 }}, + {{ 0x0000010204081020, 0x00000000000000ff }, { 0x2020202020202020, 0x0000000000804020 }}, + {{ 0x0001020408102040, 0x00000000000000ff }, { 0x4040404040404040, 0x0000000000008040 }}, + {{ 0x0102040810204080, 0x00000000000000ff }, { 0x8080808080808080, 0x0000000000000080 }}, + {{ 0x0000000000000102, 0x000000000000ff00 }, { 0x0101010101010101, 0x4020100804020100 }}, + {{ 0x0000000000010204, 0x000000000000ff00 }, { 0x0202020202020202, 0x8040201008040201 }}, + {{ 0x0000000001020408, 0x000000000000ff00 }, { 0x0404040404040404, 0x0080402010080402 }}, + {{ 0x0000000102040810, 0x000000000000ff00 }, { 0x0808080808080808, 0x0000804020100804 }}, + {{ 0x0000010204081020, 0x000000000000ff00 }, { 0x1010101010101010, 0x0000008040201008 }}, + {{ 0x0001020408102040, 0x000000000000ff00 }, { 0x2020202020202020, 0x0000000080402010 }}, + {{ 0x0102040810204080, 0x000000000000ff00 }, { 0x4040404040404040, 0x0000000000804020 }}, + {{ 0x0204081020408000, 0x000000000000ff00 }, { 0x8080808080808080, 0x0000000000008040 }}, + {{ 0x0000000000010204, 0x0000000000ff0000 }, { 0x0101010101010101, 0x2010080402010000 }}, + {{ 0x0000000001020408, 0x0000000000ff0000 }, { 0x0202020202020202, 0x4020100804020100 }}, + {{ 0x0000000102040810, 0x0000000000ff0000 }, { 0x0404040404040404, 0x8040201008040201 }}, + {{ 0x0000010204081020, 0x0000000000ff0000 }, { 0x0808080808080808, 0x0080402010080402 }}, + {{ 0x0001020408102040, 0x0000000000ff0000 }, { 0x1010101010101010, 0x0000804020100804 }}, + {{ 0x0102040810204080, 0x0000000000ff0000 }, { 0x2020202020202020, 0x0000008040201008 }}, + {{ 0x0204081020408000, 0x0000000000ff0000 }, { 0x4040404040404040, 0x0000000080402010 }}, + {{ 0x0408102040800000, 0x0000000000ff0000 }, { 0x8080808080808080, 0x0000000000804020 }}, + {{ 0x0000000001020408, 0x00000000ff000000 }, { 0x0101010101010101, 0x1008040201000000 }}, + {{ 0x0000000102040810, 0x00000000ff000000 }, { 0x0202020202020202, 0x2010080402010000 }}, + {{ 0x0000010204081020, 0x00000000ff000000 }, { 0x0404040404040404, 0x4020100804020100 }}, + {{ 0x0001020408102040, 0x00000000ff000000 }, { 0x0808080808080808, 0x8040201008040201 }}, + {{ 0x0102040810204080, 0x00000000ff000000 }, { 0x1010101010101010, 0x0080402010080402 }}, + {{ 0x0204081020408000, 0x00000000ff000000 }, { 0x2020202020202020, 0x0000804020100804 }}, + {{ 0x0408102040800000, 0x00000000ff000000 }, { 0x4040404040404040, 0x0000008040201008 }}, + {{ 0x0810204080000000, 0x00000000ff000000 }, { 0x8080808080808080, 0x0000000080402010 }}, + {{ 0x0000000102040810, 0x000000ff00000000 }, { 0x0101010101010101, 0x0804020100000000 }}, + {{ 0x0000010204081020, 0x000000ff00000000 }, { 0x0202020202020202, 0x1008040201000000 }}, + {{ 0x0001020408102040, 0x000000ff00000000 }, { 0x0404040404040404, 0x2010080402010000 }}, + {{ 0x0102040810204080, 0x000000ff00000000 }, { 0x0808080808080808, 0x4020100804020100 }}, + {{ 0x0204081020408000, 0x000000ff00000000 }, { 0x1010101010101010, 0x8040201008040201 }}, + {{ 0x0408102040800000, 0x000000ff00000000 }, { 0x2020202020202020, 0x0080402010080402 }}, + {{ 0x0810204080000000, 0x000000ff00000000 }, { 0x4040404040404040, 0x0000804020100804 }}, + {{ 0x1020408000000000, 0x000000ff00000000 }, { 0x8080808080808080, 0x0000008040201008 }}, + {{ 0x0000010204081020, 0x0000ff0000000000 }, { 0x0101010101010101, 0x0402010000000000 }}, + {{ 0x0001020408102040, 0x0000ff0000000000 }, { 0x0202020202020202, 0x0804020100000000 }}, + {{ 0x0102040810204080, 0x0000ff0000000000 }, { 0x0404040404040404, 0x1008040201000000 }}, + {{ 0x0204081020408000, 0x0000ff0000000000 }, { 0x0808080808080808, 0x2010080402010000 }}, + {{ 0x0408102040800000, 0x0000ff0000000000 }, { 0x1010101010101010, 0x4020100804020100 }}, + {{ 0x0810204080000000, 0x0000ff0000000000 }, { 0x2020202020202020, 0x8040201008040201 }}, + {{ 0x1020408000000000, 0x0000ff0000000000 }, { 0x4040404040404040, 0x0080402010080402 }}, + {{ 0x2040800000000000, 0x0000ff0000000000 }, { 0x8080808080808080, 0x0000804020100804 }}, + {{ 0x0001020408102040, 0x00ff000000000000 }, { 0x0101010101010101, 0x0201000000000000 }}, + {{ 0x0102040810204080, 0x00ff000000000000 }, { 0x0202020202020202, 0x0402010000000000 }}, + {{ 0x0204081020408000, 0x00ff000000000000 }, { 0x0404040404040404, 0x0804020100000000 }}, + {{ 0x0408102040800000, 0x00ff000000000000 }, { 0x0808080808080808, 0x1008040201000000 }}, + {{ 0x0810204080000000, 0x00ff000000000000 }, { 0x1010101010101010, 0x2010080402010000 }}, + {{ 0x1020408000000000, 0x00ff000000000000 }, { 0x2020202020202020, 0x4020100804020100 }}, + {{ 0x2040800000000000, 0x00ff000000000000 }, { 0x4040404040404040, 0x8040201008040201 }}, + {{ 0x4080000000000000, 0x00ff000000000000 }, { 0x8080808080808080, 0x0080402010080402 }}, + {{ 0x0102040810204080, 0xff00000000000000 }, { 0x0101010101010101, 0x0100000000000000 }}, + {{ 0x0204081020408000, 0xff00000000000000 }, { 0x0202020202020202, 0x0201000000000000 }}, + {{ 0x0408102040800000, 0xff00000000000000 }, { 0x0404040404040404, 0x0402010000000000 }}, + {{ 0x0810204080000000, 0xff00000000000000 }, { 0x0808080808080808, 0x0804020100000000 }}, + {{ 0x1020408000000000, 0xff00000000000000 }, { 0x1010101010101010, 0x1008040201000000 }}, + {{ 0x2040800000000000, 0xff00000000000000 }, { 0x2020202020202020, 0x2010080402010000 }}, + {{ 0x4080000000000000, 0xff00000000000000 }, { 0x4040404040404040, 0x4020100804020100 }}, + {{ 0x8000000000000000, 0xff00000000000000 }, { 0x8080808080808080, 0x8040201008040201 }} +}; +#endif + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +int last_flip(int pos, unsigned long long P) +{ + unsigned int n_flips; + const unsigned char *COUNT_FLIP_X = COUNT_FLIP[pos & 7]; + const unsigned char *COUNT_FLIP_Y = COUNT_FLIP[pos >> 3]; + uint64x2_t PP = vdupq_n_u64(P); + uint64x2_t II; +#ifdef HAS_CPU_64 // vaddvq + unsigned int t; + const uint64x2_t dmask = { 0x0808040402020101, 0x8080404020201010 }; + + PP = vreinterpretq_u64_u8(vzip1q_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(PP))); + II = vandq_u64(PP, mask_dvhd[pos][0]); // 2 dirs interleaved + t = vaddvq_u16(vreinterpretq_u16_u64(II)); + n_flips = COUNT_FLIP_X[t >> 8]; + n_flips += COUNT_FLIP_X[t & 0xFF]; + II = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask); + t = vaddvq_u16(vreinterpretq_u16_u64(II)); + n_flips += COUNT_FLIP_Y[t >> 8]; + n_flips += COUNT_FLIP_Y[t & 0xFF]; + +#else // Neon kindergarten + const uint64x2_t dmask = { 0x1020408001020408, 0x1020408001020408 }; + + II = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_u64(vandq_u64(PP, mask_dvhd[pos][0]))))); + n_flips = COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(II), 2)]; + n_flips += COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(II), 0)]; + II = vreinterpretq_u64_s8(vnegq_s8(vreinterpretq_s8_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))))); + II = vpaddlq_u32(vmulq_u32(vreinterpretq_u32_u64(dmask), vreinterpretq_u32_u64(II))); + n_flips += COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 11)]; + n_flips += COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 3)]; +#endif + return n_flips; +} + diff --git a/src/count_last_flip_plain.c b/src/count_last_flip_plain.c index 24beb6e0..e13e3aad 100644 --- a/src/count_last_flip_plain.c +++ b/src/count_last_flip_plain.c @@ -26,7 +26,7 @@ */ /** precomputed count flip array */ -static const unsigned char COUNT_FLIP[8][256] = { +const unsigned char COUNT_FLIP[8][256] = { { 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, @@ -110,7 +110,7 @@ static const unsigned char COUNT_FLIP[8][256] = { }; /* bit masks for diagonal lines */ -static const unsigned long long mask_d[2][64] = { +const unsigned long long mask_d[2][64] = { { 0x0000000000000001ULL, 0x0000000000000102ULL, 0x0000000000010204ULL, 0x0000000001020408ULL, 0x0000000102040810ULL, 0x0000010204081020ULL, 0x0001020408102040ULL, 0x0102040810204080ULL, @@ -149,6 +149,18 @@ static const unsigned long long mask_d[2][64] = { } }; +#ifdef HAS_CPU_64 + +#define packV(P, x) (((((P) >> (x)) & 0x0101010101010101ULL) * 0x0102040810204080ULL) >> 56) +#define packD(PM) (((PM) * 0x0101010101010101ULL) >> 56) + +#else + +#define packV(P, x) (((((((unsigned int)(P)) >> (x)) & 0x01010101u) + (((((unsigned int)((P) >> 32)) >> (x)) & 0x01010101u) << 4)) * 0x01020408u) >> 24) +#define packD(PM) (((((unsigned int)(PM)) * 0x01010101u) + (((unsigned int)((PM) >> 32)) * 0x01010101u)) >> 24) + +#endif // HAS_CPU_64 + /** * Count last flipped discs when playing on the last empty. * @@ -158,15 +170,17 @@ static const unsigned long long mask_d[2][64] = { */ int last_flip(int pos, unsigned long long P) { - unsigned char n_flipped; + unsigned long long PM; + int n_flipped; int x = pos & 0x07; int y = pos >> 3; - const unsigned char *COUNT_FLIP_X = COUNT_FLIP[x]; - n_flipped = COUNT_FLIP[y][(((P >> x) & 0x0101010101010101ULL) * 0x0102040810204080ULL) >> 56]; - n_flipped += (*COUNT_FLIP_X)[(unsigned char) (P >> (y * 8))]; - n_flipped += (*COUNT_FLIP_X)[((P & mask_d[0][pos]) * 0x0101010101010101ULL) >> 56]; - n_flipped += (*COUNT_FLIP_X)[((P & mask_d[1][pos]) * 0x0101010101010101ULL) >> 56]; + n_flipped = COUNT_FLIP[y][packV(P, x)]; + n_flipped += COUNT_FLIP[x][(unsigned char) (P >> (y * 8))]; + PM = P & mask_d[0][pos]; + n_flipped += COUNT_FLIP[x][packD(PM)]; + PM = P & mask_d[1][pos]; + n_flipped += COUNT_FLIP[x][packD(PM)]; return n_flipped; } diff --git a/src/count_last_flip_sse.c b/src/count_last_flip_sse.c new file mode 100644 index 00000000..45fe75de --- /dev/null +++ b/src/count_last_flip_sse.c @@ -0,0 +1,225 @@ +/** + * @file count_last_flip_sse.c + * + * + * A function is provided to count the number of fipped disc of the last move. + * + * The basic principle is to read into an array a precomputed result. Doing + * this is easy for a single line ; as we can use arrays of the form: + * - COUNT_FLIP[square where we play][8-bits disc pattern]. + * The problem is thus to convert any line of a 64-bits disc pattern into an + * 8-bits disc pattern. A fast way to do this is to select the right line, + * with a bit-mask, to gather the masked-bits into a continuous set by the + * SSE PMOVMSKB or PSADBW instruction. + * Once we get our 8-bits disc patterns, we directly get the number of + * flipped discs from the precomputed array, and add them from each flipping + * lines. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 1998 - 2023 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.5 + * + */ + +#include "bit.h" +#include + +/** precomputed count flip array */ +const uint8_t COUNT_FLIP[8][256] = { + { + 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 12, 12, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 10, 10, 10, 10, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + }, + { + 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 8, 10, 8, 8, 8, 10, 8, 8, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + }, + { + 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 6, 10, 8, 8, 6, 6, 6, 6, 6, 10, 8, 8, 6, 6, 6, 6, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + }, + { + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, +}; + +/* bit masks for diagonal lines */ +const V4DI mask_dvhd[64] = { + {{ 0x0000000000000001, 0x00000000000000ff, 0x0101010101010101, 0x8040201008040201 }}, + {{ 0x0000000000000102, 0x00000000000000ff, 0x0202020202020202, 0x0080402010080402 }}, + {{ 0x0000000000010204, 0x00000000000000ff, 0x0404040404040404, 0x0000804020100804 }}, + {{ 0x0000000001020408, 0x00000000000000ff, 0x0808080808080808, 0x0000008040201008 }}, + {{ 0x0000000102040810, 0x00000000000000ff, 0x1010101010101010, 0x0000000080402010 }}, + {{ 0x0000010204081020, 0x00000000000000ff, 0x2020202020202020, 0x0000000000804020 }}, + {{ 0x0001020408102040, 0x00000000000000ff, 0x4040404040404040, 0x0000000000008040 }}, + {{ 0x0102040810204080, 0x00000000000000ff, 0x8080808080808080, 0x0000000000000080 }}, + {{ 0x0000000000000102, 0x000000000000ff00, 0x0101010101010101, 0x4020100804020100 }}, + {{ 0x0000000000010204, 0x000000000000ff00, 0x0202020202020202, 0x8040201008040201 }}, + {{ 0x0000000001020408, 0x000000000000ff00, 0x0404040404040404, 0x0080402010080402 }}, + {{ 0x0000000102040810, 0x000000000000ff00, 0x0808080808080808, 0x0000804020100804 }}, + {{ 0x0000010204081020, 0x000000000000ff00, 0x1010101010101010, 0x0000008040201008 }}, + {{ 0x0001020408102040, 0x000000000000ff00, 0x2020202020202020, 0x0000000080402010 }}, + {{ 0x0102040810204080, 0x000000000000ff00, 0x4040404040404040, 0x0000000000804020 }}, + {{ 0x0204081020408000, 0x000000000000ff00, 0x8080808080808080, 0x0000000000008040 }}, + {{ 0x0000000000010204, 0x0000000000ff0000, 0x0101010101010101, 0x2010080402010000 }}, + {{ 0x0000000001020408, 0x0000000000ff0000, 0x0202020202020202, 0x4020100804020100 }}, + {{ 0x0000000102040810, 0x0000000000ff0000, 0x0404040404040404, 0x8040201008040201 }}, + {{ 0x0000010204081020, 0x0000000000ff0000, 0x0808080808080808, 0x0080402010080402 }}, + {{ 0x0001020408102040, 0x0000000000ff0000, 0x1010101010101010, 0x0000804020100804 }}, + {{ 0x0102040810204080, 0x0000000000ff0000, 0x2020202020202020, 0x0000008040201008 }}, + {{ 0x0204081020408000, 0x0000000000ff0000, 0x4040404040404040, 0x0000000080402010 }}, + {{ 0x0408102040800000, 0x0000000000ff0000, 0x8080808080808080, 0x0000000000804020 }}, + {{ 0x0000000001020408, 0x00000000ff000000, 0x0101010101010101, 0x1008040201000000 }}, + {{ 0x0000000102040810, 0x00000000ff000000, 0x0202020202020202, 0x2010080402010000 }}, + {{ 0x0000010204081020, 0x00000000ff000000, 0x0404040404040404, 0x4020100804020100 }}, + {{ 0x0001020408102040, 0x00000000ff000000, 0x0808080808080808, 0x8040201008040201 }}, + {{ 0x0102040810204080, 0x00000000ff000000, 0x1010101010101010, 0x0080402010080402 }}, + {{ 0x0204081020408000, 0x00000000ff000000, 0x2020202020202020, 0x0000804020100804 }}, + {{ 0x0408102040800000, 0x00000000ff000000, 0x4040404040404040, 0x0000008040201008 }}, + {{ 0x0810204080000000, 0x00000000ff000000, 0x8080808080808080, 0x0000000080402010 }}, + {{ 0x0000000102040810, 0x000000ff00000000, 0x0101010101010101, 0x0804020100000000 }}, + {{ 0x0000010204081020, 0x000000ff00000000, 0x0202020202020202, 0x1008040201000000 }}, + {{ 0x0001020408102040, 0x000000ff00000000, 0x0404040404040404, 0x2010080402010000 }}, + {{ 0x0102040810204080, 0x000000ff00000000, 0x0808080808080808, 0x4020100804020100 }}, + {{ 0x0204081020408000, 0x000000ff00000000, 0x1010101010101010, 0x8040201008040201 }}, + {{ 0x0408102040800000, 0x000000ff00000000, 0x2020202020202020, 0x0080402010080402 }}, + {{ 0x0810204080000000, 0x000000ff00000000, 0x4040404040404040, 0x0000804020100804 }}, + {{ 0x1020408000000000, 0x000000ff00000000, 0x8080808080808080, 0x0000008040201008 }}, + {{ 0x0000010204081020, 0x0000ff0000000000, 0x0101010101010101, 0x0402010000000000 }}, + {{ 0x0001020408102040, 0x0000ff0000000000, 0x0202020202020202, 0x0804020100000000 }}, + {{ 0x0102040810204080, 0x0000ff0000000000, 0x0404040404040404, 0x1008040201000000 }}, + {{ 0x0204081020408000, 0x0000ff0000000000, 0x0808080808080808, 0x2010080402010000 }}, + {{ 0x0408102040800000, 0x0000ff0000000000, 0x1010101010101010, 0x4020100804020100 }}, + {{ 0x0810204080000000, 0x0000ff0000000000, 0x2020202020202020, 0x8040201008040201 }}, + {{ 0x1020408000000000, 0x0000ff0000000000, 0x4040404040404040, 0x0080402010080402 }}, + {{ 0x2040800000000000, 0x0000ff0000000000, 0x8080808080808080, 0x0000804020100804 }}, + {{ 0x0001020408102040, 0x00ff000000000000, 0x0101010101010101, 0x0201000000000000 }}, + {{ 0x0102040810204080, 0x00ff000000000000, 0x0202020202020202, 0x0402010000000000 }}, + {{ 0x0204081020408000, 0x00ff000000000000, 0x0404040404040404, 0x0804020100000000 }}, + {{ 0x0408102040800000, 0x00ff000000000000, 0x0808080808080808, 0x1008040201000000 }}, + {{ 0x0810204080000000, 0x00ff000000000000, 0x1010101010101010, 0x2010080402010000 }}, + {{ 0x1020408000000000, 0x00ff000000000000, 0x2020202020202020, 0x4020100804020100 }}, + {{ 0x2040800000000000, 0x00ff000000000000, 0x4040404040404040, 0x8040201008040201 }}, + {{ 0x4080000000000000, 0x00ff000000000000, 0x8080808080808080, 0x0080402010080402 }}, + {{ 0x0102040810204080, 0xff00000000000000, 0x0101010101010101, 0x0100000000000000 }}, + {{ 0x0204081020408000, 0xff00000000000000, 0x0202020202020202, 0x0201000000000000 }}, + {{ 0x0408102040800000, 0xff00000000000000, 0x0404040404040404, 0x0402010000000000 }}, + {{ 0x0810204080000000, 0xff00000000000000, 0x0808080808080808, 0x0804020100000000 }}, + {{ 0x1020408000000000, 0xff00000000000000, 0x1010101010101010, 0x1008040201000000 }}, + {{ 0x2040800000000000, 0xff00000000000000, 0x2020202020202020, 0x2010080402010000 }}, + {{ 0x4080000000000000, 0xff00000000000000, 0x4040404040404040, 0x4020100804020100 }}, + {{ 0x8000000000000000, 0xff00000000000000, 0x8080808080808080, 0x8040201008040201 }} +}; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +int last_flip(int pos, unsigned long long P) +{ + uint_fast8_t n_flips; + unsigned int t; + const uint8_t *COUNT_FLIP_X = COUNT_FLIP[pos & 7]; + const uint8_t *COUNT_FLIP_Y = COUNT_FLIP[pos >> 3]; + #ifdef AVXLASTFLIP // no gain + __m256i PP = _mm256_set1_epi64x(P); + + n_flips = COUNT_FLIP_X[(P >> (pos & 0x38)) & 0xFF]; + #ifdef __AVX512VL__ + t = _cvtmask32_u32(_mm256_test_epi8_mask(PP, mask_dvhd[pos].v4)); + #else + t = _mm256_movemask_epi8(_mm256_sub_epi8(_mm256_setzero_si256(), _mm256_and_si256(PP, mask_dvhd[pos].v4))); + #endif + n_flips += COUNT_FLIP_Y[t & 0xFF]; + t >>= 16; + + #else + __m128i PP = _mm_set1_epi64x(P); + __m128i II = _mm_sad_epu8(_mm_and_si128(PP, mask_dvhd[pos].v2[0]), _mm_setzero_si128()); + + n_flips = COUNT_FLIP_X[_mm_extract_epi16(II, 4)]; + n_flips += COUNT_FLIP_X[_mm_cvtsi128_si32(II)]; + #ifdef __AVX512VL__ + t = _cvtmask16_u32(_mm_test_epi8_mask(PP, mask_dvhd[pos].v2[1])); + #else + t = _mm_movemask_epi8(_mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(PP, mask_dvhd[pos].v2[1]))); + #endif + #endif + n_flips += COUNT_FLIP_Y[t >> 8]; + n_flips += COUNT_FLIP_Y[t & 0xFF]; + + return n_flips; +} + diff --git a/src/count_last_flip_sve_lzcnt.c b/src/count_last_flip_sve_lzcnt.c new file mode 100644 index 00000000..85213e8e --- /dev/null +++ b/src/count_last_flip_sve_lzcnt.c @@ -0,0 +1,72 @@ +/** + * @file count_last_flip_sve_lzcnt.c + * + * A function is provided to count the number of fipped disc of the last move. + * + * Count last flip using the flip_sve_lzcnt way. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 2024 + * @author Toshihiko Okuhara + * @version 4.5 + * + */ + +#include + +/** precomputed count flip array */ +extern const unsigned char COUNT_FLIP[8][256]; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +#ifndef __ARM_FEATURE_SVE2 + // equivalent only if no intersection between masks +#define svbsl_u64(op1,op2,op3) svorr_u64_m(pg, (op2), svand_u64_x(pg, (op3), (op1))) +#define svbsl1n_u64(op1,op2,op3) svorr_u64_m(pg, (op2), svbic_u64_x(pg, (op3), (op1))) +#endif + +int last_flip(int pos, unsigned long long P) +{ + svuint64_t PP, p_flip, p_oflank, p_eraser, p_cap, mask; + svbool_t pg; + const uint64_t (*pmask)[8]; + + PP = svdup_u64(P); + pmask = &lrmask[pos]; + pg = svwhilelt_b64(0, 4); + + mask = svld1_u64(pg, *pmask + 4); // right: clear all bits lower than outflank + p_oflank = svand_x(pg, mask, PP); + p_oflank = svand_x(pg, svclz_z(pg, p_oflank), 63); + p_eraser = svlsr_x(pg, svdup_u64(-1), p_oflank); + p_flip = svbic_x(pg, mask, p_eraser); + + mask = svld1_u64(pg, *pmask + 0); // left: look for player LS1B + p_oflank = svand_x(pg, mask, PP); + // set all bits lower than oflank, using satulation if oflank = 0 + p_cap = svbic_x(pg, svqsub(p_oflank, 1), p_oflank); + p_flip = svbsl_u64(p_cap, p_flip, mask); + + if (svcntd() == 2) { // sve128 only + mask = svld1_u64(pg, *pmask + 6); // right: set all bits higher than outflank + p_oflank = svand_x(pg, mask, PP); + p_oflank = svand_x(pg, svclz_z(pg, p_oflank), 63); + p_eraser = svlsr_x(pg, svdup_u64(-1), p_oflank); + p_flip = svbsl1n_u64(p_eraser, p_flip, mask); + + mask = svld1_u64(pg, *pmask + 2); // left: look for player LS1B + p_oflank = svand_x(pg, mask, PP); + // set all bits lower than oflank, using satulation if oflank = 0 + p_cap = svbic_x(pg, svqsub(p_oflank, 1), p_oflank); + p_flip = svbsl_u64(p_cap, p_flip, mask); + } + + return svaddv_u64(pg, svcnt_u64_x(pg, p_flip)) * 2; +} diff --git a/src/edax.c b/src/edax.c index cc380cbf..e25b9754 100644 --- a/src/edax.c +++ b/src/edax.c @@ -3,7 +3,7 @@ * * @brief Edax protocol. * - * This is version 4.4 of Edax User Interface. Several changes + * This is version 4.5 of Edax User Interface. Several changes * occurred between this version and 3.x previous versions because of the * evolution of the search engine. Here is a summary of the commands: * @@ -12,7 +12,7 @@ * -verbose [n] set Edax verbosity (default 1). * -noise [n] start displaying Edax search result from this depth\n (default 5). * -witdh [n] display edax search results using characters\n (default 80). - * -hash-table-size [n] set hashtable size (default 18 bits). + * -hash-table-size [n] set hashtable size (default 22 bits). * -n-tasks [n] control the number of parallel threads used in searching\n (default 1). * -l|level [n] search using limited depth (default 21). * -t|game-time