diff --git a/.clang-format b/.clang-format
index 371f35b91..022be940b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -29,10 +29,21 @@ ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ConstructorInitializerIndentWidth              : 4
 ContinuationIndentWidth                        : 4
 Cpp11BracedListStyle                           : true
-DerivePointerAlignment                         : true
+DerivePointerAlignment                         : false
 DisableFormat                                  : false
 # ExperimentalAutoDetectBinPacking               :
 # ForEachMacros                                  :
+IncludeCategories                              :
+  - Regex:           '^<vsmc/.*/internal'
+    Priority:        1
+  - Regex:           '^<vsmc/internal'
+    Priority:        2
+  - Regex:           '^<vsmc/'
+    Priority:        3
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '.\*'
+    Priority:        5
 IndentCaseLabels                               : true
 IndentWidth                                    : 4
 IndentWrappedFunctionNames                     : true
diff --git a/.gitignore b/.gitignore
index 0dcf84f55..b4b8b48f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,26 @@
-/build
-/doc/main.md
-/user_options.cmake
+build
+doc/main.md
+doc/news.md
+user_guide/_minted-user_guide
+user_guide/cpp/pf
+user_guide/cpp/pf.out
+user_guide/cpp/pf.pdf
+user_guide/cpp/pf.rout
+user_guide/cpp/pf_seq
+user_guide/cpp/pf_tbb
+user_guide/cpp/program_option
+user_guide/cpp/progress
+user_guide/user_guide.aux
+user_guide/user_guide.bbl
+user_guide/user_guide.bcf
+user_guide/user_guide.blg
+user_guide/user_guide.fdb_latexmk
+user_guide/user_guide.fls
+user_guide/user_guide.log
+user_guide/user_guide.out
+user_guide/user_guide.pdf
+user_guide/user_guide.pyg
+user_guide/user_guide.run.xml
+user_guide/user_guide.synctex.gz
+user_guide/user_guide.toc
+user_options.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 153b30fdf..10f97f54c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
 #  POSSIBILITY OF SUCH DAMAGE.
 # ============================================================================
 
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.3)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
 
 # Disable in source build
 IF(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
@@ -45,23 +45,26 @@ OPTION(VSMC_ENABLE_LIBRARY "Enable building of library" ON)
 OPTION(VSMC_ENABLE_EXAMPLE "Enable building of example" ON)
 
 # User configurations
-IF (EXISTS ${PROJECT_SOURCE_DIR}/user_options.cmake)
+IF(EXISTS ${PROJECT_SOURCE_DIR}/user_options.cmake)
     INCLUDE(${PROJECT_SOURCE_DIR}/user_options.cmake)
-ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/user_options.cmake)
+ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/user_options.cmake)
 
 # Configuration files
 FILE(READ ${PROJECT_SOURCE_DIR}/README.md README)
+FILE(READ ${PROJECT_SOURCE_DIR}/NEWS.md NEWS)
 SET(DOC_SOURCE  ${PROJECT_SOURCE_DIR}/doc)
 SET(DOC_INCLUDE ${PROJECT_SOURCE_DIR}/include)
 CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/config/Doxyfile.in
     ${PROJECT_BINARY_DIR}/Doxyfile)
 CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/config/main.md.in
     ${PROJECT_SOURCE_DIR}/doc/main.md)
+CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/config/news.md.in
+    ${PROJECT_SOURCE_DIR}/doc/news.md)
 
 # Installation
-IF (NOT VSMC_INSTALL_INC_DIR)
+IF(NOT VSMC_INSTALL_INC_DIR)
     SET(VSMC_INSTALL_INC_DIR include)
-ENDIF (NOT VSMC_INSTALL_INC_DIR)
+ENDIF(NOT VSMC_INSTALL_INC_DIR)
 INSTALL(DIRECTORY ${PROJECT_SOURCE_DIR}/include/vsmc
     DESTINATION ${VSMC_INSTALL_INC_DIR}/vsmc
     PATTERN ".DS_Store" EXCLUDE PATTERN "*.swp" EXCLUDE)
@@ -71,7 +74,7 @@ INCLUDE(FindDoxygen)
 FILE(GLOB_RECURSE DOC_SOURCE_FILES      ${PROJECT_SOURCE_DIR}/doc/*.*)
 FILE(GLOB_RECURSE DOC_INCLUDE_FILES_C   ${PROJECT_SOURCE_DIR}/include/*.h)
 FILE(GLOB_RECURSE DOC_INCLUDE_FILES_CPP ${PROJECT_SOURCE_DIR}/include/*.hpp)
-IF (DOXYGEN_FOUND)
+IF(DOXYGEN_FOUND)
     ADD_CUSTOM_COMMAND(
         OUTPUT  ${PROJECT_BINARY_DIR}/doc
         DEPENDS ${PROJECT_BINARY_DIR}/Doxyfile
@@ -79,11 +82,11 @@ IF (DOXYGEN_FOUND)
         WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
         COMMAND ${DOXYGEN_EXECUTABLE} ARGS Doxyfile)
     ADD_CUSTOM_TARGET(docs DEPENDS ${PROJECT_BINARY_DIR}/doc)
-ENDIF (DOXYGEN_FOUND)
+ENDIF(DOXYGEN_FOUND)
 
-IF (NOT VSMC_ENABLE_LIBRARY AND NOT VSMC_ENABLE_EXAMPLE)
+IF(NOT VSMC_ENABLE_LIBRARY AND NOT VSMC_ENABLE_EXAMPLE)
     RETURN()
-ENDIF (NOT VSMC_ENABLE_LIBRARY AND NOT VSMC_ENABLE_EXAMPLE)
+ENDIF(NOT VSMC_ENABLE_LIBRARY AND NOT VSMC_ENABLE_EXAMPLE)
 
 ##############################################################################
 # Essential
@@ -93,10 +96,10 @@ SET(CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS}
     -D__STDC_CONSTANT_MACROS)
 SET(CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS}
     -D__CL_ENABLE_EXCEPTIONS)
-IF (APPLE)
+IF(APPLE)
     SET(CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS} -U__OBJC__)
     SET(CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS} -U__OBJC2__)
-ENDIF (APPLE)
+ENDIF(APPLE)
 
 SET(VSMC_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
 SET(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${VSMC_INCLUDE_DIR})
@@ -114,41 +117,41 @@ SET(SMP_EXECUTABLES ${SMP_EXECUTABLES} seq)
 
 # Thread
 INCLUDE(FindThread)
-IF (THREAD_FOUND)
+IF(THREAD_FOUND)
     SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES} ${Thread_LINK_LIBRARIES})
-ENDIF (THREAD_FOUND)
+ENDIF(THREAD_FOUND)
 
 # Intel TBB
 INCLUDE(FindTBB)
-IF (TBB_FOUND)
+IF(TBB_FOUND)
     SET(BACKENDS ${BACKENDS} "Intel TBB")
     SET(SMP_EXECUTABLES ${SMP_EXECUTABLES} tbb)
     ADD_DEFINITIONS(${TBB_DEFINITIONS})
     ADD_DEFINITIONS(-DVSMC_HAS_TBB=1)
     INCLUDE_DIRECTORIES(SYSTEM ${TBB_INCLUDE_DIR})
     SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES} ${TBB_LINK_LIBRARIES})
-ELSE (TBB_FOUND)
+ELSE(TBB_FOUND)
     UNSET(TBB_FOUND CACHE)
     SET(TBB_FOUND FALSE CACHE BOOL "NOT Found Intel TBB")
-ENDIF (TBB_FOUND)
+ENDIF(TBB_FOUND)
 
 # OpenMP
-IF (NOT DEFINED OPENMP_FOUND)
+IF(NOT DEFINED OPENMP_FOUND)
     INCLUDE(FindOpenMP)
-ENDIF (NOT DEFINED OPENMP_FOUND)
-IF (OPENMP_FOUND)
+ENDIF(NOT DEFINED OPENMP_FOUND)
+IF(OPENMP_FOUND)
     SET(BACKENDS ${BACKENDS} "OpenMP")
     SET(SMP_EXECUTABLES ${SMP_EXECUTABLES} omp)
     SET(OPENMP_FOUND TRUE CACHE BOOL "Found OpenMP")
-ELSE (OPENMP_FOUND)
+ELSE(OPENMP_FOUND)
     UNSET(OPENMP_FOUND CACHE)
     SET(OPENMP_FOUND FALSE CACHE BOOL "NOT Found OpenMP")
-ENDIF (OPENMP_FOUND)
+ENDIF(OPENMP_FOUND)
 
 # SMP executables
-FOREACH (smp ${SMP_EXECUTABLES})
+FOREACH(smp ${SMP_EXECUTABLES})
     ADD_CUSTOM_TARGET(example_${smp})
-ENDFOREACH (smp)
+ENDFOREACH(smp)
 
 ##############################################################################
 # Check features
@@ -158,111 +161,115 @@ SET(FEATURES)
 
 # Boost
 INCLUDE(FindBoost)
-IF (Boost_FOUND)
+IF(Boost_FOUND)
     SET(FEATURES ${FEATURES} "Boost")
     INCLUDE_DIRECTORIES(SYSTEM ${Boost_INCLUDE_DIRS})
-ENDIF (Boost_FOUND)
+ENDIF(Boost_FOUND)
 
 # 128-bits integer type
 INCLUDE(FindInt128)
-IF (INT128_FOUND)
+IF(INT128_FOUND)
     SET(FEATURES ${FEATURES} "128-bits integer type")
     ADD_DEFINITIONS(-DVSMC_HAS_INT128=1)
     ADD_DEFINITIONS(-DVSMC_INT128=${INT128_TYPE})
-ELSE (INT128_FOUND)
+ELSE(INT128_FOUND)
     ADD_DEFINITIONS(-DVSMC_HAS_INT128=0)
     UNSET(INT128_FOUND)
     SET(INT128_FOUND FALSE CACHE BOOL "NOT Found int128")
-ENDIF (INT128_FOUND)
+ENDIF(INT128_FOUND)
 
 # MKL
 INCLUDE(FindMKL)
-IF (MKL_FOUND)
+IF(MKL_FOUND)
     SET(FEATURES ${FEATURES} "Intel MKL")
     SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES} ${MKL_LINK_LIBRARIES})
     ADD_DEFINITIONS(-DVSMC_HAS_MKL=1)
     INCLUDE_DIRECTORIES(SYSTEM ${MKL_INCLUDE_DIR})
-ELSE (MKL_FOUND)
+ELSE(MKL_FOUND)
     UNSET(MKL_FOUND CACHE)
     SET(MKL_FOUND FALSE CACHE BOOL "NOT Found Intel MKL")
-ENDIF (MKL_FOUND)
+    INCLUDE(FindBLAS)
+    IF(BLAS_FOUND)
+        SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES} ${BLAS_LIBRARIES})
+    ENDIF(BLAS_FOUND)
+ENDIF(MKL_FOUND)
 
 # TBB malloc
-IF (TBB_FOUND AND TBB_MALLOC_LINK_LIBRARIES)
+IF(TBB_FOUND AND TBB_MALLOC_LINK_LIBRARIES)
     SET(FEATURES ${FEATURES} "Intel TBB scalable allocator")
     SET(TBB_MALLOC_FOUND TRUE CACHE BOOL "Found Intel TBB malloc")
     SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES}
         ${TBB_MALLOC_LINK_LIBRARIES})
     ADD_DEFINITIONS(-DVSMC_HAS_TBB_MALLOC=1)
-ELSE (TBB_FOUND AND TBB_MALLOC_LINK_LIBRARIES)
+ELSE(TBB_FOUND AND TBB_MALLOC_LINK_LIBRARIES)
     ADD_DEFINITIONS(-DVSMC_HAS_TBB_MALLOC=0)
     UNSET(TBB_MALLOC_FOUND CACHE)
     SET(TBB_MALLOC_FOUND FALSE CACHE BOOL "NOT Found Intel TBB malloc")
-ENDIF (TBB_FOUND AND TBB_MALLOC_LINK_LIBRARIES)
+ENDIF(TBB_FOUND AND TBB_MALLOC_LINK_LIBRARIES)
 
 # AES-NI
 INCLUDE(FindAESNI)
-IF (AESNI_FOUND)
+IF(AESNI_FOUND)
     SET(FEATURES ${FEATURES} "AES-NI")
     ADD_DEFINITIONS(-DVSMC_HAS_AES_NI=1)
-ELSE (AESNI_FOUND)
+ELSE(AESNI_FOUND)
     ADD_DEFINITIONS(-DVSMC_HAS_AES_NI=0)
     UNSET(AESNI_FOUND CACHE)
     SET(AESNI_FOUND FALSE CACHE BOOL "NOT Found AES-NI")
-ENDIF (AESNI_FOUND)
+ENDIF(AESNI_FOUND)
 
 # AVX2
 INCLUDE(FindAVX2)
-IF (AVX2_FOUND)
+IF(AVX2_FOUND)
     SET(FEATURES ${FEATURES} "AVX2")
     ADD_DEFINITIONS(-DVSMC_HAS_AVX2=1)
-ELSE (AVX2_FOUND)
+ELSE(AVX2_FOUND)
     ADD_DEFINITIONS(-DVSMC_HAS_AVX2=0)
     UNSET(AVX2_FOUND CACHE)
     SET(AVX2_FOUND FALSE CACHE BOOL "NOT Found AVX2")
-ENDIF (AVX2_FOUND)
+ENDIF(AVX2_FOUND)
 
 # SSE2
 INCLUDE(FindSSE2)
-IF (SSE2_FOUND)
+IF(SSE2_FOUND)
     SET(FEATURES ${FEATURES} "SSE2")
     ADD_DEFINITIONS(-DVSMC_HAS_SSE2=1)
-ELSE (SSE2_FOUND)
+ELSE(SSE2_FOUND)
     ADD_DEFINITIONS(-DVSMC_HAS_SSE2=0)
     UNSET(SSE2_FOUND CACHE)
     SET(SSE2_FOUND FALSE CACHE BOOL "NOT Found SSE2")
-ENDIF (SSE2_FOUND)
+ENDIF(SSE2_FOUND)
 
 # RDRAND
 INCLUDE(FindRDRAND)
-IF (RDRAND_FOUND)
+IF(RDRAND_FOUND)
     SET(FEATURES ${FEATURES} "RDRAND")
     ADD_DEFINITIONS(-DVSMC_HAS_RDRAND=1)
-ELSE (RDRAND_FOUND)
+ELSE(RDRAND_FOUND)
     ADD_DEFINITIONS(-DVSMC_HAS_RDRAND=0)
     UNSET(RDRAND_FOUND CACHE)
     SET(RDRAND_FOUND FALSE CACHE BOOL "NOT Found RDRAND")
-ENDIF (RDRAND_FOUND)
+ENDIF(RDRAND_FOUND)
 
 # Linux librt
-IF (UNIX AND NOT APPLE AND NOT DEFINED LINUX_LIBRT)
+IF(UNIX AND NOT APPLE AND NOT DEFINED LINUX_LIBRT)
     FIND_LIBRARY(LINUX_LIBRT rt)
-ENDIF (UNIX AND NOT APPLE AND NOT DEFINED LINUX_LIBRT)
-IF (LINUX_LIBRT)
+ENDIF(UNIX AND NOT APPLE AND NOT DEFINED LINUX_LIBRT)
+IF(LINUX_LIBRT)
     SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES} ${LINUX_LIBRT})
-ENDIF (LINUX_LIBRT)
+ENDIF(LINUX_LIBRT)
 
 # HDF5
 INCLUDE(FindHDF5)
-IF (HDF5_FOUND)
+IF(HDF5_FOUND)
     SET(FEATURES ${FEATURES} "HDF5")
     ADD_DEFINITIONS(-DVSMC_HAS_HDF5=1)
     INCLUDE_DIRECTORIES(SYSTEM ${HDF5_INCLUDE_DIRS})
     SET(VSMC_LINK_LIBRARIES ${VSMC_LINK_LIBRARIES} ${HDF5_LIBRARIES})
-ELSE (HDF5_FOUND)
+ELSE(HDF5_FOUND)
     UNSET(HDF5_FOUND CACHE)
     SET(HDF5_FOUND FALSE CACHE BOOL "NOT Found HDF5")
-ENDIF (HDF5_FOUND)
+ENDIF(HDF5_FOUND)
 
 ##############################################################################
 # Macros
@@ -271,39 +278,34 @@ ENDIF (HDF5_FOUND)
 # Make sure __STDC_CONSTANT_MACROS is defined
 ADD_DEFINITIONS(-D__STDC_CONSTANT_MACROS)
 
-# Make sure __CL_ENABLE_EXCEPTIONS is defined if using OpenCL
-IF (OPENCL_FOUND)
-    ADD_DEFINITIONS(-D__CL_ENABLE_EXCEPTIONS)
-ENDIF (OPENCL_FOUND)
-
 # Workaround for vanilla GCC on Mac OS X
-IF (APPLE)
+IF(APPLE)
     ADD_DEFINITIONS(-U__OBJC__)
     ADD_DEFINITIONS(-U__OBJC2__)
-ENDIF (APPLE)
+ENDIF(APPLE)
 
 # Disable MSVC iterator warning
-IF (MSVC)
+IF(MSVC)
     ADD_DEFINITIONS(-D_SCL_SECURE_NO_WARNINGS)
-ENDIF (MSVC)
+ENDIF(MSVC)
 
 ##############################################################################
 # Enable backends
 ##############################################################################
 
 MESSAGE(STATUS "=================== Enable backends ===================")
-FOREACH (backend ${BACKENDS})
+FOREACH(backend ${BACKENDS})
     MESSAGE(STATUS ${backend})
-ENDFOREACH (backend ${BACKENDS})
+ENDFOREACH(backend ${BACKENDS})
 
 ##############################################################################
 # Enable features
 ##############################################################################
 
 MESSAGE(STATUS "=================== Enable features ===================")
-FOREACH (feature ${FEATURES})
+FOREACH(feature ${FEATURES})
     MESSAGE(STATUS ${feature})
-ENDFOREACH (feature ${FEATURES})
+ENDFOREACH(feature ${FEATURES})
 
 MESSAGE(STATUS "Linked libraries: ${VSMC_LINK_LIBRARIES}")
 
@@ -311,21 +313,21 @@ MESSAGE(STATUS "Linked libraries: ${VSMC_LINK_LIBRARIES}")
 # Library
 ##############################################################################
 
-IF (VSMC_ENABLE_LIBRARY AND MKL_FOUND)
+IF(VSMC_ENABLE_LIBRARY AND MKL_FOUND)
     ADD_CUSTOM_TARGET(lib)
     ADD_SUBDIRECTORY(lib EXCLUDE_FROM_ALL)
     ADD_DEFINITIONS(-DVSMC_HAS_RUNTIME_LIBRARY=1)
-ELSE (VSMC_ENABLE_LIBRARY AND MKL_FOUND)
+ELSE(VSMC_ENABLE_LIBRARY AND MKL_FOUND)
     ADD_DEFINITIONS(-DVSMC_HAS_RUNTIME_LIBRARY=0)
-ENDIF (VSMC_ENABLE_LIBRARY AND MKL_FOUND)
+ENDIF(VSMC_ENABLE_LIBRARY AND MKL_FOUND)
 
 ##############################################################################
 # Example
 ##############################################################################
 
-IF (VSMC_ENABLE_EXAMPLE)
+IF(VSMC_ENABLE_EXAMPLE AND (MKL_FOUND OR BLAS_FOUND))
     ADD_CUSTOM_TARGET(example)
     ADD_CUSTOM_TARGET(check)
     ADD_CUSTOM_TARGET(pdf)
     ADD_SUBDIRECTORY(example EXCLUDE_FROM_ALL)
-ENDIF (VSMC_ENABLE_EXAMPLE)
+ENDIF(VSMC_ENABLE_EXAMPLE AND (MKL_FOUND OR BLAS_FOUND))
diff --git a/LICENSE b/LICENSE
index 9a0095c15..02a6f03c7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2013-2015, Yan Zhou
+Copyright (c) 2013-2016, Yan Zhou
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/NEWS.md b/NEWS.md
index 10bb912f5..1aa6c78f9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,175 +1,66 @@
-# Changes since v2.0.0
-
-## Important changes
-
-The mutation (MCMC) steps are now performed after the resampling of
-initialization step.
-
-# Changes since v1.2.0
-
-## Important changes
-
-The library is now C++11 only. GCC 4.8.1, Clang 3.4, Intel C++ 2015 all
-provides full C++11 support. MSVC is the only one lagging behind. At the
-moment, MSVC 2015 support is considered to be the minimum.
-
-Most changes are internal. The API remain unchanged (in C++11 mode with
-supported compilers). The following are backward compatibility breaking
-changes.
-
-* Everything in the namespace `vsmc::cxx11` is gone. Replace `vsmc::cxx11` with
-  `std` shall solve any issues.
-* `Progress` is no longer a class template.
-* `vsmc::Array` is replaced by `std::array`
-* The `RngSet` class template is replaced by `RngSetScalar` and `RngSetVector`,
-  and the structures `Scalar` and `Vector` are removed.
-* The OpenCL module has gone through a complete rework. The library now has
-  its own C++ wrapper based on `std::shared_ptr` instead of the official
-  (outdated) C++ wrapper.
-* The index type `Position` is renamed to `Index`, and it is now a template
-  alias to `std::integral_constant`
-* The structure `ConstSingleParticle` is removed. All of its occurrence shall
-  be replaced by `SingleParticle`. In addition, in most places, where the API
-  previously accept `const Particle<T> &` shall now accept `Particle<T> &`
-  instead.
-* Member functions of SMP bases classes are renamed to use a more consistent
-  pattern
-  - `pre_processor` -> `eval_pre`
-  - `post_processor` -> `eval_post`
-  - `initialize_param` -> `eval_param`
-  - `initialize_state` -> `eval_sp`
-  - `move_state` -> `eval_sp`
-  - `monitor_state` -> `eval_sp`
-  - `path_state` -> `eval_sp`
-  - `path_grid` -> `eval_grid`
-* Member function in base value classes `StateMPI` and `StateCL` are renamed
-  - `copy_pre_processor` -> `copy_pre`
-  - `copy_post_processor` -> `copy_pre`
-* Weight classes are renamed
-  - `WeightSet` -> `Weight`
-  - `WeightSetMPI` -> `WeightMPI`
-  The header file `weight_set.hpp` is also renamed to `weight`
-* `Weight` (formerly `WeightSet`)'s interface has been overhauled
-  - `set_weight` -> `set`
-  - `set_log_weight` -> `set_log`
-  - `mul_weight` -> `mul`
-  - `add_log_weight` -> `add_log`
-  - `weight_data` -> `data`
-* `Particle::weight_set` renamed to `Particle::weight`
-* `weight_set_type` renamed to `weight_type`
-* All occurrence of `Rng` are renamed to `RNG`
-
-## Removed features
-
-* Modules removed
-  - Thread
-  - GCD
-  - Integrate
-  - Core/Adapter
-  - Core/StateTuple
-  - SMP/Adapter
-  - RNG/GSL
-  - SMP/CILK
-  - SMP/GCD
-  - SMP/PPL
-  - SMP/STD
-  - OpenCL/Adapter
-  - Utility/Array
-  - Utility/Counter
-  - Utility/CString
+# Changes in v2.2.0
 
 ## New features
 
-* `Monitor::record_data` gets an overload version that return the row pointer.
-* `std::unique_ptr` alike wrappers for [Intel MKL][MKL] `VSLStreamStatePtr`,
-  `VSLSSTaskPtr`, `VSLConvTaskPtr`, `VSLCorrTaskPtr`, `DFTaskPtr`.
-
-## Bug fixes
-
-* The default resampling threshold when a user defined resampling algorithm is
-  provided in `Sampler`'s constructor is fixed to be always resampling, the
-  same as for the built-in schemes.
-
-# Changes since v1.1.1
-
-## New features
-
-* `Monitor` gains a new parameter `stage`. A monitor maybe evaluated in
-  different stages of the initialization and iterations. See the documents of
-  `Monitor` constructor.
-* A new optional argument of `Sampler::monitor` that allows setting the above
-  `stage` parameter when adding a new `Monitor`.
-* `StateMatrix` has new overloaded `data` member function that is equivalent to
-  either `row_data` (`StateMatrix<RowMajor, Dim, T>`) or `col_data`
-  (`StateMatrix<ColMajor, Dim, T>`).
-* The library now optionally use `<type_traits>` standard header. The
-  configuration macro is `VSMC_HAS_CXX11LIB_TYPE_TRAITS`.
-* Multinomial and Residual resampling algorithms now use [Intel TBB][TBB]'s
-  `tbb::parallel_sort` to improve performance. This can be disabled by setting
-  `VSMC_USE_TBB` to zero even if `VSMC_HAS_TBB` is non-zero.
-* New [Intel TBB][TBB] based thread local version of `RngSet`
-* `WeightSet` now has a set of static member functions that can be used to
-  implement weights related operations.
-* New classes in `rng/u01.hpp`, `U01SequenceSorted`, `U01SequenceStratified`,
-  `U01SequenceSystematic`, for generating sorted uniform random variates with
-  O(N) runtime cost and O(1) memory cost. These are primarily used resampling
-  algorithms within the library but can find other usages.
+* `SingleParticle` now as iterator-like behaviors.
+* `Particle` now has a new `sp` member that return `SingleParticle` objects.
+* `NormalMVDistribution` and related functions for generating multivariate
+  Normal distribution random variate.
+* New module "Random Walk" that implements generic random walk MCMC kernels and
+  Normal distribution based proposals. Both scalar and multivariate cases are
+  supported.
+* `Covariance` is a new class that can compute the covariance matrix from
+  (weighted) samples, and output the matrix in variance formats.
+* `cov_chol` transform a covariance matrix, stored in various formats, into the
+  lower triangular part of its Cholesky decomposition, stored as a packed row
+  major matrix
 
 ## Changed behaviors
 
-* `CBlas` and `vMath` functions (vExp, etc.) no longer check threshold
-  configuration macros. In particular, the macro `VSMC_CLABS_THRESHOLD` and
-  `VSMC_VMATH_THRESHOLD` are no longer checked.
-* `Particle` no longer check `resample_copy_from_replication_type` and
-  `resample_post_copy_type` for user defined resampling behaviors. It is easier
-  to write customized resampling algorithms as a `move` instead of messing with
-  `Particle`'s internal this way.
+* `AlignedAllocator` template parameter `Memory` now requires its member
+  functions to be static and the class stateless. Unless you write your
+  allocator using this template, there is no need to change client code.
+* `MatrixOrder` is now an alias to `MatrixLayout`.
+* CBLAS and LAPACK are now required dependency.
 
-## Bug fixes
-
-* Fix Residual and related resampling algorithms in situations where the new
-  system has number of particles unequal to the old system.
+## Removed features
 
-# Changes since v1.1.0
+* `U01LRDistribution` is removed, while `U01CCDistribution` etc., remain.
+* `UniformRealLRDistribution` and `UniformRealCCDistribution` etc., are
+  removed.
+* Path sampling support is removed. It can be easily done through the more
+  general `Monitor`. See the GMM example.
 
-## Changed behaviors
+## Documentation
+http://zhouyan.github.io/vSMCDoc/v2.2.0/
 
-* `Sampler` summary member functions, such as `summary_data` etc., now only
-  deal with floating point data, such as importance sampling estimates. Integer
-  data, such as acceptance counts are dealt with new member functions
-  `summary_data_int` etc.
-* `Sampler` summary now output raw results of ESS and acceptance counts. They
-  are no longer scaled by the number of particles.
+# Changes in v2.1.0
 
-## Bug fixes
+## Important changes
 
-* `Sampler` now correctly clear size history during initialization
+The mutation (MCMC) steps are now performed after the resampling of
+initialization step.
 
-# Changes since v1.0.0
+## Removed features
 
-## New features
+* OpenCL and MPI modules are removed for now. It is planned that they will be
+  added back in future with a new interface.
 
-* Support [jemalloc][jemalloc] in `utility/aligned_memory.hpp`.
-* Support storing `Particle<T>` object in [HDF5][HDF5] format.
-* New function `hdf5store_new` creates a new file for storing data (trunk
-  any existing files with the supplied file name).
+## Documentation
+http://zhouyan.github.io/vSMCDoc/v2.1.0/
 
-## Changed behaviors
+# Changes in v2.0.0
 
-* `Progress` by default shows iteration number.
-* `Progress::stop` by default assumes that all work is done.
-* `AlignedMemory` and `AlignedAllocator` by default use [jemalloc][jemalloc] if
-  it is available.
-* `hdf5store_list_empty` argument `append` now has a default value, `false`.
+The version 2.0.0 is a major restructuring of the library since the initial
+release. Its documentation is still been updated. For the time being, one can
+keep using the old JSS release or the reference manual below.
 
-## Bug fixes
+The most important change is that C++11, both language and libraries are now
+required.
 
-* `hdf5size` now correctly return the number of *bytes* of data and it is no
-  longer a template.
-* Fix a memory bug when using "initialization by iteration"
-  (`Sampler::init_by_iter(true)`).
+## Documentation
+http://zhouyan.github.io/vSMCDoc/v2.0.0/
 
 [HDF5]: http://www.hdfgroup.org/HDF5/
 [MKL]: https://software.intel.com/en-us/intel-mkl/
 [TBB]: https://www.threadingbuildingblocks.org
-[jemalloc]: http://www.canonware.com/jemalloc/
diff --git a/README.md b/README.md
index 462b04baa..ba195872a 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,19 @@
 # Introduction
 
 The vSMC library provides a framework for implementing SMC algorithms. It has a
-core module which performs resampling, etc., operations common to all SMC
-algorithms and applications. In addition, it provides the bases for
-implementing parallelized samplers. The SMC algorithms are highly
-parallelizable, but there are many frameworks for doing this. This library
-tries to hide the different parallelization mechanism behind a unified
-interface, and thus increases code reuse.
+modules that perform resampling, etc., operations common to all SMC algorithms
+and applications. In addition, it provides bases for implementing parallelized
+samplers. The SMC algorithms are highly parallelizable, but there are many
+frameworks for doing this. This library tries to hide the different
+parallelization mechanism behind a unified interface, and thus increases code
+reuse.
 
 # Installation
 
 This is a header only template C++ library. To install the library just move
 the contents of the `include` directory into a proper place, e.g.,
-`/usr/local/include` in Unix-alike systems. Alternatively, one can use
-[CMake][CMake] (2.8.3 or later required),
+`/usr/local/include` on Unix-alike systems. Alternatively, one can use
+[CMake][CMake] (3.0.0 or later required),
 ~~~sh
 cd /path_to_vSMC_source
 mkdir build
@@ -21,47 +21,30 @@ cd build
 cmake ..
 make install
 ~~~
-One may need `su` or `sudo` permissions to perform the last installation step.
+One may need administrator permissions to perform the last installation step or
+alternatively one can define the [CMake][CMake] variable `CMAKE_INSTALL_PREFIX`
+to change the destination of installation.
 
 # Documentation
 
-To make the documentations one need [Doxygen][Doxygen] 1.8.3 or later.
-~~~sh
-make docs
-~~~
 The documentation of the [master][vSMCDocMaster] and [develop][vSMCDocDevelop]
-branches can be found online.
+branches, as well as for individual releases can be found online. A [User
+guide][VSMCUserGuide] is also provided for the develop branch.
 
-# Examples
+# Third-party dependencies
 
-Examples are in the `example` subdirectory, to build them,
-~~~sh
-cd /path_to_vSMC_source
-mkdir build
-cd build
-cmake ..
-make example
-~~~
-Most examples also come with their own `README` files that give relevant
-references.
+This library requires a working BLAS/LAPACK implementation, with the standard C
+interface headers (`cblas.h` and `lapacke.h`). Some of the library's
+functionalities can only be used if a optional dependencies are present.
+Notably, [HDF5][HDF5], [TBB][TBB], [OpenMP][OpenMP] and [MKL][MKL]. One can
+tell the library that these optional features are available by defining
+configuration macros such as `-DVSMC_HAS_HDF5=1` during compilation.
 
 # Parallelization backends
 
-The library support various backends for multi-thread parallelization, unified
-under a uniform interface. The primary backends are [OpenMP][OpenMP] and [Intel
-TBB][Intel TBB]. Other backends are available in the [vSMC Extra][vSMC Extra]
-repository, including MPI and [OpenCL][OpenCL].
-
-# Third-party dependencies
-
-This library has no dependences other than C++ standard libraries (C++11). Any
-C++11 language features are optional.
-
-In particular, the library use the `<functional>` and `<random>` headers, which
-are parts of the C++11 standard libraries. Equivalences can be found in recent
-versions of [Boost][Boost]. The library does its best to detect a usable C++11
-solution and falls back to [Boost][Boost] if it fails to do so. This behavior
-can be changed explicitly through configuration macros.
+This library support various backends for multi-thread parallelization, unified
+under a uniform interface. The primary backends are [OpenMP][OpenMP] and
+[TBB][TBB].
 
 # Compiler support
 
@@ -71,29 +54,37 @@ requires a C++11 standard conforming compiler.
 This library has been regularly tested with recent [Clang][Clang], [GCC][GCC]
 and [Intel C++ Compiler][icpc] in C++11 mode.
 
-[Microsoft Visual C++][MSVC] 2015 or later are also supported. However, this
-compiler is tested less regularly.
+Other compilers might work but are not tested. Complete C++11 implementation is
+required.
+
+# Examples
 
-Other compilers might work but are not tested.
+Examples are in the `example` subdirectory, to build them,
+~~~sh
+export CXXFLAGS=c++11
+cd /path_to_vSMC_source
+mkdir build
+cd build
+cmake ..
+make example
+~~~
+Some examples may only be built if some optional dependencies are present.
 
 # License
 
 The vSMC library is distributed with a 2-clause BSD license which can be found
 in the `LICENSE` file distributed with the source.
 
-[Apple GCD]: http://en.wikipedia.org/wiki/Grand_Central_Dispatch
-[Boost]: http://www.boost.org
 [CMake]: http://www.cmake.org
 [Clang]: http://clang.llvm.org
 [Doxygen]: http://www.stack.nl/~dimitri/doxygen/manual.html
 [GCC]: http://gcc.gnu.org
-[Intel Cilk Plus]: https://www.cilkplus.org
-[Intel TBB]: http://threadingbuildingblocks.org
-[MS PPL]: http://msdn.microsoft.com/en-us/library/dd492418.aspx
-[MSVC]: http://msdn.microsoft.com/en-us/vstudio//default.aspx
+[HDF5]: http://www.hdfgroup.org
+[MKL]: https://software.intel.com/en-us/intel-mkl
+[TBB]: http://threadingbuildingblocks.org
 [OpenCL]: http://www.khronos.org/opencl
 [OpenMP]: http://www.openmp.org
 [icpc]: http://software.intel.com/en-us/intel-compilers
-[vSMCDocMaster]: http://zhouyan.github.io/vSMCDoc/master
 [vSMCDocDevelop]: http://zhouyan.github.io/vSMCDoc/develop
-[vSMC Extra]: https://github.com/zhouyan/vSMCExtra.git
+[vSMCDocMaster]: http://zhouyan.github.io/vSMCDoc/master
+[vSMCUserGuide]: http://zhouyan.github.io/vSMCDoc/develop/user_guide.pdf
diff --git a/cmake/FindAESNI.cmake b/cmake/FindAESNI.cmake
index 4d8406638..ca965bdcd 100644
--- a/cmake/FindAESNI.cmake
+++ b/cmake/FindAESNI.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -35,16 +35,16 @@
 #
 # AESNI_FOUND - TRUE if AES-NI is found and work correctly
 
-IF (DEFINED AESNI_FOUND)
+IF(DEFINED AESNI_FOUND)
     RETURN()
-ENDIF (DEFINED AESNI_FOUND)
+ENDIF(DEFINED AESNI_FOUND)
 
 FILE(READ ${CMAKE_CURRENT_LIST_DIR}/FindAESNI.cpp AESNI_TEST_SOURCE)
 
 INCLUDE(CheckCXXSourceRuns)
 CHECK_CXX_SOURCE_RUNS("${AESNI_TEST_SOURCE}" AESNI_FOUND)
-IF (AESNI_FOUND)
+IF(AESNI_FOUND)
     MESSAGE(STATUS "Found AES-NI support")
-ELSE (AESNI_FOUND)
+ELSE(AESNI_FOUND)
     MESSAGE(STATUS "NOT Found AES-NI support")
-ENDIF (AESNI_FOUND)
+ENDIF(AESNI_FOUND)
diff --git a/cmake/FindAESNI.cpp b/cmake/FindAESNI.cpp
index 2638cf75a..e06f4485e 100644
--- a/cmake/FindAESNI.cpp
+++ b/cmake/FindAESNI.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/cmake/FindAVX2.cmake b/cmake/FindAVX2.cmake
index 75c3b471a..083762dda 100644
--- a/cmake/FindAVX2.cmake
+++ b/cmake/FindAVX2.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -35,16 +35,16 @@
 #
 # AVX2_FOUND - TRUE if AVX2 is found and work correctly
 
-IF (DEFINED AVX2_FOUND)
+IF(DEFINED AVX2_FOUND)
     RETURN()
-ENDIF (DEFINED AVX2_FOUND)
+ENDIF(DEFINED AVX2_FOUND)
 
 FILE(READ ${CMAKE_CURRENT_LIST_DIR}/FindAVX2.cpp AVX2_TEST_SOURCE)
 
 INCLUDE(CheckCXXSourceRuns)
 CHECK_CXX_SOURCE_RUNS("${AVX2_TEST_SOURCE}" AVX2_FOUND)
-IF (AVX2_FOUND)
+IF(AVX2_FOUND)
     MESSAGE(STATUS "Found AVX2 support")
-ELSE (AVX2_FOUND)
+ELSE(AVX2_FOUND)
     MESSAGE(STATUS "NOT Found AVX2 support")
-ENDIF (AVX2_FOUND)
+ENDIF(AVX2_FOUND)
diff --git a/cmake/FindAVX2.cpp b/cmake/FindAVX2.cpp
index 525a7a241..b2ab93087 100644
--- a/cmake/FindAVX2.cpp
+++ b/cmake/FindAVX2.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/cmake/FindInt128.cmake b/cmake/FindInt128.cmake
index 795dbcfc5..691ad8ed7 100644
--- a/cmake/FindInt128.cmake
+++ b/cmake/FindInt128.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -36,9 +36,9 @@
 # INT128_FOUND - TRUE if 128-bits integer type is found and works correctly
 # INT128_TYPE  - The type of the 128-bits integer
 
-IF (DEFINED INT128_FOUND)
+IF(DEFINED INT128_FOUND)
     RETURN()
-ENDIF (DEFINED INT128_FOUND)
+ENDIF(DEFINED INT128_FOUND)
 
 FILE(READ ${CMAKE_CURRENT_LIST_DIR}/FindInt128.cpp INT128_TEST_SOURCE)
 INCLUDE(CheckCXXSourceRuns)
@@ -49,11 +49,11 @@ SET(CMAKE_REQUIRED_DEFINITIONS ${SAFE_CMAKE_REQUIRED_DEFINITIONS}
     -DINT128=${INT128_TRY_TYPE})
 CHECK_CXX_SOURCE_RUNS("${INT128_TEST_SOURCE}" INT128_FOUND)
 
-IF (INT128_FOUND)
+IF(INT128_FOUND)
     SET(INT128_TYPE ${INT128_TRY_TYPE} CACHE STRING "128-bits type")
     MESSAGE(STATUS "Found 128-bits integer type: ${INT128_TYPE}")
-ELSE (INT128_FOUND)
+ELSE(INT128_FOUND)
     MESSAGE(STATUS "NOT Found 128-bits integer type")
-ENDIF (INT128_FOUND)
+ENDIF(INT128_FOUND)
 
 SET(CMAKE_REQUIRED_DEFINITIONS ${SAFE_CMAKE_REQUIRED_DEFINITIONS})
diff --git a/cmake/FindInt128.cpp b/cmake/FindInt128.cpp
index 36dcd7e20..40447d6bd 100644
--- a/cmake/FindInt128.cpp
+++ b/cmake/FindInt128.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
index 2e42c454e..645ceb7fa 100644
--- a/cmake/FindMKL.cmake
+++ b/cmake/FindMKL.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -45,11 +45,11 @@
 # MKL_INC_PATH - The path CMake shall try to find headers first
 # MKL_LIB_PATH - The path CMake shall try to find libraries first
 
-IF (DEFINED MKL_FOUND)
+IF(DEFINED MKL_FOUND)
     RETURN()
-ENDIF (DEFINED MKL_FOUND)
+ENDIF(DEFINED MKL_FOUND)
 
-IF (NOT DEFINED MKL_LINK_LIBRARIES)
+IF(NOT DEFINED MKL_LINK_LIBRARIES)
     INCLUDE(FindThreads)
     FIND_LIBRARY(MKL_LINK_LIBRARIES mkl_rt
         PATHS ${MKL_LIB_PATH} ENV LIBRARY_PATH NO_DEFAULT_PATH)
@@ -71,30 +71,30 @@ IF (NOT DEFINED MKL_LINK_LIBRARIES)
         PATHS ${MKL_LIB_PATH} ENV LIBRARY_PATH NO_DEFAULT_PATH)
     FIND_LIBRARY(MKL_BLAS95_ILP64_LINK_LIBRARIES mkl_blas95_ilp64)
 
-    IF (MKL_LINK_LIBRARIES)
+    IF(MKL_LINK_LIBRARIES)
         MESSAGE(STATUS "Found MKL libraries: ${MKL_LINK_LIBRARIES}")
         SET(MKL_LINK_LIBRARIES ${MKL_LINK_LIBRARIES}
             ${CMAKE_THREAD_LIBS_INIT} CACHE STRING "MKL Libraries" )
-    ELSE (MKL_LINK_LIBRARIES)
+    ELSE(MKL_LINK_LIBRARIES)
         MESSAGE(STATUS "NOT Found MKL libraries")
-    ENDIF (MKL_LINK_LIBRARIES)
-ENDIF (NOT DEFINED MKL_LINK_LIBRARIES)
+    ENDIF(MKL_LINK_LIBRARIES)
+ENDIF(NOT DEFINED MKL_LINK_LIBRARIES)
 
-IF (NOT DEFINED MKL_INCLUDE_DIR)
+IF(NOT DEFINED MKL_INCLUDE_DIR)
     FIND_PATH(MKL_INCLUDE_DIR mkl_vml.h
         PATHS ${MKL_INC_PATH} ENV CPATH NO_DEFAULT_PATH)
     FIND_PATH(MKL_INCLUDE_DIR mkl_vml.h)
-    IF (MKL_INCLUDE_DIR)
+    IF(MKL_INCLUDE_DIR)
         MESSAGE(STATUS "Found MKL headers: ${MKL_INCLUDE_DIR}")
-    ELSE (MKL_INCLUDE_DIR)
+    ELSE(MKL_INCLUDE_DIR)
         MESSAGE(STATUS "NOT Found MKL headers")
-    ENDIF (MKL_INCLUDE_DIR)
-ENDIF (NOT DEFINED MKL_INCLUDE_DIR)
+    ENDIF(MKL_INCLUDE_DIR)
+ENDIF(NOT DEFINED MKL_INCLUDE_DIR)
 
-IF (MKL_LINK_LIBRARIES AND MKL_INCLUDE_DIR)
+IF(MKL_LINK_LIBRARIES AND MKL_INCLUDE_DIR)
     MESSAGE(STATUS "Found MKL")
     SET(MKL_FOUND TRUE CACHE BOOL "Found MKL")
-ELSE (MKL_LINK_LIBRARIES AND MKL_INCLUDE_DIR)
+ELSE(MKL_LINK_LIBRARIES AND MKL_INCLUDE_DIR)
     MESSAGE(STATUS "NOT Found MKL")
     SET(MKL_FOUND FALSE CACHE BOOL "NOT Found MKL")
-ENDIF (MKL_LINK_LIBRARIES AND MKL_INCLUDE_DIR)
+ENDIF(MKL_LINK_LIBRARIES AND MKL_INCLUDE_DIR)
diff --git a/cmake/FindRDRAND.cmake b/cmake/FindRDRAND.cmake
index fa8eabf11..ff29d011a 100644
--- a/cmake/FindRDRAND.cmake
+++ b/cmake/FindRDRAND.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -35,16 +35,16 @@
 #
 # RDRAND_FOUND - TRUE if RDRAND is found and work correctly
 
-IF (DEFINED RDRAND_FOUND)
+IF(DEFINED RDRAND_FOUND)
     RETURN()
-ENDIF (DEFINED RDRAND_FOUND)
+ENDIF(DEFINED RDRAND_FOUND)
 
 FILE(READ ${CMAKE_CURRENT_LIST_DIR}/FindRDRAND.cpp RDRAND_TEST_SOURCE)
 
 INCLUDE(CheckCXXSourceRuns)
 CHECK_CXX_SOURCE_RUNS("${RDRAND_TEST_SOURCE}" RDRAND_FOUND)
-IF (RDRAND_FOUND)
+IF(RDRAND_FOUND)
     MESSAGE(STATUS "Found RDRAND support")
-ELSE (RDRAND_FOUND)
+ELSE(RDRAND_FOUND)
     MESSAGE(STATUS "NOT Found RDRAND support")
-ENDIF (RDRAND_FOUND)
+ENDIF(RDRAND_FOUND)
diff --git a/cmake/FindRDRAND.cpp b/cmake/FindRDRAND.cpp
index 39f637a75..85eb9401e 100644
--- a/cmake/FindRDRAND.cpp
+++ b/cmake/FindRDRAND.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/cmake/FindSSE2.cmake b/cmake/FindSSE2.cmake
index 4ee20a573..90a97a04e 100644
--- a/cmake/FindSSE2.cmake
+++ b/cmake/FindSSE2.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -35,16 +35,16 @@
 #
 # SSE2_FOUND - TRUE if SSE2 is found and work correctly
 
-IF (DEFINED SSE2_FOUND)
+IF(DEFINED SSE2_FOUND)
     RETURN()
-ENDIF (DEFINED SSE2_FOUND)
+ENDIF(DEFINED SSE2_FOUND)
 
 FILE(READ ${CMAKE_CURRENT_LIST_DIR}/FindSSE2.cpp SSE2_TEST_SOURCE)
 
 INCLUDE(CheckCXXSourceRuns)
 CHECK_CXX_SOURCE_RUNS("${SSE2_TEST_SOURCE}" SSE2_FOUND)
-IF (SSE2_FOUND)
+IF(SSE2_FOUND)
     MESSAGE(STATUS "Found SSE2 support")
-ELSE (SSE2_FOUND)
+ELSE(SSE2_FOUND)
     MESSAGE(STATUS "NOT Found SSE2 support")
-ENDIF (SSE2_FOUND)
+ENDIF(SSE2_FOUND)
diff --git a/cmake/FindSSE2.cpp b/cmake/FindSSE2.cpp
index 00380981d..5dd013e04 100644
--- a/cmake/FindSSE2.cpp
+++ b/cmake/FindSSE2.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/cmake/FindTBB.cmake b/cmake/FindTBB.cmake
index 472c13763..8aea3a376 100644
--- a/cmake/FindTBB.cmake
+++ b/cmake/FindTBB.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -47,22 +47,22 @@
 # TBB_INC_PATH - The path CMake shall try to find headers first
 # TBB_LIB_PATH - The path CMake shall try to find libraries first
 
-IF (DEFINED TBB_FOUND)
+IF(DEFINED TBB_FOUND)
     RETURN()
-ENDIF (DEFINED TBB_FOUND)
+ENDIF(DEFINED TBB_FOUND)
 
 FILE(READ ${CMAKE_CURRENT_LIST_DIR}/FindTBB.cpp TBB_TEST_SOURCE)
 
 INCLUDE(FindThreads)
 
-IF (NOT DEFINED TBB_LINK_LIBRARIES)
+IF(NOT DEFINED TBB_LINK_LIBRARIES)
     FIND_LIBRARY(TBB_LINK_LIBRARIES_RELEASE_FOUND tbb
         PATHS ${TBB_LIB_PATH} ENV LIBRARY_PATH ENV LIB NO_DEFAULT_PATH)
     FIND_LIBRARY(TBB_LINK_LIBRARIES_RELEASE_FOUND tbb)
     FIND_LIBRARY(TBB_LINK_LIBRARIES_DEBUG_FOUND tbb_debug
         PATHS ${TBB_LIB_PATH} ENV LIBRARY_PATH ENV LIB NO_DEFAULT_PATH)
     FIND_LIBRARY(TBB_LINK_LIBRARIES_DEBUG_FOUND tbb_debug)
-    IF (TBB_LINK_LIBRARIES_RELEASE_FOUND AND TBB_LINK_LIBRARIES_DEBUG_FOUND)
+    IF(TBB_LINK_LIBRARIES_RELEASE_FOUND AND TBB_LINK_LIBRARIES_DEBUG_FOUND)
         SET(TBB_LINK_LIBRARIES
             optimized ${TBB_LINK_LIBRARIES_RELEASE_FOUND}
             debug ${TBB_LINK_LIBRARIES_DEBUG_FOUND}
@@ -72,25 +72,25 @@ IF (NOT DEFINED TBB_LINK_LIBRARIES)
         SET(TBB_LINK_LIBRARIES_DEBUG ${TBB_LINK_LIBRARIES_DEBUG_FOUND}
             ${CMAKE_THREAD_LIBS_INIT} CACHE STRING "Link to TBB Debug")
         MESSAGE(STATUS "Found TBB libraries: ${TBB_LINK_LIBRARIES}")
-    ELSEIF (TBB_LINK_LIBRARIES_RELEASE_FOUND)
+    ELSEIF(TBB_LINK_LIBRARIES_RELEASE_FOUND)
         SET(TBB_LINK_LIBRARIES ${TBB_LINK_LIBRARIES_RELEASE_FOUND}
             ${CMAKE_THREAD_LIBS_INIT} CACHE STRING "Link to TBB")
         SET(TBB_LINK_LIBRARIES_RELEASE ${TBB_LINK_LIBRARIES_RELEASE_FOUND}
             ${CMAKE_THREAD_LIBS_INIT} CACHE STRING "Link to TBB Release")
         MESSAGE(STATUS "Found TBB libraries: ${TBB_LINK_LIBRARIES}")
-    ELSE (TBB_LINK_LIBRARIES_RELEASE_FOUND AND TBB_LINK_LIBRARIES_DEBUG_FOUND)
+    ELSE(TBB_LINK_LIBRARIES_RELEASE_FOUND AND TBB_LINK_LIBRARIES_DEBUG_FOUND)
         MESSAGE(STATUS "NOT Found TBB libraries")
-    ENDIF (TBB_LINK_LIBRARIES_RELEASE_FOUND AND TBB_LINK_LIBRARIES_DEBUG_FOUND)
-ENDIF (NOT DEFINED TBB_LINK_LIBRARIES)
+    ENDIF(TBB_LINK_LIBRARIES_RELEASE_FOUND AND TBB_LINK_LIBRARIES_DEBUG_FOUND)
+ENDIF(NOT DEFINED TBB_LINK_LIBRARIES)
 
-IF (NOT DEFINED TBB_MALLOC_LINK_LIBRARIES)
+IF(NOT DEFINED TBB_MALLOC_LINK_LIBRARIES)
     FIND_LIBRARY(TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND tbbmalloc
         PATHS ${TBB_LIB_PATH} ENV LIBRARY_PATH ENV LIB NO_DEFAULT_PATH)
     FIND_LIBRARY(TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND tbbmalloc)
     FIND_LIBRARY(TBB_MALLOC_LINK_LIBRARIES_DEBUG_FOUND tbbmalloc_debug
         PATHS ${TBB_LIB_PATH} ENV LIBRARY_PATH ENV LIB NO_DEFAULT_PATH)
     FIND_LIBRARY(TBB_MALLOC_LINK_LIBRARIES_DEBUG_FOUND tbbmalloc_debug)
-    IF (TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND AND
+    IF(TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND AND
             TBB_MALLOC_LINK_LIBRARIES_DEBUG_FOUND)
         SET(TBB_MALLOC_LINK_LIBRARIES
             optimized ${TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND}
@@ -107,7 +107,7 @@ IF (NOT DEFINED TBB_MALLOC_LINK_LIBRARIES)
             "Link to TBB malloc Debug")
         MESSAGE(STATUS
             "Found TBB malloc libraries: ${TBB_MALLOC_LINK_LIBRARIES}")
-    ELSEIF (TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND)
+    ELSEIF(TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND)
         SET(TBB_MALLOC_LINK_LIBRARIES
             ${TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND}
             ${CMAKE_THREAD_LIBS_INIT} CACHE STRING
@@ -118,14 +118,14 @@ IF (NOT DEFINED TBB_MALLOC_LINK_LIBRARIES)
             "Link to TBB malloc Release")
         MESSAGE(STATUS
             "Found TBB malloc libraries: ${TBB_MALLOC_LINK_LIBRARIES}")
-    ELSE (TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND AND
+    ELSE(TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND AND
             TBB_MALLOC_LINK_LIBRARIES_DEBUG_FOUND)
         MESSAGE(STATUS "NOT Found TBB malloc libraries")
-    ENDIF (TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND AND
+    ENDIF(TBB_MALLOC_LINK_LIBRARIES_RELEASE_FOUND AND
         TBB_MALLOC_LINK_LIBRARIES_DEBUG_FOUND)
-ENDIF (NOT DEFINED TBB_MALLOC_LINK_LIBRARIES)
+ENDIF(NOT DEFINED TBB_MALLOC_LINK_LIBRARIES)
 
-IF (NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
+IF(NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
     FIND_LIBRARY(TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND
         tbbmalloc_proxy
         PATHS ${TBB_LIB_PATH} ENV LIBRARY_PATH ENV LIB NO_DEFAULT_PATH)
@@ -136,7 +136,7 @@ IF (NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
         PATHS ${TBB_LIB_PATH} ENV LIBRARY_PATH ENV LIB NO_DEFAULT_PATH)
     FIND_LIBRARY(TBB_MALLOC_PROXY_LINK_LIBRARIES_DEBUG_FOUND
         tbbmalloc_proxy_debug)
-    IF (TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND AND
+    IF(TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND AND
             TBB_MALLOC_PROXY_LINK_LIBRARIES_DEBUG_FOUND)
         SET(TBB_MALLOC_PROXY_LINK_LIBRARIES
             optimized ${TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND}
@@ -153,7 +153,7 @@ IF (NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
             "Link to TBB malloc proxy Debug")
         MESSAGE(STATUS
             "Found TBB malloc proxy libraries: ${TBB_MALLOC_PROXY_LINK_LIBRARIES}")
-    ELSEIF (TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND)
+    ELSEIF(TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND)
         SET(TBB_MALLOC_PROXY_LINK_LIBRARIES
             ${TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND}
             ${CMAKE_THREAD_LIBS_INIT} CACHE STRING
@@ -164,47 +164,47 @@ IF (NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
             "Link to TBB malloc proxy Release")
         MESSAGE(STATUS
             "Found TBB malloc proxy libraries: ${TBB_MALLOC_PROXY_LINK_LIBRARIES}")
-    ELSE (TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND AND
+    ELSE(TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND AND
             TBB_MALLOC_PROXY_LINK_LIBRARIES_DEBUG_FOUND)
         MESSAGE(STATUS "NOT Found TBB malloc proxy libraries")
-    ENDIF (TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND AND
+    ENDIF(TBB_MALLOC_PROXY_LINK_LIBRARIES_RELEASE_FOUND AND
         TBB_MALLOC_PROXY_LINK_LIBRARIES_DEBUG_FOUND)
-ENDIF (NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
+ENDIF(NOT DEFINED TBB_MALLOC_PROXY_LINK_LIBRARIES)
 
-IF (NOT DEFINED TBB_INCLUDE_DIR)
+IF(NOT DEFINED TBB_INCLUDE_DIR)
     FIND_PATH(TBB_INCLUDE_DIR tbb/tbb.h
         PATHS ${TBB_INC_PATH} ENV CPATH NO_DEFAULT_PATH)
     FIND_PATH(TBB_INCLUDE_DIR tbb/tbb.h)
-    IF (TBB_INCLUDE_DIR)
+    IF(TBB_INCLUDE_DIR)
         MESSAGE(STATUS "Found TBB headers: ${TBB_INCLUDE_DIR}")
-    ELSE (TBB_INCLUDE_DIR)
+    ELSE(TBB_INCLUDE_DIR)
         MESSAGE(STATUS "NOT Found TBB headers")
-    ENDIF (TBB_INCLUDE_DIR)
-ENDIF (NOT DEFINED TBB_INCLUDE_DIR)
+    ENDIF(TBB_INCLUDE_DIR)
+ENDIF(NOT DEFINED TBB_INCLUDE_DIR)
 
-IF (TBB_LINK_LIBRARIES AND TBB_INCLUDE_DIR)
+IF(TBB_LINK_LIBRARIES AND TBB_INCLUDE_DIR)
     SET(TBB_BASIC_FOUND TRUE)
-ELSE (TBB_LINK_LIBRARIES AND TBB_INCLUDE_DIR)
+ELSE(TBB_LINK_LIBRARIES AND TBB_INCLUDE_DIR)
     SET(TBB_BASIC_FOUND FALSE)
-ENDIF (TBB_LINK_LIBRARIES AND TBB_INCLUDE_DIR)
-IF (TBB_BASIC_FOUND)
+ENDIF(TBB_LINK_LIBRARIES AND TBB_INCLUDE_DIR)
+IF(TBB_BASIC_FOUND)
     INCLUDE(CheckCXXSourceRuns)
     SET(SAFE_CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS})
     SET(SAFE_CMAKE_REQUIRED_INCLUDES  ${CMAKE_REQUIRED_INCLUDES})
     SET(SAFE_CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
     SET(CMAKE_REQUIRED_INCLUDES ${SAFE_CMAKE_REQUIRED_INCLUDES}
         ${TBB_INCLUDE_DIR})
-    IF (TBB_LINK_LIBRARIES_DEBUG)
+    IF(TBB_LINK_LIBRARIES_DEBUG)
         SET(CMAKE_REQUIRED_LIBRARIES ${SAFE_CMAKE_REQUIRED_LIBRARIES}
             ${TBB_LINK_LIBRARIES_DEBUG})
-    ELSE (TBB_LINK_LIBRARIES_DEBUG)
+    ELSE(TBB_LINK_LIBRARIES_DEBUG)
         SET(CMAKE_REQUIRED_LIBRARIES ${SAFE_CMAKE_REQUIRED_LIBRARIES}
             ${TBB_LINK_LIBRARIES})
-    ENDIF (TBB_LINK_LIBRARIES_DEBUG)
+    ENDIF(TBB_LINK_LIBRARIES_DEBUG)
     CHECK_CXX_SOURCE_RUNS("${TBB_TEST_SOURCE}" TBB_TEST_SOURCE_RUNS)
-    FOREACH (EXCEPTION_FLAG 1 0)
-        FOREACH (CPP0X_FLAG 1 0)
-            IF (NOT TBB_TEST_SOURCE_RUNS)
+    FOREACH(EXCEPTION_FLAG 1 0)
+        FOREACH(CPP0X_FLAG 1 0)
+            IF(NOT TBB_TEST_SOURCE_RUNS)
                 MESSAGE(STATUS
                     "Try TBB with -DTBB_USE_CAPTURED_EXCEPTION=${EXCEPTION_FLAG} -DTBB_IMPLEMENT_CPP0X=${CPP0X_FLAG}")
                 UNSET(TBB_TEST_SOURCE_RUNS CACHE)
@@ -214,26 +214,26 @@ IF (TBB_BASIC_FOUND)
                     -DTBB_IMPLEMENT_CPP0X=${CPP0X_FLAG})
                 CHECK_CXX_SOURCE_RUNS("${TBB_TEST_SOURCE}"
                     TBB_TEST_SOURCE_RUNS)
-                IF (TBB_TEST_SOURCE_RUNS)
+                IF(TBB_TEST_SOURCE_RUNS)
                     SET(TBB_DEFINITIONS
                         -DTBB_USE_CAPTURED_EXCEPTION=${EXCEPTION_FLAG}
                         -DTBB_IMPLEMENT_CPP0X=${CPP0X_FLAG}
                         CACHE STRING "TBB compile time definitions")
-                ENDIF (TBB_TEST_SOURCE_RUNS)
-            ENDIF (NOT TBB_TEST_SOURCE_RUNS)
-        ENDFOREACH (CPP0X_FLAG 0 1)
-    ENDFOREACH (EXCEPTION_FLAG 0 1)
-    IF (TBB_TEST_SOURCE_RUNS)
+                ENDIF(TBB_TEST_SOURCE_RUNS)
+            ENDIF(NOT TBB_TEST_SOURCE_RUNS)
+        ENDFOREACH(CPP0X_FLAG 0 1)
+    ENDFOREACH(EXCEPTION_FLAG 0 1)
+    IF(TBB_TEST_SOURCE_RUNS)
         MESSAGE(STATUS "Found TBB")
         SET(TBB_FOUND TRUE CACHE BOOL "Found TBB")
-    ELSE (TBB_TEST_SOURCE_RUNS)
+    ELSE(TBB_TEST_SOURCE_RUNS)
         MESSAGE(STATUS "NOT Found TBB")
         SET(TBB_FOUND FALSE CACHE BOOL "NOT Found TBB")
-    ENDIF (TBB_TEST_SOURCE_RUNS)
+    ENDIF(TBB_TEST_SOURCE_RUNS)
     SET(CMAKE_REQUIRED_DEFINITIONS ${SAFE_CMAKE_REQUIRED_DEFINITIONS})
     SET(CMAKE_REQUIRED_INCLUDES ${SAFE_CMAKE_REQUIRED_INCLUDES})
     SET(CMAKE_REQUIRED_LIBRARIES ${SAFE_CMAKE_REQUIRED_LIBRARIES})
-ELSE (TBB_BASIC_FOUND)
+ELSE(TBB_BASIC_FOUND)
     MESSAGE(STATUS "NOT Found TBB")
     SET(TBB_FOUND FALSE CACHE BOOL "NOT Found TBB")
-ENDIF (TBB_BASIC_FOUND)
+ENDIF(TBB_BASIC_FOUND)
diff --git a/cmake/FindTBB.cpp b/cmake/FindTBB.cpp
index 31075d50c..87e4db66c 100644
--- a/cmake/FindTBB.cpp
+++ b/cmake/FindTBB.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/cmake/FindThread.cmake b/cmake/FindThread.cmake
index 60f7d490b..c10b01477 100644
--- a/cmake/FindThread.cmake
+++ b/cmake/FindThread.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -36,15 +36,15 @@
 # THREAD_FOUND          - TRUE if threads are found
 # Thread_LINK_LIBRARIES - Set to CMAKE_THREAD_LIBS_INIT and caches
 
-IF (DEFINED THREAD_FOUND)
+IF(DEFINED THREAD_FOUND)
     RETURN()
-ENDIF (DEFINED THREAD_FOUND)
+ENDIF(DEFINED THREAD_FOUND)
 
 INCLUDE(FindThreads)
-IF (CMAKE_THREAD_LIBS_INIT)
+IF(CMAKE_THREAD_LIBS_INIT)
     SET(THREAD_FOUND TRUE CACHE BOOL "Threads found")
     SET(Thread_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} CACHE STRING
         "Thread link libraries")
-ELSE (CMAKE_THREAD_LIBS_INIT)
+ELSE(CMAKE_THREAD_LIBS_INIT)
     SET(THREAD_FOUND FALSE CACHE BOOL "Threads found")
-ENDIF (CMAKE_THREAD_LIBS_INIT)
+ENDIF(CMAKE_THREAD_LIBS_INIT)
diff --git a/cmake/vSMCExampleFunctions.cmake b/cmake/vSMCExampleFunctions.cmake
index 5ec6e571c..1446b91a1 100644
--- a/cmake/vSMCExampleFunctions.cmake
+++ b/cmake/vSMCExampleFunctions.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -32,49 +32,49 @@
 FUNCTION(ADD_VSMC_EXECUTABLE exe src)
     ADD_EXECUTABLE(${exe} ${src})
 
-    IF (DEFINED VSMC_LINK_LIBRARIES)
+    IF(DEFINED VSMC_LINK_LIBRARIES)
         TARGET_LINK_LIBRARIES(${exe} ${VSMC_LINK_LIBRARIES})
-    ENDIF (DEFINED VSMC_LINK_LIBRARIES)
+    ENDIF(DEFINED VSMC_LINK_LIBRARIES)
 
     GET_TARGET_PROPERTY(compile_flags ${exe} COMPILE_FLAGS)
-    IF (NOT compile_flags)
+    IF(NOT compile_flags)
         UNSET(compile_flags)
-    ENDIF (NOT compile_flags)
+    ENDIF(NOT compile_flags)
 
     GET_TARGET_PROPERTY(link_flags ${exe} LINK_FLAGS)
-    IF (NOT link_flags)
+    IF(NOT link_flags)
         UNSET(link_flags)
-    ENDIF (NOT link_flags)
+    ENDIF(NOT link_flags)
 
-    FOREACH (arg ${ARGN})
-        IF (${arg} STREQUAL "OMP" AND OPENMP_FOUND)
+    FOREACH(arg ${ARGN})
+        IF(${arg} STREQUAL "OMP" AND OPENMP_FOUND)
             SET(compile_flags "${compile_flags} ${OpenMP_CXX_FLAGS}")
-            IF (NOT MSVC)
+            IF(NOT MSVC)
                 SET(link_flags "${link_flags} ${OpenMP_CXX_FLAGS}")
-            ENDIF (NOT MSVC)
-        ENDIF (${arg} STREQUAL "OMP" AND OPENMP_FOUND)
-    ENDFOREACH (arg ${ARGN})
+            ENDIF(NOT MSVC)
+        ENDIF(${arg} STREQUAL "OMP" AND OPENMP_FOUND)
+    ENDFOREACH(arg ${ARGN})
 
-    IF (compile_flags)
+    IF(compile_flags)
         SET_TARGET_PROPERTIES(${exe} PROPERTIES COMPILE_FLAGS
             "${compile_flags}")
-    ENDIF (compile_flags)
+    ENDIF(compile_flags)
 
-    IF (link_flags)
+    IF(link_flags)
         SET_TARGET_PROPERTIES(${exe} PROPERTIES LINK_FLAGS
             "${link_flags}")
-    ENDIF (link_flags)
+    ENDIF(link_flags)
 ENDFUNCTION(ADD_VSMC_EXECUTABLE)
 
 FUNCTION(ADD_SMP_EXECUTABLE base header source smp_name)
     STRING(TOUPPER "${smp_name}" SMP)
     STRING(TOLOWER "${smp_name}" smp)
 
-    IF (EXISTS ${PROJECT_SOURCE_DIR}/include/${header}.hpp)
+    IF(EXISTS ${PROJECT_SOURCE_DIR}/include/${header}.hpp)
         CONFIGURE_FILE(
             ${PROJECT_SOURCE_DIR}/include/${header}.hpp
             ${PROJECT_BINARY_DIR}/include/${header}_${smp}.hpp)
-    ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/include/${header}.hpp)
+    ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/include/${header}.hpp)
     CONFIGURE_FILE(
         ${PROJECT_SOURCE_DIR}/src/${source}.cpp
         ${PROJECT_BINARY_DIR}/src/${source}_${smp}.cpp)
@@ -92,50 +92,50 @@ FUNCTION(ADD_SMP_EXAMPLE base)
     FOREACH(smp ${SMP_EXECUTABLES})
         ADD_SMP_EXECUTABLE(${base} ${base} ${base} ${smp} ${ARGN})
         ADD_DEPENDENCIES(${base} ${base}_${smp})
-    ENDFOREACH (smp)
+    ENDFOREACH(smp)
 ENDFUNCTION(ADD_SMP_EXAMPLE)
 
 FUNCTION(COPY_FILE basename filename)
-    IF (UNIX)
+    IF(UNIX)
         ADD_CUSTOM_COMMAND(
             OUTPUT  ${PROJECT_BINARY_DIR}/${filename}
             DEPENDS ${PROJECT_SOURCE_DIR}/${filename}
             COMMAND ${CMAKE_COMMAND} ARGS -E create_symlink
             ${PROJECT_SOURCE_DIR}/${filename}
             ${PROJECT_BINARY_DIR}/${filename})
-    ELSE (UNIX)
+    ELSE(UNIX)
         ADD_CUSTOM_COMMAND(
             OUTPUT  ${PROJECT_BINARY_DIR}/${filename}
             DEPENDS ${PROJECT_SOURCE_DIR}/${filename}
             COMMAND ${CMAKE_COMMAND} ARGS -E copy
             ${PROJECT_SOURCE_DIR}/${filename}
             ${PROJECT_BINARY_DIR}/${filename})
-    ENDIF (UNIX)
+    ENDIF(UNIX)
     ADD_CUSTOM_TARGET(${basename}-${filename}
         DEPENDS ${PROJECT_BINARY_DIR}/${filename})
     ADD_DEPENDENCIES(${basename}-files ${basename}-${filename})
 ENDFUNCTION(COPY_FILE)
 
 FUNCTION(COPY_FILE_OPTIONAL basename filename)
-    IF (EXISTS ${PROJECT_SOURCE_DIR}/${filename})
+    IF(EXISTS ${PROJECT_SOURCE_DIR}/${filename})
         COPY_FILE(${basename} ${filename})
-    ELSE (EXISTS ${PROJECT_SOURCE_DIR}/${filename})
+    ELSE(EXISTS ${PROJECT_SOURCE_DIR}/${filename})
         MESSAGE(STATUS "File: ${basename}: ${filename} not available")
-    ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/${filename})
+    ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/${filename})
 ENDFUNCTION(COPY_FILE_OPTIONAL)
 
 FUNCTION(ADD_HEADER_EXECUTABLE basepath cond)
-    IF (${cond})
+    IF(${cond})
         STRING(REPLACE "/" "_" basename "${basepath}")
-        IF (EXISTS ${PROJECT_SOURCE_DIR}/src/${basename}.cpp)
+        IF(EXISTS ${PROJECT_SOURCE_DIR}/src/${basename}.cpp)
             ADD_VSMC_EXECUTABLE(${basename}_hpp
                 ${PROJECT_SOURCE_DIR}/src/${basename}.cpp ${ARGN})
-        ELSE (EXISTS ${PROJECT_SOURCE_DIR}/src/${basename}.cpp)
+        ELSE(EXISTS ${PROJECT_SOURCE_DIR}/src/${basename}.cpp)
             CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/src/vsmc.cpp
                 ${PROJECT_BINARY_DIR}/src/${basename}.cpp)
             ADD_VSMC_EXECUTABLE(${basename}_hpp
                 ${PROJECT_BINARY_DIR}/src/${basename}.cpp ${ARGN})
-        ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/src/${basename}.cpp)
+        ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/src/${basename}.cpp)
         ADD_DEPENDENCIES(vsmc ${basename}_hpp)
-    ENDIF (${cond})
+    ENDIF(${cond})
 ENDFUNCTION(ADD_HEADER_EXECUTABLE)
diff --git a/config/Doxyfile.in b/config/Doxyfile.in
index 4c13d7c1a..da2876e3c 100644
--- a/config/Doxyfile.in
+++ b/config/Doxyfile.in
@@ -794,8 +794,8 @@ INPUT_ENCODING         = UTF-8
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
+# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
 
 FILE_PATTERNS          = *.h \
                          *.hpp \
@@ -2033,6 +2033,8 @@ PREDEFINED             = __TBB_TASK_GROUP_CONTEXT \
                          VSMC_HAS_MKL \
                          VSMC_HAS_OMP \
                          VSMC_HAS_POSIX \
+                         VSMC_HAS_SSE2 \
+                         VSMC_HAS_AVX2 \
                          VSMC_HAS_TBB
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
diff --git a/config/news.md.in b/config/news.md.in
new file mode 100644
index 000000000..1581f5414
--- /dev/null
+++ b/config/news.md.in
@@ -0,0 +1,4 @@
+Change log {#newspage}
+================
+
+@NEWS@
diff --git a/config/user_options.cmake b/config/user_options.cmake
index 702e636fd..2ab492694 100644
--- a/config/user_options.cmake
+++ b/config/user_options.cmake
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,6 @@ IF (MSVC)
     SET (TBB_ROOT "C:/Program Files/Intel/TBB")
     SET (TBB_INC_PATH "${TBB_ROOT}/include" CACHE PATH "TBB include")
     SET (TBB_LIB_PATH "${TBB_ROOT}/lib/intel64/vc12" CACHE PATH "TBB lib")
-    SET (VSMC_MPI_FOUND FALSE CACHE BOOL "NO MPI")
-    SET (OPENCL_FOUND FALSE CACHE BOOL "NO OpenCL")
 ENDIF (MSVC)
 
 IF (UNIX AND NOT APPLE AND NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 5ac14b00e..fecd8d34b 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -3,30 +3,30 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-# Copyright (c) 2013-2015, Yan Zhou
-# All rights reserved.
+#  Copyright (c) 2013-2016, Yan Zhou
+#  All rights reserved.
 #
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
 #
-#   Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
+#    Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
 #
-#   Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
+#    Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
 # ============================================================================
 
 PROJECT(vSMCExample C CXX)
@@ -58,8 +58,8 @@ ADD_SUBDIRECTORY(rng)
 ##############################################################################
 
 MESSAGE(STATUS "=================== Enable examples ===================")
-FOREACH (example ${EXAMPLES})
+FOREACH(example ${EXAMPLES})
     MESSAGE(STATUS ${example})
-ENDFOREACH (example ${EXAMPLES})
+ENDFOREACH(example ${EXAMPLES})
 
 MESSAGE(STATUS "=======================================================")
diff --git a/example/gmm/CMakeLists.txt b/example/gmm/CMakeLists.txt
index 4bb14e7ff..cec9e54b6 100644
--- a/example/gmm/CMakeLists.txt
+++ b/example/gmm/CMakeLists.txt
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
diff --git a/example/gmm/include/gmm.hpp b/example/gmm/include/gmm.hpp
index 0a849ca7a..364b2c3a9 100644
--- a/example/gmm/include/gmm.hpp
+++ b/example/gmm/include/gmm.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -51,7 +51,7 @@ template <typename T, typename Derived>
 using MoveSMP = vsmc::Move@SMP@<T, Derived>;
 
 template <typename T, typename Derived>
-using PathEvalSMP = vsmc::PathEval@SMP@<T, Derived>;
+using MonitorEvalSMP = vsmc::MonitorEval@SMP@<T, Derived>;
 // clang-format on
 
 class gmm_param
@@ -520,17 +520,23 @@ class gmm_move_weight : public MoveSMP<gmm_state, gmm_move_weight>
     }
 };
 
-class gmm_path : public PathEvalSMP<gmm_state, gmm_path>
+class gmm_path_integrand : public MonitorEvalSMP<gmm_state, gmm_path_integrand>
 {
     public:
-    double eval_sp(std::size_t, vsmc::SingleParticle<gmm_state> sp)
+    void eval_sp(std::size_t, std::size_t, vsmc::SingleParticle<gmm_state> sp,
+        double *res)
     {
-        return sp.state(0).log_likelihood();
+        *res = sp.state(0).log_likelihood();
     }
+};
 
-    double eval_grid(std::size_t, vsmc::Particle<gmm_state> &particle)
+class gmm_path_grid
+{
+    public:
+    void operator()(std::size_t, std::size_t,
+        vsmc::Particle<gmm_state> &particle, double *res)
     {
-        return particle.value().alpha();
+        *res = particle.value().alpha();
     }
 };
 
@@ -602,12 +608,20 @@ inline int gmm_main(int argc, char **argv)
         .mcmc(gmm_move_mu(), false)
         .mcmc(gmm_move_lambda(), true)
         .mcmc(gmm_move_weight(), true)
-        .path_sampling(gmm_path());
+        .monitor("path_integrand", 1, gmm_path_integrand())
+        .monitor("path_grid", 1, gmm_path_grid(), true);
 
     vsmc::StopWatch watch;
     watch.start();
     sampler.initialize(const_cast<char *>(datafile.c_str())).iterate(n);
-    double ps = sampler.path().log_zconst();
+    double ps = 0;
+    auto ps_integrand = sampler.monitor("path_integrand");
+    auto ps_grid = sampler.monitor("path_grid");
+    for (std::size_t iter = 1; iter < sampler.iter_size(); ++iter) {
+        ps += 0.5 *
+            (ps_integrand.record(0, iter) + ps_integrand.record(0, iter - 1)) *
+            (ps_grid.record(0, iter) - ps_grid.record(0, iter - 1));
+    }
     watch.stop();
 
     std::cout << "Path sampling estimate: " << ps << std::endl;
diff --git a/example/gmm/src/gmm.cpp b/example/gmm/src/gmm.cpp
index 723326075..0528ca0e0 100644
--- a/example/gmm/src/gmm.cpp
+++ b/example/gmm/src/gmm.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/example/pf/CMakeLists.txt b/example/pf/CMakeLists.txt
index 6537ad082..c8efc4312 100644
--- a/example/pf/CMakeLists.txt
+++ b/example/pf/CMakeLists.txt
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -50,9 +50,9 @@ COPY_FILE(pf pf.R)
 ADD_CUSTOM_TARGET(pf-check)
 ADD_DEPENDENCIES(check pf-check)
 
-FOREACH (backend ${SMP_EXECUTABLES})
+FOREACH(backend ${SMP_EXECUTABLES})
     ADD_PF_SMP_CHECK(${backend})
-ENDFOREACH (backend ${SMP_EXECUTABLES})
+ENDFOREACH(backend ${SMP_EXECUTABLES})
 
 ADD_CUSTOM_TARGET(pf-pdf
     DEPENDS pf-files pf-check
diff --git a/example/pf/include/pf.hpp b/example/pf/include/pf.hpp
index 085aaf15e..3c723096f 100644
--- a/example/pf/include/pf.hpp
+++ b/example/pf/include/pf.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -64,16 +64,16 @@ static const std::size_t VelX = 2;
 static const std::size_t VelY = 3;
 static const std::size_t LogL = 4;
 
-template <vsmc::MatrixOrder Order>
-using StateBase = StateSMP<vsmc::StateMatrix<Order, 5, double>>;
+template <vsmc::MatrixLayout Layout>
+using StateBase = StateSMP<vsmc::StateMatrix<Layout, 5, double>>;
 
-template <vsmc::MatrixOrder Order>
-class pf_state : public StateBase<Order>
+template <vsmc::MatrixLayout Layout>
+class pf_state : public StateBase<Layout>
 {
     public:
-    using size_type = typename StateBase<Order>::size_type;
+    using size_type = typename StateBase<Layout>::size_type;
 
-    pf_state(size_type N) : StateBase<Order>(N) {}
+    pf_state(size_type N) : StateBase<Layout>(N) {}
 
     double &obs_x(std::size_t iter) { return obs_x_[iter]; }
     double &obs_y(std::size_t iter) { return obs_y_[iter]; }
@@ -110,11 +110,11 @@ class pf_state : public StateBase<Order>
     vsmc::Vector<double> obs_y_;
 };
 
-template <vsmc::MatrixOrder Order>
-class pf_init : public InitializeSMP<pf_state<Order>, pf_init<Order>>
+template <vsmc::MatrixLayout Layout>
+class pf_init : public InitializeSMP<pf_state<Layout>, pf_init<Layout>>
 {
     public:
-    std::size_t eval_sp(vsmc::SingleParticle<pf_state<Order>> sp) const
+    std::size_t eval_sp(vsmc::SingleParticle<pf_state<Layout>> sp) const
     {
         const double sd_pos0 = 2;
         const double sd_vel0 = 1;
@@ -131,12 +131,12 @@ class pf_init : public InitializeSMP<pf_state<Order>, pf_init<Order>>
     }
 
     void eval_param(
-        vsmc::Particle<pf_state<Order>> &particle, void *file) const
+        vsmc::Particle<pf_state<Layout>> &particle, void *file) const
     {
         particle.value().read_data(static_cast<const char *>(file));
     }
 
-    void eval_post(vsmc::Particle<pf_state<Order>> &particle)
+    void eval_post(vsmc::Particle<pf_state<Layout>> &particle)
     {
         w_.resize(particle.size());
         particle.value().read_state(LogL, w_.data());
@@ -147,12 +147,12 @@ class pf_init : public InitializeSMP<pf_state<Order>, pf_init<Order>>
     vsmc::Vector<double> w_;
 };
 
-template <vsmc::MatrixOrder Order>
-class pf_move : public MoveSMP<pf_state<Order>, pf_move<Order>>
+template <vsmc::MatrixLayout Layout>
+class pf_move : public MoveSMP<pf_state<Layout>, pf_move<Layout>>
 {
     public:
     std::size_t eval_sp(
-        std::size_t iter, vsmc::SingleParticle<pf_state<Order>> sp) const
+        std::size_t iter, vsmc::SingleParticle<pf_state<Layout>> sp) const
     {
         const double sd_pos = std::sqrt(0.02);
         const double sd_vel = std::sqrt(0.001);
@@ -169,7 +169,7 @@ class pf_move : public MoveSMP<pf_state<Order>, pf_move<Order>>
         return 1;
     }
 
-    void eval_post(std::size_t, vsmc::Particle<pf_state<Order>> &particle)
+    void eval_post(std::size_t, vsmc::Particle<pf_state<Layout>> &particle)
     {
         w_.resize(particle.size());
         particle.value().read_state(LogL, w_.data());
@@ -180,19 +180,19 @@ class pf_move : public MoveSMP<pf_state<Order>, pf_move<Order>>
     vsmc::Vector<double> w_;
 };
 
-template <vsmc::MatrixOrder Order>
-class pf_meval : public MonitorEvalSMP<pf_state<Order>, pf_meval<Order>>
+template <vsmc::MatrixLayout Layout>
+class pf_meval : public MonitorEvalSMP<pf_state<Layout>, pf_meval<Layout>>
 {
     public:
     void eval_sp(std::size_t, std::size_t,
-        vsmc::SingleParticle<pf_state<Order>> sp, double *res)
+        vsmc::SingleParticle<pf_state<Layout>> sp, double *res)
     {
         res[0] = sp.state(PosX);
         res[1] = sp.state(PosY);
     }
 };
 
-template <vsmc::MatrixOrder Order>
+template <vsmc::MatrixLayout Layout>
 inline void pf_run(vsmc::ResampleScheme scheme, const std::string &datafile,
     const std::string &prog, const std::string &name)
 {
@@ -201,10 +201,10 @@ inline void pf_run(vsmc::ResampleScheme scheme, const std::string &datafile,
     std::string pf_h5(prog + name + ".h5");
 
     vsmc::Seed::instance().set(101);
-    vsmc::Sampler<pf_state<Order>> sampler(N, scheme, 0.5);
-    sampler.init(pf_init<Order>());
-    sampler.move(pf_move<Order>(), false);
-    sampler.monitor("pos", 2, pf_meval<Order>());
+    vsmc::Sampler<pf_state<Layout>> sampler(N, scheme, 0.5);
+    sampler.init(pf_init<Layout>());
+    sampler.move(pf_move<Layout>(), false);
+    sampler.monitor("pos", 2, pf_meval<Layout>());
     sampler.monitor("pos").name(0) = "pos.x";
     sampler.monitor("pos").name(1) = "pos.y";
 
diff --git a/example/pf/pf.R b/example/pf/pf.R
index 313323d78..217c81417 100644
--- a/example/pf/pf.R
+++ b/example/pf/pf.R
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
diff --git a/example/pf/src/pf.cpp b/example/pf/src/pf.cpp
index 23cb8317b..240987e6e 100644
--- a/example/pf/src/pf.cpp
+++ b/example/pf/src/pf.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/example/rng/CMakeLists.txt b/example/rng/CMakeLists.txt
index 3daa6caac..2768ba1a7 100644
--- a/example/rng/CMakeLists.txt
+++ b/example/rng/CMakeLists.txt
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -38,20 +38,17 @@ ADD_DEPENDENCIES(example rng)
 ADD_CUSTOM_TARGET(rng-files)
 ADD_DEPENDENCIES(example-files rng-files)
 
-IF (VSMC_ENABLE_LIBRARY)
-    ADD_DEFINITIONS(-DVSMC_RNG_TEST_C_API=1)
-ENDIF (VSMC_ENABLE_LIBRARY)
-
 FUNCTION(ADD_RNG_TEST name)
     ADD_VSMC_EXECUTABLE(rng_${name} ${PROJECT_SOURCE_DIR}/src/rng_${name}.cpp)
     ADD_DEPENDENCIES(rng rng_${name})
 ENDFUNCTION(ADD_RNG_TEST)
 
+ADD_RNG_TEST(u01)
 ADD_RNG_TEST(std)
 ADD_RNG_TEST(philox)
 ADD_RNG_TEST(threefry)
 
-IF (Boost_FOUND)
+IF(Boost_FOUND)
     ADD_RNG_TEST(beta)
     ADD_RNG_TEST(cauchy)
     ADD_RNG_TEST(chi_squared)
@@ -69,17 +66,17 @@ IF (Boost_FOUND)
     ADD_RNG_TEST(student_t)
     ADD_RNG_TEST(uniform_real)
     ADD_RNG_TEST(weibull)
-ENDIF (Boost_FOUND)
+ENDIF(Boost_FOUND)
 
-IF (AESNI_FOUND)
+IF(AESNI_FOUND)
     ADD_RNG_TEST(aes)
     ADD_RNG_TEST(ars)
-ENDIF (AESNI_FOUND)
+ENDIF(AESNI_FOUND)
 
-IF (RDRAND_FOUND)
+IF(RDRAND_FOUND)
     ADD_RNG_TEST(rdrand)
-ENDIF (RDRAND_FOUND)
+ENDIF(RDRAND_FOUND)
 
-IF (MKL_FOUND)
+IF(MKL_FOUND)
     ADD_RNG_TEST(mkl)
-ENDIF (MKL_FOUND)
+ENDIF(MKL_FOUND)
diff --git a/example/rng/include/rng_dist.hpp b/example/rng/include/rng_dist.hpp
index 1b0d80b3c..d920d9912 100644
--- a/example/rng/include/rng_dist.hpp
+++ b/example/rng/include/rng_dist.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,44 +34,20 @@
 
 #include <vsmc/rng/engine.hpp>
 #include <vsmc/utility/stop_watch.hpp>
-#include <boost/math/distributions/chi_squared.hpp>
-
-#define VSMC_RNG_DIST_1(Name, STD, p1)                                        \
-    param[0] = p1;                                                            \
-    rng_dist<STD<double>, vsmc::Name##Distribution<double>>(                  \
-        N, M, param, #Name, names, mean, variance, pval1, pval2, sw);
-
-#define VSMC_RNG_DIST_2(Name, STD, p1, p2)                                    \
-    param[0] = p1;                                                            \
-    param[1] = p2;                                                            \
-    rng_dist<STD<double>, vsmc::Name##Distribution<double>>(                  \
-        N, M, param, #Name, names, mean, variance, pval1, pval2, sw);
-
-#define VSMC_RNG_DIST_ALL(TEST)                                               \
-    VSMC_RNG_DIST_2(Levy, vsmc::LevyDistribution, 0, 1);                      \
-    VSMC_RNG_DIST_2(Pareto, vsmc::ParetoDistribution, 1, 1);
-
-#define VSMC_RNG_DIST_PRE(p)                                                  \
-    std::size_t N = 10000;                                                    \
-    if (argc > 1)                                                             \
-        N = static_cast<std::size_t>(std::atoi(argv[1]));                     \
-    std::size_t M = 100;                                                      \
-    if (argc > 2)                                                             \
-        M = static_cast<std::size_t>(std::atoi(argv[2]));                     \
-    std::array<double, p> param;                                              \
-    vsmc::Vector<std::string> names;                                          \
-    vsmc::Vector<double> mean;                                                \
-    vsmc::Vector<double> variance;                                            \
-    vsmc::Vector<double> pval1;                                               \
-    vsmc::Vector<double> pval2;                                               \
-    vsmc::Vector<vsmc::StopWatch> sw;
-
-#define VSMC_RNG_DIST_POST                                                    \
-    rng_dist_output(names, mean, variance, pval1, pval2, sw);
-
-template <std::size_t K>
+#include <boost/math/distributions.hpp>
+#include <boost/random.hpp>
+
+#define VSMC_RNG_DIST_TEST(K, Name, STD)                                      \
+    rng_dist_test<float, K, vsmc::Name##Distribution<float>, STD<float>>(     \
+        argc, argv, #Name, params);                                           \
+    rng_dist_test<double, K, vsmc::Name##Distribution<double>, STD<double>>(  \
+        argc, argv, #Name, params);                                           \
+    rng_dist_test<long double, K, vsmc::Name##Distribution<long double>,      \
+        STD<long double>>(argc, argv, #Name, params);
+
+template <typename RealType, std::size_t K>
 inline std::string rng_dist_name(
-    const std::string &name, const std::array<double, K> &param)
+    const std::string &name, const std::array<RealType, K> &param)
 {
     std::stringstream ss;
     ss << name;
@@ -85,60 +61,250 @@ inline std::string rng_dist_name(
     return ss.str();
 }
 
-template <typename DistType>
-inline DistType rng_dist_init(const std::array<double, 1> &param)
+template <typename RealType, typename DistType>
+inline DistType rng_dist_init(const std::array<RealType, 0> &)
+{
+    return DistType();
+}
+
+template <typename RealType, typename DistType>
+inline DistType rng_dist_init(const std::array<RealType, 1> &param)
 {
     return DistType(param[0]);
 }
 
-template <typename DistType>
-inline DistType rng_dist_init(const std::array<double, 2> &param)
+template <typename RealType, typename DistType>
+inline DistType rng_dist_init(const std::array<RealType, 2> &param)
 {
     return DistType(param[0], param[1]);
 }
 
-template <typename QuantileType>
-inline vsmc::Vector<double> rng_dist_partition_quantile(
+template <typename RealType, typename QuantileType>
+inline vsmc::Vector<RealType> rng_dist_partition_quantile(
     std::size_t n, const QuantileType &quantile)
 {
     std::size_t k = n / 100;
-    double h = 1.0 / k;
-    vsmc::Vector<double> partition;
+    RealType h = static_cast<RealType>(1.0 / k);
+    vsmc::Vector<RealType> partition;
     for (std::size_t i = 0; i != k - 1; ++i) {
-        double p = h * (i + 1);
-        p = std::max(p, 0.0);
-        p = std::min(p, 1.0);
+        RealType p = h * (i + 1);
+        p = std::max(p, static_cast<RealType>(0));
+        p = std::min(p, static_cast<RealType>(1));
         partition.push_back(quantile(p));
     }
 
     return partition;
 }
 
-template <typename BoostDistType>
-inline vsmc::Vector<double> rng_dist_partition_boost(
+template <typename RealType, typename BoostDistType>
+inline vsmc::Vector<RealType> rng_dist_partition_boost(
     std::size_t n, const BoostDistType &dist)
 {
-    auto quantile = [&](double p) { return boost::math::quantile(dist, p); };
+    auto quantile = [&](RealType p) { return boost::math::quantile(dist, p); };
+
+    return rng_dist_partition_quantile<RealType>(n, quantile);
+}
+
+template <typename RealType, typename DistType>
+inline vsmc::Vector<RealType> rng_dist_partition(std::size_t, DistType &);
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::BetaDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(n,
+        boost::math::beta_distribution<RealType>(dist.alpha(), dist.beta()));
+}
 
-    return rng_dist_partition_quantile(n, quantile);
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::CauchyDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        return dist.a() +
+            dist.b() * std::tan(vsmc::const_pi<RealType>() *
+                           (p - static_cast<RealType>(0.5)));
+    });
 }
 
-template <typename DistType>
-inline vsmc::Vector<double> rng_dist_partition(std::size_t, DistType &);
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::ChiSquaredDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(
+        n, boost::math::chi_squared_distribution<RealType>(dist.n()));
+}
 
-template <typename Left, typename Right>
-inline vsmc::Vector<double> rng_dist_partition(
-    std::size_t n, vsmc::UniformRealLRDistribution<double, Left, Right> &dist)
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::ExponentialDistribution<RealType> &dist)
 {
-    return rng_dist_partition_quantile(
-        n, [&](double p) { return dist.a() + p * (dist.b() - dist.a()); });
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return -std::log(1 - p) / dist.lambda(); });
 }
 
-inline double rng_dist_chi2(
-    const vsmc::Vector<double> &r, const vsmc::Vector<double> &partition)
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::ExtremeValueDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        return dist.a() - dist.b() * std::log(-std::log(p));
+    });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::FisherFDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(
+        n, boost::math::fisher_f_distribution<RealType>(dist.m(), dist.n()));
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::GammaDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(n,
+        boost::math::gamma_distribution<RealType>(dist.alpha(), dist.beta()));
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::LaplaceDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        RealType q = p - static_cast<RealType>(0.5);
+        return q > 0 ? dist.a() - dist.b() * std::log(1 - 2 * q) :
+                       dist.a() + dist.b() * std::log(1 + 2 * q);
+    });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::LevyDistribution<RealType> &dist)
+{
+    boost::math::normal_distribution<RealType> normal(0, 1);
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        RealType q = boost::math::quantile(normal, 1 - p / 2);
+        return dist.a() + dist.b() / (q * q);
+    });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::LogisticDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        return dist.a() + dist.b() * std::log(p / (1 - p));
+    });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::LognormalDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(
+        n, boost::math::lognormal_distribution<RealType>(dist.m(), dist.s()));
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::NormalDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(
+        n, boost::math::normal_distribution<RealType>(
+               dist.mean(), dist.stddev()));
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::ParetoDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        return dist.b() / std::exp(std::log(1 - p) / dist.a());
+    });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::RayleighDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        return std::sqrt(-2 * std::log(1 - p) * dist.sigma() * dist.sigma());
+    });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::StudentTDistribution<RealType> &dist)
+{
+    return rng_dist_partition_boost<RealType>(
+        n, boost::math::students_t_distribution<RealType>(dist.n()));
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::U01Distribution<RealType> &)
+{
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return p; });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::U01CCDistribution<RealType> &)
+{
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return p; });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::U01CODistribution<RealType> &)
+{
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return p; });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::U01OCDistribution<RealType> &)
+{
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return p; });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::U01OODistribution<RealType> &)
+{
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return p; });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::UniformRealDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(
+        n, [&](RealType p) { return dist.a() + p * (dist.b() - dist.a()); });
+}
+
+template <typename RealType>
+inline vsmc::Vector<RealType> rng_dist_partition(
+    std::size_t n, vsmc::WeibullDistribution<RealType> &dist)
+{
+    return rng_dist_partition_quantile<RealType>(n, [&](RealType p) {
+        return dist.b() * std::pow(-std::log(1 - p), 1 / dist.a());
+    });
+}
+
+template <typename RealType>
+inline RealType rng_dist_chi2(
+    const vsmc::Vector<RealType> &r, const vsmc::Vector<RealType> &partition)
 {
     vsmc::Vector<std::size_t> count(partition.size() + 1);
-    vsmc::Vector<double> rval(r);
+    vsmc::Vector<RealType> rval(r);
     std::sort(rval.begin(), rval.end());
     std::size_t j = 0;
     for (std::size_t i = 0; i != partition.size(); ++i) {
@@ -150,53 +316,61 @@ inline double rng_dist_chi2(
         count[i] = n;
     }
     count.back() = rval.size() - j;
-    double e = 1.0 / partition.size() * rval.size();
-    double p = 0;
+    RealType e = static_cast<RealType>(1.0 / partition.size() * rval.size());
+    RealType p = 0;
     for (std::size_t i = 0; i != partition.size(); ++i)
         p += (count[i] - e) * (count[i] - e) / e;
-    boost::math::chi_squared_distribution<double> chi2(
-        static_cast<double>(partition.size() - 1));
+    boost::math::chi_squared_distribution<RealType> chi2(
+        static_cast<RealType>(partition.size() - 1));
 
     return boost::math::cdf(chi2, p);
 }
 
-inline double rng_dist_ksad(
-    const vsmc::Vector<double> &r, const vsmc::Vector<double> &partition)
+template <typename RealType>
+inline RealType rng_dist_ksad(
+    const vsmc::Vector<RealType> &r, const vsmc::Vector<RealType> &partition)
 {
     const std::size_t n = 100;
     const std::size_t m = r.size() / n;
-    vsmc::Vector<double> rval(m);
-    vsmc::Vector<double> pval(n);
-    vsmc::Vector<double> head(n);
-    vsmc::Vector<double> tail(n);
+    vsmc::Vector<RealType> rval(m);
+    vsmc::Vector<RealType> pval(n);
+    vsmc::Vector<RealType> head(n);
+    vsmc::Vector<RealType> tail(n);
     for (std::size_t i = 0; i != n; ++i) {
         std::copy(r.data() + i * m, r.data() + i * m + m, rval.data());
-        pval[i] = rng_dist_chi2(rval, partition);
+        pval[i] = rng_dist_chi2<RealType>(rval, partition);
     }
     std::sort(pval.begin(), pval.end());
     vsmc::log(n, pval.data(), head.data());
     std::reverse(pval.begin(), pval.end());
-    vsmc::sub(n, 1.0, pval.data(), pval.data());
+    vsmc::sub(n, static_cast<RealType>(1), pval.data(), pval.data());
     vsmc::log(n, pval.data(), tail.data());
     vsmc::add(n, head.data(), tail.data(), pval.data());
     for (std::size_t i = 0; i != n; ++i)
-        pval[i] *= 2 * (i + 1) - 1.0;
+        pval[i] *= 2 * (i + 1) - static_cast<RealType>(1);
 
-    return -(n + std::accumulate(pval.begin(), pval.end(), 0.0) / n);
+    return -(n +
+        std::accumulate(pval.begin(), pval.end(), static_cast<RealType>(0)) /
+            n);
 }
 
-inline void rng_dist_moments(const vsmc::Vector<double> &r,
-    vsmc::Vector<double> &mean, vsmc::Vector<double> &variance)
+template <typename RealType>
+inline void rng_dist_moments(const vsmc::Vector<RealType> &r,
+    vsmc::Vector<RealType> &mean, vsmc::Vector<RealType> &variance)
 {
-    mean.push_back(std::accumulate(r.begin(), r.end(), 0.0) / r.size());
-    variance.push_back(
-        vsmc::dot(r.size(), r.data(), 1, r.data(), 1) / r.size() -
-        mean.back() * mean.back());
+    mean.push_back(
+        std::accumulate(r.begin(), r.end(), static_cast<RealType>(0)) /
+        r.size());
+    RealType var = 0;
+    for (auto v : r)
+        var += v * v;
+    variance.push_back(var / r.size() - mean.back() * mean.back());
 }
 
-inline void rng_dist_pval(const vsmc::Vector<double> &chi2,
-    const vsmc::Vector<double> &ksad, vsmc::Vector<double> &pval1,
-    vsmc::Vector<double> &pval2)
+template <typename RealType>
+inline void rng_dist_pval(const vsmc::Vector<RealType> &chi2,
+    const vsmc::Vector<RealType> &ksad, vsmc::Vector<RealType> &pval1,
+    vsmc::Vector<RealType> &pval2)
 {
     std::size_t alpha1;
     std::size_t alpha5;
@@ -204,49 +378,55 @@ inline void rng_dist_pval(const vsmc::Vector<double> &chi2,
 
     alpha1 = alpha5 = alpha10 = 0;
     for (std::size_t i = 0; i != chi2.size(); ++i) {
-        if (chi2[i] > 0.005 && chi2[i] < 1 - 0.005)
+        if (chi2[i] > static_cast<RealType>(0.005) &&
+            chi2[i] < static_cast<RealType>(1 - 0.005))
             ++alpha1;
-        if (chi2[i] > 0.025 && chi2[i] < 1 - 0.025)
+        if (chi2[i] > static_cast<RealType>(0.025) &&
+            chi2[i] < static_cast<RealType>(1 - 0.025))
             ++alpha5;
-        if (chi2[i] > 0.05 && chi2[i] < 1 - 0.05)
+        if (chi2[i] > static_cast<RealType>(0.05) &&
+            chi2[i] < static_cast<RealType>(1 - 0.05))
             ++alpha10;
     }
-    double nchi2 = static_cast<double>(chi2.size());
-    pval1.push_back(100.0 * alpha1 / nchi2);
-    pval1.push_back(100.0 * alpha5 / nchi2);
-    pval1.push_back(100.0 * alpha10 / nchi2);
+    pval1.push_back(static_cast<RealType>(100.0 * alpha1 / chi2.size()));
+    pval1.push_back(static_cast<RealType>(100.0 * alpha5 / chi2.size()));
+    pval1.push_back(static_cast<RealType>(100.0 * alpha10 / chi2.size()));
 
     alpha1 = alpha5 = alpha10 = 0;
     for (std::size_t i = 0; i != ksad.size(); ++i) {
-        if (ksad[i] < 3.857)
+        if (ksad[i] < static_cast<RealType>(3.857))
             ++alpha1;
-        if (ksad[i] < 2.492)
+        if (ksad[i] < static_cast<RealType>(2.492))
             ++alpha5;
-        if (ksad[i] < 1.933)
+        if (ksad[i] < static_cast<RealType>(1.933))
             ++alpha10;
     }
-    double nksad = static_cast<double>(ksad.size());
-    pval2.push_back(100.0 * alpha1 / nksad);
-    pval2.push_back(100.0 * alpha5 / nksad);
-    pval2.push_back(100.0 * alpha10 / nksad);
+    pval2.push_back(static_cast<RealType>(100.0 * alpha1 / ksad.size()));
+    pval2.push_back(static_cast<RealType>(100.0 * alpha5 / ksad.size()));
+    pval2.push_back(static_cast<RealType>(100.0 * alpha10 / ksad.size()));
 }
 
-template <typename STDDistType, typename vSMCDistType, std::size_t K>
+template <typename RealType, typename vSMCDistType, typename STDDistType,
+    std::size_t K>
 inline void rng_dist(std::size_t n, std::size_t m,
-    const std::array<double, K> &param, const std::string &name,
-    vsmc::Vector<std::string> &names, vsmc::Vector<double> &mean,
-    vsmc::Vector<double> &variance, vsmc::Vector<double> &pval1,
-    vsmc::Vector<double> &pval2, vsmc::Vector<vsmc::StopWatch> &sw)
+    const std::array<RealType, K> &param, const std::string &name,
+    vsmc::Vector<std::string> &names, vsmc::Vector<RealType> &mean,
+    vsmc::Vector<RealType> &variance, vsmc::Vector<RealType> &pval1,
+    vsmc::Vector<RealType> &pval2, vsmc::Vector<vsmc::StopWatch> &sw)
 {
     names.push_back(rng_dist_name(name, param));
 
-    vsmc::RNG rng;
-    STDDistType dist_std(rng_dist_init<STDDistType>(param));
-    vSMCDistType dist_vsmc(rng_dist_init<vSMCDistType>(param));
-    vsmc::Vector<double> r(n);
-    vsmc::Vector<double> chi2;
-    vsmc::Vector<double> ksad;
-    vsmc::Vector<double> partition(rng_dist_partition(n, dist_vsmc));
+    vsmc::RNG rng_vsmc;
+    vSMCDistType dist_vsmc(rng_dist_init<RealType, vSMCDistType>(param));
+
+    std::mt19937 rng_std;
+    STDDistType dist_std(rng_dist_init<RealType, STDDistType>(param));
+
+    vsmc::Vector<RealType> r(n);
+    vsmc::Vector<RealType> chi2;
+    vsmc::Vector<RealType> ksad;
+    vsmc::Vector<RealType> partition(
+        rng_dist_partition<RealType>(n, dist_vsmc));
     vsmc::StopWatch watch;
 
     watch.reset();
@@ -255,14 +435,14 @@ inline void rng_dist(std::size_t n, std::size_t m,
     for (std::size_t i = 0; i != m; ++i) {
         watch.start();
         for (std::size_t j = 0; j != n; ++j)
-            r[j] = dist_std(rng);
+            r[j] = dist_std(rng_std);
         watch.stop();
-        chi2.push_back(rng_dist_chi2(r, partition));
-        ksad.push_back(rng_dist_ksad(r, partition));
+        chi2.push_back(rng_dist_chi2<RealType>(r, partition));
+        ksad.push_back(rng_dist_ksad<RealType>(r, partition));
     }
     sw.push_back(watch);
-    rng_dist_moments(r, mean, variance);
-    rng_dist_pval(chi2, ksad, pval1, pval2);
+    rng_dist_moments<RealType>(r, mean, variance);
+    rng_dist_pval<RealType>(chi2, ksad, pval1, pval2);
 
     watch.reset();
     chi2.clear();
@@ -270,28 +450,28 @@ inline void rng_dist(std::size_t n, std::size_t m,
     for (std::size_t i = 0; i != m; ++i) {
         watch.start();
         for (std::size_t j = 0; j != n; ++j)
-            r[j] = dist_vsmc(rng);
+            r[j] = dist_vsmc(rng_vsmc);
         watch.stop();
-        chi2.push_back(rng_dist_chi2(r, partition));
-        ksad.push_back(rng_dist_ksad(r, partition));
+        chi2.push_back(rng_dist_chi2<RealType>(r, partition));
+        ksad.push_back(rng_dist_ksad<RealType>(r, partition));
     }
     sw.push_back(watch);
-    rng_dist_moments(r, mean, variance);
-    rng_dist_pval(chi2, ksad, pval1, pval2);
+    rng_dist_moments<RealType>(r, mean, variance);
+    rng_dist_pval<RealType>(chi2, ksad, pval1, pval2);
 
     watch.reset();
     chi2.clear();
     ksad.clear();
     for (std::size_t i = 0; i != m; ++i) {
         watch.start();
-        vsmc::rng_rand(rng, dist_vsmc, n, r.data());
+        vsmc::rng_rand(rng_vsmc, dist_vsmc, n, r.data());
         watch.stop();
-        chi2.push_back(rng_dist_chi2(r, partition));
-        ksad.push_back(rng_dist_ksad(r, partition));
+        chi2.push_back(rng_dist_chi2<RealType>(r, partition));
+        ksad.push_back(rng_dist_ksad<RealType>(r, partition));
     }
     sw.push_back(watch);
-    rng_dist_moments(r, mean, variance);
-    rng_dist_pval(chi2, ksad, pval1, pval2);
+    rng_dist_moments<RealType>(r, mean, variance);
+    rng_dist_pval<RealType>(chi2, ksad, pval1, pval2);
 
 #if VSMC_HAS_MKL
     vsmc::MKL_SFMT19937 rng_mkl;
@@ -302,18 +482,19 @@ inline void rng_dist(std::size_t n, std::size_t m,
         watch.start();
         vsmc::rng_rand(rng_mkl, dist_vsmc, n, r.data());
         watch.stop();
-        chi2.push_back(rng_dist_chi2(r, partition));
-        ksad.push_back(rng_dist_ksad(r, partition));
+        chi2.push_back(rng_dist_chi2<RealType>(r, partition));
+        ksad.push_back(rng_dist_ksad<RealType>(r, partition));
     }
     sw.push_back(watch);
-    rng_dist_moments(r, mean, variance);
-    rng_dist_pval(chi2, ksad, pval1, pval2);
+    rng_dist_moments<RealType>(r, mean, variance);
+    rng_dist_pval<RealType>(chi2, ksad, pval1, pval2);
 #endif
 }
 
+template <typename RealType>
 inline void rng_dist_output(const vsmc::Vector<std::string> &names,
-    const vsmc::Vector<double> &mean, const vsmc::Vector<double> &variance,
-    const vsmc::Vector<double> &pval1, const vsmc::Vector<double> &pval2,
+    const vsmc::Vector<RealType> &mean, const vsmc::Vector<RealType> &variance,
+    const vsmc::Vector<RealType> &pval1, const vsmc::Vector<RealType> &pval2,
     const vsmc::Vector<vsmc::StopWatch> &sw)
 {
     std::size_t N = names.size();
@@ -322,6 +503,21 @@ inline void rng_dist_output(const vsmc::Vector<std::string> &names,
     int twid = 15;
     int Twid = twid * static_cast<int>(R);
     int nwid = static_cast<int>(lwid) - Twid;
+    if (sizeof(RealType) == sizeof(float)) {
+        std::cout << std::string(lwid, '=') << std::endl;
+        std::cout << "Precision: float" << std::endl;
+        std::cout << std::string(lwid, '=') << std::endl;
+    }
+    if (sizeof(RealType) == sizeof(double)) {
+        std::cout << std::string(lwid, '=') << std::endl;
+        std::cout << "Precision: double" << std::endl;
+        std::cout << std::string(lwid, '=') << std::endl;
+    }
+    if (sizeof(RealType) == sizeof(long double)) {
+        std::cout << std::string(lwid, '=') << std::endl;
+        std::cout << "Precision: long double" << std::endl;
+        std::cout << std::string(lwid, '=') << std::endl;
+    }
 
     for (std::size_t i = 0; i != N; ++i) {
         std::cout << std::string(lwid, '=') << std::endl;
@@ -342,19 +538,19 @@ inline void rng_dist_output(const vsmc::Vector<std::string> &names,
         std::cout << std::endl;
         std::cout << std::left << std::setw(nwid) << "Mean";
         for (std::size_t r = 0; r != R; ++r) {
-            double m = mean[i * R + r];
+            RealType m = mean[i * R + r];
             std::cout << std::right << std::setw(twid) << m;
         }
         std::cout << std::endl;
         std::cout << std::left << std::setw(nwid) << "Variance";
         for (std::size_t r = 0; r != R; ++r) {
-            double v = variance[i * R + r];
+            RealType v = variance[i * R + r];
             std::cout << std::right << std::setw(twid) << v;
         }
         std::cout << std::endl;
         std::cout << std::left << std::setw(nwid) << "Single level test";
         for (std::size_t r = 0; r != R; ++r) {
-            double p = pval1[i * R + r];
+            RealType p = pval1[i * R + r];
             std::stringstream ss;
             if (p < 50)
                 ss << '*';
@@ -364,7 +560,7 @@ inline void rng_dist_output(const vsmc::Vector<std::string> &names,
         std::cout << std::endl;
         std::cout << std::left << std::setw(nwid) << "Two level Test";
         for (std::size_t r = 0; r != R; ++r) {
-            double p = pval2[i * R + r];
+            RealType p = pval2[i * R + r];
             std::stringstream ss;
             if (p < 50)
                 ss << '*';
@@ -376,4 +572,32 @@ inline void rng_dist_output(const vsmc::Vector<std::string> &names,
     std::cout << std::string(lwid, '=') << std::endl;
 }
 
+template <typename RealType, std::size_t K, typename vSMCDistType,
+    typename STDDistType, typename ParamType>
+inline void rng_dist_test(
+    int argc, char **argv, const std::string &name, const ParamType &params)
+{
+    std::size_t N = 10000;
+    if (argc > 1)
+        N = static_cast<std::size_t>(std::atoi(argv[1]));
+    std::size_t M = 100;
+    if (argc > 2)
+        M = static_cast<std::size_t>(std::atoi(argv[2]));
+
+    vsmc::Vector<std::string> names;
+    vsmc::Vector<RealType> mean;
+    vsmc::Vector<RealType> variance;
+    vsmc::Vector<RealType> pval1;
+    vsmc::Vector<RealType> pval2;
+    vsmc::Vector<vsmc::StopWatch> sw;
+    for (const auto &p : params) {
+        std::array<RealType, K> param;
+        for (std::size_t i = 0; i != p.size(); ++i)
+            param[i] = static_cast<RealType>(p[i]);
+        rng_dist<RealType, vSMCDistType, STDDistType>(
+            N, M, param, name, names, mean, variance, pval1, pval2, sw);
+    }
+    rng_dist_output<RealType>(names, mean, variance, pval1, pval2, sw);
+}
+
 #endif // VSMC_EXAMPLE_RNG_DIST_HPP
diff --git a/example/rng/include/rng_test.hpp b/example/rng/include/rng_test.hpp
index 4e1b06d53..1e5c651b2 100644
--- a/example/rng/include/rng_test.hpp
+++ b/example/rng/include/rng_test.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,93 +35,108 @@
 #include <vsmc/rng/uniform_real_distribution.hpp>
 #include <vsmc/utility/stop_watch.hpp>
 
-#define VSMC_RNG_TEST(RNGType) rng_test<RNGType>(N, #RNGType, names, size, sw);
-
 #define VSMC_RNG_TEST_PRE(prog)                                               \
     std::size_t N = 1000000;                                                  \
     if (argc > 1)                                                             \
         N = static_cast<std::size_t>(std::atoi(argv[1]));                     \
-    std::string prog_name(#prog);                                             \
-    vsmc::Vector<std::string> names;                                          \
-    vsmc::Vector<std::size_t> size;                                           \
-    vsmc::Vector<vsmc::StopWatch> sw;
+    std::string name(#prog);                                                  \
+    vsmc::Vector<std::string> rng_name;                                       \
+    vsmc::Vector<std::size_t> rng_size;                                       \
+    vsmc::Vector<std::size_t> result_size;                                    \
+    vsmc::Vector<std::size_t> num;                                            \
+    vsmc::Vector<bool> test;                                                  \
+    vsmc::Vector<vsmc::StopWatch> sw1;                                        \
+    vsmc::Vector<vsmc::StopWatch> sw2;
+
+#define VSMC_RNG_TEST(RNGType)                                                \
+    rng_test<RNGType>(                                                        \
+        N, #RNGType, rng_name, rng_size, result_size, num, test, sw1, sw2);
 
-#define VSMC_RNG_TEST_POST rng_test_output(prog_name, names, size, sw);
+#define VSMC_RNG_TEST_POST                                                    \
+    rng_test_output(                                                          \
+        name, rng_name, rng_size, result_size, num, test, sw1, sw2);
 
 template <typename RNGType>
 inline void rng_test(std::size_t n, const std::string &name,
-    vsmc::Vector<std::string> &names, vsmc::Vector<std::size_t> &size,
-    vsmc::Vector<vsmc::StopWatch> &sw)
+    vsmc::Vector<std::string> &rng_name, vsmc::Vector<std::size_t> &rng_size,
+    vsmc::Vector<std::size_t> &result_size, vsmc::Vector<std::size_t> &num,
+    vsmc::Vector<bool> &test, vsmc::Vector<vsmc::StopWatch> &sw1,
+    vsmc::Vector<vsmc::StopWatch> &sw2)
 {
-    names.push_back(name);
-    size.push_back(sizeof(RNGType));
+    rng_name.push_back(name);
+    rng_size.push_back(sizeof(RNGType));
+    result_size.push_back(sizeof(typename RNGType::result_type));
 
     RNGType rng;
-    vsmc::Vector<double> r(n);
-    vsmc::Vector<typename RNGType::result_type> u(n);
-    double result = 0;
-    vsmc::StopWatch watch;
+    RNGType rng1;
+    RNGType rng2;
+    vsmc::Vector<typename RNGType::result_type> r1(n * 2);
+    vsmc::Vector<typename RNGType::result_type> r2(n * 2);
+    vsmc::StopWatch watch1;
+    vsmc::StopWatch watch2;
+    bool passed = true;
+    std::size_t number = 0;
+    std::uniform_int_distribution<std::size_t> runif(n, n * 2 - 1);
+    for (std::size_t i = 0; i != 10; ++i) {
+        std::size_t m = runif(rng);
 
-    std::uniform_real_distribution<double> runif_std(0, 1);
-    watch.reset();
-    watch.start();
-    for (std::size_t i = 0; i != n; ++i)
-        r[i] = runif_std(rng);
-    watch.stop();
-    result += std::accumulate(r.begin(), r.end(), 0.0);
-    sw.push_back(watch);
+        watch1.start();
+        for (std::size_t j = 0; j != m; ++j)
+            r1[j] = rng1();
+        watch1.stop();
 
-    vsmc::UniformRealDistribution<double> runif_vsmc(0, 1);
-    watch.reset();
-    watch.start();
-    for (std::size_t i = 0; i != n; ++i)
-        r[i] = runif_vsmc(rng);
-    watch.stop();
-    result += std::accumulate(r.begin(), r.end(), 0.0);
-    sw.push_back(watch);
+        watch2.start();
+        vsmc::rng_rand(rng2, m, r2.data());
+        watch2.stop();
 
-    rng_rand(rng, runif_vsmc, 1000, r.data());
-    watch.reset();
-    watch.start();
-    rng_rand(rng, runif_vsmc, n, r.data());
-    watch.stop();
-    result += std::accumulate(r.begin(), r.end(), 0.0);
-    sw.push_back(watch);
-
-    std::ofstream rnd("rnd");
-    rnd << result << std::endl;
-    rnd.close();
+        number += m;
+        for (std::size_t j = 0; j != m; ++j)
+            if (r1[j] != r2[j])
+                passed = false;
+    }
+    sw1.push_back(watch1);
+    sw2.push_back(watch2);
+    num.push_back(number);
+    test.push_back(passed);
 }
 
-inline void rng_test_output(const std::string &prog_name,
-    const vsmc::Vector<std::string> &names,
-    const vsmc::Vector<std::size_t> &size,
-    const vsmc::Vector<vsmc::StopWatch> &sw)
+inline void rng_test_output(const std::string &name,
+    const vsmc::Vector<std::string> &rng_name,
+    const vsmc::Vector<std::size_t> &rng_size,
+    const vsmc::Vector<std::size_t> &result_size,
+    const vsmc::Vector<std::size_t> &num, const vsmc::Vector<bool> &test,
+    const vsmc::Vector<vsmc::StopWatch> &sw1,
+    const vsmc::Vector<vsmc::StopWatch> &sw2)
 {
-    std::size_t N = names.size();
-    std::size_t R = sw.size() / N;
-    std::size_t lwid = 80;
-    int twid = 15;
-    int swid = 5;
-    int Twid = twid * static_cast<int>(R);
-    int nwid = static_cast<int>(lwid) - swid - Twid;
+    const int nwid = 30;
+    const int swid = 5;
+    const int twid = 15;
+    const std::size_t lwid = nwid + swid + twid * 5;
 
     std::cout << std::string(lwid, '=') << std::endl;
-    std::cout << std::left << std::setw(nwid) << prog_name;
+    std::cout << std::left << std::setw(nwid) << name;
     std::cout << std::right << std::setw(swid) << "Size";
-    std::cout << std::right << std::setw(twid) << "Time (STD)";
-    std::cout << std::right << std::setw(twid) << "Time (vSMC)";
-    std::cout << std::right << std::setw(twid) << "Time (Batch)";
+    std::cout << std::right << std::setw(twid) << "N/ns (Loop)";
+    std::cout << std::right << std::setw(twid) << "N/ns (Batch)";
+    std::cout << std::right << std::setw(twid) << "GB/s (Loop)";
+    std::cout << std::right << std::setw(twid) << "GB/s (Batch)";
+    std::cout << std::right << std::setw(twid) << "Test";
     std::cout << std::endl;
     std::cout << std::string(lwid, '-') << std::endl;
 
-    for (std::size_t i = 0; i != N; ++i) {
-        std::cout << std::left << std::setw(nwid) << names[i];
-        std::cout << std::right << std::setw(swid) << size[i];
-        for (std::size_t r = 0; r != R; ++r) {
-            double time = sw[i * R + r].milliseconds();
-            std::cout << std::right << std::setw(twid) << std::fixed << time;
-        }
+    for (std::size_t i = 0; i != rng_name.size(); ++i) {
+        double n1 = num[i] / sw1[i].nanoseconds();
+        double n2 = num[i] / sw2[i].nanoseconds();
+        double g1 = result_size[i] * num[i] / sw1[i].nanoseconds();
+        double g2 = result_size[i] * num[i] / sw2[i].nanoseconds();
+        std::string ts = test[i] ? "Passed" : "Failed";
+        std::cout << std::left << std::setw(nwid) << rng_name[i];
+        std::cout << std::right << std::setw(swid) << rng_size[i];
+        std::cout << std::right << std::setw(twid) << std::fixed << n1;
+        std::cout << std::right << std::setw(twid) << std::fixed << n2;
+        std::cout << std::right << std::setw(twid) << std::fixed << g1;
+        std::cout << std::right << std::setw(twid) << std::fixed << g2;
+        std::cout << std::right << std::setw(twid) << ts;
         std::cout << std::endl;
     }
     std::cout << std::string(lwid, '=') << std::endl;
diff --git a/example/rng/include/rng_u01.hpp b/example/rng/include/rng_u01.hpp
new file mode 100644
index 000000000..319eea5b2
--- /dev/null
+++ b/example/rng/include/rng_u01.hpp
@@ -0,0 +1,221 @@
+//============================================================================
+// vSMC/example/rng/include/rng_u01.hpp
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c) 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#ifndef VSMC_EXAMPLE_RNG_U01_HPP
+#define VSMC_EXAMPLE_RNG_U01_HPP
+
+#include <vsmc/rng/engine.hpp>
+#include <vsmc/rng/u01.hpp>
+#include <vsmc/rngc/u01.h>
+
+#define VSMC_DEFINE_RNGC_U01_LR_TEST(                                         \
+    ubits, fsuffix, lr, Left, Right, RealType)                                \
+    template <>                                                               \
+    inline RealType rng_u01_lr_c<std::uint##ubits##_t, RealType, vsmc::Left,  \
+        vsmc::Right>(std::uint##ubits##_t u)                                  \
+    {                                                                         \
+        return vsmc_u01_##lr##_u##ubits##fsuffix(u);                          \
+    }
+
+#define VSMC_RNG_U01_TEST(Left, Right)                                        \
+    rng_u01_lr<std::uint32_t, float, vsmc::Left, vsmc::Right>(argc, argv);    \
+    rng_u01_lr<std::uint64_t, float, vsmc::Left, vsmc::Right>(argc, argv);    \
+    rng_u01_lr<std::uint32_t, double, vsmc::Left, vsmc::Right>(argc, argv);   \
+    rng_u01_lr<std::uint64_t, double, vsmc::Left, vsmc::Right>(argc, argv);   \
+    rng_u01_lr<std::uint32_t, long double, vsmc::Left, vsmc::Right>(          \
+        argc, argv);                                                          \
+    rng_u01_lr<std::uint64_t, long double, vsmc::Left, vsmc::Right>(          \
+        argc, argv);
+
+template <typename>
+inline std::string rng_u01_type_name();
+
+template <>
+inline std::string rng_u01_type_name<float>()
+{
+    return "float";
+}
+
+template <>
+inline std::string rng_u01_type_name<double>()
+{
+    return "double";
+}
+
+template <>
+inline std::string rng_u01_type_name<long double>()
+{
+    return "long double";
+}
+
+template <>
+inline std::string rng_u01_type_name<vsmc::Closed>()
+{
+    return "Closed";
+}
+
+template <>
+inline std::string rng_u01_type_name<vsmc::Open>()
+{
+    return "Open";
+}
+
+template <typename UIntType, typename RealType, typename, typename>
+inline RealType rng_u01_lr_c(UIntType u);
+
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, f, cc, Closed, Closed, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, f, cc, Closed, Closed, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, d, cc, Closed, Closed, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, d, cc, Closed, Closed, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, l, cc, Closed, Closed, long double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, l, cc, Closed, Closed, long double)
+
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, f, co, Closed, Open, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, f, co, Closed, Open, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, d, co, Closed, Open, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, d, co, Closed, Open, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, l, co, Closed, Open, long double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, l, co, Closed, Open, long double)
+
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, f, oc, Open, Closed, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, f, oc, Open, Closed, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, d, oc, Open, Closed, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, d, oc, Open, Closed, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, l, oc, Open, Closed, long double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, l, oc, Open, Closed, long double)
+
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, f, oo, Open, Open, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, f, oo, Open, Open, float)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, d, oo, Open, Open, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, d, oo, Open, Open, double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(32, l, oo, Open, Open, long double)
+VSMC_DEFINE_RNGC_U01_LR_TEST(64, l, oo, Open, Open, long double)
+
+template <typename RealType>
+inline void rng_u01_lb(RealType x)
+{
+    if (vsmc::internal::is_equal(x, static_cast<RealType>(0)))
+        std::cout << "Left:  " << std::setw(37) << std::left << 0;
+    else
+        std::cout << "Left:  2^" << std::setw(35) << std::left << std::log2(x);
+
+    if (x < static_cast<RealType>(0))
+        std::cout << "< 0";
+    else if (x > static_cast<RealType>(0))
+        std::cout << "Open";
+    else
+        std::cout << "Closed";
+    std::cout << std::endl;
+}
+
+template <typename RealType>
+inline void rng_u01_rb(RealType x)
+{
+    if (vsmc::internal::is_equal(x, static_cast<RealType>(1))) {
+        std::cout << "Right: " << std::setw(37) << std::left << 1;
+    } else {
+        std::cout << "Right: 1 - 2^" << std::setw(31) << std::left
+                  << std::log2(static_cast<RealType>(1) - x);
+    }
+
+    if (x > static_cast<RealType>(1))
+        std::cout << "< 0";
+    else if (x < static_cast<RealType>(1))
+        std::cout << "Open";
+    else
+        std::cout << "Closed";
+    std::cout << std::endl;
+}
+
+template <typename UIntType, typename RealType, typename Left, typename Right>
+inline void rng_u01_lr(int argc, char **argv)
+{
+    std::size_t n = 1000000;
+    if (argc > 1)
+        n = static_cast<std::size_t>(std::atoi(argv[1]));
+
+    std::cout << std::string(50, '=') << std::endl;
+    std::cout << "u01<uint" << std::numeric_limits<UIntType>::digits << "_t, "
+              << rng_u01_type_name<RealType>() << ", "
+              << rng_u01_type_name<Left>() << ", "
+              << rng_u01_type_name<Right>() << ">" << std::endl;
+    std::cout << std::string(50, '-') << std::endl;
+    rng_u01_lb(vsmc::u01_lr<UIntType, RealType, Left, Right>(
+        std::numeric_limits<UIntType>::min()));
+    rng_u01_rb(vsmc::u01_lr<UIntType, RealType, Left, Right>(
+        std::numeric_limits<UIntType>::max()));
+
+    vsmc::ThreefryEngine<UIntType, 4> rng;
+    vsmc::Vector<UIntType> u(n * 2);
+    vsmc::Vector<RealType> r(n * 2);
+    vsmc::Vector<RealType> r1(n * 2);
+    vsmc::Vector<RealType> r2(n * 2);
+    bool passed1 = true;
+    bool passed2 = true;
+    std::uniform_int_distribution<std::size_t> runif(n, n * 2 - 1);
+    for (std::size_t i = 0; i != 10; ++i) {
+        std::size_t m = runif(rng);
+
+        if (passed1 || passed2) {
+            vsmc::rng_rand(rng, m, u.data());
+            for (std::size_t j = 0; j != m; ++j)
+                r[j] = vsmc::u01_lr<UIntType, RealType, Left, Right>(u[j]);
+        }
+
+        if (passed1) {
+            for (std::size_t j = 0; j != m; ++j)
+                r1[j] = rng_u01_lr_c<UIntType, RealType, Left, Right>(u[j]);
+            for (std::size_t j = 0; j != m; ++j) {
+                if (!vsmc::internal::is_equal(r[j], r1[j])) {
+                    passed1 = false;
+                    break;
+                }
+            }
+        }
+
+        if (passed2) {
+            vsmc::u01_lr<UIntType, RealType, Left, Right>(
+                m, u.data(), r2.data());
+            for (std::size_t j = 0; j != n; ++j) {
+                if (!vsmc::internal::is_equal(r[j], r2[j])) {
+                    passed2 = false;
+                    break;
+                }
+            }
+        }
+    }
+    std::cout << std::setw(44) << std::left
+              << "C API:" << (passed1 ? "Passed" : "Failed") << std::endl;
+    std::cout << std::setw(44) << std::left
+              << "Batch:" << (passed1 ? "Passed" : "Failed") << std::endl;
+}
+
+#endif // VSMC_EXAMPLE_RNG_U01_HPP
diff --git a/example/rng/src/rng_aes.cpp b/example/rng/src/rng_aes.cpp
index d3f71b327..35c32e90a 100644
--- a/example/rng/src/rng_aes.cpp
+++ b/example/rng/src/rng_aes.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_test.hpp"
 #include <vsmc/rng/aes.hpp>
+#include "rng_test.hpp"
 
 int main(int argc, char **argv)
 {
diff --git a/example/rng/src/rng_ars.cpp b/example/rng/src/rng_ars.cpp
index 9885d4fd8..3663141d9 100644
--- a/example/rng/src/rng_ars.cpp
+++ b/example/rng/src/rng_ars.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_test.hpp"
 #include <vsmc/rng/ars.hpp>
+#include "rng_test.hpp"
 
 int main(int argc, char **argv)
 {
diff --git a/example/rng/src/rng_beta.cpp b/example/rng/src/rng_beta.cpp
index d026e8b04..bb17e4b94 100644
--- a/example/rng/src/rng_beta.cpp
+++ b/example/rng/src/rng_beta.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,34 +29,24 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/beta_distribution.hpp>
-#include <boost/math/distributions/beta.hpp>
-#include <boost/random/beta_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double> rng_dist_partition<vsmc::BetaDistribution<double>>(
-    std::size_t n, vsmc::BetaDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(
-        n, boost::math::beta_distribution<double>(dist.alpha(), dist.beta()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 0.5, 0.5);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 1, 1);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 1, 0.5);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 1, 1.5);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 0.5, 1);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 1.5, 1);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 1.5, 1.5);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 0.3, 0.3);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 0.9, 0.9);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 1.5, 0.5);
-    VSMC_RNG_DIST_2(Beta, boost::random::beta_distribution, 0.5, 1.5);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.5, 0.5}});
+    params.push_back({{1.0, 1.0}});
+    params.push_back({{1.0, 0.5}});
+    params.push_back({{1.0, 1.5}});
+    params.push_back({{0.5, 1.0}});
+    params.push_back({{1.5, 1.0}});
+    params.push_back({{1.5, 1.5}});
+    params.push_back({{0.3, 0.3}});
+    params.push_back({{0.9, 0.9}});
+    params.push_back({{1.5, 0.5}});
+    params.push_back({{0.5, 1.5}});
+    VSMC_RNG_DIST_TEST(2, Beta, boost::random::beta_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_cauchy.cpp b/example/rng/src/rng_cauchy.cpp
index 6316e82bd..eb53c69ef 100644
--- a/example/rng/src/rng_cauchy.cpp
+++ b/example/rng/src/rng_cauchy.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,25 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/cauchy_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::CauchyDistribution<double>>(
-        std::size_t n, vsmc::CauchyDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n, [&](double p) {
-        return dist.a() +
-            dist.b() * std::tan(vsmc::const_pi<double>() * (p - 0.5));
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Cauchy, std::cauchy_distribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Cauchy, std::cauchy_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_chi_squared.cpp b/example/rng/src/rng_chi_squared.cpp
index ec432bf5f..1f2dc11fe 100644
--- a/example/rng/src/rng_chi_squared.cpp
+++ b/example/rng/src/rng_chi_squared.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,19 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/chi_squared_distribution.hpp>
-#include <boost/math/distributions/chi_squared.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::ChiSquaredDistribution<double>>(
-        std::size_t n, vsmc::ChiSquaredDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(
-        n, boost::math::chi_squared_distribution<double>(dist.n()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(1);
-    VSMC_RNG_DIST_1(ChiSquared, std::chi_squared_distribution, 0.2);
-    VSMC_RNG_DIST_1(ChiSquared, std::chi_squared_distribution, 1);
-    VSMC_RNG_DIST_1(ChiSquared, std::chi_squared_distribution, 1.5);
-    VSMC_RNG_DIST_1(ChiSquared, std::chi_squared_distribution, 2);
-    VSMC_RNG_DIST_1(ChiSquared, std::chi_squared_distribution, 3);
-    VSMC_RNG_DIST_1(ChiSquared, std::chi_squared_distribution, 30);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 1>> params;
+    params.push_back({{0.2}});
+    params.push_back({{1.0}});
+    params.push_back({{1.5}});
+    params.push_back({{2.0}});
+    params.push_back({{3.0}});
+    params.push_back({{30.0}});
+    VSMC_RNG_DIST_TEST(1, ChiSquared, std::chi_squared_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_exponential.cpp b/example/rng/src/rng_exponential.cpp
index dd7f9d5d0..b9c09c865 100644
--- a/example/rng/src/rng_exponential.cpp
+++ b/example/rng/src/rng_exponential.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,23 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/exponential_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::ExponentialDistribution<double>>(
-        std::size_t n, vsmc::ExponentialDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(
-        n, [&](double p) { return -std::log(1 - p) / dist.lambda(); });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(1);
-    VSMC_RNG_DIST_1(Exponential, std::exponential_distribution, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 1>> params;
+    params.push_back({{1.0}});
+    VSMC_RNG_DIST_TEST(1, Exponential, std::exponential_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_extreme_value.cpp b/example/rng/src/rng_extreme_value.cpp
index 0b766a0bd..26b29da70 100644
--- a/example/rng/src/rng_extreme_value.cpp
+++ b/example/rng/src/rng_extreme_value.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,24 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/extreme_value_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::ExtremeValueDistribution<double>>(
-        std::size_t n, vsmc::ExtremeValueDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n, [&](double p) {
-        return dist.a() - dist.b() * std::log(-std::log(p));
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(ExtremeValue, std::extreme_value_distribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, ExtremeValue, std::extreme_value_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_fisher_f.cpp b/example/rng/src/rng_fisher_f.cpp
index 23a944288..88d746ac6 100644
--- a/example/rng/src/rng_fisher_f.cpp
+++ b/example/rng/src/rng_fisher_f.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,43 +29,33 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/fisher_f_distribution.hpp>
-#include <boost/math/distributions/fisher_f.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::FisherFDistribution<double>>(
-        std::size_t n, vsmc::FisherFDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(
-        n, boost::math::fisher_f_distribution<double>(dist.m(), dist.n()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 0.2, 0.2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 0.2, 1);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 0.2, 1.5);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 0.2, 2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 0.2, 3);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 1, 0.2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 1, 1);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 1, 1.5);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 1, 2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 1, 3);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 2, 0.2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 2, 1);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 2, 1.5);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 2, 2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 2, 3);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 3, 0.2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 3, 1);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 3, 1.5);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 3, 2);
-    VSMC_RNG_DIST_2(FisherF, std::fisher_f_distribution, 3, 3);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.2, 0.2}});
+    params.push_back({{0.2, 1.0}});
+    params.push_back({{0.2, 1.5}});
+    params.push_back({{0.2, 2.0}});
+    params.push_back({{0.2, 3.0}});
+    params.push_back({{1.0, 0.2}});
+    params.push_back({{1.0, 1.0}});
+    params.push_back({{1.0, 1.5}});
+    params.push_back({{1.0, 2.0}});
+    params.push_back({{1.0, 3.0}});
+    params.push_back({{2.0, 0.2}});
+    params.push_back({{2.0, 1.0}});
+    params.push_back({{2.0, 1.5}});
+    params.push_back({{2.0, 2.0}});
+    params.push_back({{2.0, 3.0}});
+    params.push_back({{3.0, 0.2}});
+    params.push_back({{3.0, 1.0}});
+    params.push_back({{3.0, 1.5}});
+    params.push_back({{3.0, 2.0}});
+    params.push_back({{3.0, 3.0}});
+    VSMC_RNG_DIST_TEST(2, FisherF, std::fisher_f_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_gamma.cpp b/example/rng/src/rng_gamma.cpp
index 481d34435..bae1670dd 100644
--- a/example/rng/src/rng_gamma.cpp
+++ b/example/rng/src/rng_gamma.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,19 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/gamma_distribution.hpp>
-#include <boost/math/distributions/gamma.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::GammaDistribution<double>>(
-        std::size_t n, vsmc::GammaDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(
-        n, boost::math::gamma_distribution<double>(dist.alpha(), dist.beta()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Gamma, std::gamma_distribution, 1, 1);
-    VSMC_RNG_DIST_2(Gamma, std::gamma_distribution, 0.1, 1);
-    VSMC_RNG_DIST_2(Gamma, std::gamma_distribution, 0.5, 1);
-    VSMC_RNG_DIST_2(Gamma, std::gamma_distribution, 0.7, 1);
-    VSMC_RNG_DIST_2(Gamma, std::gamma_distribution, 0.9, 1);
-    VSMC_RNG_DIST_2(Gamma, std::gamma_distribution, 1.5, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{1.0, 1.0}});
+    params.push_back({{0.1, 1.0}});
+    params.push_back({{0.5, 1.0}});
+    params.push_back({{0.7, 1.0}});
+    params.push_back({{0.9, 1.0}});
+    params.push_back({{1.5, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Gamma, std::gamma_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_laplace.cpp b/example/rng/src/rng_laplace.cpp
index 738247a54..4b0ffad71 100644
--- a/example/rng/src/rng_laplace.cpp
+++ b/example/rng/src/rng_laplace.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,27 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/laplace_distribution.hpp>
-#include <boost/random/laplace_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::LaplaceDistribution<double>>(
-        std::size_t n, vsmc::LaplaceDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n, [&](double p) {
-        double q = p - 0.5;
-        return q > 0 ? dist.a() - dist.b() * std::log(1 - 2 * q) :
-                       dist.a() + dist.b() * std::log(1 + 2 * q);
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Laplace, boost::random::laplace_distribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Laplace, boost::random::laplace_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_levy.cpp b/example/rng/src/rng_levy.cpp
index 5820caeef..4c4c2728e 100644
--- a/example/rng/src/rng_levy.cpp
+++ b/example/rng/src/rng_levy.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,26 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/levy_distribution.hpp>
-#include <boost/math/distributions/normal.hpp>
-
-template <>
-inline vsmc::Vector<double> rng_dist_partition<vsmc::LevyDistribution<double>>(
-    std::size_t n, vsmc::LevyDistribution<double> &dist)
-{
-    boost::math::normal_distribution<double> normal(0, 1);
-    return rng_dist_partition_quantile(n, [&](double p) {
-        double q = boost::math::quantile(normal, 1 - 0.5 * p);
-        return dist.a() + dist.b() / (q * q);
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Levy, vsmc::LevyDistribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Levy, vsmc::LevyDistribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_logistic.cpp b/example/rng/src/rng_logistic.cpp
index 2d292ec9a..791842b62 100644
--- a/example/rng/src/rng_logistic.cpp
+++ b/example/rng/src/rng_logistic.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,23 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/logistic_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::LogisticDistribution<double>>(
-        std::size_t n, vsmc::LogisticDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n,
-        [&](double p) { return dist.a() + dist.b() * std::log(p / (1 - p)); });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Logistic, vsmc::LogisticDistribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Logistic, vsmc::LogisticDistribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_lognormal.cpp b/example/rng/src/rng_lognormal.cpp
index fcfac593c..2e0a71b57 100644
--- a/example/rng/src/rng_lognormal.cpp
+++ b/example/rng/src/rng_lognormal.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,24 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/lognormal_distribution.hpp>
-#include <boost/math/distributions/lognormal.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::LognormalDistribution<double>>(
-        std::size_t n, vsmc::LognormalDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(
-        n, boost::math::lognormal_distribution<double>(dist.m(), dist.s()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Lognormal, std::lognormal_distribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Lognormal, std::lognormal_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_mkl.cpp b/example/rng/src/rng_mkl.cpp
index 94172b5ce..501f17683 100644
--- a/example/rng/src/rng_mkl.cpp
+++ b/example/rng/src/rng_mkl.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_test.hpp"
 #include <vsmc/rng/mkl.hpp>
+#include "rng_test.hpp"
 
 int main(int argc, char **argv)
 {
diff --git a/example/rng/src/rng_normal.cpp b/example/rng/src/rng_normal.cpp
index 308fd8306..9885c7487 100644
--- a/example/rng/src/rng_normal.cpp
+++ b/example/rng/src/rng_normal.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,24 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/normal_distribution.hpp>
-#include <boost/math/distributions/normal.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::NormalDistribution<double>>(
-        std::size_t n, vsmc::NormalDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(n,
-        boost::math::normal_distribution<double>(dist.mean(), dist.stddev()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Normal, std::normal_distribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Normal, std::normal_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_pareto.cpp b/example/rng/src/rng_pareto.cpp
index 402425933..d4c64d874 100644
--- a/example/rng/src/rng_pareto.cpp
+++ b/example/rng/src/rng_pareto.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,24 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/pareto_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::ParetoDistribution<double>>(
-        std::size_t n, vsmc::ParetoDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n, [&](double p) {
-        return dist.b() / std::exp(std::log(1 - p) / dist.a());
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Pareto, vsmc::ParetoDistribution, 1, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{1.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Pareto, vsmc::ParetoDistribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_philox.cpp b/example/rng/src/rng_philox.cpp
index 2eaa296b1..2ee4cc1d5 100644
--- a/example/rng/src/rng_philox.cpp
+++ b/example/rng/src/rng_philox.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_test.hpp"
 #include <vsmc/rng/philox.hpp>
+#include "rng_test.hpp"
 
 int main(int argc, char **argv)
 {
diff --git a/example/rng/src/rng_rayleigh.cpp b/example/rng/src/rng_rayleigh.cpp
index 343f9c885..3b1d77cc9 100644
--- a/example/rng/src/rng_rayleigh.cpp
+++ b/example/rng/src/rng_rayleigh.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,24 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/rayleigh_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::RayleighDistribution<double>>(
-        std::size_t n, vsmc::RayleighDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n, [&](double p) {
-        return std::sqrt(-2 * std::log(1 - p) * dist.sigma() * dist.sigma());
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(1);
-    VSMC_RNG_DIST_1(Rayleigh, vsmc::RayleighDistribution, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 1>> params;
+    params.push_back({{1.0}});
+    VSMC_RNG_DIST_TEST(1, Rayleigh, vsmc::RayleighDistribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_rdrand.cpp b/example/rng/src/rng_rdrand.cpp
index ce068fd81..1cda72213 100644
--- a/example/rng/src/rng_rdrand.cpp
+++ b/example/rng/src/rng_rdrand.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_test.hpp"
 #include <vsmc/rng/rdrand.hpp>
+#include "rng_test.hpp"
 
 int main(int argc, char **argv)
 {
diff --git a/example/rng/src/rng_std.cpp b/example/rng/src/rng_std.cpp
index 9f442219e..f5778ead5 100644
--- a/example/rng/src/rng_std.cpp
+++ b/example/rng/src/rng_std.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/example/rng/src/rng_student_t.cpp b/example/rng/src/rng_student_t.cpp
index dc5ab189e..0211519cf 100644
--- a/example/rng/src/rng_student_t.cpp
+++ b/example/rng/src/rng_student_t.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,19 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/student_t_distribution.hpp>
-#include <boost/math/distributions/students_t.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::StudentTDistribution<double>>(
-        std::size_t n, vsmc::StudentTDistribution<double> &dist)
-{
-    return rng_dist_partition_boost(
-        n, boost::math::students_t_distribution<double>(dist.n()));
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(1);
-    VSMC_RNG_DIST_1(StudentT, std::student_t_distribution, 0.2);
-    VSMC_RNG_DIST_1(StudentT, std::student_t_distribution, 1);
-    VSMC_RNG_DIST_1(StudentT, std::student_t_distribution, 1.5);
-    VSMC_RNG_DIST_1(StudentT, std::student_t_distribution, 2);
-    VSMC_RNG_DIST_1(StudentT, std::student_t_distribution, 3);
-    VSMC_RNG_DIST_1(StudentT, std::student_t_distribution, 30);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 1>> params;
+    params.push_back({{0.2}});
+    params.push_back({{1.0}});
+    params.push_back({{1.5}});
+    params.push_back({{2.0}});
+    params.push_back({{3.0}});
+    params.push_back({{30.0}});
+    VSMC_RNG_DIST_TEST(1, StudentT, std::student_t_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_threefry.cpp b/example/rng/src/rng_threefry.cpp
index 2d1217d32..a9ae5c5d3 100644
--- a/example/rng/src/rng_threefry.cpp
+++ b/example/rng/src/rng_threefry.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_test.hpp"
 #include <vsmc/rng/threefry.hpp>
+#include "rng_test.hpp"
 
 int main(int argc, char **argv)
 {
diff --git a/example/rng/src/rng_u01.cpp b/example/rng/src/rng_u01.cpp
new file mode 100644
index 000000000..ce1d23def
--- /dev/null
+++ b/example/rng/src/rng_u01.cpp
@@ -0,0 +1,52 @@
+//============================================================================
+// vSMC/example/rng/include/rng_u01.cpp
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c); 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION); HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE);
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#include "rng_u01.hpp"
+#include <vsmc/rng/u01_distribution.hpp>
+#include "rng_dist.hpp"
+
+int main(int argc, char **argv)
+{
+    VSMC_RNG_U01_TEST(Closed, Closed);
+    VSMC_RNG_U01_TEST(Closed, Open);
+    VSMC_RNG_U01_TEST(Open, Closed);
+    VSMC_RNG_U01_TEST(Open, Open);
+    std::cout << std::string(50, '=') << std::endl;
+
+    vsmc::Vector<std::array<double, 0>> params(1);
+    VSMC_RNG_DIST_TEST(0, U01, std::uniform_real_distribution);
+    VSMC_RNG_DIST_TEST(0, U01CC, std::uniform_real_distribution);
+    VSMC_RNG_DIST_TEST(0, U01CO, std::uniform_real_distribution);
+    VSMC_RNG_DIST_TEST(0, U01OC, std::uniform_real_distribution);
+    VSMC_RNG_DIST_TEST(0, U01OO, std::uniform_real_distribution);
+
+    return 0;
+}
diff --git a/example/rng/src/rng_uniform_real.cpp b/example/rng/src/rng_uniform_real.cpp
index c82b0e81c..10e93207f 100644
--- a/example/rng/src/rng_uniform_real.cpp
+++ b/example/rng/src/rng_uniform_real.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,18 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/uniform_real_distribution.hpp>
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(UniformReal, std::uniform_real_distribution, 0, 1);
-    VSMC_RNG_DIST_2(UniformRealCC, vsmc::UniformRealCCDistribution, 0, 1);
-    VSMC_RNG_DIST_2(UniformRealCO, vsmc::UniformRealCODistribution, 0, 1);
-    VSMC_RNG_DIST_2(UniformRealOC, vsmc::UniformRealOCDistribution, 0, 1);
-    VSMC_RNG_DIST_2(UniformRealOO, vsmc::UniformRealOODistribution, 0, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{0.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, UniformReal, std::uniform_real_distribution);
 
     return 0;
 }
diff --git a/example/rng/src/rng_weibull.cpp b/example/rng/src/rng_weibull.cpp
index 82e54fb7d..f7de5cb0f 100644
--- a/example/rng/src/rng_weibull.cpp
+++ b/example/rng/src/rng_weibull.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,24 +29,14 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include "rng_dist.hpp"
 #include <vsmc/rng/weibull_distribution.hpp>
-
-template <>
-inline vsmc::Vector<double>
-    rng_dist_partition<vsmc::WeibullDistribution<double>>(
-        std::size_t n, vsmc::WeibullDistribution<double> &dist)
-{
-    return rng_dist_partition_quantile(n, [&](double p) {
-        return dist.b() * std::pow(-std::log(1 - p), 1 / dist.a());
-    });
-}
+#include "rng_dist.hpp"
 
 int main(int argc, char **argv)
 {
-    VSMC_RNG_DIST_PRE(2);
-    VSMC_RNG_DIST_2(Weibull, std::weibull_distribution, 1, 1);
-    VSMC_RNG_DIST_POST;
+    vsmc::Vector<std::array<double, 2>> params;
+    params.push_back({{1.0, 1.0}});
+    VSMC_RNG_DIST_TEST(2, Weibull, std::weibull_distribution);
 
     return 0;
 }
diff --git a/example/vsmc/CMakeLists.txt b/example/vsmc/CMakeLists.txt
index ab6f48c7b..f70944bfd 100644
--- a/example/vsmc/CMakeLists.txt
+++ b/example/vsmc/CMakeLists.txt
@@ -3,7 +3,7 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-#  Copyright (c) 2013-2015, Yan Zhou
+#  Copyright (c) 2013-2016, Yan Zhou
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@ ADD_DEPENDENCIES(example vsmc)
 ADD_HEADER_EXECUTABLE(vsmc/core/core TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/core/monitor         TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/core/particle        TRUE)
-ADD_HEADER_EXECUTABLE(vsmc/core/path            TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/core/sampler         TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/core/single_particle TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/core/state_matrix    TRUE)
@@ -51,7 +50,6 @@ ADD_HEADER_EXECUTABLE(vsmc/internal/traits   TRUE)
 
 ADD_HEADER_EXECUTABLE(vsmc/math/math TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/math/constants TRUE)
-ADD_HEADER_EXECUTABLE(vsmc/math/cblas     TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/math/vmath     TRUE)
 
 ADD_HEADER_EXECUTABLE(vsmc/resample/resample TRUE)
@@ -66,6 +64,7 @@ ADD_HEADER_EXECUTABLE(vsmc/resample/transform           TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/resample/internal/common     TRUE)
 
 ADD_HEADER_EXECUTABLE(vsmc/rng/rng TRUE "MKL")
+ADD_HEADER_EXECUTABLE(vsmc/rng/random_walk     TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/rng_set         TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/seed            TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/u01             TRUE)
@@ -73,7 +72,6 @@ ADD_HEADER_EXECUTABLE(vsmc/rng/u01_sequence    TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/internal/common TRUE)
 
 ADD_HEADER_EXECUTABLE(vsmc/rng/distribution TRUE)
-ADD_HEADER_EXECUTABLE(vsmc/rng/bernoulli_distribution     TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/uniform_bits_distribution  TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/beta_distribution          TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/cauchy_distribution        TRUE)
@@ -87,6 +85,7 @@ ADD_HEADER_EXECUTABLE(vsmc/rng/levy_distribution          TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/logistic_distribution      TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/lognormal_distribution     TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/normal_distribution        TRUE)
+ADD_HEADER_EXECUTABLE(vsmc/rng/normal_mv_distribution     TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/pareto_distribution        TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/rayleigh_distribution      TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/rng/student_t_distribution     TRUE)
@@ -115,6 +114,7 @@ ADD_HEADER_EXECUTABLE(vsmc/smp/backend_tbb  ${TBB_FOUND} "TBB")
 
 ADD_HEADER_EXECUTABLE(vsmc/utility/utility TRUE "HDF5")
 ADD_HEADER_EXECUTABLE(vsmc/utility/aligned_memory TRUE)
+ADD_HEADER_EXECUTABLE(vsmc/utility/covariance     TRUE)
 ADD_HEADER_EXECUTABLE(vsmc/utility/hdf5io         ${HDF5_FOUND} "HDF5")
 ADD_HEADER_EXECUTABLE(vsmc/utility/mkl            ${MKL_FOUND} "MKL")
 ADD_HEADER_EXECUTABLE(vsmc/utility/program_option TRUE)
diff --git a/example/vsmc/src/vsmc.cpp b/example/vsmc/src/vsmc.cpp
index 15f61f450..69e2fad62 100644
--- a/example/vsmc/src/vsmc.cpp
+++ b/example/vsmc/src/vsmc.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/core/core.hpp b/include/vsmc/core/core.hpp
index f6ef6cd0f..061762d0c 100644
--- a/include/vsmc/core/core.hpp
+++ b/include/vsmc/core/core.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,6 @@
 #include <vsmc/internal/config.h>
 #include <vsmc/core/monitor.hpp>
 #include <vsmc/core/particle.hpp>
-#include <vsmc/core/path.hpp>
 #include <vsmc/core/sampler.hpp>
 #include <vsmc/core/single_particle.hpp>
 #include <vsmc/core/state_matrix.hpp>
diff --git a/include/vsmc/core/monitor.hpp b/include/vsmc/core/monitor.hpp
index 39021c8e1..fcab5fe01 100644
--- a/include/vsmc/core/monitor.hpp
+++ b/include/vsmc/core/monitor.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -67,8 +67,8 @@ class Monitor
     using eval_type =
         std::function<void(std::size_t, std::size_t, Particle<T> &, double *)>;
 
-    explicit Monitor(std::size_t dim, const eval_type &eval,
-        bool record_only = false, MonitorStage stage = MonitorMCMC)
+    Monitor(std::size_t dim, const eval_type &eval, bool record_only = false,
+        MonitorStage stage = MonitorMCMC)
         : dim_(dim)
         , eval_(eval)
         , recording_(true)
@@ -213,16 +213,16 @@ class Monitor
     ///
     /// \param first The output iterator
     ///
-    /// For example, say `first` is of type `double *`, then if `order ==
+    /// For example, say `first` is of type `double *`, then if `layout ==
     /// ColMajor`, then, `first[j * iter_size() + i] == record(i, j)`.
-    /// Otherwise, if `order == RowMajor`, then `first[i * dim() + j] ==
+    /// Otherwise, if `layout == RowMajor`, then `first[i * dim() + j] ==
     /// record(i, j)`. That is, the output is an `iter_size()` by `dim()`
-    /// matrix, with the usual meaning of column or row major order.
-    template <MatrixOrder Order, typename OutputIter>
+    /// matrix, with the usual meaning of column or row major layout.
+    template <MatrixLayout Layout, typename OutputIter>
     void read_record_matrix(OutputIter first) const
     {
         const std::size_t N = iter_size();
-        if (Order == ColMajor) {
+        if (Layout == ColMajor) {
             for (std::size_t d = 0; d != dim_; ++d) {
                 const double *riter = record_.data() + d;
                 for (std::size_t i = 0; i != N; ++i, ++first, riter += dim_)
@@ -230,7 +230,7 @@ class Monitor
             }
         }
 
-        if (Order == RowMajor)
+        if (Layout == RowMajor)
             std::copy(record_.begin(), record_.end(), first);
     }
 
@@ -260,7 +260,9 @@ class Monitor
         const std::size_t N = static_cast<std::size_t>(particle.size());
         buffer_.resize(N * dim_);
         eval_(iter, dim_, particle, buffer_.data());
-        gemv(ColMajor, NoTrans, dim_, N, 1.0, buffer_.data(), dim_,
+        ::cblas_dgemv(::CblasColMajor, ::CblasNoTrans,
+            static_cast<VSMC_CBLAS_INT>(dim_), static_cast<VSMC_CBLAS_INT>(N),
+            1.0, buffer_.data(), static_cast<VSMC_CBLAS_INT>(dim_),
             particle.weight().data(), 1, 0.0, result_.data(), 1);
         push_back(iter);
     }
diff --git a/include/vsmc/core/particle.hpp b/include/vsmc/core/particle.hpp
index 72b14940c..6a7d7c383 100644
--- a/include/vsmc/core/particle.hpp
+++ b/include/vsmc/core/particle.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -52,10 +52,10 @@ class Particle
     using value_type = T;
     using weight_type = WeightType<T>;
     using rng_set_type = RNGSetType<T>;
-    using resample_rng_type = ResampleRNGType<T>;
     using rng_type = typename rng_set_type::rng_type;
-    using resample_type = std::function<void(std::size_t, std::size_t,
-        resample_rng_type &, const double *, size_type *)>;
+    using resample_type = std::function<void(
+        std::size_t, std::size_t, rng_type &, const double *, size_type *)>;
+    using sp_type = SingleParticle<T>;
 
     explicit Particle(size_type N)
         : size_(N)
@@ -63,7 +63,7 @@ class Particle
         , weight_(static_cast<SizeType<weight_type>>(N))
         , rng_set_(static_cast<SizeType<rng_set_type>>(N))
     {
-        Seed::instance().seed_rng(resample_rng_);
+        Seed::instance().seed_rng(rng_);
     }
 
     /// \brief Clone the particle system except the RNG engines
@@ -75,7 +75,7 @@ class Particle
         Particle<T> particle(*this);
         if (new_rng) {
             particle.rng_set().seed();
-            Seed::instance().seed_rng(particle.resample_rng());
+            Seed::instance().seed_rng(particle.rng());
         }
 
         return particle;
@@ -95,7 +95,7 @@ class Particle
 
             if (!retain_rng) {
                 rng_set_ = other.rng_set_;
-                resample_rng_ = other.resample_rng_;
+                rng_ = other.rng_;
             }
         }
 
@@ -111,7 +111,7 @@ class Particle
 
             if (!retain_rng) {
                 rng_set_ = other.rng_set_;
-                resample_rng_ = other.resample_rng_;
+                rng_ = other.rng_;
             }
         }
 
@@ -142,8 +142,23 @@ class Particle
     /// \brief Get an (parallel) RNG stream for a given particle
     rng_type &rng(size_type id) { return rng_set_[id]; }
 
+    /// \brief Get an (parallel) RNG stream for a given particle
+    const rng_type &rng(size_type id) const { return rng_set_[id]; }
+
+    /// \brief Get the (sequential) RNG used stream for resampling
+    rng_type &rng() { return rng_; }
+
     /// \brief Get the (sequential) RNG used stream for resampling
-    resample_rng_type &resample_rng() { return resample_rng_; }
+    const rng_type &rng() const { return rng_; }
+
+    /// \brief Get a SingleParticle<T> object
+    sp_type sp(size_type id) { return SingleParticle<T>(id, this); }
+
+    /// \brief Get a SingleParticle<T> object for the first particle
+    sp_type begin() { return sp(0); }
+
+    /// \brief Get a SingleParticle<T> object for the first particle
+    sp_type end() { return sp(size_); }
 
     /// \brief Performing resampling if ESS/N < threshold
     ///
@@ -169,7 +184,7 @@ class Particle
                 Vector<size_type> idx(N);
 #endif // VSMC_USE_TBB
 
-                op(N, N, resample_rng_, rwptr, rep.data());
+                op(N, N, rng_, rwptr, rep.data());
                 resample_trans_rep_index(N, N, rep.data(), idx.data());
                 value_.copy(N, idx.data());
             } else {
@@ -186,7 +201,7 @@ class Particle
     value_type value_;
     weight_type weight_;
     rng_set_type rng_set_;
-    resample_rng_type resample_rng_;
+    rng_type rng_;
 #if VSMC_USE_TBB
     ::tbb::combinable<Vector<size_type>> rep_;
     ::tbb::combinable<Vector<size_type>> idx_;
diff --git a/include/vsmc/core/path.hpp b/include/vsmc/core/path.hpp
deleted file mode 100644
index 37baf4733..000000000
--- a/include/vsmc/core/path.hpp
+++ /dev/null
@@ -1,260 +0,0 @@
-//============================================================================
-// vSMC/include/vsmc/core/path.hpp
-//----------------------------------------------------------------------------
-//                         vSMC: Scalable Monte Carlo
-//----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-//   Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//============================================================================
-
-#ifndef VSMC_CORE_PATH_HPP
-#define VSMC_CORE_PATH_HPP
-
-#include <vsmc/internal/common.hpp>
-
-#define VSMC_RUNTIME_ASSERT_CORE_PATH_ITER(func)                              \
-    VSMC_RUNTIME_ASSERT((iter < iter_size()),                                 \
-        "**Path::" #func "** INVALID ITERATION NUMBER ARGUMENT")
-
-#define VSMC_RUNTIME_ASSERT_CORE_PATH_EVAL                                    \
-    VSMC_RUNTIME_ASSERT(                                                      \
-        static_cast<bool>(eval_), "**Path::eval** INVALID EVALUAITON OBJECT")
-
-namespace vsmc
-{
-
-/// \brief Monitor for Path sampling
-/// \ingroup Core
-template <typename T>
-class Path
-{
-    public:
-    using value_type = T;
-    using eval_type =
-        std::function<double(std::size_t, Particle<T> &, double *)>;
-
-    /// \brief Construct a Path with an evaluation object
-    ///
-    /// \param eval The evaluation object of type Path::eval_type
-    /// \param record_only The Path monitor only records the integrands
-    /// instead
-    /// of calculating them itself
-    ///
-    /// A Path object is very similar to a Monitor object. It is a special
-    /// case
-    /// for Path sampling Monitor. The dimension of the Monitor is always one.
-    /// In addition, the evaluation object returns the integration grid of the
-    /// Path sampling.
-    ///
-    /// The evaluation object has the signature
-    /// ~~~{.cpp}
-    /// double eval (std::size_t iter, Particle<T> &particle, double
-    /// *integrand)
-    /// ~~~
-    /// where the first two arguments are passed in by the Sampler at the end
-    /// of each iteration. The evaluation occurs after the possible MCMC
-    /// moves.
-    /// The output parameter `integrand` shall contains the results of the
-    /// Path sampling integrands. The return value shall be the Path sampling
-    /// integration grid.
-    ///
-    /// If `record_only` is true, then the Path monitor only records the
-    /// integrand estimate stored in `integrand`. Otherwise the behavior is
-    /// explained below,
-    ///
-    /// For example, say the Path sampling is computed through integration of
-    /// \f$\lambda = \int_0^1 E[g_\alpha(X)]\,\mathrm{d}\alpha\f$. The
-    /// integral
-    /// is approximated with numerical integration at point
-    /// \f$\alpha_0 = 0, \alpha_1, \dots, \alpha_T = 1\f$, then at iteration
-    /// \f$t\f$, the output parameter `integrand` contains
-    /// \f$(g_{\alpha_t}(X_0),\dots)\f$ and the return value is
-    /// \f$\alpha_t\f$.
-    explicit Path(const eval_type &eval, bool record_only = false)
-        : eval_(eval)
-        , recording_(true)
-        , record_only_(record_only)
-        , log_zconst_(0)
-    {
-    }
-
-    /// \brief The number of iterations has been recorded
-    ///
-    /// \sa Monitor::iter_size()
-    std::size_t iter_size() const { return index_.size(); }
-
-    /// \brief Reserve space for a specified number of iterations
-    void reserve(std::size_t num)
-    {
-        index_.reserve(num);
-        integrand_.reserve(num);
-        grid_.reserve(num);
-    }
-
-    /// \brief Whether the evaluation object is valid
-    bool empty() const { return !static_cast<bool>(eval_); }
-
-    /// \brief Get the iteration index of the sampler of a given monitor
-    /// iteration
-    ///
-    /// \sa Monitor::index()
-    std::size_t index(std::size_t iter) const
-    {
-        VSMC_RUNTIME_ASSERT_CORE_PATH_ITER(index);
-
-        return index_[iter];
-    }
-
-    /// \brief Get the Path sampling integrand of a given Path iteration
-    double integrand(std::size_t iter) const
-    {
-        VSMC_RUNTIME_ASSERT_CORE_PATH_ITER(integrand);
-
-        return integrand_[iter];
-    }
-
-    /// \brief Get the Path sampling grid value of a given Path iteration
-    double grid(std::size_t iter) const
-    {
-        VSMC_RUNTIME_ASSERT_CORE_PATH_ITER(grid);
-
-        return grid_[iter];
-    }
-
-    /// \brief Read only access to the raw data of the index vector
-    const std::size_t *index_data() const { return index_.data(); }
-
-    /// \brief Read only access to the raw data of the integrand vector
-    const std::size_t *integrand_data() const { return integrand_.data(); }
-
-    /// \brief Read only access to the raw data of the grid vector
-    const std::size_t *grid_data() const { return grid_.data(); }
-
-    /// \brief Read the index history through an output iterator
-    ///
-    /// \sa Monitor::read_index()
-    template <typename OutputIter>
-    void read_index(OutputIter first) const
-    {
-        std::copy(index_.begin(), index_.end(), first);
-    }
-
-    /// \brief Read the integrand history through an output iterator
-    template <typename OutputIter>
-    void read_integrand(OutputIter first) const
-    {
-        std::copy(integrand_.begin(), integrand_.end(), first);
-    }
-
-    /// \brief Read the grid history through an output iterator
-    template <typename OutputIter>
-    void read_grid(OutputIter first) const
-    {
-        std::copy(grid_.begin(), grid_.end(), first);
-    }
-
-    /// \brief Set a new evaluation object of type eval_type
-    void set_eval(const eval_type &new_eval, bool record_only = false)
-    {
-        eval_ = new_eval;
-        record_only_ = record_only;
-    }
-
-    /// Perform the evaluation for a given iteration and a Particle<T> object
-    ///
-    /// \sa Monitor::eval()
-    void eval(std::size_t iter, Particle<T> &particle)
-    {
-        if (!recording_)
-            return;
-
-        VSMC_RUNTIME_ASSERT_CORE_PATH_EVAL;
-
-        if (record_only_) {
-            double integrand = 0;
-            double grid = eval_(iter, particle, &integrand);
-            push_back(iter, grid, integrand);
-
-            return;
-        }
-
-        const std::size_t N = static_cast<std::size_t>(particle.size());
-        buffer_.resize(N);
-        double grid = eval_(iter, particle, buffer_.data());
-        double integrand =
-            dot(N, particle.weight().data(), 1, buffer_.data(), 1);
-        push_back(iter, grid, integrand);
-    }
-
-    /// \brief Get the nomralizing constants ratio estimates
-    double zconst() const { return std::exp(log_zconst_); }
-
-    /// \brief Get the logarithm nomralizing constants ratio estimates
-    double log_zconst() const { return log_zconst_; }
-
-    /// \brief Clear all records of the index and integrations
-    void clear()
-    {
-        log_zconst_ = 0;
-        index_.clear();
-        integrand_.clear();
-        grid_.clear();
-    }
-
-    /// \brief Whether the Path is actively recording restuls
-    bool recording() const { return recording_; }
-
-    /// \brief Turn on the recording
-    void turn_on() { recording_ = true; }
-
-    /// \brief Turn off the recording
-    void turn_off() { recording_ = false; }
-
-    private:
-    eval_type eval_;
-    bool recording_;
-    bool record_only_;
-    double log_zconst_;
-    Vector<std::size_t> index_;
-    Vector<double> integrand_;
-    Vector<double> grid_;
-    Vector<double> buffer_;
-
-    void push_back(std::size_t iter, double grid, double integrand)
-    {
-        index_.push_back(iter);
-        grid_.push_back(grid);
-        integrand_.push_back(integrand);
-        if (iter_size() > 1) {
-            std::size_t i = iter_size() - 1;
-            log_zconst_ += 0.5 * (grid_[i] - grid_[i - 1]) *
-                (integrand_[i] + integrand_[i - 1]);
-        }
-    }
-}; // class PathSampling
-
-} // namespace vsmc
-
-#endif // VSMC_CORE_PATH_HPP
diff --git a/include/vsmc/core/sampler.hpp b/include/vsmc/core/sampler.hpp
index 4818aaeae..d1956dcce 100644
--- a/include/vsmc/core/sampler.hpp
+++ b/include/vsmc/core/sampler.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,6 @@
 #include <vsmc/internal/common.hpp>
 #include <vsmc/core/monitor.hpp>
 #include <vsmc/core/particle.hpp>
-#include <vsmc/core/path.hpp>
 
 #define VSMC_RUNTIME_ASSERT_CORE_SAMPLER_MONITOR_NAME(iter, map, func)        \
     VSMC_RUNTIME_ASSERT(                                                      \
@@ -73,7 +72,6 @@ class Sampler
         , init_by_iter_(false)
         , resample_threshold_(resample_threshold_never())
         , iter_num_(0)
-        , path_(typename Path<T>::eval_type())
     {
         resample_scheme(Multinomial);
     }
@@ -87,7 +85,6 @@ class Sampler
         , init_by_iter_(false)
         , resample_threshold_(resample_threshold_always())
         , iter_num_(0)
-        , path_(typename Path<T>::eval_type())
     {
         resample_scheme(scheme);
     }
@@ -101,7 +98,6 @@ class Sampler
         , init_by_iter_(false)
         , resample_threshold_(resample_threshold_always())
         , iter_num_(0)
-        , path_(typename Path<T>::eval_type())
     {
         resample_scheme(res_op);
     }
@@ -113,7 +109,6 @@ class Sampler
         , init_by_iter_(false)
         , resample_threshold_(resample_threshold)
         , iter_num_(0)
-        , path_(typename Path<T>::eval_type())
     {
         resample_scheme(scheme);
     }
@@ -126,7 +121,6 @@ class Sampler
         , init_by_iter_(false)
         , resample_threshold_(resample_threshold)
         , iter_num_(0)
-        , path_(typename Path<T>::eval_type())
     {
         resample_scheme(res_op);
     }
@@ -202,8 +196,6 @@ class Sampler
         resampled_history_.reserve(num);
         for (auto &a : accept_history_)
             a.reserve(num);
-        if (!path_.empty())
-            path_.reserve(num);
         for (auto &m : monitor_)
             if (!m.second.empty())
                 m.second.reserve(num);
@@ -445,9 +437,9 @@ class Sampler
     /// \param param Additional parameters passed to the initialization object
     /// of type init_type
     ///
-    /// All histories (ESS, resampled, accept, Monitor and Path) are clared
-    /// before callling the initialization object. Monitors and Path's
-    /// evaluation objects are untouched.
+    /// All histories (ESS, resampled, acceptance count, Monitor) are clared
+    /// before callling the initialization object. Monitors evaluation objects
+    /// are untouched.
     Sampler<T> &initialize(void *param = nullptr)
     {
         do_reset();
@@ -468,7 +460,7 @@ class Sampler
     /// \details
     /// Moves performed first. Then ESS/N is compared to the threshold and
     /// possible resampling is performed. Then mcmcs are performed. Then
-    /// monitors and Path are computed
+    /// monitors are computed
     Sampler<T> &iterate(std::size_t num = 1)
     {
         do_acch();
@@ -483,26 +475,6 @@ class Sampler
         return *this;
     }
 
-    /// \brief Read and write access to the Path sampling monitor
-    Path<T> &path() { return path_; }
-
-    /// \brief Read only access to the Path sampling monitor
-    const Path<T> &path() const { return path_; }
-
-    /// \brief Set the Path sampling evaluation object
-    Sampler<T> &path_sampling(
-        const typename Path<T>::eval_type &eval, bool record_only = false)
-    {
-        path_.set_eval(eval, record_only);
-
-        return *this;
-    }
-
-    /// \brief Path sampling estimate of the logarithm of normalizing
-    /// constants
-    /// ratio
-    double path_sampling() const { return path_.log_zconst(); }
-
     /// \brief Add a monitor
     ///
     /// \param name The name of the monitor
@@ -593,8 +565,6 @@ class Sampler
             return 0;
 
         std::size_t header_size = 1;
-        if (path_.iter_size() > 0)
-            header_size += 2;
         for (const auto &m : monitor_)
             if (m.second.iter_size() > 0)
                 header_size += m.second.dim();
@@ -623,10 +593,6 @@ class Sampler
             return;
 
         *first++ = std::string("ESS");
-        if (path_.iter_size() > 0) {
-            *first++ = std::string("Path.Integrand");
-            *first++ = std::string("Path.Grid");
-        }
         for (const auto &m : monitor_) {
             if (m.second.iter_size() > 0) {
                 unsigned md = static_cast<unsigned>(m.second.dim());
@@ -653,28 +619,28 @@ class Sampler
     }
 
     /// \brief Sampler summary data (integer data)
-    template <MatrixOrder Order, typename OutputIter>
+    template <MatrixLayout Layout, typename OutputIter>
     void summary_data_int(OutputIter first) const
     {
         if (summary_data_size_int() == 0)
             return;
 
-        if (Order == RowMajor)
+        if (Layout == RowMajor)
             summary_data_row_int(first);
-        if (Order == ColMajor)
+        if (Layout == ColMajor)
             summary_data_col_int(first);
     }
 
     /// \brief Sampler summary data (floating point data)
-    template <MatrixOrder Order, typename OutputIter>
+    template <MatrixLayout Layout, typename OutputIter>
     void summary_data(OutputIter first) const
     {
         if (summary_data_size() == 0)
             return;
 
-        if (Order == RowMajor)
+        if (Layout == RowMajor)
             summary_data_row(first);
-        if (Order == ColMajor)
+        if (Layout == ColMajor)
             summary_data_col(first);
     }
 
@@ -738,8 +704,6 @@ class Sampler
     Vector<double> ess_history_;
     Vector<bool> resampled_history_;
     Vector<Vector<std::size_t>> accept_history_;
-
-    Path<T> path_;
     monitor_map_type monitor_;
 
     void do_acch()
@@ -762,7 +726,6 @@ class Sampler
         ess_history_.clear();
         resampled_history_.clear();
         accept_history_.clear();
-        path_.clear();
         for (auto &m : monitor_)
             m.second.clear();
         iter_num_ = 0;
@@ -816,9 +779,6 @@ class Sampler
 
     void do_monitor(MonitorStage stage)
     {
-        if (!path_.empty() && stage == MonitorMCMC)
-            path_.eval(iter_num_, particle_);
-
         for (auto &m : monitor_)
             if (!m.second.empty())
                 m.second.eval(iter_num_, particle_, stage);
@@ -853,8 +813,6 @@ class Sampler
     {
         double missing_data = std::numeric_limits<double>::quiet_NaN();
 
-        std::size_t piter = 0;
-
         Vector<std::pair<std::size_t, const Monitor<T> *>> miter;
         for (const auto &m : monitor_)
             if (m.second.iter_size() > 0)
@@ -862,16 +820,6 @@ class Sampler
 
         for (std::size_t iter = 0; iter != iter_size(); ++iter) {
             *first++ = ess_history_[iter];
-            if (path_.iter_size() > 0) {
-                if (piter != path_.iter_size() && iter == path_.index(piter)) {
-                    *first++ = path_.integrand(piter);
-                    *first++ = path_.grid(piter);
-                    ++piter;
-                } else {
-                    *first++ = missing_data;
-                    *first++ = missing_data;
-                }
-            }
             for (auto &m : miter) {
                 std::size_t md = m.second->dim();
                 if (m.first != m.second->iter_size() &&
@@ -891,22 +839,6 @@ class Sampler
         double missing_data = std::numeric_limits<double>::quiet_NaN();
 
         first = std::copy(ess_history_.begin(), ess_history_.end(), first);
-        if (path_.iter_size() > 0) {
-            std::size_t piter = 0;
-            for (std::size_t iter = 0; iter != iter_size(); ++iter) {
-                if (piter != path_.iter_size() || iter == path_.index(piter))
-                    *first++ = path_.integrand(piter++);
-                else
-                    *first = missing_data;
-            }
-            piter = 0;
-            for (std::size_t iter = 0; iter != iter_size(); ++iter) {
-                if (piter != path_.iter_size() || iter == path_.index(piter))
-                    *first++ = path_.grid(piter++);
-                else
-                    *first = missing_data;
-            }
-        }
         for (const auto &m : monitor_) {
             if (m.second.iter_size() > 0) {
                 for (std::size_t d = 0; d != m.second.dim(); ++d) {
diff --git a/include/vsmc/core/single_particle.hpp b/include/vsmc/core/single_particle.hpp
index b4c144dbe..7b9407bc9 100644
--- a/include/vsmc/core/single_particle.hpp
+++ b/include/vsmc/core/single_particle.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,16 @@
 
 #include <vsmc/internal/common.hpp>
 
+#define VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2)                 \
+    VSMC_RUNTIME_ASSERT((sp1.particle_ptr() == sp2.particle_ptr()),           \
+        "COMPARE TWO SingleParticle OBJECTS THAT BELONG TO TWO PARTICLE "     \
+        "SYSTEMS");
+
+#define VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_DIFFERENCE(sp1, sp2)              \
+    VSMC_RUNTIME_ASSERT((sp1.particle_ptr() == sp2.particle_ptr()),           \
+        "SUBSTRACT TWO SingleParticle OBJECTS THAT BELONG TO TWO PARTICLE "   \
+        "SYSTEMS");
+
 namespace vsmc
 {
 
@@ -52,7 +62,9 @@ class SingleParticleBase
 
     Particle<T> &particle() const { return *pptr_; }
 
-    typename Particle<T>::rng_type &rng() const { return pptr_->rng(id_); }
+    Particle<T> *particle_ptr() const { return pptr_; }
+
+    typename Particle<T>::rng_type &rng() { return pptr_->rng(id_); }
 
     private:
     typename Particle<T>::size_type id_;
@@ -94,8 +106,156 @@ class SingleParticle : public SingleParticleBaseType<T>
         : SingleParticleBaseType<T>(id, pptr)
     {
     }
+
+    template <typename IntType>
+    SingleParticle operator[](IntType n)
+    {
+        return SingleParticle<T>(static_cast<typename Particle<T>::size_type>(
+                                     static_cast<std::ptrdiff_t>(this->id()) +
+                                     static_cast<std::ptrdiff_t>(n)),
+            this->particle_ptr());
+    }
+
+    SingleParticle<T> &operator*() { return *this; }
+
+    const SingleParticle<T> &operator*() const { return *this; }
 }; // class SingleParticle
 
+template <typename T>
+bool operator==(const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2);
+
+    return sp1.id() == sp2.id();
+}
+
+template <typename T>
+bool operator!=(const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2);
+
+    return sp1.id() != sp2.id();
+}
+
+template <typename T>
+bool operator<(const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2);
+
+    return sp1.id() < sp2.id();
+}
+
+template <typename T>
+bool operator>(const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2);
+
+    return sp1.id() > sp2.id();
+}
+
+template <typename T>
+bool operator<=(const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2);
+
+    return sp1.id() <= sp2.id();
+}
+
+template <typename T>
+bool operator>=(const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_COMPARE(sp1, sp2);
+
+    return sp1.id() >= sp2.id();
+}
+
+template <typename T>
+SingleParticle<T> &operator++(SingleParticle<T> &sp)
+{
+    sp = SingleParticle<T>(sp.id() + 1, sp.particle_ptr());
+
+    return sp;
+}
+
+template <typename T>
+SingleParticle<T> operator++(SingleParticle<T> &sp, int)
+{
+    auto sp_tmp = sp;
+    sp = SingleParticle<T>(sp.id() + 1, sp.particle_ptr());
+
+    return sp_tmp;
+}
+
+template <typename T>
+SingleParticle<T> &operator--(SingleParticle<T> &sp)
+{
+    sp = SingleParticle<T>(sp.id() - 1, sp.particle_ptr());
+
+    return sp;
+}
+
+template <typename T>
+SingleParticle<T> operator--(SingleParticle<T> &sp, int)
+{
+    auto sp_tmp = sp;
+    sp = SingleParticle<T>(sp.id() - 1, sp.particle_ptr());
+
+    return sp_tmp;
+}
+
+template <typename T, typename IntType>
+SingleParticle<T> operator+(const SingleParticle<T> &sp, IntType n)
+{
+    return SingleParticle<T>(static_cast<typename Particle<T>::size_type>(
+                                 static_cast<std::ptrdiff_t>(sp.id()) +
+                                 static_cast<std::ptrdiff_t>(n)),
+        sp.particle_ptr());
+}
+
+template <typename T, typename IntType>
+SingleParticle<T> operator+(IntType n, const SingleParticle<T> &sp)
+{
+    return SingleParticle<T>(static_cast<typename Particle<T>::size_type>(
+                                 static_cast<std::ptrdiff_t>(sp.id()) +
+                                 static_cast<std::ptrdiff_t>(n)),
+        sp.particle_ptr());
+}
+
+template <typename T, typename IntType>
+SingleParticle<T> operator-(const SingleParticle<T> &sp, IntType n)
+{
+    return SingleParticle<T>(static_cast<typename Particle<T>::size_type>(
+                                 static_cast<std::ptrdiff_t>(sp.id()) -
+                                 static_cast<std::ptrdiff_t>(n)),
+        sp.particle_ptr());
+}
+
+template <typename T, typename IntType>
+SingleParticle<T> &operator+=(SingleParticle<T> &sp, IntType n)
+{
+    sp = sp + n;
+
+    return sp;
+}
+
+template <typename T, typename IntType>
+SingleParticle<T> &operator-=(SingleParticle<T> &sp, IntType n)
+{
+    sp = sp - n;
+
+    return sp;
+}
+
+template <typename T>
+std::ptrdiff_t operator-(
+    const SingleParticle<T> &sp1, const SingleParticle<T> &sp2)
+{
+    VSMC_RUNTIME_ASSERT_SINGLE_PARTICLE_DIFFERENCE(sp1, sp2);
+
+    return static_cast<std::ptrdiff_t>(sp1.id()) -
+        static_cast<std::ptrdiff_t>(sp2.id());
+}
+
 } // namespace vsmc
 
 #endif // VSMC_CORE_SINGLE_PARTICLE_HPP
diff --git a/include/vsmc/core/state_matrix.hpp b/include/vsmc/core/state_matrix.hpp
index 815a0c07d..7a61cd1fa 100644
--- a/include/vsmc/core/state_matrix.hpp
+++ b/include/vsmc/core/state_matrix.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,10 +35,6 @@
 #include <vsmc/internal/common.hpp>
 #include <vsmc/core/single_particle.hpp>
 
-#define VSMC_STATIC_ASSERT_CORE_STATE_MATRIX_DYNAMIC_DIM_RESIZE(Dim)          \
-    VSMC_STATIC_ASSERT((Dim == Dynamic),                                      \
-        "**StateMatrix::resize_dim** USED WITH A FIXED DIMENSION OBJECT")
-
 #define VSMC_RUNTIME_ASSERT_CORE_STATE_MATRIX_COPY_SIZE_MISMATCH              \
     VSMC_RUNTIME_ASSERT((N == static_cast<size_type>(this->size())),          \
         "**StateMatrix::copy** SIZE MISMATCH")
@@ -88,7 +84,7 @@ class StateMatrixDim<Dynamic>
 
 /// \brief Base type of StateMatrix
 /// \ingroup Core
-template <MatrixOrder Order, std::size_t Dim, typename T>
+template <MatrixLayout Layout, std::size_t Dim, typename T>
 class StateMatrixBase : public internal::StateMatrixDim<Dim>
 {
     public:
@@ -116,7 +112,8 @@ class StateMatrixBase : public internal::StateMatrixDim<Dim>
 
     void resize_dim(std::size_t dim)
     {
-        VSMC_STATIC_ASSERT_CORE_STATE_MATRIX_DYNAMIC_DIM_RESIZE(Dim);
+        static_assert(Dim == Dynamic,
+            "**StateMatrix** OBJECT DECLARED WITH A FIXED DIMENSION");
         VSMC_RUNTIME_ASSERT_CORE_STATE_MATRIX_DIM_SIZE(dim);
 
         internal::StateMatrixDim<Dim>::resize_dim(dim);
@@ -129,7 +126,7 @@ class StateMatrixBase : public internal::StateMatrixDim<Dim>
 
     const state_type *data() const { return data_.data(); }
 
-    void swap(StateMatrixBase<Order, Dim, T> &other)
+    void swap(StateMatrixBase<Layout, Dim, T> &other)
     {
         internal::StateMatrixDim<Dim>::swap(other);
         std::swap(size_, other.size_);
@@ -139,8 +136,8 @@ class StateMatrixBase : public internal::StateMatrixDim<Dim>
     template <typename OutputIter>
     void read_state(std::size_t pos, OutputIter first) const
     {
-        const StateMatrix<Order, Dim, T> *sptr =
-            static_cast<const StateMatrix<Order, Dim, T> *>(this);
+        const StateMatrix<Layout, Dim, T> *sptr =
+            static_cast<const StateMatrix<Layout, Dim, T> *>(this);
         for (size_type i = 0; i != size_; ++i, ++first)
             *first = sptr->state(i, pos);
     }
@@ -152,19 +149,19 @@ class StateMatrixBase : public internal::StateMatrixDim<Dim>
             read_state(d, *first);
     }
 
-    template <MatrixOrder ROrder, typename OutputIter>
+    template <MatrixLayout RLayout, typename OutputIter>
     void read_state_matrix(OutputIter first) const
     {
-        if (ROrder == Order) {
+        if (RLayout == Layout) {
             std::copy(data_.begin(), data_.end(), first);
         } else {
-            const StateMatrix<Order, Dim, T> *sptr =
-                static_cast<const StateMatrix<Order, Dim, T> *>(this);
-            if (ROrder == RowMajor) {
+            const StateMatrix<Layout, Dim, T> *sptr =
+                static_cast<const StateMatrix<Layout, Dim, T> *>(this);
+            if (RLayout == RowMajor) {
                 for (size_type i = 0; i != size_; ++i)
                     for (std::size_t d = 0; d != this->dim(); ++d)
                         *first++ = sptr->state(i, d);
-            } else if (ROrder == ColMajor) {
+            } else if (RLayout == ColMajor) {
                 for (std::size_t d = 0; d != this->dim(); ++d)
                     for (size_type i = 0; i != size_; ++i)
                         *first++ = sptr->state(i, d);
@@ -179,8 +176,8 @@ class StateMatrixBase : public internal::StateMatrixDim<Dim>
         if (this->dim() == 0 || size_ == 0 || !os.good())
             return os;
 
-        const StateMatrix<Order, Dim, T> *sptr =
-            static_cast<const StateMatrix<Order, Dim, T> *>(this);
+        const StateMatrix<Layout, Dim, T> *sptr =
+            static_cast<const StateMatrix<Layout, Dim, T> *>(this);
         for (size_type i = 0; i != size_; ++i) {
             for (std::size_t d = 0; d != this->dim() - 1; ++d)
                 os << sptr->state(i, d) << sepchar;
@@ -198,11 +195,11 @@ class StateMatrixBase : public internal::StateMatrixDim<Dim>
     Vector<T> data_;
 }; // class StateMatrixBase
 
-template <typename CharT, typename Traits, MatrixOrder Order, std::size_t Dim,
-    typename T>
+template <typename CharT, typename Traits, MatrixLayout Layout,
+    std::size_t Dim, typename T>
 inline std::basic_ostream<CharT, Traits> &operator<<(
     std::basic_ostream<CharT, Traits> &os,
-    const StateMatrixBase<Order, Dim, T> &smatrix)
+    const StateMatrixBase<Layout, Dim, T> &smatrix)
 {
     return smatrix.print(os);
 }
@@ -247,12 +244,12 @@ class StateMatrix<RowMajor, Dim, T> : public StateMatrixBase<RowMajor, Dim, T>
     }
 
     template <typename IntType>
-    void copy(size_type N, const IntType *src_idx)
+    void copy(size_type N, const IntType *index)
     {
         VSMC_RUNTIME_ASSERT_CORE_STATE_MATRIX_COPY_SIZE_MISMATCH;
 
         for (size_type dst = 0; dst != N; ++dst)
-            copy_particle(src_idx[dst], dst);
+            copy_particle(index[dst], dst);
     }
 
     void copy_particle(size_type src, size_type dst)
@@ -357,13 +354,13 @@ class StateMatrix<ColMajor, Dim, T> : public StateMatrixBase<ColMajor, Dim, T>
     }
 
     template <typename IntType>
-    void copy(size_type N, const IntType *src_idx)
+    void copy(size_type N, const IntType *index)
     {
         VSMC_RUNTIME_ASSERT_CORE_STATE_MATRIX_COPY_SIZE_MISMATCH;
 
         for (std::size_t d = 0; d != this->dim(); ++d)
             for (size_type dst = 0; dst != N; ++dst)
-                state(dst, d) = state(static_cast<size_type>(src_idx[dst]), d);
+                state(dst, d) = state(static_cast<size_type>(index[dst]), d);
     }
 
     void copy_particle(size_type src, size_type dst)
diff --git a/include/vsmc/core/weight.hpp b/include/vsmc/core/weight.hpp
index a5c3a1ead..e29f4f257 100644
--- a/include/vsmc/core/weight.hpp
+++ b/include/vsmc/core/weight.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,8 @@ namespace vsmc
 /// \ingroup Core
 inline double weight_ess(std::size_t N, const double *first)
 {
-    return 1 / dot(N, first, 1, first, 1);
+    return 1 /
+        ::cblas_ddot(static_cast<VSMC_CBLAS_INT>(N), first, 1, first, 1);
 }
 
 /// \brief Normalize weights such that the summation is one
@@ -208,7 +209,7 @@ class Weight
     {
         double *w = data_.data();
         double accw = 0;
-        const std::size_t k = 1000;
+        const std::size_t k = 1024;
         const std::size_t m = size() / k;
         const std::size_t l = size() % k;
         for (std::size_t i = 0; i != m; ++i, w += k)
@@ -216,7 +217,8 @@ class Weight
         normalize_eval(l, w, accw, use_log);
         ::vsmc::mul(size(), 1 / accw, data_.data(), data_.data());
 
-        return 1 / dot(size(), data_.data(), 1, data_.data(), 1);
+        return 1 / cblas_ddot(static_cast<VSMC_CBLAS_INT>(size()),
+                       data_.data(), 1, data_.data(), 1);
     }
 
     void normalize_eval(std::size_t n, double *w, double &accw, bool use_log)
diff --git a/include/vsmc/internal/assert.hpp b/include/vsmc/internal/assert.hpp
index ae938cd85..62ee3c19e 100644
--- a/include/vsmc/internal/assert.hpp
+++ b/include/vsmc/internal/assert.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,6 @@
 #include <stdexcept>
 #include <string>
 
-#define VSMC_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
-
 #if VSMC_NO_RUNTIME_ASSERT
 #define VSMC_RUNTIME_ASSERT(cond, msg)
 #else // VSMC_NO_RUNTIME_ASSERT
@@ -89,25 +87,6 @@
 namespace vsmc
 {
 
-namespace internal
-{
-
-template <bool>
-class StaticAssert
-{
-    public:
-    static void test(int *) {}
-}; // class StaticAssert
-
-template <>
-class StaticAssert<true>
-{
-    public:
-    static void test(...) {}
-}; // class StaticAssert
-
-} // namespace vsmc::internal
-
 class RuntimeAssert : public std::runtime_error
 {
     public:
diff --git a/include/vsmc/internal/common.hpp b/include/vsmc/internal/common.hpp
index 1c5b89671..aa474da94 100644
--- a/include/vsmc/internal/common.hpp
+++ b/include/vsmc/internal/common.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,16 +32,12 @@
 #ifndef VSMC_INTERNAL_COMMON_HPP
 #define VSMC_INTERNAL_COMMON_HPP
 
+#include <vsmc/internal/assert.hpp>
 #include <vsmc/internal/config.h>
 #include <vsmc/internal/defines.hpp>
-#include <vsmc/internal/assert.hpp>
 #include <vsmc/internal/forward.hpp>
 #include <vsmc/internal/traits.hpp>
-
-#include <vsmc/math/cblas.hpp>
-#include <vsmc/math/constants.hpp>
-#include <vsmc/math/vmath.hpp>
-
+#include <vsmc/math/math.hpp>
 #include <vsmc/utility/aligned_memory.hpp>
 
 #include <algorithm>
@@ -85,6 +81,21 @@ namespace vsmc
 namespace internal
 {
 
+#ifdef VSMC_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#endif
+
+template <typename T>
+inline bool is_equal(const T &a, const T &b)
+{
+    return a == b;
+}
+
+#ifdef VSMC_CLANG
+#pragma clang diagnostic pop
+#endif
+
 template <typename UIntType>
 inline std::string itos(UIntType i, std::true_type)
 {
@@ -119,13 +130,28 @@ inline std::string itos(IntType i)
     return itos(i, std::is_unsigned<IntType>());
 }
 
+template <typename T, std::size_t Dim>
+using Array = typename std::conditional<Dim == Dynamic, Vector<T>,
+    std::array<T, Dim>>::type;
+
+template <typename T, std::size_t N>
+inline void resize(std::array<T, N> &, std::size_t)
+{
+}
+
+template <typename T>
+inline void resize(Vector<T> &vec, std::size_t n)
+{
+    vec.resize(n);
+}
+
 } // namespace vsmc::internal
 
 template <typename CharT, typename Traits, typename T, std::size_t N>
 inline std::basic_ostream<CharT, Traits> &operator<<(
     std::basic_ostream<CharT, Traits> &os, const std::array<T, N> &ary)
 {
-    if (!os.good())
+    if (!os.good() || N == 0)
         return os;
 
     for (std::size_t i = 0; i < N - 1; ++i)
@@ -152,6 +178,37 @@ inline std::basic_istream<CharT, Traits> &operator>>(
     return is;
 }
 
+template <typename CharT, typename Traits, typename T, std::size_t N>
+inline std::basic_ostream<CharT, Traits> &operator<<(
+    std::basic_ostream<CharT, Traits> &os, const Vector<T> &vec)
+{
+    if (!os.good() || vec.size() == 0)
+        return os;
+
+    for (std::size_t i = 0; i < vec.size() - 1; ++i)
+        os << vec[i] << ' ';
+    os << vec[N - 1];
+
+    return os;
+}
+
+template <typename CharT, typename Traits, typename T, std::size_t N>
+inline std::basic_istream<CharT, Traits> &operator>>(
+    std::basic_istream<CharT, Traits> &is, Vector<T> &vec)
+{
+    if (!is.good())
+        return is;
+
+    Vector<T> vec_tmp(vec.size());
+    for (std::size_t i = 0; i != N; ++i)
+        is >> std::ws >> vec_tmp[i];
+
+    if (is.good())
+        vec = std::move(vec_tmp);
+
+    return is;
+}
+
 } // namespace vsmc
 
 #endif // VSMC_INTERNAL_COMMON_HPP
diff --git a/include/vsmc/internal/compiler.h b/include/vsmc/internal/compiler.h
index cfe335840..fd661d5ef 100644
--- a/include/vsmc/internal/compiler.h
+++ b/include/vsmc/internal/compiler.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -75,15 +75,15 @@
 
 #ifndef VSMC_OPENCL
 #ifdef __cplusplus
-#include <cstdlib>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <cmath>
+#include <cstdlib>
 #else
-#include <stdlib.h>
+#include <math.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <math.h>
+#include <stdlib.h>
 #endif
 #endif
 
@@ -91,34 +91,30 @@
 #error __STDC_CONSTANT_MACROS not defined before #include<stdint.h>
 #endif
 
-#ifndef VSMC_HAS_RNGC_DOUBLE
-#define VSMC_HAS_RNGC_DOUBLE 1
-#endif
-
-#ifndef VSMC_STATIC_INLINE
-#ifdef __cplusplus
-#define VSMC_STATIC_INLINE inline
-#else
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-#define VSMC_STATIC_INLINE static inline
-#else
-#define VSMC_STATIC_INLINE static
-#endif
-#endif
-#endif
-
 #ifndef VSMC_INT64
 #define VSMC_INT64 long long
 #endif
 
 #ifndef VSMC_HAS_X86
-#if defined(__x86__) || defined(__x86_64__)
+#if defined(i386) || defined(__i386) || defined(__i386__) ||                  \
+    defined(_M_IX86) || defined(_X86_) || defined(__x86_64) ||                \
+    defined(__x86_64__) || defined(__amd64) || defined(__amd64__) ||          \
+    defined(_M_AMD64) || defined(_M_X64)
 #define VSMC_HAS_X86 1
 #else
 #define VSMC_HAS_X86 0
 #endif
 #endif
 
+#ifndef VSMC_HAS_X86_64
+#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) ||           \
+    defined(__amd64__) || defined(_M_AMD64) || defined(_M_X64)
+#define VSMC_HAS_X86_64 1
+#else
+#define VSMC_HAS_X86_64 0
+#endif
+#endif
+
 #ifndef VSMC_HAS_INT128
 #define VSMC_HAS_INT128 0
 #endif
diff --git a/include/vsmc/internal/compiler/clang.h b/include/vsmc/internal/compiler/clang.h
index 046af7c95..e23e635ee 100644
--- a/include/vsmc/internal/compiler/clang.h
+++ b/include/vsmc/internal/compiler/clang.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/internal/compiler/gcc.h b/include/vsmc/internal/compiler/gcc.h
index 5f895b433..d9e7660fe 100644
--- a/include/vsmc/internal/compiler/gcc.h
+++ b/include/vsmc/internal/compiler/gcc.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/internal/compiler/intel.h b/include/vsmc/internal/compiler/intel.h
index 82d9baad5..00ec30b2e 100644
--- a/include/vsmc/internal/compiler/intel.h
+++ b/include/vsmc/internal/compiler/intel.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/internal/compiler/msvc.h b/include/vsmc/internal/compiler/msvc.h
index 53ebf63de..4ddd7fe38 100644
--- a/include/vsmc/internal/compiler/msvc.h
+++ b/include/vsmc/internal/compiler/msvc.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/internal/compiler/opencl.h b/include/vsmc/internal/compiler/opencl.h
index cf2dd76be..8cb4219ff 100644
--- a/include/vsmc/internal/compiler/opencl.h
+++ b/include/vsmc/internal/compiler/opencl.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,8 @@
 #ifndef VSMC_INTERNAL_COMPILER_OPENCL_H
 #define VSMC_INTERNAL_COMPILER_OPENCL_H
 
-#ifndef VSMC_HAS_RNGC_DOUBLE
-#define VSMC_HAS_RNGC_DOUBLE 0
+#ifndef VSMC_HAS_OPENCL_DOUBLE
+#define VSMC_HAS_OPENCL_DOUBLE 0
 #endif
 
 typedef uint uint32_t;
@@ -41,12 +41,4 @@ typedef ulong uint64_t;
 #define UINT32_C(x) ((uint32_t)(x##U))
 #define UINT64_C(x) ((uint64_t)(x##UL))
 
-#ifndef VSMC_STATIC_INLINE
-#if defined(__OPENCL_C_VERSION__) && __OPENCL_C_VERSION__ >= 120
-#define VSMC_STATIC_INLINE static inline
-#else
-#define VSMC_STATIC_INLINE inline
-#endif
-#endif
-
 #endif // VSMC_INTERNAL_COMPILER_OPENCL_H
diff --git a/include/vsmc/internal/config.h b/include/vsmc/internal/config.h
index dce199e23..1f13f59f2 100644
--- a/include/vsmc/internal/config.h
+++ b/include/vsmc/internal/config.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -110,10 +110,6 @@
 
 // Optional libraries
 
-#ifndef VSMC_HAS_CBLAS
-#define VSMC_HAS_CBLAS 0
-#endif
-
 #ifndef VSMC_HAS_HDF5
 #define VSMC_HAS_HDF5 0
 #endif
@@ -126,6 +122,10 @@
 #define VSMC_USE_MKL_CBLAS VSMC_HAS_MKL
 #endif
 
+#ifndef VSMC_USE_MKL_LAPACKE
+#define VSMC_USE_MKL_LAPACKE VSMC_HAS_MKL
+#endif
+
 #ifndef VSMC_USE_MKL_VML
 #define VSMC_USE_MKL_VML VSMC_HAS_MKL
 #endif
diff --git a/include/vsmc/internal/defines.hpp b/include/vsmc/internal/defines.hpp
index 74c14b8c3..ed290ca50 100644
--- a/include/vsmc/internal/defines.hpp
+++ b/include/vsmc/internal/defines.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,6 @@
 #include <vsmc/internal/config.h>
 #include <type_traits>
 
-#define VSMC_MNE
-
 namespace vsmc
 {
 
@@ -44,23 +42,15 @@ class NullType;
 
 /// \brief Dynamic dimension
 /// \ingroup Definitions
-enum {
-    Dynamic = 0 ///< Used to specify a dimension template parameter is dynamic
-};              // enum
+enum { Dynamic = 0 };
 
-/// \brief Matrix order
+/// \brief Matrix layout
 /// \ingroup Definitions
-enum MatrixOrder {
-    RowMajor = 101, ///< Data are stored row by row in memory
-    ColMajor = 102  ///< Data are stored column by column in memory
-};                  // enum MatrixOrder
+enum MatrixLayout { RowMajor = 101, ColMajor = 102 };
 
-/// \brief Matrix Transpose
+/// \brief Alias to MatrixOrder
 /// \ingroup Definitions
-enum MatrixTrans {
-    NoTrans = 111, ///< The matrix shall not be transposed
-    Trans = 112    ///< The matrix shall be transposed
-};                 // enum MatrixTrans
+using MatrixOrder = MatrixLayout;
 
 /// \brief Resampling schemes
 /// \ingroup Definitions
diff --git a/include/vsmc/internal/forward.hpp b/include/vsmc/internal/forward.hpp
index 1369f459c..7f52afe6e 100644
--- a/include/vsmc/internal/forward.hpp
+++ b/include/vsmc/internal/forward.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,9 +47,6 @@ class Particle;
 template <typename>
 class Monitor;
 
-template <typename>
-class Path;
-
 template <typename>
 class SingleParticle;
 
@@ -58,7 +55,7 @@ class SingleParticleBase;
 
 class Weight;
 
-template <MatrixOrder, std::size_t, typename>
+template <MatrixLayout, std::size_t, typename>
 class StateMatrix;
 
 template <std::size_t, typename, typename>
diff --git a/include/vsmc/internal/traits.hpp b/include/vsmc/internal/traits.hpp
index 73c47f57b..1e31c0ed7 100644
--- a/include/vsmc/internal/traits.hpp
+++ b/include/vsmc/internal/traits.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -211,6 +211,37 @@ namespace vsmc
 /// \ingroup Traits
 VSMC_DEFINE_TYPE_DISPATCH_TRAIT(SizeType, size_type, std::size_t)
 
+namespace internal
+{
+
+template <typename T, typename T1, typename... Types>
+class is_one_of : public std::integral_constant<bool,
+                      is_one_of<T, T1>::value || is_one_of<T, Types...>::value>
+{
+}; // class is_one_of
+
+template <typename T, typename T1>
+class is_one_of<T, T1>
+    : public std::integral_constant<bool, std::is_same<T, T1>::value>
+{
+}; // class is_one_of
+
+template <typename T, typename T1, typename... Types>
+class is_seed_seq
+    : public std::integral_constant<bool,
+          is_seed_seq<T, T1>::value && is_seed_seq<T, Types...>::value>
+{
+};
+
+template <typename T, typename T1>
+class is_seed_seq<T, T1>
+    : public std::integral_constant<bool, !std::is_convertible<T, T1>::value &&
+              !std::is_same<typename std::remove_cv<T>::type, T1>::value>
+{
+}; // class is_seed_seq
+
+} // namespace vsmc::internal
+
 } // namespace vsmc
 
 #endif // VSMC_INTERNAL_TRAITS_HPP
diff --git a/include/vsmc/math/cblas.h b/include/vsmc/math/cblas.h
new file mode 100644
index 000000000..20fd60551
--- /dev/null
+++ b/include/vsmc/math/cblas.h
@@ -0,0 +1,58 @@
+//============================================================================
+// vSMC/include/vsmc/math/cblas.h
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c) 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#ifndef VSMC_MATH_CBLAS_H
+#define VSMC_MATH_CBLAS_H
+
+#include <vsmc/internal/config.h>
+
+/// \brief Integer type of CBLAS routines
+/// \ingroup Config
+///
+/// \details
+/// Define this macro if the CBLAS interface has unusual integer types. For
+/// example, the CBLAS library use ILP64 while the rest of the program use
+/// LP64.
+#ifndef VSMC_CBLAS_INT_TYPE
+#define VSMC_CBLAS_INT_TYPE int
+#endif
+
+#if VSMC_USE_MKL_CBLAS
+#include <mkl_cblas.h>
+#define VSMC_CBLAS_INT MKL_INT
+#else
+#include <cblas.h>
+#ifndef VSMC_CBLAS_INT
+#define VSMC_CBLAS_INT VSMC_CBLAS_INT_TYPE
+#endif
+#endif
+
+#endif // VSMC_MATH_CBLAS_H
diff --git a/include/vsmc/math/cblas.hpp b/include/vsmc/math/cblas.hpp
deleted file mode 100644
index d428e7f89..000000000
--- a/include/vsmc/math/cblas.hpp
+++ /dev/null
@@ -1,277 +0,0 @@
-//============================================================================
-// vSMC/include/vsmc/math/cblas.hpp
-//----------------------------------------------------------------------------
-//                         vSMC: Scalable Monte Carlo
-//----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-//   Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//============================================================================
-
-#ifndef VSMC_MATH_CBLAS_HPP
-#define VSMC_MATH_CBLAS_HPP
-
-#include <vsmc/internal/config.h>
-#include <vsmc/internal/defines.hpp>
-#include <cmath>
-#include <numeric>
-
-#if VSMC_USE_MKL_CBLAS
-#include <mkl.h>
-#define VSMC_CBLAS_INT MKL_INT
-#elif VSMC_HAS_CBLAS
-#include <cblas.h>
-#ifndef VSMC_CBLAS_INT
-#define VSMC_CBLAS_INT int
-#endif
-#endif
-
-namespace vsmc
-{
-
-/// \defgroup CBLAS1 BLAS level 1 routines and functions
-/// \ingroup CBLAS
-/// @{
-
-/// \brief Computes the sum of magnitudes of the vector elements
-template <typename T>
-inline T asum(std::size_t n, const T *x, std::size_t incx)
-{
-    T sum = 0;
-    std::size_t j = 0;
-    for (std::size_t i = 0; i != n; ++i, j += incx)
-        sum += std::fabs(x[j]);
-
-    return sum;
-}
-
-/// \brief Computes a vector-scalar product and adds the result to a vector
-template <typename T>
-inline void axpy(
-    std::size_t n, T a, const T *x, std::size_t incx, T *y, std::size_t incy)
-{
-    std::size_t j = 0;
-    std::size_t k = 0;
-    for (std::size_t i = 0; i != n; ++i, j += incx, k += incy)
-        y[k] += a * x[j];
-}
-
-/// \brief Copies vector to another vector
-template <typename T>
-inline void copy(
-    std::size_t n, const T *x, std::size_t incx, T *y, std::size_t incy)
-{
-    std::size_t j = 0;
-    std::size_t k = 0;
-    for (std::size_t i = 0; i != n; ++i, j += incx, k += incy)
-        y[k] = x[j];
-}
-
-/// \brief Computes a vector-vector dot product
-template <typename T>
-inline T dot(
-    std::size_t n, const T *x, std::size_t incx, const T *y, std::size_t incy)
-{
-    T sum = 0;
-    std::size_t j = 0;
-    std::size_t k = 0;
-    for (std::size_t i = 0; i != n; ++i, j += incx, k += incy)
-        sum += x[j] * y[k];
-
-    return sum;
-}
-
-/// \brief Computes the Euclidean norm of a vector
-template <typename T>
-inline T nrm2(std::size_t n, const T *x, std::size_t incx)
-{
-    return std::sqrt(dot(n, x, incx, x, incx));
-}
-
-/// \brief Computes the product of a vector by a scalar
-template <typename T>
-inline void scal(std::size_t n, T a, T *x, std::size_t incx)
-{
-    std::size_t j = 0;
-    for (std::size_t i = 0; i != n; ++i, j += incx)
-        x[j] *= a;
-}
-
-/// @}
-
-/// \defgroup CBLAS2 BLAS level 2 routines
-/// \ingroup CBLAS
-/// @{
-
-/// \brief Computes a matrix-vector product using a general matrix
-template <typename T>
-inline void gemv(MatrixOrder order, MatrixTrans trans, std::size_t m,
-    std::size_t n, T alpha, const T *A, std::size_t lda, const T *x,
-    std::size_t incx, T beta, T *y, std::size_t incy)
-{
-    std::size_t nrow = trans == NoTrans ? m : n;
-    std::size_t ncol = trans == NoTrans ? n : m;
-
-    scal(nrow, beta, y, incy);
-
-    if ((order == RowMajor && trans == NoTrans) ||
-        (order == ColMajor && trans == Trans)) {
-        std::size_t k = 0;
-        for (std::size_t r = 0; r != nrow; ++r, k += incy)
-            y[k] += alpha * dot<T>(ncol, x, incx, A + r * lda, 1);
-    } else {
-        std::size_t j = 0;
-        for (std::size_t c = 0; c != ncol; ++c, j += incx) {
-            std::size_t k = 0;
-            std::size_t l = c * lda;
-            const double ax = alpha * x[j];
-            for (std::size_t r = 0; r != nrow; ++r, ++l, k += incy)
-                y[k] += ax * A[l];
-        }
-    }
-}
-
-/// @}
-
-} // namespace vsmc
-
-#ifdef VSMC_CBLAS_INT
-
-namespace vsmc
-{
-
-inline float asum(std::size_t n, const float *x, std::size_t incx)
-{
-    return ::cblas_sasum(
-        static_cast<VSMC_CBLAS_INT>(n), x, static_cast<VSMC_CBLAS_INT>(incx));
-}
-
-inline double asum(std::size_t n, const double *x, std::size_t incx)
-{
-    return ::cblas_dasum(
-        static_cast<VSMC_CBLAS_INT>(n), x, static_cast<VSMC_CBLAS_INT>(incx));
-}
-
-inline void axpy(std::size_t n, float a, const float *x, std::size_t incx,
-    float *y, std::size_t incy)
-{
-    ::cblas_saxpy(static_cast<VSMC_CBLAS_INT>(n), a, x,
-        static_cast<VSMC_CBLAS_INT>(incx), y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline void axpy(std::size_t n, double a, const double *x, std::size_t incx,
-    double *y, std::size_t incy)
-{
-    ::cblas_daxpy(static_cast<VSMC_CBLAS_INT>(n), a, x,
-        static_cast<VSMC_CBLAS_INT>(incx), y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline void copy(std::size_t n, const float *x, std::size_t incx, float *y,
-    std::size_t incy)
-{
-    ::cblas_scopy(static_cast<VSMC_CBLAS_INT>(n), x,
-        static_cast<VSMC_CBLAS_INT>(incx), y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline void copy(std::size_t n, const double *x, std::size_t incx, double *y,
-    std::size_t incy)
-{
-    ::cblas_dcopy(static_cast<VSMC_CBLAS_INT>(n), x,
-        static_cast<VSMC_CBLAS_INT>(incx), y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline float dot(std::size_t n, const float *x, std::size_t incx,
-    const float *y, std::size_t incy)
-{
-    return ::cblas_sdot(static_cast<VSMC_CBLAS_INT>(n), x,
-        static_cast<VSMC_CBLAS_INT>(incx), y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline double dot(std::size_t n, const double *x, std::size_t incx,
-    const double *y, std::size_t incy)
-{
-    return ::cblas_ddot(static_cast<VSMC_CBLAS_INT>(n), x,
-        static_cast<VSMC_CBLAS_INT>(incx), y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline float nrm2(std::size_t n, const float *x, std::size_t incx)
-{
-    return ::cblas_snrm2(
-        static_cast<VSMC_CBLAS_INT>(n), x, static_cast<VSMC_CBLAS_INT>(incx));
-}
-
-inline double nrm2(std::size_t n, const double *x, std::size_t incx)
-{
-    return ::cblas_dnrm2(
-        static_cast<VSMC_CBLAS_INT>(n), x, static_cast<VSMC_CBLAS_INT>(incx));
-}
-
-inline void scal(std::size_t n, float a, float *x, std::size_t incx)
-{
-    ::cblas_sscal(static_cast<VSMC_CBLAS_INT>(n), a, x,
-        static_cast<VSMC_CBLAS_INT>(incx));
-}
-
-inline void scal(std::size_t n, double a, double *x, std::size_t incx)
-{
-    ::cblas_dscal(static_cast<VSMC_CBLAS_INT>(n), a, x,
-        static_cast<VSMC_CBLAS_INT>(incx));
-}
-
-inline void gemv(MatrixOrder order, MatrixTrans trans, std::size_t m,
-    std::size_t n, float alpha, const float *A, std::size_t lda,
-    const float *x, std::size_t incx, float beta, float *y, std::size_t incy)
-{
-    ::cblas_sgemv((order == RowMajor ? ::CblasRowMajor : ::CblasColMajor),
-        (trans == NoTrans ? ::CblasNoTrans : ::CblasTrans),
-        static_cast<VSMC_CBLAS_INT>(m), static_cast<VSMC_CBLAS_INT>(n), alpha,
-        A, static_cast<VSMC_CBLAS_INT>(lda), x,
-        static_cast<VSMC_CBLAS_INT>(incx), beta, y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-inline void gemv(MatrixOrder order, MatrixTrans trans, std::size_t m,
-    std::size_t n, double alpha, const double *A, std::size_t lda,
-    const double *x, std::size_t incx, double beta, double *y,
-    std::size_t incy)
-{
-    ::cblas_dgemv((order == RowMajor ? ::CblasRowMajor : ::CblasColMajor),
-        (trans == NoTrans ? ::CblasNoTrans : ::CblasTrans),
-        static_cast<VSMC_CBLAS_INT>(m), static_cast<VSMC_CBLAS_INT>(n), alpha,
-        A, static_cast<VSMC_CBLAS_INT>(lda), x,
-        static_cast<VSMC_CBLAS_INT>(incx), beta, y,
-        static_cast<VSMC_CBLAS_INT>(incy));
-}
-
-} // namespace vsmc
-
-#endif // VSMC_CBLAS_INT
-
-#endif // VSMC_MATH_CBLAS_HPP
diff --git a/include/vsmc/math/constants.hpp b/include/vsmc/math/constants.hpp
index b0c2a8d2e..6080b3448 100644
--- a/include/vsmc/math/constants.hpp
+++ b/include/vsmc/math/constants.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,22 +34,22 @@
 
 #define VSMC_DEFINE_MATH_CONSTANTS(name, val)                                 \
     template <typename T>                                                     \
-    inline T const_##name() noexcept                                          \
+    inline constexpr T const_##name() noexcept                                \
     {                                                                         \
         return static_cast<T>(val##l);                                        \
     }                                                                         \
     template <>                                                               \
-    inline float const_##name<float>() noexcept                               \
+    inline constexpr float const_##name<float>() noexcept                     \
     {                                                                         \
         return val##f;                                                        \
     }                                                                         \
     template <>                                                               \
-    inline double const_##name<double>() noexcept                             \
+    inline constexpr double const_##name<double>() noexcept                   \
     {                                                                         \
         return val;                                                           \
     }                                                                         \
     template <>                                                               \
-    inline long double const_##name<long double>() noexcept                   \
+    inline constexpr long double const_##name<long double>() noexcept         \
     {                                                                         \
         return val##l;                                                        \
     }
diff --git a/include/vsmc/math/lapacke.h b/include/vsmc/math/lapacke.h
new file mode 100644
index 000000000..18cd70fc5
--- /dev/null
+++ b/include/vsmc/math/lapacke.h
@@ -0,0 +1,43 @@
+//============================================================================
+// vSMC/include/vsmc/math/lapacke.h
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c) 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#ifndef VSMC_MATH_LAPACKE_H
+#define VSMC_MATH_LAPACKE_H
+
+#include <vsmc/internal/config.h>
+
+#if VSMC_USE_MKL_LAPACKE
+#include <mkl_lapacke.h>
+#else
+#include <lapacke.h>
+#endif
+
+#endif // VSMC_MATH_LAPACKE_H
diff --git a/include/vsmc/math/math.hpp b/include/vsmc/math/math.hpp
index 710845132..85b802e98 100644
--- a/include/vsmc/math/math.hpp
+++ b/include/vsmc/math/math.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,9 @@
 #define VSMC_MATH_MATH_HPP
 
 #include <vsmc/internal/config.h>
-#include <vsmc/math/cblas.hpp>
+#include <vsmc/math/cblas.h>
 #include <vsmc/math/constants.hpp>
+#include <vsmc/math/lapacke.h>
 #include <vsmc/math/vmath.hpp>
 
 #endif // VSMC_MATH_MATH_HPP
diff --git a/include/vsmc/math/vmath.hpp b/include/vsmc/math/vmath.hpp
index 0e08d4df4..f4cea2a1f 100644
--- a/include/vsmc/math/vmath.hpp
+++ b/include/vsmc/math/vmath.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -236,7 +236,7 @@ template <typename T>
 inline void linear_frac(std::size_t n, const T *a, const T *b, T beta_a,
     T beta_b, T mu_a, T mu_b, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
@@ -344,7 +344,7 @@ VSMC_DEFINE_MATH_VMATH_1(std::sqrt, sqrt)
 template <typename T>
 inline void invsqrt(std::size_t n, const T *a, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
@@ -362,7 +362,7 @@ VSMC_DEFINE_MATH_VMATH_1(std::cbrt, cbrt)
 template <typename T>
 inline void invcbrt(std::size_t n, const T *a, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
@@ -377,7 +377,7 @@ inline void invcbrt(std::size_t n, const T *a, T *y)
 template <typename T>
 inline void pow2o3(std::size_t n, const T *a, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
@@ -392,7 +392,7 @@ inline void pow2o3(std::size_t n, const T *a, T *y)
 template <typename T>
 inline void pow3o2(std::size_t n, const T *a, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
@@ -435,7 +435,7 @@ VSMC_DEFINE_MATH_VMATH_1(std::exp2, exp2)
 template <typename T>
 inline void exp10(std::size_t n, const T *a, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
@@ -477,10 +477,10 @@ VSMC_DEFINE_MATH_VMATH_1(std::sin, sin)
 template <typename T>
 inline void sincos(std::size_t n, const T *a, T *y, T *z)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
+    for (std::size_t i = 0; i != m; ++i, a += k, y += k, z += k) {
         sin(k, a, y);
         cos(k, a, z);
     }
@@ -546,7 +546,7 @@ VSMC_DEFINE_MATH_VMATH_1(std::erfc, erfc)
 template <typename T>
 inline void cdfnorm(std::size_t n, const T *a, T *y)
 {
-    const std::size_t k = 1000;
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, a += k, y += k) {
diff --git a/include/vsmc/resample/index.hpp b/include/vsmc/resample/index.hpp
index 2a423e94b..2cbea0a4b 100644
--- a/include/vsmc/resample/index.hpp
+++ b/include/vsmc/resample/index.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -133,17 +133,17 @@ class ResampleIndex
         return idx;
     }
 
-    template <MatrixOrder Order>
+    template <MatrixLayout Layout>
     Vector<index_type> index_matrix() const
     {
         return index_matrix_dispatch(
-            std::integral_constant<MatrixOrder, Order>());
+            std::integral_constant<MatrixLayout, Layout>());
     }
 
-    template <MatrixOrder Order, typename OutputIter>
+    template <MatrixLayout Layout, typename OutputIter>
     void read_index_matrix(OutputIter first) const
     {
-        Vector<index_type> idxmat(index_matrix<Order>());
+        Vector<index_type> idxmat(index_matrix<Layout>());
         std::copy(idxmat.begin(), idxmat.end(), first);
     }
 
@@ -154,7 +154,7 @@ class ResampleIndex
     Vector<Vector<index_type>> index_;
 
     Vector<index_type> index_matrix_dispatch(
-        std::integral_constant<MatrixOrder, RowMajor>) const
+        std::integral_constant<MatrixLayout, RowMajor>) const
     {
         Vector<index_type> idxmat(size_ * iter_size_);
         if (size_ * iter_size_ == 0)
@@ -181,7 +181,7 @@ class ResampleIndex
     }
 
     Vector<index_type> index_matrix_dispatch(
-        std::integral_constant<MatrixOrder, ColMajor>) const
+        std::integral_constant<MatrixLayout, ColMajor>) const
     {
         Vector<index_type> idxmat(size_ * iter_size_);
         if (size_ * iter_size_ == 0)
diff --git a/include/vsmc/resample/internal/common.hpp b/include/vsmc/resample/internal/common.hpp
index cc59519b7..732bfb91b 100644
--- a/include/vsmc/resample/internal/common.hpp
+++ b/include/vsmc/resample/internal/common.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,20 +36,9 @@
 #include <vsmc/rng/engine.hpp>
 #include <vsmc/rng/u01_sequence.hpp>
 
-/// \brief Default RNG type for resampling
-/// \ingroup Config
-#ifndef VSMC_RESAMPLE_RNG_TYPE
-#define VSMC_RESAMPLE_RNG_TYPE VSMC_RNG_TYPE
-#endif
-
 namespace vsmc
 {
 
-/// \brief Particle::resample_rng_type trait
-/// \ingroup Traits
-VSMC_DEFINE_TYPE_DISPATCH_TRAIT(
-    ResampleRNGType, resample_rng_type, VSMC_RESAMPLE_RNG_TYPE)
-
 /// \brief Type trait of ResampleScheme parameter
 /// \ingroup Resample
 template <ResampleScheme>
diff --git a/include/vsmc/resample/multinomial.hpp b/include/vsmc/resample/multinomial.hpp
index 7d3f9e506..b41c509ee 100644
--- a/include/vsmc/resample/multinomial.hpp
+++ b/include/vsmc/resample/multinomial.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/resample.hpp b/include/vsmc/resample/resample.hpp
index 13f5fd97d..3cf18796c 100644
--- a/include/vsmc/resample/resample.hpp
+++ b/include/vsmc/resample/resample.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/residual.hpp b/include/vsmc/resample/residual.hpp
index 0f192e056..f37644d12 100644
--- a/include/vsmc/resample/residual.hpp
+++ b/include/vsmc/resample/residual.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/residual_stratified.hpp b/include/vsmc/resample/residual_stratified.hpp
index 0f19ae4a3..ad748b757 100644
--- a/include/vsmc/resample/residual_stratified.hpp
+++ b/include/vsmc/resample/residual_stratified.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/residual_systematic.hpp b/include/vsmc/resample/residual_systematic.hpp
index 063b61bd0..f6e43ad58 100644
--- a/include/vsmc/resample/residual_systematic.hpp
+++ b/include/vsmc/resample/residual_systematic.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/stratified.hpp b/include/vsmc/resample/stratified.hpp
index 155f7104f..6a95674fe 100644
--- a/include/vsmc/resample/stratified.hpp
+++ b/include/vsmc/resample/stratified.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/systematic.hpp b/include/vsmc/resample/systematic.hpp
index 5f0bcd233..b83a335d8 100644
--- a/include/vsmc/resample/systematic.hpp
+++ b/include/vsmc/resample/systematic.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/resample/transform.hpp b/include/vsmc/resample/transform.hpp
index 7723ca71b..c1279f4da 100644
--- a/include/vsmc/resample/transform.hpp
+++ b/include/vsmc/resample/transform.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -74,12 +74,12 @@ inline void resample_trans_u01_rep(std::size_t M, std::size_t N,
 /// \ingroup Resample
 template <typename IntType, typename U01SeqType>
 inline void resample_trans_u01_index(std::size_t M, std::size_t N,
-    const double *weight, U01SeqType &&u01seq, IntType *src_idx)
+    const double *weight, U01SeqType &&u01seq, IntType *index)
 {
     if (M == 0 || N == 0)
         return;
 
-    std::memset(src_idx, 0, sizeof(IntType) * N);
+    std::memset(index, 0, sizeof(IntType) * N);
     if (M == 1)
         return;
 
@@ -88,17 +88,17 @@ inline void resample_trans_u01_index(std::size_t M, std::size_t N,
     for (std::size_t i = 0; i != M - 1; ++i) {
         accw += weight[i];
         while (j != N && u01seq[j] <= accw)
-            src_idx[j++] = static_cast<IntType>(i);
+            index[j++] = static_cast<IntType>(i);
     }
     while (j != N)
-        src_idx[j++] = static_cast<IntType>(M - 1);
+        index[j++] = static_cast<IntType>(M - 1);
 }
 
 /// \brief Transform replication numbers into parent indices
 /// \ingroup Resample
 template <typename IntType1, typename IntType2>
-inline void resample_trans_rep_index(std::size_t M, std::size_t N,
-    const IntType1 *replication, IntType2 *src_idx)
+inline void resample_trans_rep_index(
+    std::size_t M, std::size_t N, const IntType1 *replication, IntType2 *index)
 {
     if (M == 0 || N == 0)
         return;
@@ -108,7 +108,7 @@ inline void resample_trans_rep_index(std::size_t M, std::size_t N,
         for (std::size_t src = 0; src != M; ++src) {
             const IntType1 rep = replication[src];
             for (IntType1 r = 0; r != rep; ++r)
-                src_idx[dst++] = static_cast<IntType2>(src);
+                index[dst++] = static_cast<IntType2>(src);
         }
         return;
     }
@@ -117,7 +117,7 @@ inline void resample_trans_rep_index(std::size_t M, std::size_t N,
     std::size_t src = 0;
     for (std::size_t dst = 0; dst != N; ++dst) {
         if (replication[dst] != 0) {
-            src_idx[dst] = static_cast<IntType2>(dst);
+            index[dst] = static_cast<IntType2>(dst);
         } else {
             // replication[dst] has zero child, copy from elsewhere
             if (replication[src] < time + 2) {
@@ -127,7 +127,7 @@ inline void resample_trans_rep_index(std::size_t M, std::size_t N,
                     ++src;
                 while (replication[src] < 2);
             }
-            src_idx[dst] = static_cast<IntType2>(src);
+            index[dst] = static_cast<IntType2>(src);
             ++time;
         }
     }
@@ -136,8 +136,8 @@ inline void resample_trans_rep_index(std::size_t M, std::size_t N,
 /// \brief Transform parent indices into replication numbers
 /// \ingroup Resample
 template <typename IntType1, typename IntType2>
-inline void resample_trans_index_rep(std::size_t M, std::size_t N,
-    const IntType1 *src_idx, IntType2 *replication)
+inline void resample_trans_index_rep(
+    std::size_t M, std::size_t N, const IntType1 *index, IntType2 *replication)
 {
     if (M == 0 || N == 0)
         return;
@@ -145,7 +145,7 @@ inline void resample_trans_index_rep(std::size_t M, std::size_t N,
     std::memset(replication, 0, sizeof(IntType2) * M);
 
     for (std::size_t i = 0; i != N; ++i)
-        ++replication[src_idx[i]];
+        ++replication[index[i]];
 }
 
 /// \brief Transform normalized weights to normalized residual and integrals,
diff --git a/include/vsmc/rng/aes.hpp b/include/vsmc/rng/aes.hpp
index ea71ddd5a..5cd90c51c 100644
--- a/include/vsmc/rng/aes.hpp
+++ b/include/vsmc/rng/aes.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,12 @@
 #include <vsmc/rng/internal/common.hpp>
 #include <vsmc/rng/aes_ni.hpp>
 
-#define VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(N, val)                            \
+#define VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(N, val)                            \
     template <>                                                               \
-    class AESRoundConstant<N> : public std::integral_constant<int, val>       \
+    inline __m128i AESKeyGenAssist<N>(__m128i xmm)                    \
     {                                                                         \
-    }; // class AESRoundConstant
+        return _mm_aeskeygenassist_si128(xmm, val);                           \
+    }
 
 /// \brief AESEngine default blocks
 /// \ingroup Config
@@ -53,265 +54,265 @@ namespace vsmc
 namespace internal
 {
 
-template <std::size_t N>
-class AESRoundConstant;
-
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x00, 0x8D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x01, 0x01)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x02, 0x02)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x03, 0x04)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x04, 0x08)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x05, 0x10)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x06, 0x20)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x07, 0x40)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x08, 0x80)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x09, 0x1B)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x0A, 0x36)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x0B, 0x6C)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x0C, 0xD8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x0D, 0xAB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x0E, 0x4D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x0F, 0x9A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x10, 0x2F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x11, 0x5E)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x12, 0xBC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x13, 0x63)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x14, 0xC6)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x15, 0x97)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x16, 0x35)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x17, 0x6A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x18, 0xD4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x19, 0xB3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x1A, 0x7D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x1B, 0xFA)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x1C, 0xEF)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x1D, 0xC5)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x1E, 0x91)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x1F, 0x39)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x20, 0x72)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x21, 0xE4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x22, 0xD3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x23, 0xBD)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x24, 0x61)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x25, 0xC2)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x26, 0x9F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x27, 0x25)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x28, 0x4A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x29, 0x94)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x2A, 0x33)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x2B, 0x66)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x2C, 0xCC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x2D, 0x83)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x2E, 0x1D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x2F, 0x3A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x30, 0x74)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x31, 0xE8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x32, 0xCB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x33, 0x8D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x34, 0x01)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x35, 0x02)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x36, 0x04)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x37, 0x08)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x38, 0x10)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x39, 0x20)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x3A, 0x40)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x3B, 0x80)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x3C, 0x1B)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x3D, 0x36)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x3E, 0x6C)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x3F, 0xD8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x40, 0xAB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x41, 0x4D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x42, 0x9A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x43, 0x2F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x44, 0x5E)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x45, 0xBC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x46, 0x63)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x47, 0xC6)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x48, 0x97)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x49, 0x35)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x4A, 0x6A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x4B, 0xD4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x4C, 0xB3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x4D, 0x7D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x4E, 0xFA)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x4F, 0xEF)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x50, 0xC5)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x51, 0x91)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x52, 0x39)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x53, 0x72)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x54, 0xE4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x55, 0xD3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x56, 0xBD)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x57, 0x61)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x58, 0xC2)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x59, 0x9F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x5A, 0x25)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x5B, 0x4A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x5C, 0x94)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x5D, 0x33)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x5E, 0x66)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x5F, 0xCC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x60, 0x83)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x61, 0x1D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x62, 0x3A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x63, 0x74)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x64, 0xE8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x65, 0xCB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x66, 0x8D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x67, 0x01)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x68, 0x02)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x69, 0x04)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x6A, 0x08)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x6B, 0x10)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x6C, 0x20)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x6D, 0x40)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x6E, 0x80)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x6F, 0x1B)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x70, 0x36)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x71, 0x6C)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x72, 0xD8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x73, 0xAB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x74, 0x4D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x75, 0x9A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x76, 0x2F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x77, 0x5E)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x78, 0xBC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x79, 0x63)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x7A, 0xC6)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x7B, 0x97)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x7C, 0x35)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x7D, 0x6A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x7E, 0xD4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x7F, 0xB3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x80, 0x7D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x81, 0xFA)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x82, 0xEF)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x83, 0xC5)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x84, 0x91)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x85, 0x39)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x86, 0x72)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x87, 0xE4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x88, 0xD3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x89, 0xBD)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x8A, 0x61)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x8B, 0xC2)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x8C, 0x9F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x8D, 0x25)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x8E, 0x4A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x8F, 0x94)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x90, 0x33)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x91, 0x66)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x92, 0xCC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x93, 0x83)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x94, 0x1D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x95, 0x3A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x96, 0x74)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x97, 0xE8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x98, 0xCB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x99, 0x8D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x9A, 0x01)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x9B, 0x02)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x9C, 0x04)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x9D, 0x08)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x9E, 0x10)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0x9F, 0x20)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA0, 0x40)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA1, 0x80)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA2, 0x1B)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA3, 0x36)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA4, 0x6C)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA5, 0xD8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA6, 0xAB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA7, 0x4D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA8, 0x9A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xA9, 0x2F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xAA, 0x5E)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xAB, 0xBC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xAC, 0x63)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xAD, 0xC6)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xAE, 0x97)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xAF, 0x35)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB0, 0x6A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB1, 0xD4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB2, 0xB3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB3, 0x7D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB4, 0xFA)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB5, 0xEF)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB6, 0xC5)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB7, 0x91)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB8, 0x39)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xB9, 0x72)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xBA, 0xE4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xBB, 0xD3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xBC, 0xBD)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xBD, 0x61)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xBE, 0xC2)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xBF, 0x9F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC0, 0x25)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC1, 0x4A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC2, 0x94)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC3, 0x33)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC4, 0x66)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC5, 0xCC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC6, 0x83)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC7, 0x1D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC8, 0x3A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xC9, 0x74)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xCA, 0xE8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xCB, 0xCB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xCC, 0x8D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xCD, 0x01)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xCE, 0x02)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xCF, 0x04)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD0, 0x08)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD1, 0x10)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD2, 0x20)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD3, 0x40)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD4, 0x80)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD5, 0x1B)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD6, 0x36)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD7, 0x6C)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD8, 0xD8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xD9, 0xAB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xDA, 0x4D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xDB, 0x9A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xDC, 0x2F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xDD, 0x5E)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xDE, 0xBC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xDF, 0x63)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE0, 0xC6)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE1, 0x97)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE2, 0x35)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE3, 0x6A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE4, 0xD4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE5, 0xB3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE6, 0x7D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE7, 0xFA)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE8, 0xEF)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xE9, 0xC5)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xEA, 0x91)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xEB, 0x39)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xEC, 0x72)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xED, 0xE4)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xEE, 0xD3)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xEF, 0xBD)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF0, 0x61)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF1, 0xC2)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF2, 0x9F)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF3, 0x25)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF4, 0x4A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF5, 0x94)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF6, 0x33)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF7, 0x66)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF8, 0xCC)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xF9, 0x83)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xFA, 0x1D)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xFB, 0x3A)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xFC, 0x74)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xFD, 0xE8)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xFE, 0xCB)
-VSMC_DEFINE_RNG_AES_ROUND_CONSTANT(0xFF, 0x8D)
+template <std::size_t>
+inline __m128i AESKeyGenAssist(__m128i) ;
+
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x00, 0x8D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x01, 0x01)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x02, 0x02)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x03, 0x04)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x04, 0x08)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x05, 0x10)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x06, 0x20)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x07, 0x40)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x08, 0x80)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x09, 0x1B)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x0A, 0x36)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x0B, 0x6C)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x0C, 0xD8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x0D, 0xAB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x0E, 0x4D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x0F, 0x9A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x10, 0x2F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x11, 0x5E)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x12, 0xBC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x13, 0x63)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x14, 0xC6)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x15, 0x97)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x16, 0x35)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x17, 0x6A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x18, 0xD4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x19, 0xB3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x1A, 0x7D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x1B, 0xFA)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x1C, 0xEF)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x1D, 0xC5)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x1E, 0x91)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x1F, 0x39)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x20, 0x72)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x21, 0xE4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x22, 0xD3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x23, 0xBD)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x24, 0x61)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x25, 0xC2)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x26, 0x9F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x27, 0x25)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x28, 0x4A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x29, 0x94)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x2A, 0x33)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x2B, 0x66)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x2C, 0xCC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x2D, 0x83)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x2E, 0x1D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x2F, 0x3A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x30, 0x74)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x31, 0xE8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x32, 0xCB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x33, 0x8D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x34, 0x01)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x35, 0x02)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x36, 0x04)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x37, 0x08)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x38, 0x10)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x39, 0x20)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x3A, 0x40)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x3B, 0x80)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x3C, 0x1B)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x3D, 0x36)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x3E, 0x6C)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x3F, 0xD8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x40, 0xAB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x41, 0x4D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x42, 0x9A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x43, 0x2F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x44, 0x5E)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x45, 0xBC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x46, 0x63)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x47, 0xC6)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x48, 0x97)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x49, 0x35)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x4A, 0x6A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x4B, 0xD4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x4C, 0xB3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x4D, 0x7D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x4E, 0xFA)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x4F, 0xEF)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x50, 0xC5)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x51, 0x91)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x52, 0x39)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x53, 0x72)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x54, 0xE4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x55, 0xD3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x56, 0xBD)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x57, 0x61)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x58, 0xC2)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x59, 0x9F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x5A, 0x25)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x5B, 0x4A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x5C, 0x94)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x5D, 0x33)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x5E, 0x66)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x5F, 0xCC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x60, 0x83)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x61, 0x1D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x62, 0x3A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x63, 0x74)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x64, 0xE8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x65, 0xCB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x66, 0x8D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x67, 0x01)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x68, 0x02)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x69, 0x04)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x6A, 0x08)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x6B, 0x10)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x6C, 0x20)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x6D, 0x40)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x6E, 0x80)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x6F, 0x1B)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x70, 0x36)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x71, 0x6C)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x72, 0xD8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x73, 0xAB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x74, 0x4D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x75, 0x9A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x76, 0x2F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x77, 0x5E)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x78, 0xBC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x79, 0x63)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x7A, 0xC6)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x7B, 0x97)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x7C, 0x35)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x7D, 0x6A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x7E, 0xD4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x7F, 0xB3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x80, 0x7D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x81, 0xFA)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x82, 0xEF)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x83, 0xC5)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x84, 0x91)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x85, 0x39)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x86, 0x72)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x87, 0xE4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x88, 0xD3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x89, 0xBD)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x8A, 0x61)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x8B, 0xC2)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x8C, 0x9F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x8D, 0x25)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x8E, 0x4A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x8F, 0x94)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x90, 0x33)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x91, 0x66)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x92, 0xCC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x93, 0x83)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x94, 0x1D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x95, 0x3A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x96, 0x74)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x97, 0xE8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x98, 0xCB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x99, 0x8D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x9A, 0x01)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x9B, 0x02)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x9C, 0x04)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x9D, 0x08)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x9E, 0x10)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0x9F, 0x20)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA0, 0x40)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA1, 0x80)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA2, 0x1B)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA3, 0x36)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA4, 0x6C)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA5, 0xD8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA6, 0xAB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA7, 0x4D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA8, 0x9A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xA9, 0x2F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xAA, 0x5E)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xAB, 0xBC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xAC, 0x63)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xAD, 0xC6)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xAE, 0x97)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xAF, 0x35)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB0, 0x6A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB1, 0xD4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB2, 0xB3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB3, 0x7D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB4, 0xFA)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB5, 0xEF)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB6, 0xC5)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB7, 0x91)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB8, 0x39)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xB9, 0x72)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xBA, 0xE4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xBB, 0xD3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xBC, 0xBD)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xBD, 0x61)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xBE, 0xC2)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xBF, 0x9F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC0, 0x25)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC1, 0x4A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC2, 0x94)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC3, 0x33)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC4, 0x66)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC5, 0xCC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC6, 0x83)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC7, 0x1D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC8, 0x3A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xC9, 0x74)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xCA, 0xE8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xCB, 0xCB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xCC, 0x8D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xCD, 0x01)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xCE, 0x02)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xCF, 0x04)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD0, 0x08)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD1, 0x10)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD2, 0x20)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD3, 0x40)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD4, 0x80)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD5, 0x1B)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD6, 0x36)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD7, 0x6C)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD8, 0xD8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xD9, 0xAB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xDA, 0x4D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xDB, 0x9A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xDC, 0x2F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xDD, 0x5E)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xDE, 0xBC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xDF, 0x63)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE0, 0xC6)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE1, 0x97)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE2, 0x35)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE3, 0x6A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE4, 0xD4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE5, 0xB3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE6, 0x7D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE7, 0xFA)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE8, 0xEF)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xE9, 0xC5)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xEA, 0x91)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xEB, 0x39)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xEC, 0x72)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xED, 0xE4)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xEE, 0xD3)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xEF, 0xBD)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF0, 0x61)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF1, 0xC2)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF2, 0x9F)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF3, 0x25)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF4, 0x4A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF5, 0x94)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF6, 0x33)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF7, 0x66)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF8, 0xCC)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xF9, 0x83)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xFA, 0x1D)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xFB, 0x3A)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xFC, 0x74)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xFD, 0xE8)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xFE, 0xCB)
+VSMC_DEFINE_RNG_AES_KEY_GEN_ASSIST(0xFF, 0x8D)
 
 class AESKeyInit
 {
@@ -319,7 +320,7 @@ class AESKeyInit
     template <std::size_t Offset, std::size_t N, typename T,
         std::size_t KeySize, std::size_t Rp1>
     static void eval(const std::array<T, KeySize> &key,
-        std::array<__m128i, Rp1> &ks, __m128i &xmm)
+        std::array<__m128i, Rp1> &ks, __m128i &xmm) 
     {
         init<Offset, N>(key, ks, xmm, std::integral_constant < bool, N<Rp1>());
     }
@@ -328,14 +329,14 @@ class AESKeyInit
     template <std::size_t, std::size_t, typename T, std::size_t KeySize,
         std::size_t Rp1>
     static void init(const std::array<T, KeySize> &,
-        std::array<__m128i, Rp1> &, __m128i &, std::false_type)
+        std::array<__m128i, Rp1> &, __m128i &, std::false_type) 
     {
     }
 
     template <std::size_t Offset, std::size_t N, typename T,
         std::size_t KeySize, std::size_t Rp1>
     static void init(const std::array<T, KeySize> &key,
-        std::array<__m128i, Rp1> &ks, __m128i &xmm, std::true_type)
+        std::array<__m128i, Rp1> &ks, __m128i &xmm, std::true_type) 
     {
         M128I<> tmp;
         tmp.load(key.data());
@@ -349,7 +350,7 @@ class AESKeySeq
     public:
     using key_type = typename KeySeqGenerator::key_type;
 
-    void reset(const key_type &key)
+    void reset(const key_type &key) 
     {
         KeySeqGenerator generator;
         generator(key, key_seq_);
@@ -357,7 +358,7 @@ class AESKeySeq
 
     template <typename U>
     void operator()(
-        const key_type &, std::array<M128I<U>, Rounds + 1> &rk) const
+        const key_type &, std::array<M128I<U>, Rounds + 1> &rk) const 
     {
         rk = key_seq_;
     }
@@ -373,7 +374,8 @@ class AES128KeySeqGenerator
     using key_type = std::array<T, 16 / sizeof(T)>;
 
     template <std::size_t Rp1>
-    void operator()(const key_type &key, std::array<M128I<>, Rp1> &key_seq)
+    void operator()(
+        const key_type &key, std::array<M128I<>, Rp1> &key_seq) 
     {
         std::array<__m128i, Rp1> ks;
         AESKeyInit::eval<0, 0>(key, ks, xmm1_);
@@ -387,20 +389,20 @@ class AES128KeySeqGenerator
     __m128i xmm3_;
 
     template <std::size_t, std::size_t Rp1>
-    void generate_seq(std::array<__m128i, Rp1> &, std::false_type)
+    void generate_seq(std::array<__m128i, Rp1> &, std::false_type) 
     {
     }
 
     template <std::size_t N, std::size_t Rp1>
-    void generate_seq(std::array<__m128i, Rp1> &ks, std::true_type)
+    void generate_seq(std::array<__m128i, Rp1> &ks, std::true_type) 
     {
-        xmm2_ = _mm_aeskeygenassist_si128(xmm1_, AESRoundConstant<N>::value);
+        xmm2_ = AESKeyGenAssist<N>(xmm1_);
         expand_key();
         std::get<N>(ks) = xmm1_;
         generate_seq<N + 1>(ks, std::integral_constant<bool, N + 1 < Rp1>());
     }
 
-    void expand_key()
+    void expand_key() 
     {
         xmm2_ = _mm_shuffle_epi32(xmm2_, 0xFF); // pshufd xmm2, xmm2, 0xFF
         xmm3_ = _mm_slli_si128(xmm1_, 0x04);    // pshufb xmm3, xmm5
@@ -420,7 +422,8 @@ class AES192KeySeqGenerator
     using key_type = std::array<T, 24 / sizeof(T)>;
 
     template <std::size_t Rp1>
-    void operator()(const key_type &key, std::array<M128I<>, Rp1> &key_seq)
+    void operator()(
+        const key_type &key, std::array<M128I<>, Rp1> &key_seq) 
     {
         std::array<__m128i, Rp1> ks;
 
@@ -454,12 +457,12 @@ class AES192KeySeqGenerator
     __m128i xmm7_;
 
     template <std::size_t, std::size_t>
-    void generate_seq(unsigned char *, std::false_type)
+    void generate_seq(unsigned char *, std::false_type) 
     {
     }
 
     template <std::size_t N, std::size_t Rp1>
-    void generate_seq(unsigned char *ks_ptr, std::true_type)
+    void generate_seq(unsigned char *ks_ptr, std::true_type) 
     {
         generate_key<N>(ks_ptr);
         complete_key<N>(
@@ -469,23 +472,23 @@ class AES192KeySeqGenerator
     }
 
     template <std::size_t N>
-    void generate_key(unsigned char *ks_ptr)
+    void generate_key(unsigned char *ks_ptr) 
     {
         // In entry, N * 24 < Rp1 * 16
         // Required Storage: N * 24 + 16;
 
-        xmm2_ = _mm_aeskeygenassist_si128(xmm4_, AESRoundConstant<N>::value);
+        xmm2_ = AESKeyGenAssist<N>(xmm4_);
         generate_key_expansion();
         _mm_storeu_si128(reinterpret_cast<__m128i *>(ks_ptr + N * 24), xmm1_);
     }
 
     template <std::size_t>
-    void complete_key(unsigned char *, std::false_type)
+    void complete_key(unsigned char *, std::false_type) 
     {
     }
 
     template <std::size_t N>
-    void complete_key(unsigned char *ks_ptr, std::true_type)
+    void complete_key(unsigned char *ks_ptr, std::true_type) 
     {
         // In entry, N * 24 + 16 < Rp1 * 16
         // Required storage: N * 24 + 32
@@ -495,7 +498,7 @@ class AES192KeySeqGenerator
             reinterpret_cast<__m128i *>(ks_ptr + N * 24 + 16), xmm7_);
     }
 
-    void generate_key_expansion()
+    void generate_key_expansion() 
     {
         xmm2_ = _mm_shuffle_epi32(xmm2_, 0xFF);  // pshufd xmm2, xmm2, 0xFF
         xmm3_ = _mm_castps_si128(_mm_shuffle_ps( // shufps xmm3, xmm1, 0x10
@@ -507,7 +510,7 @@ class AES192KeySeqGenerator
         xmm1_ = _mm_xor_si128(xmm1_, xmm2_); // pxor   xmm1, xmm2
     }
 
-    void complete_key_expansion()
+    void complete_key_expansion() 
     {
         xmm5_ = _mm_load_si128(&xmm4_);          // movdqa xmm5, xmm4
         xmm5_ = _mm_slli_si128(xmm5_, 0x04);     // pslldq xmm5, 0x04
@@ -519,14 +522,14 @@ class AES192KeySeqGenerator
     }
 
     template <std::size_t Rp1>
-    void copy_key(
-        std::array<__m128i, Rp1> &, const unsigned char *, std::false_type)
+    void copy_key(std::array<__m128i, Rp1> &, const unsigned char *,
+        std::false_type) 
     {
     }
 
     template <std::size_t Rp1>
     void copy_key(std::array<__m128i, Rp1> &ks, const unsigned char *ks_ptr,
-        std::true_type)
+        std::true_type) 
     {
         unsigned char *dst = reinterpret_cast<unsigned char *>(ks.data());
         std::memcpy(dst + 24, ks_ptr + 24, Rp1 * 16 - 24);
@@ -540,7 +543,8 @@ class AES256KeySeqGenerator
     using key_type = std::array<T, 32 / sizeof(T)>;
 
     template <std::size_t Rp1>
-    void operator()(const key_type &key, std::array<M128I<>, Rp1> &key_seq)
+    void operator()(
+        const key_type &key, std::array<M128I<>, Rp1> &key_seq) 
     {
         std::array<__m128i, Rp1> ks;
         AESKeyInit::eval<0, 0>(key, ks, xmm1_);
@@ -556,35 +560,34 @@ class AES256KeySeqGenerator
     __m128i xmm4_;
 
     template <std::size_t, std::size_t Rp1>
-    void generate_seq(std::array<__m128i, Rp1> &, std::false_type)
+    void generate_seq(std::array<__m128i, Rp1> &, std::false_type) 
     {
     }
 
     template <std::size_t N, std::size_t Rp1>
-    void generate_seq(std::array<__m128i, Rp1> &ks, std::true_type)
+    void generate_seq(std::array<__m128i, Rp1> &ks, std::true_type) 
     {
         generate_key<N>(ks, std::integral_constant<bool, N % 2 == 0>());
         generate_seq<N + 1>(ks, std::integral_constant<bool, N + 1 < Rp1>());
     }
 
     template <std::size_t N, std::size_t Rp1>
-    void generate_key(std::array<__m128i, Rp1> &ks, std::true_type)
+    void generate_key(std::array<__m128i, Rp1> &ks, std::true_type) 
     {
-        xmm2_ =
-            _mm_aeskeygenassist_si128(xmm3_, AESRoundConstant<N / 2>::value);
+        xmm2_ = AESKeyGenAssist<N / 2>(xmm3_);
         expand_key(std::true_type());
         std::get<N>(ks) = xmm1_;
     }
 
     template <std::size_t N, std::size_t Rp1>
-    void generate_key(std::array<__m128i, Rp1> &ks, std::false_type)
+    void generate_key(std::array<__m128i, Rp1> &ks, std::false_type) 
     {
         xmm4_ = _mm_aeskeygenassist_si128(xmm1_, 0);
         expand_key(std::false_type());
         std::get<N>(ks) = xmm3_;
     }
 
-    void expand_key(std::true_type)
+    void expand_key(std::true_type) 
     {
         xmm2_ = _mm_shuffle_epi32(xmm2_, 0xFF); // pshufd xmm2, xmm2, 0xFF
         xmm4_ = _mm_slli_si128(xmm1_, 0x04);    // pshufb xmm4, xmm5
@@ -596,7 +599,7 @@ class AES256KeySeqGenerator
         xmm1_ = _mm_xor_si128(xmm1_, xmm2_);    // pxor   xmm1, xmm2
     }
 
-    void expand_key(std::false_type)
+    void expand_key(std::false_type) 
     {
         xmm2_ = _mm_shuffle_epi32(xmm4_, 0xAA); // pshufd xmm2, xmm4, 0xAA
         xmm4_ = _mm_slli_si128(xmm3_, 0x04);    // pshufb xmm4, xmm5
diff --git a/include/vsmc/rng/aes_ni.hpp b/include/vsmc/rng/aes_ni.hpp
index 2bdd13e9d..09e5af4e2 100644
--- a/include/vsmc/rng/aes_ni.hpp
+++ b/include/vsmc/rng/aes_ni.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,18 +36,6 @@
 #include <vsmc/rng/counter.hpp>
 #include <wmmintrin.h>
 
-#define VSMC_STATIC_ASSERT_RNG_AES_NI_BLOCKS(Blocks)                          \
-    VSMC_STATIC_ASSERT(                                                       \
-        (Blocks > 0), "**AESNIGenerator** USED WITH ZERO BLOCKS")
-
-#define VSMC_STATIC_ASSERT_RNG_AES_NI_RESULT_TYPE(ResultType)                 \
-    VSMC_STATIC_ASSERT((std::is_unsigned<ResultType>::value),                 \
-        "**AESNIGenerator USED WITH ResultType NOT AN UNSIGNED INTEGER")
-
-#define VSMC_STATIC_ASSERT_RNG_AES_NI                                         \
-    VSMC_STATIC_ASSERT_RNG_AES_NI_BLOCKS(Blocks);                             \
-    VSMC_STATIC_ASSERT_RNG_AES_NI_RESULT_TYPE(ResultType);
-
 namespace vsmc
 {
 
@@ -57,13 +45,18 @@ template <typename ResultType, typename KeySeqType, std::size_t Rounds,
     std::size_t Blocks>
 class AESNIGenerator
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**AESNIGenerator** USED WITH ResultType OTHER THAN UNSIGNED INTEGER "
+        "TYPES");
+
+    static_assert(
+        Blocks != 0, "**AESNIGenerator** USED WITH Blocks EQUAL TO ZERO");
+
     public:
     using result_type = ResultType;
     using ctr_type = std::array<ResultType, M128I<ResultType>::size()>;
     using key_type = typename KeySeqType::key_type;
 
-    AESNIGenerator() { VSMC_STATIC_ASSERT_RNG_AES_NI; }
-
     static constexpr std::size_t size()
     {
         return Blocks * M128I<ResultType>::size();
@@ -72,12 +65,12 @@ class AESNIGenerator
     void reset(const key_type &key) { key_seq_.reset(key); }
 
     void operator()(ctr_type &ctr, const key_type &key,
-        std::array<result_type, Blocks * M128I<ResultType>::size()> &buffer)
+        std::array<ResultType, size()> &buffer) const
     {
         union {
             std::array<M128I<>, Blocks> state;
             std::array<ctr_type, Blocks> ctr_block;
-            std::array<result_type, size()> result;
+            std::array<ResultType, size()> result;
         } buf;
 
         std::array<M128I<>, Rounds + 1> rk;
@@ -90,26 +83,28 @@ class AESNIGenerator
         buffer = buf.result;
     }
 
-    std::size_t operator()(ctr_type &ctr, const key_type &key, std::size_t n,
-        result_type *r) const
+    void operator()(ctr_type &ctr, const key_type &key, std::size_t n,
+        std::array<ResultType, size()> *buffer) const
     {
-        const std::size_t K = 8;
-        const std::size_t M = K * M128I<ResultType>::size();
-        const std::size_t m = n / M;
+        if (n == 0)
+            return;
+
+        union {
+            std::array<M128I<>, Blocks> state;
+            std::array<ctr_type, Blocks> ctr_block;
+            std::array<ResultType, size()> result;
+        } buf;
+
         std::array<M128I<>, Rounds + 1> rk;
         key_seq_(key, rk);
-        increment(ctr, m * K, reinterpret_cast<ctr_type *>(r));
-        std::array<M128I<>, K> *s =
-            reinterpret_cast<std::array<M128I<>, K> *>(r);
-        for (std::size_t i = 0; i != m; ++i) {
-            enc_first(s[i], rk);
-            enc_round<1>(s[i], rk, std::integral_constant<bool, 1 < Rounds>());
-            enc_last(s[i], rk);
+        for (std::size_t i = 0; i != n; ++i) {
+            increment(ctr, buf.ctr_block);
+            enc_first(buf.state, rk);
+            enc_round<1>(
+                buf.state, rk, std::integral_constant<bool, 1 < Rounds>());
+            enc_last(buf.state, rk);
+            buffer[i] = buf.result;
         }
-        n -= m * M;
-        r += m * M;
-
-        return m * M;
     }
 
     private:
@@ -160,6 +155,7 @@ class AESNIGenerator
     template <std::size_t B, std::size_t N, std::size_t K>
     void enc_round_block(std::array<M128I<>, K> &state,
         const std::array<M128I<>, Rounds + 1> &rk, std::true_type) const
+
     {
         std::get<B>(state) = _mm_aesenc_si128(
             std::get<B>(state).value(), std::get<N>(rk).value());
diff --git a/include/vsmc/rng/ars.hpp b/include/vsmc/rng/ars.hpp
index f3dc44b2b..98764e82f 100644
--- a/include/vsmc/rng/ars.hpp
+++ b/include/vsmc/rng/ars.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/rng/bernoulli_distribution.hpp b/include/vsmc/rng/bernoulli_distribution.hpp
deleted file mode 100644
index 01de2ee1e..000000000
--- a/include/vsmc/rng/bernoulli_distribution.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-//============================================================================
-// vSMC/include/vsmc/rng/bernoulli_distribution.hpp
-//----------------------------------------------------------------------------
-//                         vSMC: Scalable Monte Carlo
-//----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-//   Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//============================================================================
-
-#ifndef VSMC_RNG_BERNOULLI_DISTRIBUTION_HPP
-#define VSMC_RNG_BERNOULLI_DISTRIBUTION_HPP
-
-#include <vsmc/rng/internal/common.hpp>
-#include <vsmc/rng/u01_distribution.hpp>
-
-namespace vsmc
-{
-
-namespace internal
-{
-
-inline bool bernoulli_distribution_check_param(double p)
-{
-    return p >= 0 && p <= 1;
-}
-
-} // namespace vsmc::internal
-
-/// \brief Bernoulli distribution
-/// \ingroup Distribution
-template <typename IntType>
-class BernoulliDistribution
-{
-    VSMC_DEFINE_RNG_DISTRIBUTION_1(
-        Bernoulli, bernoulli, IntType, double, p, 0.5)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
-
-    public:
-    result_type min VSMC_MNE() const { return static_cast<result_type>(0); }
-    result_type max VSMC_MNE() const { return static_cast<result_type>(1); }
-    void reset() {}
-
-    private:
-    template <typename RNGType>
-    result_type generate(RNGType &rng, const param_type &param)
-    {
-        U01CODistribution<double> runif;
-        double u = runif(rng);
-
-        return generate(u, param.p(), static_cast<result_type *>(nullptr));
-    }
-
-    static bool generate(double u, double p, bool *) { return u < p; }
-
-    template <typename U>
-    static U generate(double u, double p, U *)
-    {
-        return u < p ? 1 : 0;
-    }
-}; // class BernoulliDistribution
-
-namespace internal
-{
-
-template <std::size_t K, typename IntType, typename RNGType>
-inline void bernoulli_distribution_impl(
-    RNGType &rng, std::size_t n, IntType *r, double p)
-{
-    double u[K];
-    u01_co_distribution(rng, n, u);
-    std::memset(r, 0, sizeof(IntType) * n);
-    for (std::size_t i = 0; i != n; ++i)
-        if (u[i] < p)
-            r[i] = 1;
-}
-
-} // namespace vsmc::internal
-
-/// \brief Generating bernoulli random variates
-/// \ingroup Distribution
-template <typename IntType, typename RNGType>
-inline void bernoulli_distribution(
-    RNGType &rng, std::size_t n, IntType *r, IntType p)
-{
-    const std::size_t k = 1000;
-    const std::size_t m = n / k;
-    const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::bernoulli_distribution_impl<k>(rng, k, r + i * k, p);
-    internal::bernoulli_distribution_impl<k>(rng, l, r + m * k, p);
-}
-
-template <typename IntType, typename RNGType>
-inline void rng_rand(RNGType &rng, BernoulliDistribution<IntType> &dist,
-    std::size_t n, IntType *r)
-{
-    dist(rng, n, r);
-}
-
-} // namespace vsmc
-
-#endif // VSMC_RNG_BERNOULLI_DISTRIBUTION_HPP
diff --git a/include/vsmc/rng/beta_distribution.hpp b/include/vsmc/rng/beta_distribution.hpp
index 9659daa79..85d0e5076 100644
--- a/include/vsmc/rng/beta_distribution.hpp
+++ b/include/vsmc/rng/beta_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,8 @@
 #define VSMC_RNG_BETA_DISTRIBUTION_HPP
 
 #include <vsmc/rng/internal/common.hpp>
-#include <vsmc/rng/u01_distribution.hpp>
 #include <vsmc/rng/normal_distribution.hpp>
+#include <vsmc/rng/u01_distribution.hpp>
 
 namespace vsmc
 {
@@ -74,7 +74,8 @@ class BetaDistributionConstant
         const RealType K = static_cast<RealType>(0.852);
         const RealType C = static_cast<RealType>(-0.956);
         const RealType D = beta + K * alpha * alpha + C;
-        if (is_equal<RealType>(alpha, 0.5) && is_equal<RealType>(beta, 0.5))
+        if (is_equal<RealType>(alpha, static_cast<RealType>(0.5)) &&
+            is_equal<RealType>(beta, static_cast<RealType>(0.5)))
             algorithm = BetaDistributionAlgorithmAS;
         else if (is_equal<RealType>(alpha, 1) && is_equal<RealType>(beta, 1))
             algorithm = BetaDistributionAlgorithm11;
@@ -155,13 +156,13 @@ class BetaDistributionConstant
 template <typename RealType>
 class BetaDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Beta, beta, RealType, result_type, alpha, 1, result_type, beta, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Beta, beta, alpha, 1, beta, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
-    result_type max VSMC_MNE() const { return 1; }
+    result_type min() const { return 0; }
+
+    result_type max() const { return 1; }
+
     void reset() { constant_.reset(alpha(), beta()); }
 
     private:
@@ -221,8 +222,8 @@ class BetaDistribution
     result_type generate_as(RNGType &rng, const param_type &,
         const internal::BetaDistributionConstant<RealType> &)
     {
-        U01CODistribution<RealType> runif;
-        result_type u = runif(rng);
+        U01Distribution<RealType> u01;
+        result_type u = u01(rng);
         u = std::sin(
             -const_pi_by2<result_type>() + const_pi<result_type>() * u);
 
@@ -234,42 +235,42 @@ class BetaDistribution
     result_type generate_11(RNGType &rng, const param_type &,
         const internal::BetaDistributionConstant<RealType> &)
     {
-        U01OODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return runif(rng);
+        return u01(rng);
     }
 
     template <typename RNGType>
     result_type generate_1x(RNGType &rng, const param_type &,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01OODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return 1 - std::exp(constant.b * std::log(runif(rng)));
+        return 1 - std::exp(constant.b * std::log(u01(rng)));
     }
 
     template <typename RNGType>
     result_type generate_x1(RNGType &rng, const param_type &,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01OODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return std::exp(constant.a * std::log(runif(rng)));
+        return std::exp(constant.a * std::log(u01(rng)));
     }
 
     template <typename RNGType>
     result_type generate_c(RNGType &rng, const param_type &param,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         const result_type ln_4 = 2 * const_ln_2<result_type>();
         result_type x = 0;
         result_type y = 0;
         result_type left = 0;
         result_type right = 0;
         do {
-            result_type u1 = runif(rng);
-            result_type u2 = runif(rng);
+            result_type u1 = u01(rng);
+            result_type u2 = u01(rng);
             result_type v = constant.b * std::log(u1 / (1 - u1));
             x = param.alpha() * std::exp(v);
             y = param.beta() + x;
@@ -285,12 +286,12 @@ class BetaDistribution
     result_type generate_j(RNGType &rng, const param_type &,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         result_type x = 0;
         result_type y = 0;
         do {
-            x = std::pow(runif(rng), constant.a);
-            y = std::pow(runif(rng), constant.b);
+            x = std::pow(u01(rng), constant.a);
+            y = std::pow(u01(rng), constant.b);
         } while (x + y > 1);
 
         return x / (x + y);
@@ -300,10 +301,10 @@ class BetaDistribution
     result_type generate_a1(RNGType &rng, const param_type &param,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         while (true) {
-            result_type u = runif(rng);
-            result_type e = -std::log(runif(rng));
+            result_type u = u01(rng);
+            result_type e = -std::log(u01(rng));
             result_type x = 0;
             result_type v = 0;
             if (u < constant.p) {
@@ -324,10 +325,10 @@ class BetaDistribution
     result_type generate_a2(RNGType &rng, const param_type &param,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         while (true) {
-            result_type u = runif(rng);
-            result_type e = -std::log(runif(rng));
+            result_type u = u01(rng);
+            result_type e = -std::log(u01(rng));
             result_type x = 0;
             result_type v = 0;
             if (u < constant.p) {
@@ -348,10 +349,10 @@ class BetaDistribution
     result_type generate_a3(RNGType &rng, const param_type &param,
         const internal::BetaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         while (true) {
-            result_type u = runif(rng);
-            result_type e = -std::log(runif(rng));
+            result_type u = u01(rng);
+            result_type e = -std::log(u01(rng));
             result_type x = 0;
             result_type v = 0;
             if (u < constant.p) {
@@ -377,7 +378,7 @@ inline std::size_t beta_distribution_impl_as(RNGType &rng, std::size_t n,
     RealType *r, RealType, RealType,
     const BetaDistributionConstant<RealType> &)
 {
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     fma(n, const_pi<RealType>(), r, -const_pi_by2<RealType>(), r);
     sin(n, r, r);
     fma(n, static_cast<RealType>(0.5), r, static_cast<RealType>(0.5), r);
@@ -390,7 +391,7 @@ inline std::size_t beta_distribution_impl_11(RNGType &rng, std::size_t n,
     RealType *r, RealType, RealType,
     const BetaDistributionConstant<RealType> &)
 {
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
 
     return n;
 }
@@ -400,7 +401,7 @@ inline std::size_t beta_distribution_impl_1x(RNGType &rng, std::size_t n,
     RealType *r, RealType, RealType,
     const BetaDistributionConstant<RealType> &constant)
 {
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     log(n, r, r);
     mul(n, constant.b, r, r);
     exp(n, r, r);
@@ -414,7 +415,7 @@ inline std::size_t beta_distribution_impl_x1(RNGType &rng, std::size_t n,
     RealType *r, RealType, RealType,
     const BetaDistributionConstant<RealType> &constant)
 {
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     log(n, r, r);
     mul(n, constant.a, r, r);
     exp(n, r, r);
@@ -438,7 +439,7 @@ inline std::size_t beta_distribution_impl_c(RNGType &rng, std::size_t n,
     RealType *const v = s + n * 2;
     RealType *const x = s + n * 3;
     RealType *const y = s + n * 4;
-    u01_co_distribution(rng, n * 2, s);
+    u01_distribution(rng, n * 2, s);
     sub(n, static_cast<RealType>(1), u1, v);
     div(n, u1, v, v);
     log(n, v, v);
@@ -474,7 +475,7 @@ inline std::size_t beta_distribution_impl_j(RNGType &rng, std::size_t n,
     RealType *const x = s;
     RealType *const y = s + n;
     RealType *const u = s + n * 2;
-    u01_co_distribution(rng, n * 2, s);
+    u01_distribution(rng, n * 2, s);
     pow(n, x, a, x);
     pow(n, y, b, y);
     add(n, x, y, u);
@@ -554,7 +555,11 @@ template <typename RealType, typename RNGType>
 inline void beta_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType alpha, RealType beta)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**beta_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const internal::BetaDistributionConstant<RealType> constant(alpha, beta);
     while (n > k) {
         std::size_t m = internal::beta_distribution_impl<k>(
@@ -575,12 +580,7 @@ inline void beta_distribution(
     }
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(
-    RNGType &rng, BetaDistribution<RealType> &dist, std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Beta, beta, alpha, beta)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/cauchy_distribution.hpp b/include/vsmc/rng/cauchy_distribution.hpp
index 0d43e4be3..6177e1d7f 100644
--- a/include/vsmc/rng/cauchy_distribution.hpp
+++ b/include/vsmc/rng/cauchy_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,20 +54,15 @@ inline bool cauchy_distribution_check_param(RealType, RealType b)
 template <typename RealType>
 class CauchyDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Cauchy, cauchy, RealType, result_type, a, 0, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Cauchy, cauchy, a, 0, b, 1)
 
     public:
-    result_type min VSMC_MNE() const
+    result_type min() const
     {
-        return -std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::lowest();
     }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -75,10 +70,10 @@ class CauchyDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
         return param.a() +
-            param.b() * std::tan(const_pi<result_type>() * runif(rng));
+            param.b() * std::tan(const_pi<result_type>() * (1 - u01(rng)));
     }
 }; // class CauchyDistribution
 
@@ -89,7 +84,8 @@ template <typename RealType, typename RNGType>
 inline void cauchy_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    u01_co_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
+    sub(n, static_cast<RealType>(1), r, r);
     mul(n, const_pi<RealType>(), r, r);
     tan(n, r, r);
     for (std::size_t i = 0; i != n; ++i)
@@ -104,20 +100,19 @@ template <typename RealType, typename RNGType>
 inline void cauchy_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**cauchy_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::cauchy_distribution_impl(rng, k, r + i * k, a, b);
-    internal::cauchy_distribution_impl(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::cauchy_distribution_impl(rng, k, r, a, b);
+    internal::cauchy_distribution_impl(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, CauchyDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Cauchy, cauchy, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/chi_squared_distribution.hpp b/include/vsmc/rng/chi_squared_distribution.hpp
index 85bf1fe6e..96b214aa1 100644
--- a/include/vsmc/rng/chi_squared_distribution.hpp
+++ b/include/vsmc/rng/chi_squared_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool chi_squared_distribution_check_param(RealType n)
 template <typename RealType>
 class ChiSquaredDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_1(
-        ChiSquared, chi_squared, RealType, result_type, n, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_1(ChiSquared, chi_squared, n, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() { gamma_ = GammaDistribution<RealType>(n() / 2, 2); }
 
@@ -89,15 +84,14 @@ template <typename RealType, typename RNGType>
 inline void chi_squared_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType df)
 {
+    static_assert(std::is_floating_point<RealType>::value,
+        "**chi_squared_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
     gamma_distribution(rng, n, r, df / 2, static_cast<RealType>(2));
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, ChiSquaredDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_1(ChiSquared, chi_squared, n)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/counter.hpp b/include/vsmc/rng/counter.hpp
index 2af0ab4c8..55ef9c34e 100644
--- a/include/vsmc/rng/counter.hpp
+++ b/include/vsmc/rng/counter.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -69,7 +69,7 @@ inline void increment(std::array<T, K> &ctr)
 template <typename T, std::size_t K, T NSkip>
 inline void increment(std::array<T, K> &ctr, std::integral_constant<T, NSkip>)
 {
-    if (ctr.front() < std::numeric_limits<T>::max VSMC_MNE() - NSkip) {
+    if (ctr.front() < std::numeric_limits<T>::max() - NSkip) {
         ctr.front() += NSkip;
     } else {
         ctr.front() += NSkip;
@@ -83,7 +83,7 @@ inline void increment(std::array<T, K> &ctr, std::integral_constant<T, NSkip>)
 template <typename T, std::size_t K>
 inline void increment(std::array<T, K> &ctr, T nskip)
 {
-    if (ctr.front() < std::numeric_limits<T>::max VSMC_MNE() - nskip) {
+    if (ctr.front() < std::numeric_limits<T>::max() - nskip) {
         ctr.front() += nskip;
     } else {
         ctr.front() += nskip;
@@ -151,8 +151,7 @@ inline void increment(
 {
     internal::increment_block_set<0>(
         ctr, ctr_block, std::integral_constant<bool, 0 < Blocks>());
-    if (ctr.front() <
-        std::numeric_limits<T>::max VSMC_MNE() - static_cast<T>(Blocks)) {
+    if (ctr.front() < std::numeric_limits<T>::max() - static_cast<T>(Blocks)) {
         internal::increment_block_safe<0>(
             ctr, ctr_block, std::integral_constant<bool, 0 < Blocks>());
     } else {
@@ -262,7 +261,7 @@ inline void increment(
 
     increment(ctr);
     const std::uint64_t m =
-        static_cast<std::uint64_t>(std::numeric_limits<T>::max VSMC_MNE());
+        static_cast<std::uint64_t>(std::numeric_limits<T>::max());
     const std::uint64_t l = static_cast<std::uint64_t>(ctr.front());
     const std::uint64_t k = static_cast<std::uint64_t>(n);
     if (k < m && l < m - k) {
@@ -357,34 +356,35 @@ class CounterEngine
 
     void operator()(std::size_t n, result_type *r)
     {
-        if (n * sizeof(result_type) <= 32) {
-            for (std::size_t i = 0; i != n; ++i)
-                r[i] = operator()();
-            return;
-        }
+        const std::size_t remain = M_ - index_;
 
-        std::size_t p = 32 -
-            static_cast<std::size_t>(reinterpret_cast<std::uintptr_t>(r) % 32);
-        if (p % sizeof(result_type) == 0) {
-            p /= sizeof(result_type);
-            for (std::size_t i = 0; i != p; ++i)
-                r[i] = operator()();
-            n -= p;
-            r += p;
+        if (n < remain) {
+            std::memcpy(r, buffer_.data() + index_, sizeof(result_type) * n);
+            index_ += n;
+            return;
         }
 
-        const std::size_t q = generator_(ctr_, key_, n, r);
-        n -= q;
-        r += q;
-
-        const std::size_t m = n / M_;
-        std::array<result_type, M_> *s =
-            reinterpret_cast<std::array<result_type, M_> *>(r);
-        for (std::size_t i = 0; i != m; ++i)
-            generator_(ctr_, key_, s[i]);
-        n -= m * M_;
-        r += m * M_;
+        std::memcpy(r, buffer_.data() + index_, sizeof(result_type) * remain);
+        r += remain;
+        n -= remain;
+        index_ = M_;
 
+        const std::size_t k = 1024 / M_;
+        if (k != 0) {
+            const std::size_t m = (n / M_) / k;
+            const std::size_t l = (n / M_) % k;
+            alignas(32) std::array<result_type, M_> buffer[k];
+            for (std::size_t i = 0; i != m; ++i) {
+                generator_(ctr_, key_, k, buffer);
+                std::memcpy(r, buffer, sizeof(result_type) * M_ * k);
+                r += k * M_;
+                n -= k * M_;
+            }
+            generator_(ctr_, key_, l, buffer);
+            std::memcpy(r, buffer, sizeof(result_type) * M_ * l);
+            r += l * M_;
+            n -= l * M_;
+        }
         for (std::size_t i = 0; i != n; ++i)
             r[i] = operator()();
     }
@@ -411,14 +411,14 @@ class CounterEngine
         index_ = n % M_;
     }
 
-    static constexpr result_type min VSMC_MNE()
+    static constexpr result_type min()
     {
-        return std::numeric_limits<result_type>::min VSMC_MNE();
+        return std::numeric_limits<result_type>::min();
     }
 
-    static constexpr result_type max VSMC_MNE()
+    static constexpr result_type max()
     {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::max();
     }
 
     friend bool operator==(const CounterEngine<Generator> &eng1,
@@ -481,11 +481,11 @@ class CounterEngine
     private:
     static constexpr std::size_t M_ = Generator::size();
 
-    std::array<result_type, M_> buffer_;
+    alignas(32) std::array<result_type, M_> buffer_;
+    std::size_t index_;
     ctr_type ctr_;
     key_type key_;
     Generator generator_;
-    std::size_t index_;
 
     void reset()
     {
diff --git a/include/vsmc/rng/discrete_distribution.hpp b/include/vsmc/rng/discrete_distribution.hpp
index cc0affe7c..da58597ea 100644
--- a/include/vsmc/rng/discrete_distribution.hpp
+++ b/include/vsmc/rng/discrete_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,6 @@
 
 #include <vsmc/rng/internal/common.hpp>
 #include <vsmc/rng/u01_distribution.hpp>
-#include <vsmc/math/cblas.hpp>
 
 #define VSMC_RUNTIME_ASSERT_RNG_DISCRETE_DISTRIBUTION_POSITIVE(flag)          \
     VSMC_RUNTIME_ASSERT(                                                      \
@@ -61,8 +60,7 @@ class DiscreteDistribution
         param_type() {}
 
         template <typename InputIter>
-        param_type(InputIter first, InputIter last)
-            : probability_(first, last)
+        param_type(InputIter first, InputIter last) : probability_(first, last)
         {
             invariant();
         }
@@ -192,8 +190,7 @@ class DiscreteDistribution
     DiscreteDistribution() {}
 
     template <typename InputIter>
-    DiscreteDistribution(InputIter first, InputIter last)
-        : param_(first, last)
+    DiscreteDistribution(InputIter first, InputIter last) : param_(first, last)
     {
     }
 
@@ -216,9 +213,9 @@ class DiscreteDistribution
     {
     }
 
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
+    result_type max() const
     {
         return param_.size() == 0 ? 0 : param_.size() - 1;
     }
@@ -257,8 +254,8 @@ class DiscreteDistribution
         using value_type =
             typename std::iterator_traits<InputIter>::value_type;
 
-        U01CODistribution<value_type> runif;
-        value_type u = runif(rng);
+        U01Distribution<value_type> u01;
+        value_type u = u01(rng);
 
         if (!normalized) {
             value_type mulw =
@@ -289,7 +286,37 @@ class DiscreteDistribution
         return index - 1;
     }
 
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    friend bool operator==(
+        const distribution_type &dist1, const distribution_type &dist2)
+    {
+        return dist1.param_ == dist2.param_;
+    }
+
+    friend bool operator!=(
+        const distribution_type &dist1, const distribution_type &dist2)
+    {
+        return !(dist1 == dist2);
+    }
+
+    template <typename CharT, typename Traits>
+    friend std::basic_ostream<CharT, Traits> &operator<<(
+        std::basic_ostream<CharT, Traits> &os, const distribution_type &dist)
+    {
+        os << dist.param_;
+
+        return os;
+    }
+
+    template <typename CharT, typename Traits>
+    friend std::basic_istream<CharT, Traits> &operator>>(
+        std::basic_istream<CharT, Traits> &is, distribution_type &dist)
+    {
+        is >> std::ws >> dist.param_;
+        if (is.good())
+            dist.reset();
+
+        return is;
+    }
 
     private:
     param_type param_;
diff --git a/include/vsmc/rng/distribution.hpp b/include/vsmc/rng/distribution.hpp
index c4c6f628f..cf3a4dbff 100644
--- a/include/vsmc/rng/distribution.hpp
+++ b/include/vsmc/rng/distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,10 +33,10 @@
 #define VSMC_RNG_DISTRIBUTION_HPP
 
 #include <vsmc/internal/config.h>
-#include <vsmc/rng/bernoulli_distribution.hpp>
 #include <vsmc/rng/beta_distribution.hpp>
 #include <vsmc/rng/cauchy_distribution.hpp>
 #include <vsmc/rng/chi_squared_distribution.hpp>
+#include <vsmc/rng/discrete_distribution.hpp>
 #include <vsmc/rng/exponential_distribution.hpp>
 #include <vsmc/rng/extreme_value_distribution.hpp>
 #include <vsmc/rng/fisher_f_distribution.hpp>
@@ -46,12 +46,13 @@
 #include <vsmc/rng/logistic_distribution.hpp>
 #include <vsmc/rng/lognormal_distribution.hpp>
 #include <vsmc/rng/normal_distribution.hpp>
+#include <vsmc/rng/normal_mv_distribution.hpp>
 #include <vsmc/rng/pareto_distribution.hpp>
 #include <vsmc/rng/rayleigh_distribution.hpp>
 #include <vsmc/rng/student_t_distribution.hpp>
 #include <vsmc/rng/u01_distribution.hpp>
+#include <vsmc/rng/uniform_bits_distribution.hpp>
 #include <vsmc/rng/uniform_real_distribution.hpp>
 #include <vsmc/rng/weibull_distribution.hpp>
-#include <vsmc/rng/discrete_distribution.hpp>
 
 #endif // VSMC_RNG_DISTRIBUTION_HPP
diff --git a/include/vsmc/rng/engine.hpp b/include/vsmc/rng/engine.hpp
index c6c0aaa2a..3490701a2 100644
--- a/include/vsmc/rng/engine.hpp
+++ b/include/vsmc/rng/engine.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/rng/exponential_distribution.hpp b/include/vsmc/rng/exponential_distribution.hpp
index 60175c075..d45093467 100644
--- a/include/vsmc/rng/exponential_distribution.hpp
+++ b/include/vsmc/rng/exponential_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool exponential_distribution_check_param(RealType lambda)
 template <typename RealType>
 class ExponentialDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_1(
-        Exponential, exponential, RealType, result_type, lambda, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_1(Exponential, exponential, lambda, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -72,9 +67,9 @@ class ExponentialDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OCDistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return -std::log(runif(rng)) / param.lambda();
+        return -std::log(1 - u01(rng)) / param.lambda();
     }
 }; // class ExponentialDistribution
 
@@ -85,7 +80,8 @@ template <typename RealType, typename RNGType>
 inline void exponential_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType lambda)
 {
-    u01_oc_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
+    sub(n, static_cast<RealType>(1), r, r);
     log(n, r, r);
     mul(n, -1 / lambda, r, r);
 }
@@ -98,20 +94,19 @@ template <typename RealType, typename RNGType>
 inline void exponential_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType lambda)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**exponential_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::exponential_distribution_impl(rng, k, r + i * k, lambda);
-    internal::exponential_distribution_impl(rng, l, r + m * k, lambda);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::exponential_distribution_impl(rng, k, r, lambda);
+    internal::exponential_distribution_impl(rng, l, r, lambda);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, ExponentialDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_1(Exponential, exponential, lambda)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/extreme_value_distribution.hpp b/include/vsmc/rng/extreme_value_distribution.hpp
index 5b3cadd03..2fec9e914 100644
--- a/include/vsmc/rng/extreme_value_distribution.hpp
+++ b/include/vsmc/rng/extreme_value_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,20 +54,15 @@ inline bool extreme_value_distribution_check_param(RealType, RealType b)
 template <typename RealType>
 class ExtremeValueDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(ExtremeValue, extreme_value, RealType,
-        result_type, a, 0, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(ExtremeValue, extreme_value, a, 0, b, 1)
 
     public:
-    result_type min VSMC_MNE() const
+    result_type min() const
     {
-        return -std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::lowest();
     }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -75,9 +70,9 @@ class ExtremeValueDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return param.a() - param.b() * std::log(-std::log(runif(rng)));
+        return param.a() - param.b() * std::log(-std::log(u01(rng)));
     }
 }; // class ExtremeValueDistribution
 
@@ -88,7 +83,7 @@ template <typename RealType, typename RNGType>
 inline void extreme_value_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     log(n, r, r);
     mul(n, static_cast<RealType>(-1), r, r);
     log(n, r, r);
@@ -103,20 +98,19 @@ template <typename RealType, typename RNGType>
 inline void extreme_value_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**extreme_value_distribution** USED WITH RealType OTHER THAN "
+        "FLOATING POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::extreme_value_distribution_impl(rng, k, r + i * k, a, b);
-    internal::extreme_value_distribution_impl(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::extreme_value_distribution_impl(rng, k, r, a, b);
+    internal::extreme_value_distribution_impl(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, ExtremeValueDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(ExtremeValue, extreme_value, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/fisher_f_distribution.hpp b/include/vsmc/rng/fisher_f_distribution.hpp
index 144397003..45165a793 100644
--- a/include/vsmc/rng/fisher_f_distribution.hpp
+++ b/include/vsmc/rng/fisher_f_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool fisher_f_distribution_check_param(RealType m, RealType n)
 template <typename RealType>
 class FisherFDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        FisherF, fisher_f, RealType, result_type, m, 1, result_type, n, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(FisherF, fisher_f, m, 1, n, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset()
     {
@@ -113,20 +108,19 @@ template <typename RealType, typename RNGType>
 inline void fisher_f_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType df1, RealType df2)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**fisher_f_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::fisher_f_distribution_impl<k>(rng, k, r + i * k, df1, df2);
-    internal::fisher_f_distribution_impl<k>(rng, l, r + m * k, df1, df2);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::fisher_f_distribution_impl<k>(rng, k, r, df1, df2);
+    internal::fisher_f_distribution_impl<k>(rng, l, r, df1, df2);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, FisherFDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(FisherF, fisher_f, m, n)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/gamma_distribution.hpp b/include/vsmc/rng/gamma_distribution.hpp
index a57075b53..45e1cf9b0 100644
--- a/include/vsmc/rng/gamma_distribution.hpp
+++ b/include/vsmc/rng/gamma_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,8 @@
 #define VSMC_RNG_GAMMA_DISTRIBUTION_HPP
 
 #include <vsmc/rng/internal/common.hpp>
-#include <vsmc/rng/u01_distribution.hpp>
 #include <vsmc/rng/normal_distribution.hpp>
+#include <vsmc/rng/u01_distribution.hpp>
 
 namespace vsmc
 {
@@ -66,7 +66,7 @@ class GammaDistributionConstant
 
     void reset(RealType alpha, RealType)
     {
-        if (alpha < 0.6)
+        if (alpha < static_cast<RealType>(0.6L))
             algorithm = GammaDistributionAlgorithmT;
         else if (alpha < 1)
             algorithm = GammaDistributionAlgorithmW;
@@ -105,17 +105,12 @@ class GammaDistributionConstant
 template <typename RealType>
 class GammaDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Gamma, gamma, RealType, result_type, alpha, 1, result_type, beta, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Gamma, gamma, alpha, 1, beta, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() { constant_.reset(alpha(), beta()); }
 
@@ -161,10 +156,10 @@ class GammaDistribution
     result_type generate_t(RNGType &rng, const param_type &param,
         const internal::GammaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         while (true) {
-            result_type u = runif(rng);
-            result_type e = -std::log(runif(rng));
+            result_type u = u01(rng);
+            result_type e = -std::log(u01(rng));
             if (u > constant.d) {
                 u = -std::log(constant.c * (1 - u));
                 e += u;
@@ -180,13 +175,13 @@ class GammaDistribution
     result_type generate_w(RNGType &rng, const param_type &,
         const internal::GammaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         result_type u = 0;
         result_type e = 0;
         result_type r = 0;
         do {
-            u = -std::log(runif(rng));
-            e = -std::log(runif(rng));
+            u = -std::log(u01(rng));
+            e = -std::log(u01(rng));
             r = std::exp(constant.c * std::log(u));
         } while (u + e < constant.d + r);
 
@@ -197,10 +192,10 @@ class GammaDistribution
     result_type generate_n(RNGType &rng, const param_type &,
         const internal::GammaDistributionConstant<RealType> &constant)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
         NormalDistribution<RealType> rnorm(0, 1);
         while (true) {
-            result_type u = runif(rng);
+            result_type u = u01(rng);
             result_type e = 0;
             result_type v = 0;
             result_type w = 0;
@@ -224,9 +219,9 @@ class GammaDistribution
     result_type generate_e(RNGType &rng, const param_type &,
         const internal::GammaDistributionConstant<RealType> &)
     {
-        U01CODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return -std::log(runif(rng));
+        return -std::log(u01(rng));
     }
 }; // class GammaDistribution
 
@@ -245,7 +240,7 @@ inline std::size_t gamma_distribution_impl_t(RNGType &rng, std::size_t n,
     RealType *const e = s + n;
     RealType *const x = s + n * 2;
 
-    u01_co_distribution(rng, n * 2, s);
+    u01_distribution(rng, n * 2, s);
     log(n, e, e);
     mul(n, static_cast<RealType>(-1), e, e);
     for (std::size_t i = 0; i != n; ++i) {
@@ -281,7 +276,7 @@ inline std::size_t gamma_distribution_impl_w(RNGType &rng, std::size_t n,
     RealType *const e = s + n;
     RealType *const x = s + n * 2;
 
-    u01_co_distribution(rng, n * 2, s);
+    u01_distribution(rng, n * 2, s);
     log(n * 2, s, s);
     mul(n * 2, static_cast<RealType>(-1), s, s);
     log(n, s, x);
@@ -313,7 +308,7 @@ inline std::size_t gamma_distribution_impl_n(RNGType &rng, std::size_t n,
     RealType *const w = s + n * 3;
     RealType *const x = s + n * 4;
 
-    u01_co_distribution(rng, n, u);
+    u01_distribution(rng, n, u);
     normal_distribution(
         rng, n, w, static_cast<RealType>(0), static_cast<RealType>(1));
     fma(n, c, w, static_cast<RealType>(1), v);
@@ -352,7 +347,7 @@ inline std::size_t gamma_distribution_impl_e(RNGType &rng, std::size_t n,
     RealType *r, RealType, RealType beta,
     const GammaDistributionConstant<RealType> &)
 {
-    u01_co_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     log(n, r, r);
     mul(n, -beta, r, r);
 
@@ -389,7 +384,11 @@ template <typename RealType, typename RNGType>
 inline void gamma_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType alpha, RealType beta)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**gamma_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const internal::GammaDistributionConstant<RealType> constant(alpha);
     while (n > k) {
         std::size_t m = internal::gamma_distribution_impl<k>(
@@ -410,12 +409,7 @@ inline void gamma_distribution(
     }
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, GammaDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Gamma, gamma, alpha, beta)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/internal/common.hpp b/include/vsmc/rng/internal/common.hpp
index 4c8633d92..be2a0a703 100644
--- a/include/vsmc/rng/internal/common.hpp
+++ b/include/vsmc/rng/internal/common.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,24 +42,65 @@
     VSMC_RUNTIME_ASSERT((flag),                                               \
         "**" #Name "Distribution** CONSTRUCTED WITH INVALID PARAMETERS")
 
-#define VSMC_DEFINE_RNG_DISTRIBUTION_1(Name, name, T, T1, p1, v1)             \
+#define VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_0(Name, T, t, Type)           \
     public:                                                                   \
-    using result_type = T;                                                    \
-    using distribution_type = Name##Distribution<T>;                          \
-                                                                              \
     class param_type                                                          \
     {                                                                         \
+        static_assert(std::is_##t<T>::value,                                  \
+            "**" #Name "Distribution::param_type** USED WITH " #T             \
+            " OTHER THAN " #Type " INTEGER TYPES");                           \
+                                                                              \
         public:                                                               \
         using result_type = T;                                                \
         using distribution_type = Name##Distribution<T>;                      \
                                                                               \
-        explicit param_type(T1 p1 = v1) : p1##_(p1)                           \
+        friend bool operator==(const param_type &, const param_type &)        \
+        {                                                                     \
+            return true;                                                      \
+        }                                                                     \
+                                                                              \
+        friend bool operator!=(const param_type &, const param_type &)        \
+        {                                                                     \
+            return false;                                                     \
+        }                                                                     \
+                                                                              \
+        template <typename CharT, typename Traits>                            \
+        friend std::basic_ostream<CharT, Traits> &operator<<(                 \
+            std::basic_ostream<CharT, Traits> &os, const param_type &)        \
+        {                                                                     \
+            return os;                                                        \
+        }                                                                     \
+                                                                              \
+        template <typename CharT, typename Traits>                            \
+        friend std::basic_istream<CharT, Traits> &operator>>(                 \
+            std::basic_istream<CharT, Traits> &is, param_type &)              \
+        {                                                                     \
+            return is;                                                        \
+        }                                                                     \
+                                                                              \
+        private:                                                              \
+        friend distribution_type;                                             \
+    }; // class param_type
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_1(Name, name, p1, v1)         \
+    public:                                                                   \
+    class param_type                                                          \
+    {                                                                         \
+        static_assert(std::is_floating_point<RealType>::value,                \
+            "**" #Name "Distribution::param_type** USED WITH RealType OTHER " \
+            "THAN FLOATING POINT TYPES");                                     \
+                                                                              \
+        public:                                                               \
+        using result_type = RealType;                                         \
+        using distribution_type = Name##Distribution<RealType>;               \
+                                                                              \
+        explicit param_type(result_type p1 = v1) : p1##_(p1)                  \
         {                                                                     \
             VSMC_RUNTIME_ASSERT_RNG_DISTRIBUTION_PARAM(                       \
                 internal::name##_distribution_check_param(p1), Name);         \
         }                                                                     \
                                                                               \
-        T1 p1() const { return p1##_; }                                       \
+        result_type p1() const { return p1##_; }                              \
                                                                               \
         friend bool operator==(                                               \
             const param_type &param1, const param_type &param2)               \
@@ -94,7 +135,7 @@
             if (!is.good())                                                   \
                 return is;                                                    \
                                                                               \
-            T1 p1 = 0;                                                        \
+            result_type p1 = 0;                                               \
             is >> std::ws >> p1;                                              \
                                                                               \
             if (is.good()) {                                                  \
@@ -108,73 +149,32 @@
         }                                                                     \
                                                                               \
         private:                                                              \
-        T1 p1##_;                                                             \
-    };                                                                        \
-                                                                              \
-    explicit Name##Distribution(T1 p1 = v1) : param_(p1) { reset(); }         \
-                                                                              \
-    explicit Name##Distribution(const param_type &param) : param_(param)      \
-    {                                                                         \
-        reset();                                                              \
-    }                                                                         \
-                                                                              \
-    T1 p1() const { return param_.p1(); }                                     \
-                                                                              \
-    param_type param() const { return param_; }                               \
-                                                                              \
-    void param(const param_type &parm)                                        \
-    {                                                                         \
-        param_ = parm;                                                        \
-        reset();                                                              \
-    }                                                                         \
-                                                                              \
-    template <typename RNGType>                                               \
-    result_type operator()(RNGType &rng)                                      \
-    {                                                                         \
-        return operator()(rng, param_);                                       \
-    }                                                                         \
-                                                                              \
-    template <typename RNGType>                                               \
-    result_type operator()(RNGType &rng, const param_type &param)             \
-    {                                                                         \
-        return generate(rng, param);                                          \
-    }                                                                         \
-                                                                              \
-    template <typename RNGType>                                               \
-    void operator()(RNGType &rng, std::size_t n, result_type *r)              \
-    {                                                                         \
-        operator()(rng, n, r, param_);                                        \
-    }                                                                         \
-                                                                              \
-    template <typename RNGType>                                               \
-    void operator()(                                                          \
-        RNGType &rng, std::size_t n, result_type *r, const param_type &param) \
-    {                                                                         \
-        name##_distribution(rng, n, r, param.p1());                           \
-    }                                                                         \
+        result_type p1##_;                                                    \
                                                                               \
-    private:                                                                  \
-    param_type param_;
+        friend distribution_type;                                             \
+    }; // class param_type
 
-#define VSMC_DEFINE_RNG_DISTRIBUTION_2(Name, name, T, T1, p1, v1, T2, p2, v2) \
+#define VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_2(Name, name, p1, v1, p2, v2) \
     public:                                                                   \
-    using result_type = T;                                                    \
-    using distribution_type = Name##Distribution<T>;                          \
-                                                                              \
     class param_type                                                          \
     {                                                                         \
+        static_assert(std::is_floating_point<RealType>::value,                \
+            "**" #Name "Distribution::param_type** USED WITH RealType OTHER " \
+            "THAN FLOATING POINT TYPES");                                     \
+                                                                              \
         public:                                                               \
-        using result_type = T;                                                \
-        using distribution_type = Name##Distribution<T>;                      \
+        using result_type = RealType;                                         \
+        using distribution_type = Name##Distribution<RealType>;               \
                                                                               \
-        explicit param_type(T1 p1 = v1, T2 p2 = v2) : p1##_(p1), p2##_(p2)    \
+        explicit param_type(result_type p1 = v1, result_type p2 = v2)         \
+            : p1##_(p1), p2##_(p2)                                            \
         {                                                                     \
             VSMC_RUNTIME_ASSERT_RNG_DISTRIBUTION_PARAM(                       \
                 internal::name##_distribution_check_param(p1, p2), Name);     \
         }                                                                     \
                                                                               \
-        T1 p1() const { return p1##_; }                                       \
-        T2 p2() const { return p2##_; }                                       \
+        result_type p1() const { return p1##_; }                              \
+        result_type p2() const { return p2##_; }                              \
                                                                               \
         friend bool operator==(                                               \
             const param_type &param1, const param_type &param2)               \
@@ -212,8 +212,8 @@
             if (!is.good())                                                   \
                 return is;                                                    \
                                                                               \
-            T1 p1 = 0;                                                        \
-            T2 p2 = 0;                                                        \
+            result_type p1 = 0;                                               \
+            result_type p2 = 0;                                               \
             is >> std::ws >> p1;                                              \
             is >> std::ws >> p2;                                              \
                                                                               \
@@ -230,11 +230,44 @@
         }                                                                     \
                                                                               \
         private:                                                              \
-        T1 p1##_;                                                             \
-        T2 p2##_;                                                             \
-    };                                                                        \
+        result_type p1##_;                                                    \
+        result_type p2##_;                                                    \
+                                                                              \
+        friend distribution_type;                                             \
+    }; // class param_type
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_CONSTRUCTOR_0(Name, T)                   \
+    public:                                                                   \
+    using result_type = T;                                                    \
+    using distribution_type = Name##Distribution<T>;                          \
+                                                                              \
+    Name##Distribution() = default;                                           \
+    explicit Name##Distribution(const param_type &) {}
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_CONSTRUCTOR_1(Name, p1, v1)              \
+    public:                                                                   \
+    using result_type = RealType;                                             \
+    using distribution_type = Name##Distribution<RealType>;                   \
+                                                                              \
+    explicit Name##Distribution(result_type p1 = v1) : param_(p1)             \
+    {                                                                         \
+        reset();                                                              \
+    }                                                                         \
+                                                                              \
+    explicit Name##Distribution(const param_type &param) : param_(param)      \
+    {                                                                         \
+        reset();                                                              \
+    }                                                                         \
                                                                               \
-    explicit Name##Distribution(T1 p1 = v1, T2 p2 = v2) : param_(p1, p2)      \
+    result_type p1() const { return param_.p1(); }
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_CONSTRUCTOR_2(Name, p1, v1, p2, v2)      \
+    public:                                                                   \
+    using result_type = RealType;                                             \
+    using distribution_type = Name##Distribution<RealType>;                   \
+                                                                              \
+    explicit Name##Distribution(result_type p1 = v1, result_type p2 = v2)     \
+        : param_(p1, p2)                                                      \
     {                                                                         \
         reset();                                                              \
     }                                                                         \
@@ -244,14 +277,22 @@
         reset();                                                              \
     }                                                                         \
                                                                               \
-    T1 p1() const { return param_.p1(); }                                     \
-    T2 p2() const { return param_.p2(); }                                     \
+    result_type p1() const { return param_.p1(); }                            \
+    result_type p2() const { return param_.p2(); }
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_OPERATOR(Name, name)                     \
+    public:                                                                   \
+    const param_type &param() const { return param_; }                        \
                                                                               \
-    param_type param() const { return param_; }                               \
+    void param(const param_type &param)                                       \
+    {                                                                         \
+        param_ = param;                                                       \
+        reset();                                                              \
+    }                                                                         \
                                                                               \
-    void param(const param_type &parm)                                        \
+    void pram(param_type &&param)                                             \
     {                                                                         \
-        param_ = parm;                                                        \
+        param_ = std::move(param);                                            \
         reset();                                                              \
     }                                                                         \
                                                                               \
@@ -277,14 +318,14 @@
     void operator()(                                                          \
         RNGType &rng, std::size_t n, result_type *r, const param_type &param) \
     {                                                                         \
-        name##_distribution(rng, n, r, param.p1(), param.p2());               \
+        if (n < 100) {                                                        \
+            for (std::size_t i = 0; i != n; ++i)                              \
+                r[i] = operator()(rng, param);                                \
+        } else {                                                              \
+            name##_distribution(rng, n, r, param);                            \
+        }                                                                     \
     }                                                                         \
                                                                               \
-    private:                                                                  \
-    param_type param_;
-
-#define VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS                                \
-    public:                                                                   \
     friend bool operator==(                                                   \
         const distribution_type &dist1, const distribution_type &dist2)       \
     {                                                                         \
@@ -294,7 +335,7 @@
     friend bool operator!=(                                                   \
         const distribution_type &dist1, const distribution_type &dist2)       \
     {                                                                         \
-        return dist1.param_ != dist2.param_;                                  \
+        return !(dist1 == dist2);                                             \
     }                                                                         \
                                                                               \
     template <typename CharT, typename Traits>                                \
@@ -311,9 +352,73 @@
         std::basic_istream<CharT, Traits> &is, distribution_type &dist)       \
     {                                                                         \
         is >> std::ws >> dist.param_;                                         \
-        dist.reset();                                                         \
+        if (is.good())                                                        \
+            dist.reset();                                                     \
                                                                               \
         return is;                                                            \
+    }                                                                         \
+                                                                              \
+    private:                                                                  \
+    param_type param_;
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_0(Name, name, T, t, Type)                \
+    VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_0(Name, T, t, Type)               \
+    VSMC_DEFINE_RNG_DISTRIBUTION_CONSTRUCTOR_0(Name, T)                       \
+    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATOR(Name, name)
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_1(Name, name, p1, v1)                    \
+    VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_1(Name, name, p1, v1)             \
+    VSMC_DEFINE_RNG_DISTRIBUTION_CONSTRUCTOR_1(Name, p1, v1)                  \
+    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATOR(Name, name)
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_2(Name, name, p1, v1, p2, v2)            \
+    VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_2(Name, name, p1, v1, p2, v2)     \
+    VSMC_DEFINE_RNG_DISTRIBUTION_CONSTRUCTOR_2(Name, p1, v1, p2, v2)          \
+    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATOR(Name, name)
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(Name, name, T)                    \
+    template <typename T, typename RNGType>                                   \
+    inline void name##_distribution(RNGType &rng, std::size_t n, T *r,        \
+        const typename Name##Distribution<T>::param_type &)                   \
+    {                                                                         \
+        name##_distribution(rng, n, r);                                       \
+    }                                                                         \
+                                                                              \
+    template <typename T, typename RNGType>                                   \
+    inline void rng_rand(                                                     \
+        RNGType &rng, Name##Distribution<T> &dist, std::size_t n, T *r)       \
+    {                                                                         \
+        dist(rng, n, r);                                                      \
+    }
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_RAND_1(Name, name, p1)                   \
+    template <typename RealType, typename RNGType>                            \
+    inline void name##_distribution(RNGType &rng, std::size_t n, RealType *r, \
+        const typename Name##Distribution<RealType>::param_type &param)       \
+    {                                                                         \
+        name##_distribution(rng, n, r, param.p1());                           \
+    }                                                                         \
+                                                                              \
+    template <typename RealType, typename RNGType>                            \
+    inline void rng_rand(RNGType &rng, Name##Distribution<RealType> &dist,    \
+        std::size_t n, RealType *r)                                           \
+    {                                                                         \
+        dist(rng, n, r);                                                      \
+    }
+
+#define VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Name, name, p1, p2)               \
+    template <typename RealType, typename RNGType>                            \
+    inline void name##_distribution(RNGType &rng, std::size_t n, RealType *r, \
+        const typename Name##Distribution<RealType>::param_type &param)       \
+    {                                                                         \
+        name##_distribution(rng, n, r, param.p1(), param.p2());               \
+    }                                                                         \
+                                                                              \
+    template <typename RealType, typename RNGType>                            \
+    inline void rng_rand(RNGType &rng, Name##Distribution<RealType> &dist,    \
+        std::size_t n, RealType *r)                                           \
+    {                                                                         \
+        dist(rng, n, r);                                                      \
     }
 
 namespace vsmc
@@ -324,112 +429,87 @@ namespace internal
 
 VSMC_DEFINE_TYPE_DISPATCH_TRAIT(KeyType, key_type, NullType)
 
-#ifdef VSMC_CLANG
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
-
-template <typename T>
-inline bool is_equal(const T &a, const T &b)
+template <typename UIntType, UIntType U,
+    int N = std::numeric_limits<UIntType>::digits>
+class RNGMinBitsImpl
 {
-    return a == b;
-}
+    static constexpr int M = std::numeric_limits<UIntType>::digits - N;
 
-#ifdef VSMC_CLANG
-#pragma clang diagnostic pop
-#endif
+    public:
+    static constexpr int value =
+        (U >> M) == 0 ? M : RNGMinBitsImpl<UIntType, U, N - 1>::value;
+}; // class RNGMinBitsImpl
 
-template <int N>
-class RNGBitsNMax
+template <typename UIntType, UIntType U>
+class RNGMinBitsImpl<UIntType, U, 0>
 {
     public:
-    static constexpr std::uint64_t
-        value = std::numeric_limits<std::uint64_t>::max VSMC_MNE() >> (64 - N);
-}; // class RNGBitsNMax
+    static constexpr int value = std::numeric_limits<UIntType>::digits;
+}; // class RNGMinBitsImpl
+
+} // namespace vsmc::internal
+
+/// \brief Find the smallest N such that `(RNGType::min() >> N) == 0`
+/// \ingroup RNG
+template <typename RNGType>
+class RNGMinBits : public std::integral_constant<int,
+                       internal::RNGMinBitsImpl<typename RNGType::result_type,
+                                                     RNGType::min()>::value>
+{
+}; // class RNGMinBits
+
+namespace internal
+{
 
-template <std::uint64_t UMax, int N>
-class RNGBitsN
+template <typename UIntType, UIntType U,
+    int N = std::numeric_limits<UIntType>::digits>
+class RNGMaxBitsImpl
 {
-    static constexpr std::uint64_t bmax = RNGBitsNMax<N>::value;
+    static constexpr UIntType bmax = std::numeric_limits<UIntType>::max() >>
+        (std::numeric_limits<UIntType>::digits - N);
 
     public:
     static constexpr int value =
-        UMax < bmax ? RNGBitsN<UMax, N - 1>::value : N;
-}; // class RNGMaxBitsN
+        U < bmax ? RNGMaxBitsImpl<UIntType, U, N - 1>::value : N;
+}; // class RNGMaxBitsImpl
 
-template <std::uint64_t UMax>
-class RNGBitsN<UMax, 0>
+template <typename UIntType, UIntType U>
+class RNGMaxBitsImpl<UIntType, U, 0>
 {
     public:
     static constexpr int value = 0;
-}; // class RNGMaxBitsN
+}; // class RNGMaxBitsImpl
 
-template <typename RNGType>
-class RNGMinBits
-    : public std::integral_constant<int,
-          RNGBitsN<static_cast<std::uint64_t>(RNGType::min VSMC_MNE()),
-                                        64>::value>
-{
-}; // class RNGMinBits
+} // namespace vsmc::internal
 
+/// \brief Find the largest N such that
+/// RNGType::max() >= (M >> (W - N)) where
+/// M = std::numeric_limits<typename RNGType::result_type>::max()
+/// W = std::numeric_limits<typename RNGType::result_type>::digits
+/// \ingroup RNG
 template <typename RNGType>
-class RNGMaxBits
-    : public std::integral_constant<int,
-          RNGBitsN<static_cast<std::uint64_t>(RNGType::max VSMC_MNE()),
-                                        64>::value>
+class RNGMaxBits : public std::integral_constant<int,
+                       internal::RNGMaxBitsImpl<typename RNGType::result_type,
+                                                     RNGType::max()>::value>
 {
 }; // class RNGMaxBits
 
+/// \brief The value of
+/// RNGMaxBits<RNGType>::value - RNGMinBits<RNGType>::value
+/// \ingroup RNG
+///
+/// \details
+/// Let R = RNGMinBits<RNGType>::value.
+/// Let P = RNGMaxBits<RNGType>::vaue - RNGMinBits<RNGType>::value
+/// Then given `u` is the unsigned random integer generated by `RNGType`,
+/// `(u >> R)` is an unsigned random integer covering at least
+/// \f$\{0,\dots,2^P - 1\}\f$.
 template <typename RNGType>
 class RNGBits : public std::integral_constant<int,
                     RNGMaxBits<RNGType>::value - RNGMinBits<RNGType>::value>
 {
 }; // class RNGBits
 
-template <std::size_t>
-class IntBitsN;
-
-template <>
-class IntBitsN<sizeof(int8_t)> : public std::integral_constant<int, 8>
-{
-}; // class IntBitsN
-
-template <>
-class IntBitsN<sizeof(int16_t)> : public std::integral_constant<int, 16>
-{
-}; // class IntBitsN
-
-template <>
-class IntBitsN<sizeof(int32_t)> : public std::integral_constant<int, 32>
-{
-}; // class IntBitsN
-
-template <>
-class IntBitsN<sizeof(int64_t)> : public std::integral_constant<int, 64>
-{
-}; // class IntBitsN
-
-template <typename IntType>
-class IntBits : public IntBitsN<sizeof(IntType)>
-{
-}; // class IntBits
-
-template <typename SeedSeq, typename U, typename V = U, typename W = V>
-class is_seed_seq
-    : public std::integral_constant<bool,
-          !std::is_convertible<SeedSeq, U>::value &&
-              !std::is_convertible<SeedSeq, V>::value &&
-              !std::is_convertible<SeedSeq, W>::value &&
-              !std::is_same<typename std::remove_cv<SeedSeq>::type,
-                                        U>::value &&
-              !std::is_same<typename std::remove_cv<SeedSeq>::type,
-                                        V>::value &&
-              !std::is_same<typename std::remove_cv<SeedSeq>::type, W>::value>
-{
-}; // class is_seed_seq
-
-} // namespace vsmc::internal
-
 /// \brief Parameter type for open interval
 /// \ingroup RNG
 class Open;
@@ -450,18 +530,17 @@ void rng_rand(RNGType &rng, std::size_t n, typename RNGType::result_type *r)
 template <typename>
 class CounterEngine;
 
-template <typename Generator>
-inline void rng_rand(CounterEngine<Generator> &, std::size_t,
-    typename CounterEngine<Generator>::result_type *);
+template <typename = double, std::size_t = Dynamic>
+class RandomWalk;
 
-template <typename = int>
-class BernoulliIntDistribution;
+template <typename = double, std::size_t = Dynamic, std::size_t = Dynamic>
+class RandomWalkG;
 
-template <typename = unsigned>
-class UniformBitsDistribution;
+template <typename = double>
+class NormalProposal;
 
-template <typename = int>
-class DiscreteDistribution;
+template <typename = double, std::size_t = Dynamic>
+class NormalMVProposal;
 
 template <typename = double>
 class BetaDistribution;
@@ -472,6 +551,9 @@ class CauchyDistribution;
 template <typename = double>
 class ChiSquaredDistribution;
 
+template <typename = int>
+class DiscreteDistribution;
+
 template <typename = double>
 class ExponentialDistribution;
 
@@ -499,6 +581,9 @@ class LognormalDistribution;
 template <typename = double>
 class NormalDistribution;
 
+template <typename = double, std::size_t = Dynamic>
+class NormalMVDistribution;
+
 template <typename = double>
 class ParetoDistribution;
 
@@ -508,22 +593,24 @@ class RayleighDistribution;
 template <typename = double>
 class StudentTDistribution;
 
-template <typename = double, typename = Closed, typename = Open>
+template <typename = double>
+class U01Distribution;
+
+template <typename = double, typename = Open, typename = Closed>
 class U01LRDistribution;
 
-template <typename = double, typename = Closed, typename = Open>
-class UniformRealLRDistribution;
+template <typename = unsigned>
+class UniformBitsDistribution;
 
 template <typename = double>
-class WeibullDistribution;
+class UniformRealDistribution;
 
-template <typename IntType, typename RNGType>
-inline void rng_rand(
-    RNGType &, BernoulliIntDistribution<IntType> &, std::size_t, IntType *);
+template <typename = double>
+class WeibullDistribution;
 
-template <typename UIntType, typename RNGType>
-inline void rng_rand(
-    RNGType &, UniformBitsDistribution<UIntType> &, std::size_t, UIntType *);
+template <typename Generator>
+inline void rng_rand(CounterEngine<Generator> &, std::size_t,
+    typename CounterEngine<Generator>::result_type *);
 
 template <typename RealType, typename RNGType>
 inline void rng_rand(
@@ -577,6 +664,10 @@ template <typename RealType, typename RNGType>
 inline void rng_rand(
     RNGType &, NormalDistribution<RealType> &, std::size_t, RealType *);
 
+template <typename RealType, std::size_t Dim, typename RNGType>
+inline void rng_rand(
+    RNGType &, NormalMVDistribution<RealType, Dim> &, std::size_t, RealType *);
+
 template <typename RealType, typename RNGType>
 inline void rng_rand(
     RNGType &, ParetoDistribution<RealType> &, std::size_t, RealType *);
@@ -589,24 +680,25 @@ template <typename RealType, typename RNGType>
 inline void rng_rand(
     RNGType &, StudentTDistribution<RealType> &, std::size_t, RealType *);
 
+template <typename RealType, typename RNGType>
+inline void rng_rand(
+    RNGType &, U01Distribution<RealType> &, std::size_t, RealType *);
+
 template <typename RealType, typename RNGType, typename Left, typename Right>
 inline void rng_rand(RNGType &, U01LRDistribution<RealType, Left, Right> &,
     std::size_t, RealType *);
 
-template <typename RealType, typename RNGType, typename Left, typename Right>
-inline void rng_rand(RNGType &,
-    UniformRealLRDistribution<RealType, Left, Right> &, std::size_t,
-    RealType *);
+template <typename UIntType, typename RNGType>
+inline void rng_rand(
+    RNGType &, UniformBitsDistribution<UIntType> &, std::size_t, UIntType *);
 
 template <typename RealType, typename RNGType>
 inline void rng_rand(
-    RNGType &, WeibullDistribution<RealType> &, std::size_t, RealType *);
+    RNGType &, UniformRealDistribution<RealType> &, std::size_t, RealType *);
 
-template <typename IntType, typename RNGType>
-inline void bernoulli_distribution(RNGType &, std::size_t, IntType *, double);
-
-template <typename UIntType, typename RNGType>
-inline void uniform_bits_distribution(RNGType &, std::size_t, UIntType *);
+template <typename RealType, typename RNGType>
+inline void rng_rand(
+    RNGType &, WeibullDistribution<RealType> &, std::size_t, RealType *);
 
 template <typename RealType, typename RNGType>
 inline void beta_distribution(
@@ -656,6 +748,10 @@ template <typename RealType, typename RNGType>
 inline void normal_distribution(
     RNGType &, std::size_t, RealType *, RealType, RealType);
 
+template <typename RealType, typename RNGType>
+inline void normal_mv_distribution(RNGType &, std::size_t, RealType *,
+    std::size_t, const RealType *, const RealType *);
+
 template <typename RealType, typename RNGType>
 inline void pareto_distribution(
     RNGType &, std::size_t, RealType *, RealType, RealType);
@@ -668,20 +764,19 @@ template <typename RealType, typename RNGType>
 inline void student_t_distribution(
     RNGType &, std::size_t, RealType *, RealType);
 
-template <typename RealType, typename Left, typename Right, typename RNGType>
+template <typename RealType, typename RNGType>
 inline void u01_distribution(RNGType &, std::size_t, RealType *);
 
-template <typename RealType, typename RNGType>
+template <typename, typename, typename RealType, typename RNGType>
 inline void u01_lr_distribution(RNGType &, std::size_t, RealType *);
 
+template <typename UIntType, typename RNGType>
+inline void uniform_bits_distribution(RNGType &, std::size_t, UIntType *);
+
 template <typename RealType, typename RNGType>
 inline void uniform_real_distribution(
     RNGType &, std::size_t, RealType *, RealType, RealType);
 
-template <typename RealType, typename Left, typename Right, typename RNGType>
-inline void uniform_real_lr_distribution(
-    RNGType &, std::size_t, RealType *, RealType, RealType);
-
 template <typename RealType, typename RNGType>
 inline void weibull_distribution(
     RNGType &, std::size_t, RealType *, RealType, RealType);
@@ -696,38 +791,44 @@ inline void rng_rand(MKLEngine<BRNG, Bits> &, std::size_t,
     typename MKLEngine<BRNG, Bits>::result_type *);
 
 template <MKL_INT BRNG, int Bits>
-inline void bernoulli_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, MKL_INT *, double);
+inline void beta_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void uniform_real_distribution(
+inline void beta_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
+
+template <MKL_INT BRNG, int Bits>
+inline void cauchy_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void uniform_real_distribution(
+inline void cauchy_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void u01_distribution(MKLEngine<BRNG, Bits> &, std::size_t, float *);
+inline void exponential_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, float *, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void u01_distribution(MKLEngine<BRNG, Bits> &, std::size_t, double *);
+inline void exponential_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, double *, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void normal_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, float *r, float, float);
+inline void extreme_value_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void normal_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, double *r, double, double);
+inline void extreme_value_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void exponential_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, float *, float);
+inline void gamma_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void exponential_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, double *, double);
+inline void gamma_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 template <MKL_INT BRNG, int Bits>
 inline void laplace_distribution(
@@ -738,59 +839,57 @@ inline void laplace_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void weibull_distribution(
+inline void lognormal_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void weibull_distribution(
+inline void lognormal_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void cauchy_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
+inline void normal_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, float *r, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void cauchy_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
+inline void normal_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, double *r, double, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void rayleigh_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, float *, float);
+inline void normal_mv_distribution(MKLEngine<BRNG, Bits> &, std::size_t,
+    float *, std::size_t, const float *, const float *);
 
 template <MKL_INT BRNG, int Bits>
-inline void rayleigh_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, double *, double);
+inline void normal_mv_distribution(MKLEngine<BRNG, Bits> &, std::size_t,
+    double *, std::size_t, const double *, const double *);
 
 template <MKL_INT BRNG, int Bits>
-inline void lognormal_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
+inline void rayleigh_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, float *, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void lognormal_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
+inline void rayleigh_distribution(
+    MKLEngine<BRNG, Bits> &, std::size_t, double *, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void extreme_value_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
+inline void u01_distribution(MKLEngine<BRNG, Bits> &, std::size_t, float *);
 
 template <MKL_INT BRNG, int Bits>
-inline void extreme_value_distribution(
-    MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
+inline void u01_distribution(MKLEngine<BRNG, Bits> &, std::size_t, double *);
 
 template <MKL_INT BRNG, int Bits>
-inline void gamma_distribution(
+inline void uniform_real_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void gamma_distribution(
+inline void uniform_real_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 template <MKL_INT BRNG, int Bits>
-inline void beta_distribution(
+inline void weibull_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, float *, float, float);
 
 template <MKL_INT BRNG, int Bits>
-inline void beta_distribution(
+inline void weibull_distribution(
     MKLEngine<BRNG, Bits> &, std::size_t, double *, double, double);
 
 #endif // VSMC_HAS_MKL
diff --git a/include/vsmc/rng/internal/mkl_brng_defines.hpp b/include/vsmc/rng/internal/mkl_brng_defines.hpp
index 267c4cbb2..5bbc10d47 100644
--- a/include/vsmc/rng/internal/mkl_brng_defines.hpp
+++ b/include/vsmc/rng/internal/mkl_brng_defines.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/rng/laplace_distribution.hpp b/include/vsmc/rng/laplace_distribution.hpp
index 02f83570c..e45c681c6 100644
--- a/include/vsmc/rng/laplace_distribution.hpp
+++ b/include/vsmc/rng/laplace_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -58,20 +58,15 @@ inline bool laplace_distribution_check_param(RealType, RealType b)
 template <typename RealType>
 class LaplaceDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Laplace, laplace, RealType, result_type, a, 0, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Laplace, laplace, a, 0, b, 1)
 
     public:
-    result_type min VSMC_MNE() const
+    result_type min() const
     {
-        return -std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::lowest();
     }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -79,8 +74,8 @@ class LaplaceDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OCDistribution<RealType> runif;
-        result_type u = runif(rng) - static_cast<result_type>(0.5);
+        U01Distribution<RealType> u01;
+        result_type u = u01(rng) - static_cast<result_type>(0.5);
 
         return u > 0 ? param.a() - param.b() * std::log(1 - 2 * u) :
                        param.a() + param.b() * std::log(1 + 2 * u);
@@ -95,7 +90,7 @@ inline void laplace_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
     RealType s[K];
-    u01_oc_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     sub(n, r, static_cast<RealType>(0.5), r);
     for (std::size_t i = 0; i != n; ++i) {
         if (r[i] > 0) {
@@ -118,20 +113,19 @@ template <typename RealType, typename RNGType>
 inline void laplace_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**laplace_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::laplace_distribution_impl<k>(rng, k, r + i * k, a, b);
-    internal::laplace_distribution_impl<k>(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::laplace_distribution_impl<k>(rng, k, r, a, b);
+    internal::laplace_distribution_impl<k>(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, LaplaceDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Laplace, laplace, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/levy_distribution.hpp b/include/vsmc/rng/levy_distribution.hpp
index 86904406e..dd9cdddc6 100644
--- a/include/vsmc/rng/levy_distribution.hpp
+++ b/include/vsmc/rng/levy_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool levy_distribution_check_param(RealType, RealType b)
 template <typename RealType>
 class LevyDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Levy, levy, RealType, result_type, a, 0, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Levy, levy, a, 0, b, 1)
 
     public:
-    result_type min VSMC_MNE() const { return a(); }
+    result_type min() const { return a(); }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() { normal_ = NormalDistribution<RealType>(0, 1); }
 
@@ -102,20 +97,19 @@ template <typename RealType, typename RNGType>
 inline void levy_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**levy_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::levy_distribution_impl<k>(rng, k, r + i * k, a, b);
-    internal::levy_distribution_impl<k>(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::levy_distribution_impl<k>(rng, k, r, a, b);
+    internal::levy_distribution_impl<k>(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(
-    RNGType &rng, LevyDistribution<RealType> &dist, std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Levy, levy, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/logistic_distribution.hpp b/include/vsmc/rng/logistic_distribution.hpp
index c1901e4b6..d221c76a4 100644
--- a/include/vsmc/rng/logistic_distribution.hpp
+++ b/include/vsmc/rng/logistic_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,20 +54,15 @@ inline bool logistic_distribution_check_param(RealType, RealType b)
 template <typename RealType>
 class LogisticDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Logistic, logistic, RealType, result_type, a, 0, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Logistic, logistic, a, 0, b, 1)
 
     public:
-    result_type min VSMC_MNE() const
+    result_type min() const
     {
-        return -std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::lowest();
     }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -75,8 +70,8 @@ class LogisticDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OODistribution<RealType> runif;
-        result_type u = runif(rng);
+        U01Distribution<RealType> u01;
+        result_type u = u01(rng);
 
         return param.a() + param.b() * std::log(u / (1 - u));
     }
@@ -90,7 +85,7 @@ inline void logistic_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
     RealType s[K];
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     sub(n, static_cast<RealType>(1), r, s);
     div(n, r, s, r);
     log(n, r, r);
@@ -105,20 +100,19 @@ template <typename RealType, typename RNGType>
 inline void logistic_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**logistic_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::logistic_distribution_impl<k>(rng, k, r + i * k, a, b);
-    internal::logistic_distribution_impl<k>(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::logistic_distribution_impl<k>(rng, k, r, a, b);
+    internal::logistic_distribution_impl<k>(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, LogisticDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Logistic, logistic, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/lognormal_distribution.hpp b/include/vsmc/rng/lognormal_distribution.hpp
index 38e4febd7..1cf8694b1 100644
--- a/include/vsmc/rng/lognormal_distribution.hpp
+++ b/include/vsmc/rng/lognormal_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool lognormal_distribution_check_param(RealType, RealType s)
 template <typename RealType>
 class LognormalDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Lognormal, lognormal, RealType, result_type, m, 0, result_type, s, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Lognormal, lognormal, m, 0, s, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() { normal_ = NormalDistribution<RealType>(0, 1); }
 
@@ -97,7 +92,11 @@ template <typename RealType, typename RNGType>
 inline void lognormal_distribution(RNGType &rng, std::size_t n, RealType *r,
     RealType logmean, RealType logstddev)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**lognormal_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, r += k)
@@ -105,12 +104,7 @@ inline void lognormal_distribution(RNGType &rng, std::size_t n, RealType *r,
     internal::lognormal_distribution_impl(rng, l, r, logmean, logstddev);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, LognormalDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Lognormal, lognormal, m, s)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/mkl.hpp b/include/vsmc/rng/mkl.hpp
index 24280897d..419371f79 100644
--- a/include/vsmc/rng/mkl.hpp
+++ b/include/vsmc/rng/mkl.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,11 @@
 
 #include <vsmc/rng/internal/common.hpp>
 
+#define VSMC_RUNTIME_ASSERT_RNG_MKL_OFFSET(offset)                            \
+    VSMC_RUNTIME_ASSERT((offset < max()),                                     \
+        "**MKLOffsetDynamic** "                                               \
+        "EXCESS MAXIMUM NUMBER OF INDEPDENT RNG STREAMS")
+
 namespace vsmc
 {
 
@@ -43,8 +48,8 @@ namespace internal
 class MKLOffsetZero
 {
     public:
-    static constexpr MKL_INT min VSMC_MNE() { return 0; }
-    static constexpr MKL_INT max VSMC_MNE() { return 0; }
+    static constexpr MKL_INT min() { return 0; }
+    static constexpr MKL_INT max() { return 0; }
     static void set(MKL_INT) {}
     static constexpr MKL_INT get() { return 0; }
 }; // class OffsetZero
@@ -55,12 +60,12 @@ class MKLOffsetDynamic
     public:
     MKLOffsetDynamic() : offset_(0) {}
 
-    static constexpr MKL_INT min VSMC_MNE() { return 0; }
-    static constexpr MKL_INT max VSMC_MNE() { return MaxOffset; }
+    static constexpr MKL_INT min() { return 0; }
+    static constexpr MKL_INT max() { return MaxOffset; }
 
     void set(MKL_INT n)
     {
-        VSMC_RUNTIME_ASSERT_UTILITY_MKL_VSL_OFFSET(n);
+        VSMC_RUNTIME_ASSERT_RNG_MKL_OFFSET(n);
         offset_ = n % MaxOffset;
     }
 
@@ -81,8 +86,7 @@ template <>
 class MKLOffset<Dynamic>
 {
     public:
-    using type =
-        MKLOffsetDynamic<std::numeric_limits<MKL_INT>::max VSMC_MNE()>;
+    using type = MKLOffsetDynamic<std::numeric_limits<MKL_INT>::max()>;
 }; // class MKLOffset
 
 template <>
@@ -160,7 +164,7 @@ class MKLDiscardGeneral
         if (nskip == 0)
             return;
 
-        std::array<MKLResultType<Bits>, 1000> buffer;
+        std::array<MKLResultType<Bits>, 1024> buffer;
         const MKL_INT k = static_cast<MKL_INT>(buffer.size());
         while (nskip > k) {
             MKLUniformBits<Bits>::eval(stream, k, buffer.data());
@@ -209,18 +213,24 @@ class MKLEngine
     public:
     using result_type = internal::MKLResultType<Bits>;
 
-    explicit MKLEngine(MKL_UINT s = 1) : index_(M_) { seed(s); }
+    explicit MKLEngine(MKL_UINT s = 1) : index_(M_), stream_(BRNG, 0)
+    {
+        seed(s);
+    }
 
     template <typename SeedSeq>
     explicit MKLEngine(SeedSeq &seq,
         typename std::enable_if<internal::is_seed_seq<SeedSeq, MKL_UINT,
             MKLEngine<BRNG, Bits>>::value>::type * = nullptr)
-        : index_(M_)
+        : index_(M_), stream_(BRNG, 0)
     {
         seed(seq);
     }
 
-    MKLEngine(MKL_UINT s, MKL_INT offset) { seed(s, offset); }
+    MKLEngine(MKL_UINT s, MKL_INT offset) : index_(M_), stream_(BRNG, 0)
+    {
+        seed(s, offset);
+    }
 
     void seed(MKL_UINT s) { seed(s, 0); }
 
@@ -255,8 +265,31 @@ class MKLEngine
 
     void operator()(std::size_t n, result_type *r)
     {
+        std::size_t remain = M_ - index_;
+
+        if (n < remain) {
+            std::memcpy(r, buffer_.data() + index_, sizeof(result_type) * n);
+            index_ += n;
+            return;
+        }
+
+        std::memcpy(r, buffer_.data() + index_, sizeof(result_type) * remain);
+        r += remain;
+        n -= remain;
+        index_ = M_;
+
+        const std::size_t m = n / M_;
+        const std::size_t l = n % M_;
+        for (std::size_t i = 0; i != m; ++i) {
+            internal::MKLUniformBits<Bits>::eval(
+                stream_, static_cast<MKL_INT>(M_), r);
+            r += M_;
+            n -= M_;
+        }
         internal::MKLUniformBits<Bits>::eval(
-            stream_, static_cast<MKL_INT>(n), r);
+            stream_, static_cast<MKL_INT>(M_), buffer_.data());
+        std::memcpy(r, buffer_.data(), sizeof(result_type) * l);
+        index_ = l;
     }
 
     void discard(long long nskip)
@@ -265,14 +298,14 @@ class MKLEngine
         index_ = M_;
     }
 
-    static constexpr result_type min VSMC_MNE()
+    static constexpr result_type min()
     {
-        return std::numeric_limits<result_type>::min VSMC_MNE();
+        return std::numeric_limits<result_type>::min();
     }
 
-    static constexpr result_type max VSMC_MNE()
+    static constexpr result_type max()
     {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::max();
     }
 
     MKLStream &stream() { return stream_; }
@@ -369,11 +402,11 @@ class MKLEngine
     }
 
     private:
-    static constexpr std::size_t M_ = 1000;
+    static constexpr std::size_t M_ = 1024;
 
-    MKLStream stream_;
-    std::array<result_type, M_> buffer_;
+    alignas(32) std::array<result_type, M_> buffer_;
     std::size_t index_;
+    MKLStream stream_;
 }; // class MKLEngine
 
 /// \brief A 59-bits multiplicative congruential generator
@@ -448,66 +481,75 @@ inline void rng_rand(MKLEngine<BRNG, Bits> &rng, std::size_t n,
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void bernoulli_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, int *r, double p)
+inline void beta_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    float *r, float alpha, float beta)
 {
-    rng.stream().bernoulli(static_cast<MKL_INT>(n), r, p);
+    rng.stream().beta(static_cast<MKL_INT>(n), r, alpha, beta, 0, 1);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void uniform_real_distribution(
+inline void beta_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    double *r, double alpha, double beta)
+{
+    rng.stream().beta(static_cast<MKL_INT>(n), r, alpha, beta, 0, 1);
+}
+
+template <MKL_INT BRNG, int Bits>
+inline void cauchy_distribution(
     MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float a, float b)
 {
-    rng.stream().uniform(static_cast<MKL_INT>(n), r, a, b);
+    rng.stream().cauchy(static_cast<MKL_INT>(n), r, a, b);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void uniform_real_distribution(
+inline void cauchy_distribution(
     MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double a, double b)
 {
-    rng.stream().uniform(static_cast<MKL_INT>(n), r, a, b);
+    rng.stream().cauchy(static_cast<MKL_INT>(n), r, a, b);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void u01_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r)
+inline void exponential_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float lambda)
 {
-    rng.stream().uniform(static_cast<MKL_INT>(n), r, 0, 1);
+    rng.stream().exponential(static_cast<MKL_INT>(n), r, 0, 1 / lambda);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void u01_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r)
+inline void exponential_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double lambda)
 {
-    rng.stream().uniform(static_cast<MKL_INT>(n), r, 0, 1);
+    rng.stream().exponential(static_cast<MKL_INT>(n), r, 0, 1 / lambda);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void normal_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
-    float *r, float mean, float stddev)
+inline void extreme_value_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float a, float b)
 {
-    rng.stream().gaussian(static_cast<MKL_INT>(n), r, mean, stddev);
+    rng.stream().gumbel(static_cast<MKL_INT>(n), r, a, b);
+    sub(n, 2 * a, r, r);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void normal_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
-    double *r, double mean, double stddev)
+inline void extreme_value_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double a, double b)
 {
-    rng.stream().gaussian(static_cast<MKL_INT>(n), r, mean, stddev);
+    rng.stream().gumbel(static_cast<MKL_INT>(n), r, a, b);
+    sub(n, 2 * a, r, r);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void exponential_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float lambda)
+inline void gamma_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    float *r, float alpha, float beta)
 {
-    rng.stream().exponential(static_cast<MKL_INT>(n), r, 0, 1 / lambda);
+    rng.stream().gamma(static_cast<MKL_INT>(n), r, alpha, 0, beta);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void exponential_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double lambda)
+inline void gamma_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    double *r, double alpha, double beta)
 {
-    rng.stream().exponential(static_cast<MKL_INT>(n), r, 0, 1 / lambda);
+    rng.stream().gamma(static_cast<MKL_INT>(n), r, alpha, 0, beta);
 }
 
 template <MKL_INT BRNG, int Bits>
@@ -525,31 +567,47 @@ inline void laplace_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void weibull_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float a, float b)
+inline void lognormal_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float m, float s)
 {
-    rng.stream().weibull(static_cast<MKL_INT>(n), r, a, 0, b);
+    rng.stream().lognormal(static_cast<MKL_INT>(n), r, m, s, 0, 1);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void weibull_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double a, double b)
+inline void lognormal_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double m, double s)
 {
-    rng.stream().weibull(static_cast<MKL_INT>(n), r, a, 0, b);
+    rng.stream().lognormal(static_cast<MKL_INT>(n), r, m, s, 0, 1);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void cauchy_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float a, float b)
+inline void normal_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    float *r, float mean, float stddev)
 {
-    rng.stream().cauchy(static_cast<MKL_INT>(n), r, a, b);
+    rng.stream().gaussian(static_cast<MKL_INT>(n), r, mean, stddev);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void cauchy_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double a, double b)
+inline void normal_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    double *r, double mean, double stddev)
 {
-    rng.stream().cauchy(static_cast<MKL_INT>(n), r, a, b);
+    rng.stream().gaussian(static_cast<MKL_INT>(n), r, mean, stddev);
+}
+
+template <MKL_INT BRNG, int Bits>
+inline void normal_mv_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    float *r, std::size_t m, const float *mean, const float *chol)
+{
+    rng.stream().gaussian_mv(static_cast<MKL_INT>(n), r,
+        static_cast<MKL_INT>(m), VSL_MATRIX_STORAGE_PACKED, mean, chol);
+}
+
+template <MKL_INT BRNG, int Bits>
+inline void normal_mv_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
+    double *r, std::size_t m, const double *mean, const double *chol)
+{
+    rng.stream().gaussian_mv(static_cast<MKL_INT>(n), r,
+        static_cast<MKL_INT>(m), VSL_MATRIX_STORAGE_PACKED, mean, chol);
 }
 
 template <MKL_INT BRNG, int Bits>
@@ -569,61 +627,45 @@ inline void rayleigh_distribution(
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void lognormal_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float m, float s)
+inline void u01_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r)
 {
-    rng.stream().lognormal(static_cast<MKL_INT>(n), r, m, s, 0, 1);
+    rng.stream().uniform(static_cast<MKL_INT>(n), r, 0, 1);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void lognormal_distribution(
-    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double m, double s)
+inline void u01_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r)
 {
-    rng.stream().lognormal(static_cast<MKL_INT>(n), r, m, s, 0, 1);
+    rng.stream().uniform(static_cast<MKL_INT>(n), r, 0, 1);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void extreme_value_distribution(
+inline void uniform_real_distribution(
     MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float a, float b)
 {
-    rng.stream().gumbel(static_cast<MKL_INT>(n), r, a, b);
-    sub(n, 2 * a, r, r);
+    rng.stream().uniform(static_cast<MKL_INT>(n), r, a, b);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void extreme_value_distribution(
+inline void uniform_real_distribution(
     MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double a, double b)
 {
-    rng.stream().gumbel(static_cast<MKL_INT>(n), r, a, b);
-    sub(n, 2 * a, r, r);
-}
-
-template <MKL_INT BRNG, int Bits>
-inline void gamma_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
-    float *r, float alpha, float beta)
-{
-    rng.stream().gamma(static_cast<MKL_INT>(n), r, alpha, 0, beta);
-}
-
-template <MKL_INT BRNG, int Bits>
-inline void gamma_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
-    double *r, double alpha, double beta)
-{
-    rng.stream().gamma(static_cast<MKL_INT>(n), r, alpha, 0, beta);
+    rng.stream().uniform(static_cast<MKL_INT>(n), r, a, b);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void beta_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
-    float *r, float alpha, float beta)
+inline void weibull_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, float *r, float a, float b)
 {
-    rng.stream().beta(static_cast<MKL_INT>(n), r, alpha, beta, 0, 1);
+    rng.stream().weibull(static_cast<MKL_INT>(n), r, a, 0, b);
 }
 
 template <MKL_INT BRNG, int Bits>
-inline void beta_distribution(MKLEngine<BRNG, Bits> &rng, std::size_t n,
-    double *r, double alpha, double beta)
+inline void weibull_distribution(
+    MKLEngine<BRNG, Bits> &rng, std::size_t n, double *r, double a, double b)
 {
-    rng.stream().beta(static_cast<MKL_INT>(n), r, alpha, beta, 0, 1);
+    rng.stream().weibull(static_cast<MKL_INT>(n), r, a, 0, b);
 }
 
 } // namespace vsmc
diff --git a/include/vsmc/rng/mkl_brng.hpp b/include/vsmc/rng/mkl_brng.hpp
index 52a8eb14f..efc40f8c4 100644
--- a/include/vsmc/rng/mkl_brng.hpp
+++ b/include/vsmc/rng/mkl_brng.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,9 +32,9 @@
 #ifndef VSMC_RNG_MKL_BRNG_HPP
 #define VSMC_RNG_MKL_BRNG_HPP
 
-#include <vsmc/vsmc.h>
 #include <vsmc/rng/internal/common.hpp>
 #include <vsmc/rng/engine.hpp>
+#include <vsmc/vsmc.h>
 
 #define VSMC_DEFINE_RNG_MKL_BRNG(RNGType, name)                               \
     template <>                                                               \
diff --git a/include/vsmc/rng/normal_distribution.hpp b/include/vsmc/rng/normal_distribution.hpp
index af3c8d112..67aace9a0 100644
--- a/include/vsmc/rng/normal_distribution.hpp
+++ b/include/vsmc/rng/normal_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -55,26 +55,86 @@ inline bool normal_distribution_check_param(RealType, RealType stddev)
 template <typename RealType>
 class NormalDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Normal, normal, RealType, result_type, mean, 0, result_type, stddev, 1)
+    VSMC_DEFINE_RNG_DISTRIBUTION_PARAM_TYPE_2(
+        Normal, normal, mean, 0, stddev, 1)
 
     public:
-    result_type min VSMC_MNE() const
+    using result_type = RealType;
+    using distribution_type = NormalDistribution<RealType>;
+
+    explicit NormalDistribution(result_type mean = 0, result_type stddev = 1)
+        : param_(mean, stddev), v_(0), saved_(false)
     {
-        return -std::numeric_limits<result_type>::max VSMC_MNE();
+        reset();
     }
 
-    result_type max VSMC_MNE() const
+    explicit NormalDistribution(const param_type &param)
+        : param_(param), v_(0), saved_(false)
+    {
+        reset();
+    }
+
+    result_type mean() const { return param_.mean(); }
+
+    result_type stddev() const { return param_.stddev(); }
+
+    result_type min() const
     {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::lowest();
     }
 
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
+
     void reset()
     {
         v_ = 0;
         saved_ = false;
     }
 
+    const param_type &param() const { return param_; }
+
+    void param(const param_type &param)
+    {
+        param_ = param;
+        reset();
+    }
+
+    void pram(param_type &&param)
+    {
+        param_ = std::move(param);
+        reset();
+    }
+
+    template <typename RNGType>
+    result_type operator()(RNGType &rng)
+    {
+        return operator()(rng, param_);
+    }
+
+    template <typename RNGType>
+    result_type operator()(RNGType &rng, const param_type &param)
+    {
+        return generate(rng, param);
+    }
+
+    template <typename RNGType>
+    void operator()(RNGType &rng, std::size_t n, result_type *r)
+    {
+        operator()(rng, n, r, param_);
+    }
+
+    template <typename RNGType>
+    void operator()(
+        RNGType &rng, std::size_t n, result_type *r, const param_type &param)
+    {
+        if (n < 100) {
+            for (std::size_t i = 0; i != n; ++i)
+                r[i] = operator()(rng, param);
+        } else {
+            normal_distribution(rng, n, r, param);
+        }
+    }
+
     friend bool operator==(
         const distribution_type &dist1, const distribution_type &dist2)
     {
@@ -92,13 +152,16 @@ class NormalDistribution
     friend bool operator!=(
         const distribution_type &dist1, const distribution_type &dist2)
     {
-        return dist1.param_ != dist2.param_;
+        return !(dist1 == dist2);
     }
 
     template <typename CharT, typename Traits>
     friend std::basic_ostream<CharT, Traits> &operator<<(
         std::basic_ostream<CharT, Traits> &os, const distribution_type &dist)
     {
+        if (!os.good())
+            return os;
+
         os << dist.param_ << ' ';
         os << dist.v_ << ' ';
         os << dist.saved_;
@@ -110,6 +173,9 @@ class NormalDistribution
     friend std::basic_istream<CharT, Traits> &operator>>(
         std::basic_istream<CharT, Traits> &is, distribution_type &dist)
     {
+        if (!is.good())
+            return is;
+
         param_type param;
         result_type v;
         bool saved;
@@ -126,6 +192,7 @@ class NormalDistribution
     }
 
     private:
+    param_type param_;
     result_type v_;
     bool saved_;
 
@@ -137,9 +204,9 @@ class NormalDistribution
             z = v_;
             saved_ = false;
         } else {
-            U01OCDistribution<RealType> runif;
-            result_type u1 = std::sqrt(-2 * std::log(runif(rng)));
-            result_type u2 = const_pi_2<result_type>() * runif(rng);
+            U01Distribution<RealType> u01;
+            result_type u1 = std::sqrt(-2 * std::log(u01(rng)));
+            result_type u2 = const_pi_2<result_type>() * u01(rng);
             z = u1 * std::cos(u2);
             v_ = u1 * std::sin(u2);
             saved_ = true;
@@ -160,7 +227,7 @@ inline void normal_distribution_impl(
     const std::size_t nu = n / 2;
     RealType *const u1 = r;
     RealType *const u2 = r + nu;
-    u01_oc_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     log(nu, u1, s);
     mul(nu, static_cast<RealType>(-2), s, s);
     sqrt(nu, s, s);
@@ -173,34 +240,33 @@ inline void normal_distribution_impl(
 
 } // namespace vsmc::internal
 
-/// \brief Generating normal random variates
+/// \brief Generating Normal random variates
 /// \ingroup Distribution
 template <typename RealType, typename RNGType>
 inline void normal_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType mean, RealType stddev)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**normal_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
     for (std::size_t i = 0; i != m; ++i, r += k)
         internal::normal_distribution_impl<k>(rng, k, r, mean, stddev);
     internal::normal_distribution_impl<k>(rng, l, r, mean, stddev);
     if (n % 2 != 0) {
-        U01OCDistribution<RealType> runif;
-        RealType u = runif(rng);
-        RealType v = runif(rng);
+        U01Distribution<RealType> u01;
+        RealType u = u01(rng);
+        RealType v = u01(rng);
         r[l - 1] = mean +
             stddev * std::sqrt(-2 * std::log(u)) *
                 std::cos(const_pi_2<RealType>() * v);
     }
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, NormalDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Normal, normal, mean, stddev)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/normal_mv_distribution.hpp b/include/vsmc/rng/normal_mv_distribution.hpp
new file mode 100644
index 000000000..65fee4212
--- /dev/null
+++ b/include/vsmc/rng/normal_mv_distribution.hpp
@@ -0,0 +1,423 @@
+//============================================================================
+// vSMC/include/vsmc/rng/normal_mv_distribution.hpp
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c) 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#ifndef VSMC_RNG_NORMAL_MV_DISTRIBUTION_HPP
+#define VSMC_RNG_NORMAL_MV_DISTRIBUTION_HPP
+
+#include <vsmc/rng/internal/common.hpp>
+#include <vsmc/rng/normal_distribution.hpp>
+
+namespace vsmc
+{
+
+/// \brief Multivariate Normal distribution
+/// \ingroup Distribution
+///
+/// \details
+/// The distribution is parameterized by its mean vector and the lower
+/// triangular elements of the Cholesky decomposition of the covaraince matrix,
+/// packed row by row.
+///
+/// \tparam RealType Only `float` and `double` are supported.
+/// \tparam Dim If `Dim > 0`, then the distribution has a static size and does
+/// not use dynamic memory. If `Dim == Dynamic`, then the dimension of the
+/// distribution is specified at runtime.
+template <typename RealType, std::size_t Dim>
+class NormalMVDistribution
+{
+
+    public:
+    using result_type = RealType;
+    using distribution_type = NormalMVDistribution<RealType, Dim>;
+
+    class param_type
+    {
+        static_assert(internal::is_one_of<RealType, float, double>::value,
+            "**NormalMVDistributon::param_type** USED WITH RealType OTHER "
+            "THAN float OR double");
+
+        public:
+        using result_type = RealType;
+        using distribution_type = NormalMVDistribution<RealType, Dim>;
+
+        explicit param_type(const result_type *mean = nullptr,
+            const result_type *chol = nullptr)
+            : rnorm_(0, 1)
+            , null_mean_(mean == nullptr)
+            , null_chol_(chol == nullptr)
+        {
+            static_assert(Dim != Dynamic, "**NormalMVDistribution::param_type*"
+                                          "* OBJECT DECLARED WITH DYNAMIC "
+                                          "DIMENSION");
+            init(mean, chol);
+        }
+
+        explicit param_type(std::size_t dim, const result_type *mean = nullptr,
+            const result_type *chol = nullptr)
+            : rnorm_(0, 1)
+            , mean_(dim)
+            , chol_(dim * (dim + 1) / 2)
+            , null_mean_(mean == nullptr)
+            , null_chol_(chol == nullptr)
+        {
+            static_assert(Dim == Dynamic, "**NormalMVDistribution::param_type*"
+                                          "* OBJECT DECLARED WITH FIXED "
+                                          "DIMENSION");
+            init(mean, chol);
+        }
+
+        std::size_t dim() const { return mean_.size(); }
+
+        const result_type *mean() const { return mean_.data(); }
+
+        const result_type *chol() const { return chol_.data(); }
+
+        friend bool operator==(
+            const param_type &param1, const param_type &param2)
+        {
+            if (param1.norm_ != param2.norm_)
+                return false;
+            if (param1.mean_ != param2.mean_)
+                return false;
+            if (param1.chol_ != param2.chol_)
+                return false;
+            if (param1.null_mean_ != param2.null_mean_)
+                return false;
+            if (param1.null_chol_ != param2.null_chol_)
+                return false;
+            return true;
+        }
+
+        friend bool operator!=(
+            const param_type &param1, const param_type &param2)
+        {
+            return !(param1 == param2);
+        }
+
+        template <typename CharT, typename Traits>
+        friend std::basic_ostream<CharT, Traits> &operator<<(
+            std::basic_ostream<CharT, Traits> &os, const param_type &param)
+        {
+            if (!os.good())
+                return os;
+
+            os << param.norm_ << ' ';
+            os << param.dim() << ' ';
+            os << param.mean_ << ' ';
+            os << param.chol_ << ' ';
+            os << param.null_mean_ << ' ';
+            os << param.null_chol_;
+
+            return os;
+        }
+
+        template <typename CharT, typename Traits>
+        friend std::basic_ostream<CharT, Traits> &operator>>(
+            std::basic_istream<CharT, Traits> &is, const param_type &param)
+        {
+            if (!is.good())
+                return is;
+
+            NormalDistribution<RealType> rnorm;
+            internal::Array<result_type, Dim> mean;
+            internal::Array<result_type, Dim *(Dim + 1) / 2> chol;
+            bool null_mean;
+            bool null_chol;
+
+            is >> std::ws >> rnorm;
+            if (!is.good())
+                return is;
+
+            std::size_t dim = 0;
+            is >> std::ws >> dim;
+            if (!is.good())
+                return is;
+
+            internal::resize(mean, dim);
+            internal::resize(chol, dim * (dim + 1) / 2);
+            is >> std::ws >> mean;
+            is >> std::ws >> chol;
+            is >> std::ws >> null_mean;
+            is >> std::ws >> null_chol;
+
+            if (is.good()) {
+                param.rnorm_ = std::move(rnorm);
+                param.mean_ = std::move(mean);
+                param.chol_ = std::move(chol);
+                param.null_mean_ = null_mean;
+                param.null_chol_ = null_chol;
+            } else {
+                is.setstate(std::ios_base::failbit);
+            }
+
+            return is;
+        }
+
+        private:
+        NormalDistribution<RealType> rnorm_;
+        internal::Array<result_type, Dim> mean_;
+        internal::Array<result_type, Dim *(Dim + 1) / 2> chol_;
+        bool null_mean_;
+        bool null_chol_;
+
+        friend distribution_type;
+
+        void init(const result_type *mean, const result_type *chol)
+        {
+            if (mean == nullptr)
+                std::fill(mean_.begin(), mean_.end(), 0);
+            else
+                std::copy_n(mean, mean_.size(), mean_.begin());
+
+            if (chol == nullptr)
+                std::fill(chol_.begin(), chol_.end(), 0);
+            else
+                std::copy_n(chol, chol_.size(), chol_.begin());
+
+            if (chol == nullptr)
+                for (std::size_t i = 0; i != mean_.size(); ++i)
+                    chol_[i * (i + 1) / 2 + i] = 1;
+        }
+    }; // class param_type
+
+    /// \brief Only usable when `Dim > 0`
+    ///
+    /// \param mean Mean vector, if it is a null pointer, then the mean is a
+    /// zero vector
+    /// \param chol The lower triangular elements of the Cholesky decomposition
+    /// of the covaraince matrix, packed row by row. If it is a nullpointer,
+    /// then the covariance is the identicy matrix \f$I\f$
+    explicit NormalMVDistribution(
+        const result_type *mean = nullptr, const result_type *chol = nullptr)
+        : param_(mean, chol)
+    {
+        reset();
+    }
+
+    /// \brief Only usable when `Dim == Dynamic`
+    explicit NormalMVDistribution(std::size_t dim,
+        const result_type *mean = nullptr, const result_type *chol = nullptr)
+        : param_(dim, mean, chol)
+    {
+        reset();
+    }
+
+    void min(result_type *x) const
+    {
+        std::fill_n(x, dim(), std::numeric_limits<result_type>::lowest());
+    }
+
+    void max(result_type *x) const
+    {
+        std::fill_n(x, dim(), std::numeric_limits<result_type>::max());
+    }
+
+    void reset() { param_.rnorm_.reset(); }
+
+    std::size_t dim() const { return param_.dim(); }
+
+    const result_type *mean() const { return param_.mean(); }
+
+    const result_type *chol() const { return param_.chol(); }
+
+    param_type param() const { return param_; }
+
+    void param(const param_type &param)
+    {
+        param_ = param;
+        reset();
+    }
+
+    void param(param_type &&param)
+    {
+        param_ = std::move(param);
+        reset();
+    }
+
+    template <typename RNGType>
+    void operator()(RNGType &rng, result_type *r)
+    {
+        operator()(rng, r, param_);
+    }
+
+    template <typename RNGType>
+    void operator()(RNGType &rng, result_type *r, const param_type &param)
+    {
+        generate(rng, r, param);
+    }
+
+    template <typename RNGType>
+    void operator()(RNGType &rng, std::size_t n, result_type *r)
+    {
+        operator()(rng, n, r, param_);
+    }
+
+    template <typename RNGType>
+    void operator()(
+        RNGType &rng, std::size_t n, result_type *r, const param_type &param)
+    {
+        normal_mv_distribution(rng, n, r, param.dim(),
+            (param.null_mean_ ? param.mean() : nullptr),
+            (param.null_chol_ ? param.chol() : nullptr));
+    }
+
+    friend bool operator==(
+        const distribution_type &dist1, const distribution_type &dist2)
+    {
+        if (dist1.param_ != dist2.param_)
+            return false;
+        return true;
+    }
+
+    friend bool operator!=(
+        const distribution_type &dist1, const distribution_type &dist2)
+    {
+        return !(dist1 == dist2);
+    }
+
+    template <typename CharT, typename Traits>
+    friend std::basic_ostream<CharT, Traits> &operator<<(
+        std::basic_ostream<CharT, Traits> &os, const distribution_type &dist)
+    {
+        if (!os.good())
+            return os;
+
+        os << dist.param_ << ' ';
+
+        return os;
+    }
+
+    template <typename CharT, typename Traits>
+    friend std::basic_istream<CharT, Traits> &operator>>(
+        std::basic_istream<CharT, Traits> &is, distribution_type &dist)
+    {
+        if (!is.good())
+            return is;
+
+        param_type param;
+        is >> std::ws >> param;
+        if (is.good())
+            dist.param_ = std::move(param);
+
+        return is;
+    }
+
+    private:
+    param_type param_;
+
+    template <typename RNGType>
+    void generate(RNGType &rng, result_type *r, const param_type &param)
+    {
+        param_.rnorm_(rng, param.dim(), r);
+        if (!param.null_chol_)
+            mulchol(r, param);
+        if (!param.null_mean_)
+            add(param.dim(), param.mean(), r, r);
+    }
+
+    void mulchol(float *r, const param_type &param)
+    {
+        ::cblas_stpmv(::CblasRowMajor, ::CblasLower, ::CblasNoTrans,
+            ::CblasNonUnit, static_cast<VSMC_CBLAS_INT>(dim()), param.chol(),
+            r, 1);
+    }
+
+    void mulchol(double *r, const param_type &param)
+    {
+        ::cblas_dtpmv(::CblasRowMajor, ::CblasLower, ::CblasNoTrans,
+            ::CblasNonUnit, static_cast<VSMC_CBLAS_INT>(dim()), param.chol(),
+            r, 1);
+    }
+}; // class NormalMVDistribution
+
+namespace internal
+{
+
+inline void normal_mv_distribution_mulchol(
+    std::size_t n, float *r, std::size_t m, const float *chol)
+{
+    ::cblas_strmm(::CblasRowMajor, ::CblasRight, ::CblasLower, ::CblasTrans,
+        ::CblasNonUnit, static_cast<VSMC_CBLAS_INT>(n),
+        static_cast<VSMC_CBLAS_INT>(m), 1, chol,
+        static_cast<VSMC_CBLAS_INT>(m), r, static_cast<VSMC_CBLAS_INT>(m));
+}
+
+inline void normal_mv_distribution_mulchol(
+    std::size_t n, double *r, std::size_t m, const double *chol)
+{
+    ::cblas_dtrmm(::CblasRowMajor, ::CblasRight, ::CblasLower, ::CblasTrans,
+        ::CblasNonUnit, static_cast<VSMC_CBLAS_INT>(n),
+        static_cast<VSMC_CBLAS_INT>(m), 1, chol,
+        static_cast<VSMC_CBLAS_INT>(m), r, static_cast<VSMC_CBLAS_INT>(m));
+}
+
+} // namespace vsmc::internal
+
+/// \brief Generating multivariate Normal random varaites
+/// \ingroup Distribution
+template <typename RealType, typename RNGType>
+inline void normal_mv_distribution(RNGType &rng, std::size_t n, RealType *r,
+    std::size_t dim, const RealType *mean, const RealType *chol)
+{
+    static_assert(internal::is_one_of<RealType, float, double>::value,
+        "**normal_mv_distribution** USED WITH RealType OTHER THAN float OR "
+        "double");
+
+    normal_distribution(rng, n * dim, r, 0.0, 1.0);
+    if (chol != nullptr) {
+        Vector<RealType> cholf(dim * dim);
+        for (std::size_t i = 0; i != dim; ++i)
+            for (std::size_t j = 0; j <= i; ++j)
+                cholf[i * dim + j] = *chol++;
+        internal::normal_mv_distribution_mulchol(n, r, dim, cholf.data());
+    }
+    if (mean != nullptr)
+        for (std::size_t i = 0; i != n; ++i, r += dim)
+            add(dim, mean, r, r);
+}
+
+template <typename RealType, typename RNGType>
+inline void normal_mv_distribution(RNGType &rng, std::size_t n, RealType *r,
+    const typename NormalMVDistribution<RealType>::param_type &param)
+{
+    normal_mv_distribution(rng, n, r, param.dim(), param.mean(), param.chol());
+}
+
+template <typename RealType, std::size_t Dim, typename RNGType>
+inline void rng_rand(RNGType &rng, NormalMVDistribution<RealType, Dim> &dist,
+    std::size_t n, RealType *r)
+{
+    dist(rng, n, r);
+}
+
+} // namespace vsmc
+
+#endif // VSMC_RNG_NORMAL_DISTRIBUTION_HPP
diff --git a/include/vsmc/rng/pareto_distribution.hpp b/include/vsmc/rng/pareto_distribution.hpp
index cf80bc710..d2989b816 100644
--- a/include/vsmc/rng/pareto_distribution.hpp
+++ b/include/vsmc/rng/pareto_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool pareto_distribution_check_param(RealType a, RealType b)
 template <typename RealType>
 class ParetoDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Pareto, pareto, RealType, result_type, a, 1, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Pareto, pareto, a, 1, b, 1)
 
     public:
-    result_type min VSMC_MNE() const { return a(); }
+    result_type min() const { return a(); }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -72,9 +67,9 @@ class ParetoDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OCDistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return param.b() * std::exp(-std::log(runif(rng)) / param.a());
+        return param.b() * std::exp(-std::log(u01(rng)) / param.a());
     }
 }; // class ParetoDistribution
 
@@ -98,20 +93,19 @@ template <typename RealType, typename RNGType>
 inline void pareto_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**pareto_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::pareto_distribution_impl<k>(rng, k, r + i * k, a, b);
-    internal::pareto_distribution_impl<k>(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::pareto_distribution_impl<k>(rng, k, r, a, b);
+    internal::pareto_distribution_impl<k>(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, ParetoDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Pareto, pareto, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/philox.hpp b/include/vsmc/rng/philox.hpp
index 4a3c21b5b..e0f03bd83 100644
--- a/include/vsmc/rng/philox.hpp
+++ b/include/vsmc/rng/philox.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,22 +39,6 @@
 #include <intrin.h>
 #endif
 
-#define VSMC_STATIC_ASSERT_RNG_PHILOX_RESULT_TYPE(ResultType)                 \
-    VSMC_STATIC_ASSERT(((sizeof(ResultType) == sizeof(std::uint32_t) &&       \
-                            std::is_unsigned<ResultType>::value) ||           \
-                           (sizeof(ResultType) == sizeof(std::uint64_t) &&    \
-                               std::is_unsigned<ResultType>::value)),         \
-        "**PhiloxGenerator** USED WITH ResultType OTHER THAN UNSIGNED 32/64 " \
-        "BITS INTEGER")
-
-#define VSMC_STATIC_ASSERT_RNG_PHILOX_SIZE(K)                                 \
-    VSMC_STATIC_ASSERT((K == 2 || K == 4),                                    \
-        "**PhiloxGenerator** USED WITH SIZE OTHER THAN 2 OR 4")
-
-#define VSMC_STATIC_ASSERT_RNG_PHILOX                                         \
-    VSMC_STATIC_ASSERT_RNG_PHILOX_RESULT_TYPE(ResultType);                    \
-    VSMC_STATIC_ASSERT_RNG_PHILOX_SIZE(K);
-
 #define VSMC_DEFINE_RNG_PHILOX_WELY_CONSTANT(T, I, val)                       \
     template <>                                                               \
     class PhiloxWeylConstant<T, I> : public std::integral_constant<T, val>    \
@@ -255,79 +239,58 @@ template <typename ResultType, std::size_t K = VSMC_RNG_PHILOX_VECTOR_LENGTH,
     std::size_t Rounds = VSMC_RNG_PHILOX_ROUNDS>
 class PhiloxGenerator
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**PhiloxGenerator** USED WITH ResultType OTHER THAN UNSIGNED INTEGER "
+        "TYPES");
+
+    static_assert(sizeof(ResultType) == sizeof(std::uint32_t) ||
+            sizeof(ResultType) == sizeof(std::uint64_t),
+        "**PhiloxGenerator** USED WITH ResultType OF SIZE OTHER THAN 32 OR 64 "
+        "BITS");
+
+    static_assert(
+        K == 2 || K == 4, "**PhiloxGenerator** USED WITH K OTHER THAN 2 OR 4");
+
     public:
     using result_type = ResultType;
     using ctr_type = std::array<ResultType, K>;
     using key_type = std::array<ResultType, K / 2>;
 
-    PhiloxGenerator() { VSMC_STATIC_ASSERT_RNG_PHILOX; }
-
     static constexpr std::size_t size() { return K; }
 
     void reset(const key_type &) {}
 
-    void operator()(ctr_type &ctr, const key_type &key,
-        std::array<result_type, K> &buffer) const
+    void operator()(ctr_type &ctr, const key_type &key, ctr_type &buffer) const
     {
-        union {
-            std::array<ctr_type, 1> state;
-            std::array<result_type, size()> result;
-        } buf;
-
         increment(ctr);
-        buf.state.front() = ctr;
+        buffer = ctr;
         key_type par = key;
-        generate<0>(buf.state, par, std::true_type());
-        buffer = buf.result;
+        generate<0>(buffer, par, std::true_type());
     }
 
-    std::size_t operator()(ctr_type &ctr, const key_type &key, std::size_t n,
-        result_type *r) const
+    void operator()(ctr_type &ctr, const key_type &key, std::size_t n,
+        ctr_type *buffer) const
     {
-        const std::size_t Blocks = 8;
-        const std::size_t M = size() * Blocks;
-        const std::size_t m = n / M;
-        increment(ctr, m, reinterpret_cast<ctr_type *>(r));
-        std::array<ctr_type, Blocks> *s =
-            reinterpret_cast<std::array<ctr_type, Blocks> *>(r);
-        for (std::size_t i = 0; i != m; ++i) {
+        increment(ctr, n, buffer);
+        for (std::size_t i = 0; i != n; ++i) {
             key_type par = key;
-            generate<0>(s[i], par, std::true_type());
+            generate<0>(buffer[i], par, std::true_type());
         }
-
-        return m * M;
     }
 
     private:
-    template <std::size_t, std::size_t Blocks>
-    void generate(
-        std::array<ctr_type, Blocks> &, key_type &, std::false_type) const
+    template <std::size_t>
+    void generate(ctr_type &, key_type &, std::false_type) const
     {
     }
 
-    template <std::size_t N, std::size_t Blocks>
-    void generate(std::array<ctr_type, Blocks> &state, key_type &par,
-        std::true_type) const
+    template <std::size_t N>
+    void generate(ctr_type &state, key_type &par, std::true_type) const
     {
         internal::PhiloxBumpKey<ResultType, K, N>::eval(par);
-        round<N, 0>(state, par, std::true_type());
+        internal::PhiloxRound<ResultType, K, N>::eval(state, par);
         generate<N + 1>(
-            state, par, std::integral_constant < bool, N<Rounds>());
-    }
-
-    template <std::size_t, std::size_t, std::size_t Blocks>
-    void round(
-        std::array<ctr_type, Blocks> &, key_type &, std::false_type) const
-    {
-    }
-
-    template <std::size_t N, std::size_t B, std::size_t Blocks>
-    void round(std::array<ctr_type, Blocks> &state, key_type &par,
-        std::true_type) const
-    {
-        internal::PhiloxRound<ResultType, K, N>::eval(std::get<B>(state), par);
-        round<N, B + 1>(
-            state, par, std::integral_constant<bool, B + 1 < Blocks>());
+            state, par, std::integral_constant<bool, (N < Rounds)>());
     }
 }; // class PhiloxGenerator
 
diff --git a/include/vsmc/rng/random_walk.hpp b/include/vsmc/rng/random_walk.hpp
new file mode 100644
index 000000000..0086e4ece
--- /dev/null
+++ b/include/vsmc/rng/random_walk.hpp
@@ -0,0 +1,577 @@
+//============================================================================
+// vSMC/include/vsmc/rng/random_walk.hpp
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c) 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#ifndef VSMC_RNG_RANDOM_WALK_HPP
+#define VSMC_RNG_RANDOM_WALK_HPP
+
+#include <vsmc/rng/internal/common.hpp>
+#include <vsmc/rng/normal_distribution.hpp>
+#include <vsmc/rng/normal_mv_distribution.hpp>
+#include <vsmc/rng/u01_distribution.hpp>
+
+#define VSMC_RUNTIME_ASSERT_RNG_RANDOM_WALK_PROPOSAL_PARAM(flag, Name)        \
+    VSMC_RUNTIME_ASSERT(                                                      \
+        (flag), "**" #Name "Proposal** CONSTRUCTED WITH INVALID PARAMETERS")
+
+namespace vsmc
+{
+
+/// \brief Random walk MCMC update
+/// \ingroup RandomWalk
+template <typename RealType, std::size_t Dim>
+class RandomWalk
+{
+    public:
+    using result_type = RealType;
+
+    /// \brief Only usable when `Dim != Dynamic`
+    RandomWalk()
+    {
+        static_assert(Dim != Dynamic,
+            "**RandomWalk** OBJECT DECLARED WITH DYNAMIC DIMENSION");
+    }
+
+    /// \brief Only usable when `Dim == Dynamic`
+    RandomWalk(std::size_t dim) : x_(dim), y_(dim)
+    {
+        static_assert(Dim == Dynamic,
+            "**RandomWalk** OBJECT DECLARED WITH FIXED DIMENSION");
+    }
+
+    std::size_t dim() const { return x_.size(); }
+
+    /// \brief One-step random walk update
+    ///
+    /// \param rng RNG engine
+    /// \param x The current state value. It will be updated to the new value
+    /// after the MCMC move.
+    /// \param ltx If it is a non-null pointer, then it points to the value of
+    /// the \f$\log\gamma(x)\f$. It will be updated to the new value if the
+    /// MCMC move is accepted and left unchanged otherwise. If it is a null
+    /// pointer, then it is ignored. Use this pointer to save
+    /// \f$\log\gamma(x)\f$ between updates if it is expensive to calculate.
+    /// \param log_target The log-target fucntion
+    /// ~~~{.cpp}
+    /// result_type log_target(std::size_t dim_x, const result_type *x );
+    /// ~~~
+    /// It accepts the lengths of the state vector and a pointer to the storage
+    /// of the state value. It return the value of log-target function
+    /// \f$\log\gamma(x)\f$.
+    /// \param proposal The proposal function. It takes the form,
+    /// ~~~{.cpp}
+    /// result_type proposal(RNGType &rng, std::size_t dim,
+    ///     const result_type *x, result_type *y);
+    /// ~~~
+    /// After the call, the function return the proposed value in `y` and
+    /// return the value \f$\log(q(y, x) / q(x, y))\f$.
+    ///
+    /// \return Acceptance count
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(RNGType &rng, result_type *x, result_type *ltx,
+        LogTargetType &&log_target, ProposalType &&proposal)
+    {
+        U01Distribution<result_type> u01;
+        result_type q = proposal(rng, dim(), x, y_.data());
+        result_type s = ltx == nullptr ? log_target(dim(), x) : *ltx;
+        result_type t = log_target(dim(), y_.data());
+        result_type p = t - s + q;
+        result_type u = std::log(u01(rng));
+
+        if (u < p) {
+            std::copy(y_.begin(), y_.end(), x);
+            if (ltx != nullptr)
+                *ltx = t;
+            return 1;
+        }
+        return 0;
+    }
+
+    /// \brief One-step random walk update of a block of elements within a
+    /// vector state
+    ///
+    /// \details
+    /// With this operator, it is assumed that the length of input/output state
+    /// vector `x` is of length `m` instead of `dim_x()`. A sub-vector of
+    /// length `dim_x()`, starting at index `idx` will be updated. The
+    /// log-target function will be called with `m` as its first argument and a
+    /// length `m` vector will be passed as its second argument as well.
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(RNGType &rng, std::size_t m, std::size_t idx,
+        result_type *x, result_type *ltx, LogTargetType &&log_target,
+        ProposalType &&proposal)
+    {
+        U01Distribution<result_type> u01;
+        std::copy_n(x + idx, dim(), x_.begin());
+        result_type q = proposal(rng, dim(), x_.data(), y_.data());
+        result_type s = ltx == nullptr ? log_target(m, x) : *ltx;
+        std::copy(y_.begin(), y_.end(), x + idx);
+        result_type t = log_target(m, x);
+        result_type p = t - s + q;
+        result_type u = std::log(u01(rng));
+
+        if (u < p) {
+            if (ltx != nullptr)
+                *ltx = t;
+            return 1;
+        }
+        std::copy(x_.begin(), x_.end(), x + idx);
+
+        return 0;
+    }
+
+    /// \brief Multi-step random walk update
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(std::size_t n, RNGType &rng, result_type *x,
+        result_type *ltx, LogTargetType &&log_target, ProposalType &&proposal)
+    {
+        std::size_t acc = 0;
+        result_type s = ltx == nullptr ? log_target(dim(), x) : *ltx;
+        for (std::size_t i = 0; i != n; ++i) {
+            acc += operator()(rng, x, &s,
+                std::forward<LogTargetType>(log_target),
+                std::forward<ProposalType>(proposal));
+        }
+        if (ltx != nullptr)
+            *ltx = s;
+
+        return acc;
+    }
+
+    /// \brief Multi-step random walk update of a block of element within a
+    /// vector state
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(std::size_t n, RNGType &rng, std::size_t m,
+        std::size_t idx, result_type *x, result_type *ltx,
+        LogTargetType &&log_target, ProposalType &&proposal)
+    {
+        std::size_t acc = 0;
+        result_type s = ltx == nullptr ? log_target(m, x) : *ltx;
+        for (std::size_t i = 0; i != n; ++i) {
+            acc += operator()(rng, m, idx, x, &s,
+                std::forward<LogTargetType>(log_target),
+                std::forward<ProposalType>(proposal));
+        }
+        if (ltx != nullptr)
+            *ltx = s;
+
+        return acc;
+    }
+
+    private:
+    internal::Array<RealType, Dim> x_;
+    internal::Array<RealType, Dim> y_;
+}; // class RandomWalk
+
+/// \brief Random walk MCMC update with test function
+/// \ingroup RandomWalk
+template <typename RealType, std::size_t DimX, std::size_t DimG>
+class RandomWalkG
+{
+    public:
+    using result_type = RealType;
+
+    /// \brief Only usable when `DimX != Dynamic` and `DimG != Dynamic`
+    RandomWalkG()
+    {
+        static_assert(DimX != Dynamic && DimG != Dynamic,
+            "**RandomWalk** OBJECT DECLARED WITH DYNAMIC DIMENSION");
+    }
+
+    /// \brief Only usable when `DimX == Dynamic` and `DimG == Dynamic`
+    RandomWalkG(std::size_t dim_x, std::size_t dim_g)
+        : x_(dim_x), y_(dim_x), g_(dim_g)
+    {
+        static_assert(DimX == Dynamic && DimG == Dynamic,
+            "**RandomWalk** OBJECT DECLARED WITH FIXED DIMENSION");
+    }
+
+    std::size_t dim_x() const { return x_.size(); }
+    std::size_t dim_g() const { return g_.size(); }
+
+    /// \brief One-step random walk update
+    ///
+    /// \param rng RNG engine
+    /// \param x The current state value. It will be updated to the new value
+    /// after the MCMC move.
+    /// \param ltx If it is a non-null pointer, then it points to the value of
+    /// the \f$\log\gamma(x)\f$. It will be updated to the new value if the
+    /// MCMC move is accepted and left unchanged otherwise. If it is a null
+    /// pointer, then it is ignored. Use this pointer to save
+    /// \f$\log\gamma(x)\f$ between updates if it is expensive to calculate.
+    /// \param g If it is a non-null pointer, then it is used to save the value
+    /// of the test function \f$g(x)\f$ if the MCMC move is accepted and left
+    /// unchanged otherwise. If it is a null pointer, then it is ignored.
+    /// \param log_target The log-target fucntion
+    /// ~~~{.cpp}
+    /// result_type log_target(std::size_t dim_x, std::size_t dim_g,
+    ///     const result_type *x, result_type *g);
+    /// ~~~
+    /// It accepts the lengths of the state vector and test function value
+    /// vector, and pointers to the storage of the state value and test
+    /// function value. It return the value of log-target function
+    /// \f$\log\gamma(x)\f$. Note that this fucntion shall be able to handle
+    /// its argument `g` as a null pointer.
+    /// \param proposal The proposal function. It takes the form,
+    /// ~~~{.cpp}
+    /// result_type proposal(RNGType &rng, std::size_t dim,
+    ///     const result_type *x, result_type *y);
+    /// ~~~
+    /// After the call, the function return the proposed value in `y` and
+    /// return the value \f$\log(q(y, x) / q(x, y))\f$.
+    ///
+    /// \return Acceptance count
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(RNGType &rng, result_type *x, result_type *ltx,
+        result_type *g, LogTargetType &&log_target, ProposalType &&proposal)
+    {
+        U01Distribution<result_type> u01;
+        result_type q = proposal(rng, dim_x(), x, y_.data());
+        result_type s =
+            ltx == nullptr ? log_target(dim_x(), dim_g(), x, nullptr) : *ltx;
+        result_type t = log_target(
+            dim_x(), dim_g(), y_.data(), (g == nullptr ? nullptr : g_.data()));
+        result_type p = t - s + q;
+        result_type u = std::log(u01(rng));
+
+        if (u < p) {
+            std::copy(y_.begin(), y_.end(), x);
+            if (ltx != nullptr)
+                *ltx = t;
+            if (g != nullptr)
+                std::copy(g_.begin(), g_.end(), g);
+            return 1;
+        }
+        return 0;
+    }
+
+    /// \brief One-step random walk update of a block of elements within a
+    /// vector state
+    ///
+    /// \details
+    /// With this operator, it is assumed that the length of input/output state
+    /// vector `x` is of length `m` instead of `dim_x()`. A sub-vector of
+    /// length `dim_x()`, starting at index `idx` will be updated. The
+    /// log-target function will be called with `m` as its first argument and a
+    /// length `m` vector will be passed as its third argument as well.
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(RNGType &rng, std::size_t m, std::size_t idx,
+        result_type *x, result_type *ltx, result_type *g,
+        LogTargetType &&log_target, ProposalType &&proposal)
+    {
+        U01Distribution<result_type> u01;
+        std::copy_n(x + idx, dim_x(), x_.begin());
+        result_type q = proposal(rng, dim_x(), x_.data(), y_.data());
+        result_type s =
+            ltx == nullptr ? log_target(m, dim_g(), x, nullptr) : *ltx;
+        std::copy(y_.begin(), y_.end(), x + idx);
+        result_type t =
+            log_target(m, dim_g(), x, (g == nullptr ? nullptr : g_.data()));
+        result_type p = t - s + q;
+        result_type u = std::log(u01(rng));
+
+        if (u < p) {
+            if (ltx != nullptr)
+                *ltx = t;
+            if (g != nullptr)
+                std::copy(g_.begin(), g_.end(), g);
+            return 1;
+        }
+        std::copy(x_.begin(), x_.end(), x + idx);
+
+        return 0;
+    }
+
+    /// \brief Multi-step random walk update
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(std::size_t n, RNGType &rng, result_type *x,
+        result_type *ltx, result_type *g, LogTargetType &&log_target,
+        ProposalType &&proposal)
+    {
+        std::size_t acc = 0;
+        result_type s =
+            ltx == nullptr ? log_target(dim_x(), dim_g(), x, nullptr) : *ltx;
+        for (std::size_t i = 0; i != n; ++i) {
+            acc += operator()(rng, x, &s, g,
+                std::forward<LogTargetType>(log_target),
+                std::forward<ProposalType>(proposal));
+        }
+        if (ltx != nullptr)
+            *ltx = s;
+
+        return acc;
+    }
+
+    /// \brief Multi-step random walk update of a block of element within a
+    /// vector state
+    template <typename RNGType, typename LogTargetType, typename ProposalType>
+    std::size_t operator()(std::size_t n, RNGType &rng, std::size_t m,
+        std::size_t idx, result_type *x, result_type *ltx, result_type *g,
+        LogTargetType &&log_target, ProposalType &&proposal)
+    {
+        std::size_t acc = 0;
+        result_type s =
+            ltx == nullptr ? log_target(m, dim_g(), x, nullptr) : *ltx;
+        for (std::size_t i = 0; i != n; ++i) {
+            acc += operator()(rng, m, idx, x, &s, g,
+                std::forward<LogTargetType>(log_target),
+                std::forward<ProposalType>(proposal));
+        }
+        if (ltx != nullptr)
+            *ltx = s;
+
+        return acc;
+    }
+
+    private:
+    internal::Array<RealType, DimX> x_;
+    internal::Array<RealType, DimX> y_;
+    internal::Array<RealType, DimG> g_;
+}; // class RandomWalkG
+
+namespace internal
+{
+
+template <typename RealType>
+inline bool normal_proposal_check_param(RealType a, RealType b)
+{
+    return a < b;
+}
+
+template <typename RealType>
+inline bool normal_mv_proposal_check_param(
+    std::size_t dim, const RealType *a, const RealType *b)
+{
+    for (std::size_t i = 0; i != dim; ++i)
+        if (a[i] >= b[i])
+            return false;
+    return true;
+}
+
+template <typename RealType>
+RealType normal_proposal_q(RealType x, RealType &y, RealType z)
+{
+    y = x + z;
+
+    return 0;
+}
+
+template <typename RealType>
+RealType normal_proposal_qa(RealType x, RealType &y, RealType z, RealType a)
+{
+    y = a + (x - a) * std::exp(z);
+
+    return z;
+}
+
+template <typename RealType>
+RealType normal_proposal_qb(RealType x, RealType &y, RealType z, RealType b)
+{
+    y = b - (b - x) * std::exp(z);
+
+    return z;
+}
+
+template <typename RealType>
+RealType normal_proposal_qab(
+    RealType x, RealType &y, RealType z, RealType a, RealType b)
+{
+    RealType r = std::exp(z) * (x - a) / (b - x);
+    y = (a + b * r) / (1 + r);
+
+    return std::log((y - a) / (x - a) * (b - y) / (b - x));
+}
+
+} // namespace vsmc::internal
+
+/// \brief Normal random walk proposal
+/// \ingroup RandomWalk
+template <typename RealType>
+class NormalProposal
+{
+    public:
+    using result_type = RealType;
+
+    /// \brief Construct a Normal random walk proposal
+    ///
+    /// \param stddev The standard deviation (scale) of the proposal
+    /// \param a The lower bound of the support of the target distribution
+    /// \param b The upper bound of the support of the target distribution
+    explicit NormalProposal(result_type stddev = 1,
+        result_type a = -std::numeric_limits<result_type>::infinity(),
+        result_type b = std::numeric_limits<result_type>::infinity())
+        : rnorm_(0, stddev), a_(a), b_(b), flag_(0)
+    {
+        unsigned lower = std::isfinite(a) ? 1 : 0;
+        unsigned upper = std::isfinite(b) ? 1 : 0;
+        flag_ = (lower << 1) + upper;
+        VSMC_RUNTIME_ASSERT_RNG_RANDOM_WALK_PROPOSAL_PARAM(
+            internal::normal_proposal_check_param(a, b), Normal);
+    }
+
+    result_type a() const { return a_; }
+    result_type b() const { return b_; }
+
+    /// \brief Propose new value `y` and return \f$\log(q(y, x) / q(x, y))\f$.
+    template <typename RNGType>
+    result_type operator()(
+        RNGType &rng, std::size_t, const result_type *x, result_type *y)
+    {
+        result_type z = rnorm_(rng);
+        switch (flag_) {
+            case 0: return internal::normal_proposal_q(*x, *y, z);
+            case 1: return internal::normal_proposal_qb(*x, *y, z, b_);
+            case 2: return internal::normal_proposal_qa(*x, *y, z, a_);
+            case 3: return internal::normal_proposal_qab(*x, *y, z, a_, b_);
+            default: return 0;
+        }
+    }
+
+    private:
+    NormalDistribution<RealType> rnorm_;
+    result_type a_;
+    result_type b_;
+    unsigned flag_;
+}; // class NormalProposal
+
+/// \brief Multivariate Normal random walk proposal
+/// \ingroup RandomWalk
+template <typename RealType, std::size_t Dim>
+class NormalMVProposal
+{
+    public:
+    using result_type = RealType;
+
+    /// \brief Only usable when `Dim != Dynamic`
+    ///
+    /// \param chol The lower triangular elements of the Cholesky decomposition
+    /// of the covaraince matrix, packed row by row. If it is a nullpointer,
+    /// then the covariance is the identicy matrix \f$I\f$
+    /// \param a The lower bound of the support of the target distribuiton. It
+    /// is assumed that the support is \f$\prod_{p=1}^d E_p \subset
+    /// \mathbb{R}^d\f$ where \f$E_p \subset \mathbb{R}\f$.
+    /// \param b The upper bound of the support of the target distribution.
+    ///
+    /// \details
+    /// If the geometry of the support is more complex than above, then one may
+    /// find a superset of the support that takes the required form, and reject
+    /// proposals that lay outside the support manually.
+    explicit NormalMVProposal(const result_type *chol = nullptr,
+        const result_type *a = nullptr, const result_type *b = nullptr)
+        : rnorm_(nullptr, chol)
+    {
+        static_assert(Dim != Dynamic,
+            "**NormalMVProposal** OBJECT DECLARED WITH DYNAMIC DIMENSION");
+        init(Dim, a, b);
+    }
+
+    explicit NormalMVProposal(std::size_t dim,
+        const result_type *chol = nullptr, const result_type *a = nullptr,
+        const result_type *b = nullptr)
+        : rnorm_(dim, nullptr, chol), a_(dim), b_(dim), z_(dim), flag_(dim)
+    {
+        static_assert(Dim == Dynamic,
+            "**NormalMVProposal** OBJECT DECLARED WITH FIXED DIMENSION");
+        init(dim, a, b);
+    }
+
+    std::size_t dim() const { return rnorm_.dim(); }
+    const result_type *a() const { return a_.data(); }
+    const result_type *b() const { return b_.data(); }
+
+    template <typename RNGType>
+    result_type operator()(
+        RNGType &rng, std::size_t, const result_type *x, result_type *y)
+    {
+        rnorm_(rng, z_.data());
+        result_type q = 0;
+        for (std::size_t i = 0; i != dim(); ++i) {
+            switch (flag_[i]) {
+                case 0:
+                    q += internal::normal_proposal_q(x[i], y[i], z_[i]);
+                    break;
+                case 1:
+                    q +=
+                        internal::normal_proposal_qb(x[i], y[i], z_[i], b_[i]);
+                    break;
+                case 2:
+                    q +=
+                        internal::normal_proposal_qa(x[i], y[i], z_[i], a_[i]);
+                    break;
+                case 3:
+                    q += internal::normal_proposal_qab(
+                        x[i], y[i], z_[i], a_[i], b_[i]);
+                    break;
+                default: break;
+            }
+        }
+
+        return q;
+    }
+
+    private:
+    NormalMVDistribution<RealType, Dim> rnorm_;
+    internal::Array<RealType, Dim> a_;
+    internal::Array<RealType, Dim> b_;
+    internal::Array<RealType, Dim> z_;
+    internal::Array<unsigned, Dim> flag_;
+
+    void init(std::size_t dim, const result_type *a, const result_type *b)
+    {
+        if (a == nullptr) {
+            std::fill(a_.begin(), a_.end(),
+                -std::numeric_limits<result_type>::infinity());
+        } else {
+            std::copy_n(a, dim, a_.begin());
+        }
+
+        if (b == nullptr) {
+            std::fill(b_.begin(), b_.end(),
+                std::numeric_limits<result_type>::infinity());
+        } else {
+            std::copy_n(b, dim, b_.begin());
+        }
+
+        for (std::size_t i = 0; i != dim; ++i) {
+            unsigned lower = std::isfinite(a_[i]) ? 1 : 0;
+            unsigned upper = std::isfinite(b_[i]) ? 1 : 0;
+            flag_[i] = (lower << 1) + upper;
+        }
+
+        VSMC_RUNTIME_ASSERT_RNG_RANDOM_WALK_PROPOSAL_PARAM(
+            internal::normal_mv_proposal_check_param(
+                dim, a_.data(), b_.data()),
+            NormalMV);
+    }
+}; // class NormalMVProposal
+
+} // namespace vsmc
+
+#endif // VSMC_RNG_RANDOM_WALK_HPP
diff --git a/include/vsmc/rng/rayleigh_distribution.hpp b/include/vsmc/rng/rayleigh_distribution.hpp
index fb883d735..762bfd348 100644
--- a/include/vsmc/rng/rayleigh_distribution.hpp
+++ b/include/vsmc/rng/rayleigh_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,17 +54,12 @@ inline bool rayleigh_distribution_check_param(RealType sigma)
 template <typename RealType>
 class RayleighDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_1(
-        Rayleigh, rayleigh, RealType, result_type, sigma, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_1(Rayleigh, rayleigh, sigma, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -72,9 +67,9 @@ class RayleighDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OCDistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return param.sigma() * std::sqrt(-2 * std::log(runif(rng)));
+        return param.sigma() * std::sqrt(-2 * std::log(u01(rng)));
     }
 }; // class RayleighDistribution
 
@@ -85,7 +80,7 @@ template <typename RealType, typename RNGType>
 inline void rayleigh_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType sigma)
 {
-    u01_oc_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
     log(n, r, r);
     mul(n, -2 * sigma * sigma, r, r);
     sqrt(n, r, r);
@@ -99,20 +94,19 @@ template <typename RealType, typename RNGType>
 inline void rayleigh_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType sigma)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**rayleigh_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::rayleigh_distribution_impl(rng, k, r + i * k, sigma);
-    internal::rayleigh_distribution_impl(rng, l, r + m * k, sigma);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::rayleigh_distribution_impl(rng, k, r, sigma);
+    internal::rayleigh_distribution_impl(rng, l, r, sigma);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, RayleighDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_1(Rayleigh, rayleigh, sigma)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/rdrand.hpp b/include/vsmc/rng/rdrand.hpp
index b2aaa6c26..931f0b31b 100644
--- a/include/vsmc/rng/rdrand.hpp
+++ b/include/vsmc/rng/rdrand.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,19 +39,6 @@
 #define VSMC_RDRAND_NTRIAL_MAX 0
 #endif
 
-#define VSMC_STATIC_ASSERT_RNG_RDRAND_ENGINE_RESULT_TYPE(ResultType)          \
-    VSMC_STATIC_ASSERT(((sizeof(ResultType) == sizeof(std::uint16_t) &&       \
-                            std::is_unsigned<ResultType>::value) ||           \
-                           (sizeof(ResultType) == sizeof(std::uint32_t) &&    \
-                               std::is_unsigned<ResultType>::value) ||        \
-                           (sizeof(ResultType) == sizeof(std::uint64_t) &&    \
-                               std::is_unsigned<ResultType>::value)),         \
-        "**RDRANDEngine** USED WITH ResultType OTHER THAN UNSIGNED 16/32/64 " \
-        "BITS INTEGER")
-
-#define VSMC_STATIC_ASSERT_RNG_RDRAND_ENGINE                                  \
-    VSMC_STATIC_ASSERT_RNG_RDRAND_ENGINE_RESULT_TYPE(ResultType);
-
 #define VSMC_RUNTIME_WARNING_RNG_RDRAND_ENGINE_NTRIAL(ntrial, NTrialMax)      \
     VSMC_RUNTIME_WARNING((ntrial < NTrialMax),                                \
         "**RDRAND::generate** MAXIMUM NUMBER OF TRIALS EXCEEDED")
@@ -112,21 +99,26 @@ inline bool rdrand(
 template <typename ResultType, std::size_t NTrialMax = VSMC_RDRAND_NTRIAL_MAX>
 class RDRANDEngine
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**RDRANDEngine** USED WITH ResultType OTHER THAN UNSIGNED INTEGER "
+        "TYPES");
+
+    static_assert(sizeof(ResultType) == sizeof(std::uint16_t) ||
+            sizeof(ResultType) == sizeof(std::uint32_t) ||
+            sizeof(ResultType) == sizeof(std::uint64_t),
+        "**RDRANDEngine** USED WITH ResultType OF SIZE OTHER THAN 16, 32 OR "
+        "64 BITS");
 
     public:
     using result_type = ResultType;
 
-    explicit RDRANDEngine(result_type = 0)
-    {
-        VSMC_STATIC_ASSERT_RNG_RDRAND_ENGINE;
-    }
+    explicit RDRANDEngine(result_type = 0) {}
 
     template <typename SeedSeq>
     explicit RDRANDEngine(SeedSeq &,
         typename std::enable_if<internal::is_seed_seq<SeedSeq, result_type,
             RDRANDEngine<ResultType, NTrialMax>>::value>::type * = nullptr)
     {
-        VSMC_STATIC_ASSERT_RNG_RDRAND_ENGINE;
     }
 
     void seed(result_type) {}
@@ -144,14 +136,14 @@ class RDRANDEngine
 
     void discard(std::size_t) {}
 
-    static constexpr result_type min VSMC_MNE()
+    static constexpr result_type min()
     {
-        return std::numeric_limits<result_type>::min VSMC_MNE();
+        return std::numeric_limits<result_type>::min();
     }
 
-    static constexpr result_type max VSMC_MNE()
+    static constexpr result_type max()
     {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::max();
     }
 
     friend bool operator==(const RDRANDEngine<ResultType, NTrialMax> &,
diff --git a/include/vsmc/rng/rng.hpp b/include/vsmc/rng/rng.hpp
index 1150112d9..81c0bfc77 100644
--- a/include/vsmc/rng/rng.hpp
+++ b/include/vsmc/rng/rng.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,12 @@
 #define VSMC_RNG_RNG_HPP
 
 #include <vsmc/internal/config.h>
+#include <vsmc/rng/distribution.hpp>
+#include <vsmc/rng/engine.hpp>
+#include <vsmc/rng/random_walk.hpp>
 #include <vsmc/rng/rng_set.hpp>
 #include <vsmc/rng/seed.hpp>
 #include <vsmc/rng/u01.hpp>
 #include <vsmc/rng/u01_sequence.hpp>
-#include <vsmc/rng/distribution.hpp>
-#include <vsmc/rng/engine.hpp>
 
 #endif // VSMC_RNG_RNG_HPP
diff --git a/include/vsmc/rng/rng_set.hpp b/include/vsmc/rng/rng_set.hpp
index c517e9cdd..520278325 100644
--- a/include/vsmc/rng/rng_set.hpp
+++ b/include/vsmc/rng/rng_set.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,17 +33,21 @@
 #define VSMC_RNG_RNG_SET_HPP
 
 #include <vsmc/rng/internal/common.hpp>
-#include <vsmc/rng/seed.hpp>
 #include <vsmc/rng/engine.hpp>
+#include <vsmc/rng/seed.hpp>
 #if VSMC_HAS_TBB
-#include <tbb/tbb.h>
+#include <tbb/combinable.h>
 #endif
 
 /// \brief Default RNG set type
 /// \ingroup Config
 #ifndef VSMC_RNG_SET_TYPE
+#if VSMC_HAS_TBB
+#define VSMC_RNG_SET_TYPE ::vsmc::RNGSetTBB<::vsmc::RNG>
+#else
 #define VSMC_RNG_SET_TYPE ::vsmc::RNGSetVector<::vsmc::RNG>
 #endif
+#endif
 
 namespace vsmc
 {
@@ -117,6 +121,42 @@ class RNGSetVector
     AlignedVector<rng_type> rng_;
 }; // class RNGSetVector
 
+#if VSMC_HAS_TBB
+
+/// \brief Thread-local storage RNG set using tbb::combinable
+/// \ingroup RNG
+template <typename RNGType>
+class RNGSetTBB
+{
+    public:
+    using rng_type = RNGType;
+    using size_type = std::size_t;
+
+    explicit RNGSetTBB(size_type N = 0)
+        : size_(N), rng_([]() {
+            rng_type rng;
+            Seed::instance().seed_rng(rng);
+            return rng;
+        })
+    {
+        seed();
+    }
+
+    size_type size() const { return size_; }
+
+    void resize(std::size_t) {}
+
+    void seed() { rng_.clear(); }
+
+    rng_type &operator[](size_type) { return rng_.local(); }
+
+    private:
+    std::size_t size_;
+    ::tbb::combinable<rng_type> rng_;
+}; // class RNGSetTBB
+
+#endif // VSMC_HAS_TBB
+
 using RNGSet = VSMC_RNG_SET_TYPE;
 
 /// \brief Particle::rng_set_type trait
diff --git a/include/vsmc/rng/seed.hpp b/include/vsmc/rng/seed.hpp
index 610cf813f..4a2248d3a 100644
--- a/include/vsmc/rng/seed.hpp
+++ b/include/vsmc/rng/seed.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,10 +35,6 @@
 #include <vsmc/rng/internal/common.hpp>
 #include <vsmc/rng/counter.hpp>
 
-#define VSMC_STATIC_ASSERT_RNG_SEED_GENERATOR_RESULT_TYPE(T)                  \
-    VSMC_STATIC_ASSERT((std::is_unsigned<T>::value),                          \
-        "**SeedGenerator** USED WITH ResultType NOT AN UNSIGNED INTEGER")
-
 #define VSMC_RUNTIME_ASSERT_RNG_SEED_GENERATOR_MODULO(div, rem)               \
     VSMC_RUNTIME_ASSERT((div > rem),                                          \
         "**SeedGenerator::modulo** "                                          \
@@ -88,6 +84,10 @@ namespace vsmc
 template <typename ID, typename ResultType = VSMC_SEED_RESULT_TYPE>
 class SeedGenerator
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**SeedGenerator** USED WITH ResultType OTHER THAN UNSIGEND INTEGER "
+        "TYPES");
+
     public:
     using result_type = ResultType;
     using skip_type = ResultType;
@@ -151,7 +151,7 @@ class SeedGenerator
 
         divisor_ = div;
         remainder_ = rem;
-        seed_max_ = std::numeric_limits<skip_type>::max VSMC_MNE();
+        seed_max_ = std::numeric_limits<skip_type>::max();
         seed_max_ -= seed_max_ % divisor_;
         seed_max_ /= divisor_;
 
@@ -223,8 +223,6 @@ class SeedGenerator
 
     SeedGenerator() : seed_(0), seed_max_(0), divisor_(1), remainder_(0)
     {
-        VSMC_STATIC_ASSERT_RNG_SEED_GENERATOR_RESULT_TYPE(ResultType);
-
         modulo(divisor_, remainder_);
     }
 }; // class SeedGenerator
@@ -249,21 +247,26 @@ class SeedGenerator
 /// s.back() = world.rank();
 /// seed.set(s);
 /// ~~~
-template <typename ID, typename T, std::size_t K>
-class SeedGenerator<ID, std::array<T, K>>
+template <typename ID, typename ResultType, std::size_t K>
+class SeedGenerator<ID, std::array<ResultType, K>>
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**SeedGenerator** USED WITH ResultType OTHER THAN UNSIGEND INTEGER "
+        "TYPES");
+
     public:
-    using result_type = std::array<T, K>;
-    using skip_type = T;
+    using result_type = std::array<ResultType, K>;
+    using skip_type = ResultType;
 
-    SeedGenerator(const SeedGenerator<ID, std::array<T, K>> &) = delete;
+    SeedGenerator(
+        const SeedGenerator<ID, std::array<ResultType, K>> &) = delete;
 
-    SeedGenerator<ID, std::array<T, K>> &operator=(
-        const SeedGenerator<ID, std::array<T, K>> &) = delete;
+    SeedGenerator<ID, std::array<ResultType, K>> &operator=(
+        const SeedGenerator<ID, std::array<ResultType, K>> &) = delete;
 
-    static SeedGenerator<ID, std::array<T, K>> &instance()
+    static SeedGenerator<ID, std::array<ResultType, K>> &instance()
     {
-        static SeedGenerator<ID, std::array<T, K>> seed;
+        static SeedGenerator<ID, std::array<ResultType, K>> seed;
 
         return seed;
     }
@@ -291,7 +294,7 @@ class SeedGenerator<ID, std::array<T, K>>
         return seed_;
     }
 
-    T get_scalar()
+    ResultType get_scalar()
     {
         result_type s1(get());
         result_type s2(get());
@@ -302,7 +305,7 @@ class SeedGenerator<ID, std::array<T, K>>
         return std::get<0>(s2);
     }
 
-    void set(T s)
+    void set(ResultType s)
     {
         seed_.fill(0);
         seed_.front() = s;
@@ -324,7 +327,7 @@ class SeedGenerator<ID, std::array<T, K>>
 
         divisor_ = div;
         remainder_ = rem;
-        seed_max_.fill(std::numeric_limits<skip_type>::max VSMC_MNE());
+        seed_max_.fill(std::numeric_limits<skip_type>::max());
 
         set(seed_);
     }
@@ -336,7 +339,7 @@ class SeedGenerator<ID, std::array<T, K>>
     template <typename CharT, typename Traits>
     friend std::basic_ostream<CharT, Traits> &operator<<(
         std::basic_ostream<CharT, Traits> &os,
-        const SeedGenerator<ID, std::array<T, K>> &sg)
+        const SeedGenerator<ID, std::array<ResultType, K>> &sg)
     {
         if (!os.good())
             return os;
@@ -351,7 +354,7 @@ class SeedGenerator<ID, std::array<T, K>>
     template <typename CharT, typename Traits>
     friend std::basic_istream<CharT, Traits> &operator>>(
         std::basic_istream<CharT, Traits> &is,
-        SeedGenerator<ID, std::array<T, K>> &sg)
+        SeedGenerator<ID, std::array<ResultType, K>> &sg)
     {
         if (!is.good())
             return is;
@@ -380,8 +383,6 @@ class SeedGenerator<ID, std::array<T, K>>
 
     SeedGenerator() : divisor_(1), remainder_(0)
     {
-        VSMC_STATIC_ASSERT_RNG_SEED_GENERATOR_RESULT_TYPE(T);
-
         seed_.fill(0);
         seed_max_.fill(0);
         modulo(divisor_, remainder_);
diff --git a/include/vsmc/rng/student_t_distribution.hpp b/include/vsmc/rng/student_t_distribution.hpp
index bc9a8a504..0a503ab1f 100644
--- a/include/vsmc/rng/student_t_distribution.hpp
+++ b/include/vsmc/rng/student_t_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -55,20 +55,15 @@ inline bool student_t_distribution_check_param(RealType n)
 template <typename RealType>
 class StudentTDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_1(
-        StudentT, student_t, RealType, result_type, n, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_1(StudentT, student_t, n, 1)
 
     public:
-    result_type min VSMC_MNE() const
+    result_type min() const
     {
-        return -std::numeric_limits<result_type>::max VSMC_MNE();
+        return std::numeric_limits<result_type>::lowest();
     }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset()
     {
@@ -120,20 +115,19 @@ template <typename RealType, typename RNGType>
 inline void student_t_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType df)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**student_t_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::student_t_distribution_impl<k>(rng, k, r + i * k, df);
-    internal::student_t_distribution_impl<k>(rng, l, r + m * k, df);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::student_t_distribution_impl<k>(rng, k, r, df);
+    internal::student_t_distribution_impl<k>(rng, l, r, df);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, StudentTDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_1(StudentT, student_t, n)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/threefry.hpp b/include/vsmc/rng/threefry.hpp
index f2994c71b..5b09aa6f9 100644
--- a/include/vsmc/rng/threefry.hpp
+++ b/include/vsmc/rng/threefry.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,22 +35,6 @@
 #include <vsmc/rng/internal/common.hpp>
 #include <vsmc/rng/counter.hpp>
 
-#define VSMC_STATIC_ASSERT_RNG_THREEFRY_RESULT_TYPE(ResultType, SIMD)         \
-    VSMC_STATIC_ASSERT(((sizeof(ResultType) == sizeof(std::uint32_t) &&       \
-                            std::is_unsigned<ResultType>::value) ||           \
-                           (sizeof(ResultType) == sizeof(std::uint64_t) &&    \
-                               std::is_unsigned<ResultType>::value)),         \
-        "**ThreefryGenerator" #SIMD                                           \
-        "** USED WITH ResultType OTHER THAN UNSIGNED 32/64 BITS INTEGER")
-
-#define VSMC_STATIC_ASSERT_RNG_THREEFRY_SIZE(K, SIMD)                         \
-    VSMC_STATIC_ASSERT((K == 2 || K == 4),                                    \
-        "**Threefry" #SIMD "** USED WITH SIZE OTHER THAN 2 OR 4")
-
-#define VSMC_STATIC_ASSERT_RNG_THREEFRY(SIMD)                                 \
-    VSMC_STATIC_ASSERT_RNG_THREEFRY_RESULT_TYPE(ResultType, SIMD);            \
-    VSMC_STATIC_ASSERT_RNG_THREEFRY_SIZE(K, SIMD);
-
 #define VSMC_DEFINE_RNG_THREEFRY_ROTATE_CONSTANT(T, K, N, I, val)             \
     template <>                                                               \
     class ThreefryRotateConstant<T, K, N, I>                                  \
@@ -58,18 +42,18 @@
     {                                                                         \
     }; // class ThreefryRotateConstant
 
-/// \brief ThreefryGenerator default vector length
-/// \ingroup Config
-#ifndef VSMC_RNG_THREEFRY_VECTOR_LENGTH
-#define VSMC_RNG_THREEFRY_VECTOR_LENGTH 4
-#endif
-
 /// \brief ThreefryGenerator default rounds
 /// \ingroup Config
 #ifndef VSMC_RNG_THREEFRY_ROUNDS
 #define VSMC_RNG_THREEFRY_ROUNDS 20
 #endif
 
+/// \brief ThreefryGenerator default vector length
+/// \ingroup Config
+#ifndef VSMC_RNG_THREEFRY_VECTOR_LENGTH
+#define VSMC_RNG_THREEFRY_VECTOR_LENGTH 4
+#endif
+
 namespace vsmc
 {
 
@@ -319,67 +303,74 @@ class ThreefryInsertKey<T, 4, N, true>
 /// \ingroup Threefry
 template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
     std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
-class ThreefryGeneratorGeneric
+class ThreefryGenerator
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**ThreefryGenerator** USED WITH ResultType OTHER THAN UNSIGNED "
+        "INTEGER TYPES");
+
+    static_assert(sizeof(ResultType) == sizeof(std::uint32_t) ||
+            sizeof(ResultType) == sizeof(std::uint64_t),
+        "**ThreefryGenerator** USED WITH ResultType OF SIZE OTHER THAN 32 OR "
+        "64 BITS");
+
+    static_assert(K == 2 || K == 4,
+        "**ThreefryGenerator** USED WITH K OTHER THAN 2 OR 4");
+
     public:
     using result_type = ResultType;
     using ctr_type = std::array<ResultType, K>;
     using key_type = std::array<ResultType, K>;
 
-    ThreefryGeneratorGeneric() { VSMC_STATIC_ASSERT_RNG_THREEFRY(Generic); }
-
     static constexpr std::size_t size() { return K; }
 
     void reset(const key_type &) {}
 
-    void operator()(ctr_type &ctr, const key_type &key,
-        std::array<result_type, K> &buffer) const
+    void operator()(ctr_type &ctr, const key_type &key, ctr_type &buffer) const
     {
-        std::array<result_type, K + 1> par;
+        std::array<ResultType, K + 1> par;
         internal::ThreefryInitPar<ResultType, K>::eval(key, par);
         increment(ctr);
         buffer = ctr;
         generate<0>(buffer, par, std::true_type());
     }
 
-    std::size_t operator()(ctr_type &ctr, const key_type &key, std::size_t n,
-        result_type *r) const
+    void operator()(ctr_type &ctr, const key_type &key, std::size_t n,
+        ctr_type *buffer) const
     {
-        const std::size_t m = n / size();
-        std::array<result_type, K + 1> par;
-        internal::ThreefryInitPar<ResultType, K>::eval(key, par);
-        ctr_type *s = reinterpret_cast<ctr_type *>(r);
-        increment(ctr, m, s);
-        for (std::size_t i = 0; i != m; ++i)
-            generate<0>(s[i], par, std::true_type());
+        if (n == 0)
+            return;
 
-        return m * size();
+        std::array<ResultType, K + 1> par;
+        internal::ThreefryInitPar<ResultType, K>::eval(key, par);
+        increment(ctr, n, buffer);
+        for (std::size_t i = 0; i != n; ++i)
+            generate<0>(buffer[i], par, std::true_type());
     }
 
     private:
     template <std::size_t>
-    void generate(std::array<result_type, K> &,
-        const std::array<result_type, K + 1> &, std::false_type) const
+    void generate(std::array<ResultType, K> &,
+        const std::array<ResultType, K + 1> &, std::false_type) const
     {
     }
 
     template <std::size_t N>
-    void generate(std::array<result_type, K> &state,
-        const std::array<result_type, K + 1> &par, std::true_type) const
+    void generate(std::array<ResultType, K> &state,
+        const std::array<ResultType, K + 1> &par, std::true_type) const
     {
         internal::ThreefryRotate<ResultType, K, N>::eval(state);
         internal::ThreefryInsertKey<ResultType, K, N>::eval(state, par);
         generate<N + 1>(
-            state, par, std::integral_constant < bool, N<Rounds>());
+            state, par, std::integral_constant<bool, (N < Rounds)>());
     }
-}; // class ThreefryGeneratorGeneric
+}; // class ThreefryGenerator
 
 /// \brief Threefry RNG engine
 /// \ingroup Threefry
 template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
     std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
-using ThreefryEngine =
-    CounterEngine<ThreefryGeneratorGeneric<ResultType, K, Rounds>>;
+using ThreefryEngine = CounterEngine<ThreefryGenerator<ResultType, K, Rounds>>;
 
 /// \brief Threefry2x32 RNG engine
 /// \ingroup Threefry
@@ -499,13 +490,23 @@ template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
     std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
 class ThreefryGeneratorSSE2
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**ThreefryGeneratorSSE2** USED WITH ResultType OTHER THAN UNSIGNED "
+        "INTEGER TYPES");
+
+    static_assert(sizeof(ResultType) == sizeof(std::uint32_t) ||
+            sizeof(ResultType) == sizeof(std::uint64_t),
+        "**ThreefryGeneratorSSE2** USED WITH ResultType OF SIZE OTHER THAN "
+        "32 OR 64 BITS");
+
+    static_assert(K == 2 || K == 4,
+        "**ThreefryGeneratorSSE2** USED WITH K OTHER THAN 2 OR 4");
+
     public:
     using result_type = ResultType;
     using ctr_type = std::array<ResultType, K>;
     using key_type = std::array<ResultType, K>;
 
-    ThreefryGeneratorSSE2() { VSMC_STATIC_ASSERT_RNG_THREEFRY(SSE2); }
-
     static constexpr std::size_t size()
     {
         return K * M128I<ResultType>::size();
@@ -514,14 +515,14 @@ class ThreefryGeneratorSSE2
     void reset(const key_type &) {}
 
     void operator()(ctr_type &ctr, const key_type &key,
-        std::array<result_type, K * M128I<ResultType>::size()> &buffer)
+        std::array<ResultType, size()> &buffer) const
     {
         union {
             std::array<M128I<ResultType>, K> state;
             std::array<ResultType, size()> result;
         } buf;
 
-        std::array<result_type, K + 1> p;
+        std::array<ResultType, K + 1> p;
         std::array<M128I<ResultType>, K + 1> par;
         internal::ThreefryInitPar<ResultType, K>::eval(key, p);
         internal::ThreefryParPackSSE2<ResultType, K>::eval(p, par);
@@ -530,27 +531,43 @@ class ThreefryGeneratorSSE2
         buffer = buf.result;
     }
 
-    std::size_t operator()(
-        ctr_type &, const key_type &, std::size_t, result_type *) const
+    void operator()(ctr_type &ctr, const key_type &key, std::size_t n,
+        std::array<ResultType, size()> *buffer) const
     {
-        return 0;
+        if (n == 0)
+            return;
+
+        union {
+            std::array<M128I<ResultType>, K> state;
+            std::array<ResultType, size()> result;
+        } buf;
+
+        std::array<ResultType, K + 1> p;
+        std::array<M128I<ResultType>, K + 1> par;
+        internal::ThreefryInitPar<ResultType, K>::eval(key, p);
+        internal::ThreefryParPackSSE2<ResultType, K>::eval(p, par);
+        for (std::size_t i = 0; i != n; ++i) {
+            internal::ThreefryCtrPackSSE2<ResultType, K>::eval(ctr, buf.state);
+            generate<0>(buf.state, par, std::true_type());
+            buffer[i] = buf.result;
+        }
     }
 
     private:
     template <std::size_t>
     void generate(std::array<M128I<ResultType>, K> &,
-        const std::array<M128I<ResultType>, K + 1> &, std::false_type)
+        const std::array<M128I<ResultType>, K + 1> &, std::false_type) const
     {
     }
 
     template <std::size_t N>
     void generate(std::array<M128I<ResultType>, K> &state,
-        const std::array<M128I<ResultType>, K + 1> &par, std::true_type)
+        const std::array<M128I<ResultType>, K + 1> &par, std::true_type) const
     {
         internal::ThreefryRotate<M128I<ResultType>, K, N>::eval(state);
         internal::ThreefryInsertKey<M128I<ResultType>, K, N>::eval(state, par);
         generate<N + 1>(
-            state, par, std::integral_constant < bool, N<Rounds>());
+            state, par, std::integral_constant<bool, (N < Rounds)>());
     }
 }; // class ThreefryGeneratorSSE2
 
@@ -687,13 +704,23 @@ template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
     std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
 class ThreefryGeneratorAVX2
 {
+    static_assert(std::is_unsigned<ResultType>::value,
+        "**ThreefryGeneratorAVX2** USED WITH ResultType OTHER THAN UNSIGNED "
+        "INTEGER TYPES");
+
+    static_assert(sizeof(ResultType) == sizeof(std::uint32_t) ||
+            sizeof(ResultType) == sizeof(std::uint64_t),
+        "**ThreefryGeneratorAVX2** USED WITH ResultType OF SIZE OTHER THAN "
+        "32 OR 64 BITS");
+
+    static_assert(K == 2 || K == 4,
+        "**ThreefryGeneratorAVX2** USED WITH K OTHER THAN 2 OR 4");
+
     public:
     using result_type = ResultType;
     using ctr_type = std::array<ResultType, K>;
     using key_type = std::array<ResultType, K>;
 
-    ThreefryGeneratorAVX2() { VSMC_STATIC_ASSERT_RNG_THREEFRY(AVX2); }
-
     static constexpr std::size_t size()
     {
         return K * M256I<ResultType>::size();
@@ -702,14 +729,14 @@ class ThreefryGeneratorAVX2
     void reset(const key_type &) {}
 
     void operator()(ctr_type &ctr, const key_type &key,
-        std::array<result_type, K * M256I<ResultType>::size()> &buffer)
+        std::array<ResultType, size()> &buffer) const
     {
         union {
             std::array<M256I<ResultType>, K> state;
             std::array<ResultType, size()> result;
         } buf;
 
-        std::array<result_type, K + 1> p;
+        std::array<ResultType, K + 1> p;
         std::array<M256I<ResultType>, K + 1> par;
         internal::ThreefryInitPar<ResultType, K>::eval(key, p);
         internal::ThreefryParPackAVX2<ResultType, K>::eval(p, par);
@@ -718,27 +745,43 @@ class ThreefryGeneratorAVX2
         buffer = buf.result;
     }
 
-    std::size_t operator()(
-        ctr_type &, const key_type &, std::size_t, result_type *) const
+    void operator()(ctr_type &ctr, const key_type &key, std::size_t n,
+        std::array<ResultType, size()> *buffer) const
     {
-        return 0;
+        if (n == 0)
+            return;
+
+        union {
+            std::array<M256I<ResultType>, K> state;
+            std::array<ResultType, size()> result;
+        } buf;
+
+        std::array<ResultType, K + 1> p;
+        std::array<M256I<ResultType>, K + 1> par;
+        internal::ThreefryInitPar<ResultType, K>::eval(key, p);
+        internal::ThreefryParPackAVX2<ResultType, K>::eval(p, par);
+        for (std::size_t i = 0; i != n; ++i) {
+            internal::ThreefryCtrPackAVX2<ResultType, K>::eval(ctr, buf.state);
+            generate<0>(buf.state, par, std::true_type());
+            buffer[i] = buf.result;
+        }
     }
 
     private:
     template <std::size_t>
     void generate(std::array<M256I<ResultType>, K> &,
-        const std::array<M256I<ResultType>, K + 1> &, std::false_type)
+        const std::array<M256I<ResultType>, K + 1> &, std::false_type) const
     {
     }
 
     template <std::size_t N>
     void generate(std::array<M256I<ResultType>, K> &state,
-        const std::array<M256I<ResultType>, K + 1> &par, std::true_type)
+        const std::array<M256I<ResultType>, K + 1> &par, std::true_type) const
     {
         internal::ThreefryRotate<M256I<ResultType>, K, N>::eval(state);
         internal::ThreefryInsertKey<M256I<ResultType>, K, N>::eval(state, par);
         generate<N + 1>(
-            state, par, std::integral_constant < bool, N<Rounds>());
+            state, par, std::integral_constant<bool, (N < Rounds)>());
     }
 }; // class ThreefryGeneratorAVX2
 
diff --git a/include/vsmc/rng/u01.hpp b/include/vsmc/rng/u01.hpp
index e4c8d9bbf..985136ea1 100644
--- a/include/vsmc/rng/u01.hpp
+++ b/include/vsmc/rng/u01.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,31 +33,6 @@
 #define VSMC_RNG_U01_HPP
 
 #include <vsmc/rng/internal/common.hpp>
-#include <vsmc/rngc/u01.h>
-
-#define VSMC_DEFINE_RNG_U01_IMPL(                                             \
-    UBits, FBits, RealType, Left, Right, left, right)                         \
-    template <>                                                               \
-    class U01Impl<sizeof(std::uint##UBits##_t), sizeof(RealType), Left,       \
-        Right>                                                                \
-    {                                                                         \
-        public:                                                               \
-        template <typename UIntType>                                          \
-        static RealType eval(UIntType u)                                      \
-        {                                                                     \
-            return ::vsmc_u01_##left##_##right##_u##UBits##_f##FBits(         \
-                static_cast<uint##UBits##_t>(u));                             \
-        }                                                                     \
-    }; // class U01Impl
-
-#define VSMC_DEFINE_RNG_U01(UBits, FBits, RealType)                           \
-    VSMC_DEFINE_RNG_U01_IMPL(                                                 \
-        UBits, FBits, RealType, Closed, Closed, closed, closed)               \
-    VSMC_DEFINE_RNG_U01_IMPL(                                                 \
-        UBits, FBits, RealType, Closed, Open, closed, open)                   \
-    VSMC_DEFINE_RNG_U01_IMPL(                                                 \
-        UBits, FBits, RealType, Open, Closed, open, closed)                   \
-    VSMC_DEFINE_RNG_U01_IMPL(UBits, FBits, RealType, Open, Open, open, open)
 
 namespace vsmc
 {
@@ -65,21 +40,265 @@ namespace vsmc
 namespace internal
 {
 
-template <std::size_t, std::size_t, typename, typename>
-class U01Impl;
+template <int P,
+    int Q = (std::numeric_limits<unsigned long long>::digits <
+                        std::numeric_limits<long double>::digits ?
+                    std::numeric_limits<unsigned long long>::digits :
+                    std::numeric_limits<long double>::digits) -
+        1,
+    bool = (Q < P)>
+class U01ImplPow2L
+{
+    public:
+    static constexpr long double value =
+        static_cast<long double>(1ULL << Q) * U01ImplPow2L<P - Q>::value;
+}; // class U01ImplPow2L
+
+template <int P, int Q>
+class U01ImplPow2L<P, Q, false>
+{
+    public:
+    static constexpr long double value = static_cast<long double>(1ULL << P);
+}; // class U01ImplPow2L
 
-VSMC_DEFINE_RNG_U01(32, 32, float)
-VSMC_DEFINE_RNG_U01(32, 64, double)
-VSMC_DEFINE_RNG_U01(64, 32, float)
-VSMC_DEFINE_RNG_U01(64, 64, double)
+template <int P>
+class U01ImplPow2InvL
+{
+    public:
+    static constexpr long double value = 1.0L / U01ImplPow2L<P>::value;
+}; // class U01ImplPow2InvL
+
+template <typename RealType, int P>
+class U01ImplPow2Inv
+{
+    public:
+    static constexpr RealType value =
+        static_cast<RealType>(U01ImplPow2InvL<P>::value);
+}; // class U01ImplPow2Inv
 
 } // namespace vsmc::internal
 
+namespace internal
+{
+
+template <typename, typename, typename, typename>
+class U01LRImpl;
+
+template <typename UIntType, typename RealType>
+class U01LRImpl<UIntType, RealType, Closed, Closed>
+{
+    static constexpr int W = std::numeric_limits<UIntType>::digits;
+    static constexpr int M = std::numeric_limits<RealType>::digits;
+    static constexpr int P = W - 1 < M ? W - 1 : M;
+    static constexpr int V = P + 1;
+    static constexpr int L = V < W ? 1 : 0;
+    static constexpr int R = V < W ? W - 1 - V : 0;
+
+    public:
+    static RealType eval(UIntType u) noexcept
+    {
+        return trans((u << L) >> (R + L),
+                   std::integral_constant<bool, (V < W)>()) *
+            U01ImplPow2Inv<RealType, P + 1>::value;
+    }
+
+    static void eval(std::size_t n, const UIntType *u, RealType *r) noexcept
+    {
+        for (std::size_t i = 0; i != n; ++i) {
+            r[i] = trans((u[i] << L) >> (R + L),
+                std::integral_constant<bool, (V < W)>());
+        }
+        mul(n, U01ImplPow2Inv<RealType, P + 1>::value, r, r);
+    }
+
+    private:
+    static RealType trans(UIntType u, std::true_type) noexcept
+    {
+        return static_cast<RealType>((u & 1) + u);
+    }
+
+    static RealType trans(UIntType u, std::false_type) noexcept
+    {
+        return static_cast<RealType>(u & 1) + static_cast<RealType>(u);
+    }
+}; // class U01LRImpl
+
+template <typename UIntType, typename RealType>
+class U01LRImpl<UIntType, RealType, Closed, Open>
+{
+    static constexpr int W = std::numeric_limits<UIntType>::digits;
+    static constexpr int M = std::numeric_limits<RealType>::digits;
+    static constexpr int P = W < M ? W : M;
+    static constexpr int R = W - P;
+
+    public:
+    static RealType eval(UIntType u) noexcept
+    {
+        return static_cast<RealType>(u >> R) *
+            U01ImplPow2Inv<RealType, P>::value;
+    }
+
+    static void eval(std::size_t n, const UIntType *u, RealType *r) noexcept
+    {
+        for (std::size_t i = 0; i != n; ++i)
+            r[i] = u[i] >> R;
+        mul(n, U01ImplPow2Inv<RealType, P>::value, r, r);
+    }
+}; // class U01LRImpl
+
+template <typename UIntType, typename RealType>
+class U01LRImpl<UIntType, RealType, Open, Closed>
+{
+    static constexpr int W = std::numeric_limits<UIntType>::digits;
+    static constexpr int M = std::numeric_limits<RealType>::digits;
+    static constexpr int P = W < M ? W : M;
+    static constexpr int R = W - P;
+
+    public:
+    static RealType eval(UIntType u) noexcept
+    {
+        return static_cast<RealType>(u >> R) *
+            U01ImplPow2Inv<RealType, P>::value +
+            U01ImplPow2Inv<RealType, P>::value;
+    }
+
+    static void eval(std::size_t n, const UIntType *u, RealType *r) noexcept
+    {
+        for (std::size_t i = 0; i != n; ++i)
+            r[i] = u[i] >> R;
+        fma(n, U01ImplPow2Inv<RealType, P>::value, r,
+            U01ImplPow2Inv<RealType, P>::value, r);
+    }
+}; // class U01LRImpl
+
+template <typename UIntType, typename RealType>
+class U01LRImpl<UIntType, RealType, Open, Open>
+{
+    static constexpr int W = std::numeric_limits<UIntType>::digits;
+    static constexpr int M = std::numeric_limits<RealType>::digits;
+    static constexpr int P = W + 1 < M ? W + 1 : M;
+    static constexpr int R = W + 1 - P;
+
+    public:
+    static RealType eval(UIntType u) noexcept
+    {
+        return static_cast<RealType>(u >> R) *
+            U01ImplPow2Inv<RealType, P - 1>::value +
+            U01ImplPow2Inv<RealType, P>::value;
+    }
+
+    static void eval(std::size_t n, const UIntType *u, RealType *r) noexcept
+    {
+        for (std::size_t i = 0; i != n; ++i)
+            r[i] = u[i] >> R;
+        fma(n, U01ImplPow2Inv<RealType, P - 1>::value, r,
+            U01ImplPow2Inv<RealType, P>::value, r);
+    }
+}; // class U01LRImpl
+
+} // namespace vsmc::internal
+
+/// \brief Convert uniform unsigned integers to floating points within [0, 1]
+/// \ingroup RNG
+///
+/// \details
+/// Let \f$W\f$ be the number of digits of unsigned integer type `UIntType`.
+/// Let \f$M\f$ be the number of significant digits of floating point type
+/// `RealType`. Assuming the input is a uniform random number on the set
+/// \f$\{0,1,\dots,2^W - 1\f$, the output is uniform over the interval
+/// \f$[0,1]\f$ or one of its (half-)open interval variant. The exact output
+/// depend on the template parameter `Left` and `Right`.
 template <typename UIntType, typename RealType, typename Left, typename Right>
-class U01
-    : public internal::U01Impl<sizeof(UIntType), sizeof(RealType), Left, Right>
+RealType u01_lr(UIntType u) noexcept
+{
+    static_assert(std::is_unsigned<UIntType>::value,
+        "**u01_lr** USED WITH UIntType OTHER THAN UNSIGNED INTEGER "
+        "TYPES");
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_lr** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    return internal::U01LRImpl<UIntType, RealType, Left, Right>::eval(u);
+}
+
+/// \brief Convert uniform unsigned integers to floating points within [0, 1]
+/// \ingroup RNG
+template <typename UIntType, typename RealType, typename Left, typename Right>
+void u01_lr(std::size_t n, const UIntType *u, RealType *r) noexcept
+{
+    static_assert(std::is_unsigned<UIntType>::value,
+        "**u01_lr** USED WITH UIntType OTHER THAN UNSIGNED INTEGER "
+        "TYPES");
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_lr** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    internal::U01LRImpl<UIntType, RealType, Left, Right>::eval(n, u, r);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on [0, 1]
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+RealType u01_cc(UIntType u) noexcept
+{
+    return u01_lr<UIntType, RealType, Closed, Closed>(u);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on [0, 1)
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+RealType u01_co(UIntType u) noexcept
+{
+    return u01_lr<UIntType, RealType, Closed, Open>(u);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on (0, 1]
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+RealType u01_oc(UIntType u) noexcept
+{
+    return u01_lr<UIntType, RealType, Open, Closed>(u);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on (0, 1)
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+RealType u01_oo(UIntType u) noexcept
+{
+    return u01_lr<UIntType, RealType, Open, Open>(u);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on [0, 1]
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+void u01_cc(std::size_t n, const UIntType *u, RealType *r) noexcept
+{
+    u01_lr<UIntType, RealType, Closed, Closed>(n, u, r);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on [0, 1)
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+void u01_co(std::size_t n, const UIntType *u, RealType *r) noexcept
+{
+    u01_lr<UIntType, RealType, Closed, Open>(n, u, r);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on (0, 1]
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+void u01_oc(std::size_t n, const UIntType *u, RealType *r) noexcept
+{
+    u01_lr<UIntType, RealType, Open, Closed>(n, u, r);
+}
+
+/// \brief Convert uniform unsigned integers to floating points on (0, 1)
+/// \ingroup RNG
+template <typename UIntType, typename RealType>
+void u01_oo(std::size_t n, const UIntType *u, RealType *r) noexcept
 {
-}; // class U01
+    u01_lr<UIntType, RealType, Open, Open>(n, u, r);
+}
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/u01_distribution.hpp b/include/vsmc/rng/u01_distribution.hpp
index 6a8e5cae6..b646f1bcc 100644
--- a/include/vsmc/rng/u01_distribution.hpp
+++ b/include/vsmc/rng/u01_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,223 +39,272 @@
 namespace vsmc
 {
 
-/// \brief Standard uniform distribution with open/closed variants
+namespace internal
+{
+
+template <typename RNGType>
+using U01UIntType = typename std::conditional<(RNGBits<RNGType>::value > 32),
+    std::uint64_t, std::uint32_t>::type;
+
+} // namespace vsmc::internal
+
+/// \brief Standard uniform distribution
 /// \ingroup Distribution
-///
-/// \tparam RealType The floating points type of results
-/// \tparam Left Shall the left side of the interval be Open or Closed
-/// \tparam Right Shall the right side of the interval be Open or Closed
-template <typename RealType, typename Left, typename Right>
-class U01LRDistribution
+template <typename RealType>
+class U01Distribution
 {
+    VSMC_DEFINE_RNG_DISTRIBUTION_0(
+        U01, u01, RealType, floating_point, FLOATING_POINT)
+
     public:
-    using result_type = RealType;
-    using distribution_type = U01LRDistribution<RealType, Left, Right>;
+    result_type min() const { return 0; }
 
-    class param_type
-    {
-        public:
-        using result_type = RealType;
-        using distribution_type = U01LRDistribution<RealType, Left, Right>;
-
-        friend bool operator==(const param_type &, const param_type &)
-        {
-            return true;
-        }
-
-        friend bool operator!=(const param_type &, const param_type &)
-        {
-            return false;
-        }
-
-        template <typename CharT, typename Traits>
-        friend std::basic_ostream<CharT, Traits> &operator<<(
-            std::basic_ostream<CharT, Traits> &os, const param_type &)
-        {
-            return os;
-        }
-
-        template <typename CharT, typename Traits>
-        friend std::basic_istream<CharT, Traits> &operator>>(
-            std::basic_istream<CharT, Traits> &is, param_type &)
-        {
-            return is;
-        }
-    }; // class param_type
-
-    U01LRDistribution() {}
-    explicit U01LRDistribution(const param_type &) {}
-
-    result_type min VSMC_MNE() const { return 0; }
-    result_type max VSMC_MNE() const { return 1; }
+    result_type max() const { return 1; }
 
     void reset() {}
 
+    private:
     template <typename RNGType>
-    result_type operator()(RNGType &rng)
+    result_type generate(RNGType &rng, const param_type &)
     {
-        using uint_type =
-            typename std::conditional<internal::RNGBits<RNGType>::value >= 64,
-                std::uint64_t, std::uint32_t>::type;
-        using flt_type = typename std::conditional<
-            std::is_same<result_type, float>::value ||
-                std::is_same<result_type, double>::value,
-            result_type, double>::type;
+        return u01_co<internal::U01UIntType<RNGType>, result_type>(
+            UniformBits<internal::U01UIntType<RNGType>>::eval(rng));
+    }
+}; // class U01Distribution
 
-        UniformBitsDistribution<uint_type> rbits;
+/// \brief Standard uniform distribution on [0, 1]
+/// \ingroup Distribution
+template <typename RealType>
+class U01CCDistribution
+{
+    VSMC_DEFINE_RNG_DISTRIBUTION_0(
+        U01CC, u01_cc, RealType, floating_point, FLOATING_POINT)
 
-        return U01<uint_type, flt_type, Left, Right>::eval(rbits(rng));
-    }
+    public:
+    result_type min() const { return 0; }
 
-    template <typename RNGType>
-    result_type operator()(RNGType &rng, const param_type &)
-    {
-        return operator()(rng);
-    }
+    result_type max() const { return 1; }
 
+    void reset() {}
+
+    private:
     template <typename RNGType>
-    void operator()(RNGType &rng, std::size_t n, result_type *r)
+    result_type generate(RNGType &rng, const param_type &)
     {
-        u01_lr_distribution<RealType, Left, Right>(rng, n, r);
+        return u01_cc<internal::U01UIntType<RNGType>, result_type>(
+            UniformBits<internal::U01UIntType<RNGType>>::eval(rng));
     }
+}; // class U01CODistribution
 
+/// \brief Standard uniform distribution on [0, 1)
+/// \ingroup Distribution
+template <typename RealType>
+class U01CODistribution
+{
+    VSMC_DEFINE_RNG_DISTRIBUTION_0(
+        U01CO, u01_co, RealType, floating_point, FLOATING_POINT)
+
+    public:
+    result_type min() const { return 0; }
+
+    result_type max() const { return 1; }
+
+    void reset() {}
+
+    private:
     template <typename RNGType>
-    void operator()(
-        RNGType &rng, std::size_t n, result_type *r, const param_type &)
+    result_type generate(RNGType &rng, const param_type &)
     {
-        u01_lr_distribution<RealType, Left, Right>(rng, n, r);
+        return u01_co<internal::U01UIntType<RNGType>, result_type>(
+            UniformBits<internal::U01UIntType<RNGType>>::eval(rng));
     }
+}; // class U01CODistribution
 
-    friend bool operator==(const U01LRDistribution<RealType, Left, Right> &,
-        const U01LRDistribution<RealType, Left, Right> &)
-    {
-        return true;
-    }
+/// \brief Standard uniform distribution on (0, 1]
+/// \ingroup Distribution
+template <typename RealType>
+class U01OCDistribution
+{
+    VSMC_DEFINE_RNG_DISTRIBUTION_0(
+        U01OC, u01_oc, RealType, floating_point, FLOATING_POINT)
 
-    friend bool operator!=(const U01LRDistribution<RealType, Left, Right> &,
-        const U01LRDistribution<RealType, Left, Right> &)
-    {
-        return false;
-    }
+    public:
+    result_type min() const { return 0; }
 
-    template <typename CharT, typename Traits>
-    friend std::basic_ostream<CharT, Traits> &operator<<(
-        std::basic_ostream<CharT, Traits> &os,
-        const U01LRDistribution<RealType, Left, Right> &)
-    {
-        return os;
-    }
+    result_type max() const { return 1; }
+
+    void reset() {}
 
-    template <typename CharT, typename Traits>
-    friend std::basic_istream<CharT, Traits> &operator>>(
-        std::basic_istream<CharT, Traits> &is,
-        U01LRDistribution<RealType, Left, Right> &)
+    private:
+    template <typename RNGType>
+    result_type generate(RNGType &rng, const param_type &)
     {
-        return is;
+        return u01_oc<internal::U01UIntType<RNGType>, result_type>(
+            UniformBits<internal::U01UIntType<RNGType>>::eval(rng));
     }
-}; // class U01LRDistribution
+}; // class U01CODistribution
 
-/// \brief Standard uniform distribution on cloed-closed interval
+/// \brief Standard uniform distribution on (0, 1)
 /// \ingroup Distribution
-template <typename RealType = double>
-using U01CCDistribution = U01LRDistribution<RealType, Closed, Closed>;
+template <typename RealType>
+class U01OODistribution
+{
+    VSMC_DEFINE_RNG_DISTRIBUTION_0(
+        U01OO, u01_oo, RealType, floating_point, FLOATING_POINT)
 
-/// \brief Standard uniform distribution on cloed-open interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using U01OODistribution = U01LRDistribution<RealType, Open, Open>;
+    public:
+    result_type min() const { return 0; }
 
-/// \brief Standard uniform distribution on open-closed interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using U01CODistribution = U01LRDistribution<RealType, Closed, Open>;
+    result_type max() const { return 1; }
 
-/// \brief Standard uniform distribution on open-open interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using U01OCDistribution = U01LRDistribution<RealType, Open, Closed>;
+    void reset() {}
 
-/// \brief Standard uniform distribution
-/// \ingroup Distribution
-template <typename RealType = double>
-using U01Distribution = U01CODistribution<RealType>;
+    private:
+    template <typename RNGType>
+    result_type generate(RNGType &rng, const param_type &)
+    {
+        return u01_oo<internal::U01UIntType<RNGType>, result_type>(
+            UniformBits<internal::U01UIntType<RNGType>>::eval(rng));
+    }
+}; // class U01CODistribution
 
 namespace internal
 {
 
-template <std::size_t K, typename RealType, typename Left, typename Right,
-    typename RNGType>
-inline void u01_lr_distribution_impl(RNGType &rng, std::size_t n, RealType *r)
+template <std::size_t K, typename RealType, typename RNGType>
+inline void u01_distribution_impl(RNGType &rng, std::size_t n, RealType *r)
+{
+    U01UIntType<RNGType> s[K];
+    uniform_bits_distribution(rng, n, s);
+    u01_co<U01UIntType<RNGType>, RealType>(n, s, r);
+}
+
+template <std::size_t K, typename RealType, typename RNGType>
+inline void u01_cc_distribution_impl(RNGType &rng, std::size_t n, RealType *r)
 {
-    uint32_t s[K];
+    U01UIntType<RNGType> s[K];
     uniform_bits_distribution(rng, n, s);
+    u01_cc<U01UIntType<RNGType>, RealType>(n, s, r);
     for (std::size_t i = 0; i != n; ++i)
-        r[i] = U01<std::uint32_t, RealType, Left, Right>::eval(s[i]);
+        r[i] = u01_cc<U01UIntType<RNGType>, RealType>(s[i]);
+}
+
+template <std::size_t K, typename RealType, typename RNGType>
+inline void u01_co_distribution_impl(RNGType &rng, std::size_t n, RealType *r)
+{
+    U01UIntType<RNGType> s[K];
+    uniform_bits_distribution(rng, n, s);
+    u01_co<U01UIntType<RNGType>, RealType>(n, s, r);
+}
+
+template <std::size_t K, typename RealType, typename RNGType>
+inline void u01_oc_distribution_impl(RNGType &rng, std::size_t n, RealType *r)
+{
+    U01UIntType<RNGType> s[K];
+    uniform_bits_distribution(rng, n, s);
+    u01_oc<U01UIntType<RNGType>, RealType>(n, s, r);
+}
+
+template <std::size_t K, typename RealType, typename RNGType>
+inline void u01_oo_distribution_impl(RNGType &rng, std::size_t n, RealType *r)
+{
+    U01UIntType<RNGType> s[K];
+    uniform_bits_distribution(rng, n, s);
+    u01_oo<U01UIntType<RNGType>, RealType>(n, s, r);
 }
 
 } // namespace vsmc::internal
 
-/// \brief Generate standard uniform random variates with open/closed variants
+/// \brief Generate standard uniform random variates
 /// \ingroup Distribution
-template <typename RealType, typename Left, typename Right, typename RNGType>
-inline void u01_lr_distribution(RNGType &rng, std::size_t n, RealType *r)
+template <typename RealType, typename RNGType>
+inline void u01_distribution(RNGType &rng, std::size_t n, RealType *r)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i, r += k) {
-        internal::u01_lr_distribution_impl<k, RealType, Left, Right>(
-            rng, k, r);
-    }
-    internal::u01_lr_distribution_impl<k, RealType, Left, Right>(rng, l, r);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::u01_distribution_impl<k>(rng, k, r);
+    internal::u01_distribution_impl<k>(rng, l, r);
 }
 
-/// \brief Generate standard uniform random variates on closed-closed interval
+/// \brief Generate standard uniform random variates on [0, 1]
 /// \ingroup Distribution
 template <typename RealType, typename RNGType>
 inline void u01_cc_distribution(RNGType &rng, std::size_t n, RealType *r)
 {
-    u01_lr_distribution<RealType, Closed, Closed>(rng, n, r);
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_cc_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
+    const std::size_t m = n / k;
+    const std::size_t l = n % k;
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::u01_cc_distribution_impl<k>(rng, k, r);
+    internal::u01_cc_distribution_impl<k>(rng, l, r);
 }
 
-/// \brief Generate standard uniform random variates on closed-open interval
+/// \brief Generate standard uniform random variates on [0, 1)
 /// \ingroup Distribution
 template <typename RealType, typename RNGType>
 inline void u01_co_distribution(RNGType &rng, std::size_t n, RealType *r)
 {
-    u01_lr_distribution<RealType, Closed, Open>(rng, n, r);
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_co_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
+    const std::size_t m = n / k;
+    const std::size_t l = n % k;
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::u01_co_distribution_impl<k>(rng, k, r);
+    internal::u01_co_distribution_impl<k>(rng, l, r);
 }
 
-/// \brief Generate standard uniform random variates on open-closed interval
+/// \brief Generate standard uniform random variates on (0, 1]
 /// \ingroup Distribution
 template <typename RealType, typename RNGType>
 inline void u01_oc_distribution(RNGType &rng, std::size_t n, RealType *r)
 {
-    u01_lr_distribution<RealType, Open, Closed>(rng, n, r);
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_oc_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
+
+    const std::size_t k = 1024;
+    const std::size_t m = n / k;
+    const std::size_t l = n % k;
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::u01_oc_distribution_impl<k>(rng, k, r);
+    internal::u01_oc_distribution_impl<k>(rng, l, r);
 }
 
-/// \brief Generate standard uniform random variates on open-open interval
+/// \brief Generate standard uniform random variates on (0, 1)
 /// \ingroup Distribution
 template <typename RealType, typename RNGType>
 inline void u01_oo_distribution(RNGType &rng, std::size_t n, RealType *r)
 {
-    u01_lr_distribution<RealType, Open, Open>(rng, n, r);
-}
+    static_assert(std::is_floating_point<RealType>::value,
+        "**u01_oo_distribution** USED WITH RealType OTHER THAN FLOATING POINT "
+        "TYPES");
 
-/// \brief Generate standard uniform random variates
-/// \ingroup Distribution
-template <typename RealType, typename RNGType>
-inline void u01_distribution(RNGType &rng, std::size_t n, RealType *r)
-{
-    u01_co_distribution(rng, n, r);
+    const std::size_t k = 1024;
+    const std::size_t m = n / k;
+    const std::size_t l = n % k;
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::u01_oo_distribution_impl<k>(rng, k, r);
+    internal::u01_oo_distribution_impl<k>(rng, l, r);
 }
 
-template <typename RealType, typename RNGType, typename Left, typename Right>
-inline void rng_rand(RNGType &rng, U01LRDistribution<RealType, Left, Right> &,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(U01, u01, RealType)
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(U01CC, u01_cc, RealType)
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(U01CO, u01_co, RealType)
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(U01OC, u01_oc, RealType)
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(U01OO, u01_oo, RealType)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/u01_sequence.hpp b/include/vsmc/rng/u01_sequence.hpp
index 98532e141..ba69945ad 100644
--- a/include/vsmc/rng/u01_sequence.hpp
+++ b/include/vsmc/rng/u01_sequence.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -99,10 +99,10 @@ inline void u01_systematic(std::size_t N, RealType u01, RealType *u01seq)
 /// ~~~{.cpp}
 /// const std::size_t N = 1000;
 /// std::mt19937 rng;
-/// std::uniform_real_distribution<double> runif;
+/// std::uniform_real_distribution<double> u01;
 /// std::vector<double> u01(N);
 /// for (std::size_t i = 0; i != N; ++i)
-///     u01[i] = runif(rng);
+///     u01[i] = u01(rng);
 /// std::sort(u01.begin(), u01.end());
 /// for (std::size_t i = 0; i != N; ++i)
 ///     do_something_with_u01(u01[i]);
@@ -145,7 +145,8 @@ class U01SequenceSorted
         if (n == n_)
             return u_;
 
-        lmax_ += std::log(runif_(rng_)) / (N_ - n);
+        U01Distribution<result_type> u01;
+        lmax_ += std::log(u01(rng_)) / (N_ - n);
         n_ = n;
         u_ = 1 - std::exp(lmax_);
 
@@ -160,7 +161,6 @@ class U01SequenceSorted
     result_type u_;
     result_type lmax_;
     RNGType &rng_;
-    U01OODistribution<double> runif_;
 }; // U01SequenceSorted
 
 /// \brief Generate a fixed length sequence of uniform \f$[0,1)\f$ random
@@ -195,8 +195,9 @@ class U01SequenceStratified
         if (n == n_)
             return u_;
 
+        U01Distribution<result_type> u01;
         n_ = n;
-        u_ = runif_(rng_) * delta_ + n * delta_;
+        u_ = u01(rng_) * delta_ + n * delta_;
 
         return u_;
     }
@@ -209,7 +210,6 @@ class U01SequenceStratified
     result_type u_;
     result_type delta_;
     RNGType &rng_;
-    U01CODistribution<double> runif_;
 }; // class U01SequenceStratified
 
 /// \brief Generate a fixed length sequence of uniform \f$[0,1)\f$ random
@@ -231,8 +231,8 @@ class U01SequenceSystematic
     U01SequenceSystematic(std::size_t N, RNGType &rng)
         : N_(N), n_(N), u_(0), u0_(0), delta_(1 / static_cast<result_type>(N))
     {
-        U01CODistribution<double> runif;
-        u0_ = runif(rng) * delta_;
+        U01Distribution<result_type> u01;
+        u0_ = u01(rng) * delta_;
     }
 
     result_type operator[](std::size_t n)
diff --git a/include/vsmc/rng/uniform_bits_distribution.hpp b/include/vsmc/rng/uniform_bits_distribution.hpp
index f0272aa60..1ceba4e4a 100644
--- a/include/vsmc/rng/uniform_bits_distribution.hpp
+++ b/include/vsmc/rng/uniform_bits_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,249 +37,138 @@
 namespace vsmc
 {
 
-namespace internal
-{
-
-template <typename UIntType, int Bits>
+/// \brief Generate uniform bits of given type
+/// \ingroup RNG
+///
+/// \details
+/// For a given unsigned integer type `UIntType` with \f$W\f$ bits, the output
+/// will be unsigned integers on the set \f$\{0,\dots,2^W - 1\}\f$ regardless
+/// of the range of the input `RNGType`
+template <typename UIntType>
 class UniformBits
 {
+    static_assert(std::is_unsigned<UIntType>::value,
+        "**UniformBits** USED WITH UIntType OTHER THAN UNSIGNED INTEGER "
+        "TYPES");
+
     public:
     template <typename RNGType>
     static UIntType eval(RNGType &rng)
     {
-        return eval(rng,
-            std::integral_constant<bool, RNGMinBits<RNGType>::value == 0>(),
-            std::integral_constant<bool, RNGBits<RNGType>::value >= Bits>());
+        static constexpr int w = std::numeric_limits<UIntType>::digits;
+        static constexpr int p = RNGBits<RNGType>::value;
+
+        return eval(rng, std::integral_constant<bool, w <= p>());
     }
 
     private:
     template <typename RNGType>
-    static UIntType eval(RNGType &rng, std::true_type, std::true_type)
+    static UIntType eval(RNGType &rng, std::true_type)
     {
-        return static_cast<UIntType>(rng());
-    }
+        static constexpr int r = RNGMinBits<RNGType>::value;
 
-    template <typename RNGType>
-    static UIntType eval(RNGType &rng, std::false_type, std::true_type)
-    {
-        return static_cast<UIntType>(rng() >> RNGMinBits<RNGType>::value);
+        return static_cast<UIntType>(rng() >> r);
     }
 
     template <typename RNGType>
-    static UIntType eval(RNGType &rng, std::true_type, std::false_type)
+    static UIntType eval(RNGType &rng, std::false_type)
     {
-        return static_cast<UIntType>(
-            patch<0, RNGBits<RNGType>::value, RNGMinBits<RNGType>::value>(
-                rng, std::true_type()));
+        return patch<0>(rng, std::true_type());
     }
 
-    template <typename RNGType>
-    static UIntType eval(RNGType &rng, std::false_type, std::false_type)
-    {
-        return eval(rng, std::true_type(), std::false_type());
-    }
-
-    template <int, int, int, typename RNGType>
+    template <int, typename RNGType>
     static UIntType patch(RNGType &, std::false_type)
     {
         return 0;
     }
 
-    template <int N, int B, int R, typename RNGType>
+    template <int N, typename RNGType>
     static UIntType patch(RNGType &rng, std::true_type)
     {
-        return static_cast<UIntType>((rng() >> R) << (B * N)) +
-            patch<N + 1, B, R>(
-                   rng, std::integral_constant<bool, (N * B + B) < Bits>());
+        static constexpr int w = std::numeric_limits<UIntType>::digits;
+        static constexpr int v =
+            std::numeric_limits<typename RNGType::result_type>::digits;
+        static constexpr int l = v - RNGMaxBits<RNGType>::value;
+        static constexpr int r = l + RNGMinBits<RNGType>::value;
+        static constexpr int p = N * RNGBits<RNGType>::value;
+        static constexpr int q = p + RNGBits<RNGType>::value;
+
+        UIntType u = static_cast<UIntType>((rng() << l) >> r);
+
+        return (u << p) +
+            patch<N + 1>(rng, std::integral_constant<bool, (q < w)>());
     }
 }; // class UniformBits
 
-} // namespace vsmc::internal
-
 /// \brief Uniform bits distribution
 /// \ingroup Distribution
+///
+/// \details
+/// For a given unsigned integer type `UIntType` with \f$W\f$ bits, the output
+/// will be unsigned integers on the set \f$\{0,\dots,2^W - 1\}\f$ regardless
+/// of the range of the input `RNGType`
 template <typename UIntType>
 class UniformBitsDistribution
 {
-    public:
-    using result_type = UIntType;
-    using distribution_type = UniformBitsDistribution<UIntType>;
+    VSMC_DEFINE_RNG_DISTRIBUTION_0(
+        UniformBits, uniform_bits, UIntType, unsigned, UNSIGNED)
 
-    class param_type
-    {
-        public:
-        using result_type = UIntType;
-        using distribution_type = UniformBitsDistribution<UIntType>;
-
-        friend bool operator==(const param_type &, const param_type &)
-        {
-            return true;
-        }
-
-        friend bool operator!=(const param_type &, const param_type &)
-        {
-            return false;
-        }
-
-        template <typename CharT, typename Traits>
-        friend std::basic_ostream<CharT, Traits> &operator<<(
-            std::basic_ostream<CharT, Traits> &os, const param_type &)
-        {
-            return os;
-        }
-
-        template <typename CharT, typename Traits>
-        friend std::basic_istream<CharT, Traits> &operator>>(
-            std::basic_istream<CharT, Traits> &is, param_type &)
-        {
-            return is;
-        }
-    }; // class param_type
-
-    UniformBitsDistribution() {}
-    explicit UniformBitsDistribution(const param_type &) {}
-
-    result_type min VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::min VSMC_MNE();
-    }
+    public:
+    result_type min() const { return std::numeric_limits<result_type>::min(); }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
+    private:
     template <typename RNGType>
-    result_type operator()(RNGType &rng)
-    {
-        return internal::UniformBits<UIntType,
-            internal::IntBits<UIntType>::value>::eval(rng);
-    }
-
-    template <typename RNGType>
-    result_type operator()(RNGType &rng, const param_type &)
-    {
-        return operator()(rng);
-    }
-
-    template <typename RNGType>
-    void operator()(RNGType &rng, std::size_t n, result_type *r)
-    {
-        uniform_bits_distribution(rng, n, r);
-    }
-
-    template <typename RNGType>
-    void operator()(
-        RNGType &rng, std::size_t n, result_type *r, const param_type &)
-    {
-        uniform_bits_distribution(rng, n, r);
-    }
-
-    friend bool operator==(const UniformBitsDistribution<UIntType> &,
-        const UniformBitsDistribution<UIntType> &)
-    {
-        return true;
-    }
-
-    friend bool operator!=(const UniformBitsDistribution<UIntType> &,
-        const UniformBitsDistribution<UIntType> &)
-    {
-        return false;
-    }
-
-    template <typename CharT, typename Traits>
-    friend std::basic_ostream<CharT, Traits> &operator<<(
-        std::basic_ostream<CharT, Traits> &os,
-        const UniformBitsDistribution<UIntType> &)
-    {
-        return os;
-    }
-
-    template <typename CharT, typename Traits>
-    friend std::basic_istream<CharT, Traits> &operator>>(
-        std::basic_istream<CharT, Traits> &is,
-        UniformBitsDistribution<UIntType> &)
+    result_type generate(RNGType &rng, const param_type &)
     {
-        return is;
+        return UniformBits<UIntType>::eval(rng);
     }
 }; // class UniformBitsDistribution
 
 namespace internal
 {
 
-template <typename UIntType, typename RNGType, bool B1, bool B2>
-inline void uniform_bits_distribution_impl(RNGType &rng, std::size_t n,
-    UIntType *r, std::false_type, std::integral_constant<bool, B1>,
-    std::integral_constant<bool, B2>)
+template <typename UIntType, typename RNGType>
+inline void uniform_bits_distribution_impl(
+    RNGType &rng, std::size_t n, UIntType *r, std::false_type)
 {
     for (std::size_t i = 0; i != n; ++i)
-        r[i] = UniformBits<UIntType, IntBits<UIntType>::value>::eval(rng);
+        r[i] = UniformBits<UIntType>::eval(rng);
 }
 
 template <typename UIntType, typename RNGType>
-inline void uniform_bits_distribution_impl(RNGType &rng, std::size_t n,
-    UIntType *r, std::true_type, std::true_type, std::true_type)
+inline void uniform_bits_distribution_impl(
+    RNGType &rng, std::size_t n, UIntType *r, std::true_type)
 {
     rng_rand(rng, n, reinterpret_cast<typename RNGType::result_type *>(r));
 }
 
-template <typename UIntType, typename RNGType>
-inline void uniform_bits_distribution_impl(RNGType &rng, std::size_t n,
-    UIntType *r, std::true_type, std::true_type, std::false_type)
-{
-    const std::size_t k =
-        sizeof(typename RNGType::result_type) / sizeof(UIntType);
-    const std::size_t m = n / k;
-    const std::size_t l = n % k;
-    rng_rand(rng, m, reinterpret_cast<typename RNGType::result_type *>(r));
-    n -= m * k;
-    r += m * k;
-    for (std::size_t i = 0; i != l; ++i)
-        r[i] = UniformBits<UIntType, IntBits<UIntType>::value>::eval(rng);
-}
-
-template <typename UIntType, typename RNGType>
-inline void uniform_bits_distribution_impl(RNGType &rng, std::size_t n,
-    UIntType *r, std::true_type, std::false_type, std::true_type)
-{
-    const std::size_t k =
-        sizeof(UIntType) / sizeof(typename RNGType::result_type);
-    const std::size_t m = n * k;
-    rng_rand(rng, m, reinterpret_cast<typename RNGType::result_type *>(r));
-}
-
-template <typename UIntType, typename RNGType>
-inline void uniform_bits_distribution_impl(RNGType &rng, std::size_t n,
-    UIntType *r, std::true_type, std::false_type, std::false_type)
-{
-    for (std::size_t i = 0; i != n; ++i)
-        r[i] = UniformBits<UIntType, IntBits<UIntType>::value>::eval(rng);
-}
-
-} // namespace vsmc::rng_type
+} // namespace vsmc::internal
 
 template <typename UIntType, typename RNGType>
 inline void uniform_bits_distribution(RNGType &rng, std::size_t n, UIntType *r)
 {
-    const int mbits = internal::RNGMinBits<RNGType>::value;
-    const int rbits = internal::RNGBits<RNGType>::value;
-    const int ubits = internal::IntBits<UIntType>::value;
-    internal::uniform_bits_distribution_impl(rng, n, r,
-        std::integral_constant<bool, mbits == 0>(),
-        std::integral_constant < bool,
-        rbits >= ubits && rbits % ubits == 0 > (),
-        std::integral_constant < bool,
-        ubits >= rbits && ubits % rbits == 0 > ());
+    static_assert(std::is_unsigned<UIntType>::value,
+        "**uniform_bits_distribution** USED WITH UIntType OTHER THAN UNSIGNED "
+        "TYPES");
+
+    static constexpr bool zero_min = RNGMinBits<RNGType>::value == 0;
+    static constexpr bool eq_bits =
+        RNGBits<RNGType>::value == std::numeric_limits<UIntType>::digits;
+    static constexpr bool eq_size =
+        sizeof(typename RNGType::result_type) == sizeof(UIntType);
+    static constexpr bool eq_align =
+        alignof(typename RNGType::result_type) == alignof(UIntType);
+
+    internal::uniform_bits_distribution_impl(
+        rng, n, r, std::integral_constant<bool,
+                       (zero_min && eq_bits && eq_size && eq_align)>());
 }
 
-template <typename UIntType, typename RNGType>
-inline void rng_rand(RNGType &rng, UniformBitsDistribution<UIntType> &dist,
-    std::size_t n, UIntType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_0(UniformBits, uniform_bits, UIntType)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rng/uniform_real_distribution.hpp b/include/vsmc/rng/uniform_real_distribution.hpp
index 105d6109e..5e306cd69 100644
--- a/include/vsmc/rng/uniform_real_distribution.hpp
+++ b/include/vsmc/rng/uniform_real_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,195 +35,52 @@
 #include <vsmc/rng/internal/common.hpp>
 #include <vsmc/rng/u01_distribution.hpp>
 
-#define VSMC_RUNTIME_ASSERT_RNG_UNIFORM_REAL_DISTRIBUTION_PARAM_CHECK(a, b)   \
-    VSMC_RUNTIME_ASSERT((a <= b), "**UniformRealDistribution** CONSTRUCTED "  \
-                                  "WITH INVALID MINIMUM AND MAXIMUM "         \
-                                  "PARAMTER VALUES")
-
 namespace vsmc
 {
 
-/// \brief Uniform real distribution with open/closed variants
-/// \ingroup Distribution
-///
-/// \tparam RealType The floating points type of results
-/// \tparam Left Shall the left side of the interval be Open or Closed
-/// \tparam Right Shall the right side of the interval be Open or Closed
-template <typename RealType, typename Left, typename Right>
-class UniformRealLRDistribution
+namespace internal
 {
-    public:
-    using result_type = RealType;
-    using distribution_type = UniformRealLRDistribution<RealType, Left, Right>;
-
-    class param_type
-    {
-        public:
-        using result_type = RealType;
-        using distribution_type =
-            UniformRealLRDistribution<RealType, Left, Right>;
-
-        param_type(result_type a, result_type b) : a_(a), b_(b)
-        {
-            invariant();
-        }
-
-        result_type a() const { return a_; }
-        result_type b() const { return b_; }
-
-        friend bool operator==(
-            const param_type &param1, const param_type &param2)
-        {
-            if (!is_equal(param1.a_, param2.a_))
-                return false;
-            if (!is_equal(param1.b_, param2.b_))
-                return false;
-            return true;
-        }
-
-        friend bool operator!=(
-            const param_type &param1, const param_type &param2)
-        {
-            return !(param1 == param2);
-        }
-
-        template <typename CharT, typename Traits>
-        friend std::basic_ostream<CharT, Traits> &operator<<(
-            std::basic_ostream<CharT, Traits> &os, const param_type &param)
-        {
-            if (!os.good())
-                return os;
-
-            os << param.a_ << ' ';
-            os << param.b_;
 
-            return os;
-        }
-
-        template <typename CharT, typename Traits>
-        friend std::basic_istream<CharT, Traits> &operator>>(
-            std::basic_istream<CharT, Traits> &is, param_type &param)
-        {
-            if (!is.good())
-                return is;
-
-            result_type a = 1;
-            result_type b = 0;
-            is >> std::ws >> a;
-            is >> std::ws >> b;
-
-            if (is.good()) {
-                if (a <= b)
-                    param = param_type(a, b);
-                else
-                    is.setstate(std::ios_base::failbit);
-            }
-
-            return is;
-        }
-
-        private:
-        result_type a_;
-        result_type b_;
-
-        friend distribution_type;
-
-        void invariant()
-        {
-            VSMC_RUNTIME_ASSERT_RNG_UNIFORM_REAL_DISTRIBUTION_PARAM_CHECK(
-                a_, b_);
-        }
-
-        void reset() {}
-    }; // class param_type
+template <typename RealType>
+bool uniform_real_distribution_check_param(RealType a, RealType b)
+{
+    return a <= b;
+}
 
-    explicit UniformRealLRDistribution(result_type a = 0, result_type b = 1)
-        : param_(a, b)
-    {
-    }
+} // namespace vsmc::internal
 
-    explicit UniformRealLRDistribution(const param_type &param) : param_(param)
-    {
-    }
+/// \brief Uniform real distribution
+/// \ingroup Distribution
+template <typename RealType>
+class UniformRealDistribution
+{
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(UniformReal, uniform_real, a, 0, b, 1)
 
-    result_type a() const { return param_.a(); }
-    result_type b() const { return param_.b(); }
+    public:
+    result_type min() const { return a(); }
 
-    result_type min VSMC_MNE() const { return a(); }
-    result_type max VSMC_MNE() const { return b(); }
+    result_type max() const { return b(); }
 
     void reset() {}
 
+    private:
     template <typename RNGType>
-    result_type operator()(RNGType &rng)
-    {
-        return operator()(rng, param_);
-    }
-
-    template <typename RNGType>
-    result_type operator()(RNGType &rng, const param_type &param)
+    result_type generate(RNGType &rng, const param_type &param)
     {
-        U01LRDistribution<RealType, Left, Right> u01;
+        U01Distribution<RealType> u01;
 
         return param.a() + (param.b() - param.a()) * u01(rng);
     }
-
-    template <typename RNGType>
-    void operator()(RNGType &rng, std::size_t n, result_type *r)
-    {
-        return operator()(rng, n, r, param_);
-    }
-
-    template <typename RNGType>
-    void operator()(
-        RNGType &rng, std::size_t n, result_type *r, const param_type &param)
-    {
-        uniform_real_lr_distribution<RealType, Left, Right>(
-            rng, n, r, param.a(), param.b());
-    }
-
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
-
-    private:
-    param_type param_;
-}; // class UniformRealLRDistribution
-
-/// \brief Uniform real distribution on cloed-closed interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using UniformRealCCDistribution =
-    UniformRealLRDistribution<RealType, Closed, Closed>;
-
-/// \brief Uniform real distribution on cloed-open interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using UniformRealOODistribution =
-    UniformRealLRDistribution<RealType, Open, Open>;
-
-/// \brief Uniform real distribution on open-closed interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using UniformRealCODistribution =
-    UniformRealLRDistribution<RealType, Closed, Open>;
-
-/// \brief Uniform real distribution on open-open interval
-/// \ingroup Distribution
-template <typename RealType = double>
-using UniformRealOCDistribution =
-    UniformRealLRDistribution<RealType, Open, Closed>;
-
-/// \brief Uniform real distribution
-template <typename RealType = double>
-using UniformRealDistribution = UniformRealCODistribution<RealType>;
+}; // class UniformRealDistribution
 
 namespace internal
 {
 
-template <typename RealType, typename Left, typename Right, typename RNGType>
-inline void uniform_real_lr_distribution_impl(
+template <typename RealType, typename RNGType>
+inline void uniform_real_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    u01_lr_distribution<RealType, Left, Right>(rng, n, r);
+    u01_distribution<RealType>(rng, n, r);
     fma(n, (b - a), r, a, r);
 }
 
@@ -231,74 +88,24 @@ inline void uniform_real_lr_distribution_impl(
 
 /// \brief Generate uniform real random variates with open/closed variants
 /// \ingroup Distribution
-template <typename RealType, typename Left, typename Right, typename RNGType>
-inline void uniform_real_lr_distribution(
-    RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
-{
-    const std::size_t k = 1000;
-    const std::size_t m = n / k;
-    const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i) {
-        internal::uniform_real_lr_distribution_impl<RealType, Left, Right>(
-            rng, k, r + i * k, a, b);
-    }
-    internal::uniform_real_lr_distribution_impl<RealType, Left, Right>(
-        rng, l, r + m * k, a, b);
-}
-
-/// \brief Generate uniform real random variates on closed-closed interval
-/// \ingroup Distribution
-template <typename RealType, typename RNGType>
-inline void uniform_real_cc_distribution(
-    RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
-{
-    uniform_real_lr_distribution<RealType, Closed, Closed>(rng, n, r, a, b);
-}
-
-/// \brief Generate uniform real random variates on closed-open interval
-/// \ingroup Distribution
-template <typename RealType, typename RNGType>
-inline void uniform_real_co_distribution(
-    RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
-{
-    uniform_real_lr_distribution<RealType, Closed, Open>(rng, n, r, a, b);
-}
-
-/// \brief Generate uniform real random variates on open-closed interval
-/// \ingroup Distribution
-template <typename RealType, typename RNGType>
-inline void uniform_real_oc_distribution(
-    RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
-{
-    uniform_real_lr_distribution<RealType, Open, Closed>(rng, n, r, a, b);
-}
-
-/// \brief Generate uniform real random variates on open-open interval
-/// \ingroup Distribution
-template <typename RealType, typename RNGType>
-inline void uniform_real_oo_distribution(
-    RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
-{
-    uniform_real_lr_distribution<RealType, Open, Open>(rng, n, r, a, b);
-}
-
-/// \brief Generate uniform real random variates
-/// \ingroup Distribution
 template <typename RealType, typename RNGType>
 inline void uniform_real_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    uniform_real_co_distribution(rng, n, r, a, b);
-}
+    static_assert(std::is_floating_point<RealType>::value,
+        "**uniform_real_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
 
-template <typename RealType, typename RNGType, typename Left, typename Right>
-inline void rng_rand(RNGType &rng,
-    UniformRealLRDistribution<RealType, Left, Right> &dist, std::size_t n,
-    RealType *r)
-{
-    dist(rng, n, r);
+    const std::size_t k = 1024;
+    const std::size_t m = n / k;
+    const std::size_t l = n % k;
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::uniform_real_distribution_impl<RealType>(rng, k, r, a, b);
+    internal::uniform_real_distribution_impl<RealType>(rng, l, r, a, b);
 }
 
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(UniformReal, uniform_real, a, b)
+
 } // namespace vsmc
 
 #endif // VSMC_RNG_UNIFORM_REAL_DISTRIBUTION_HPP
diff --git a/include/vsmc/rng/weibull_distribution.hpp b/include/vsmc/rng/weibull_distribution.hpp
index e1cd3118c..903746488 100644
--- a/include/vsmc/rng/weibull_distribution.hpp
+++ b/include/vsmc/rng/weibull_distribution.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,8 @@
 #define VSMC_RNG_WEIBULL_DISTRIBUTION_HPP
 
 #include <vsmc/rng/internal/common.hpp>
-#include <vsmc/rng/u01_distribution.hpp>
 #include <vsmc/rng/normal_distribution.hpp>
+#include <vsmc/rng/u01_distribution.hpp>
 
 namespace vsmc
 {
@@ -55,17 +55,12 @@ inline bool weibull_distribution_check_param(RealType a, RealType b)
 template <typename RealType>
 class WeibullDistribution
 {
-    VSMC_DEFINE_RNG_DISTRIBUTION_2(
-        Weibull, weibull, RealType, result_type, a, 1, result_type, b, 1)
-    VSMC_DEFINE_RNG_DISTRIBUTION_OPERATORS
+    VSMC_DEFINE_RNG_DISTRIBUTION_2(Weibull, weibull, a, 1, b, 1)
 
     public:
-    result_type min VSMC_MNE() const { return 0; }
+    result_type min() const { return 0; }
 
-    result_type max VSMC_MNE() const
-    {
-        return std::numeric_limits<result_type>::max VSMC_MNE();
-    }
+    result_type max() const { return std::numeric_limits<result_type>::max(); }
 
     void reset() {}
 
@@ -73,10 +68,12 @@ class WeibullDistribution
     template <typename RNGType>
     result_type generate(RNGType &rng, const param_type &param)
     {
-        U01OODistribution<RealType> runif;
+        U01Distribution<RealType> u01;
 
-        return param.b() *
-            std::exp((1 / param.a()) * std::log(-std::log(runif(rng))));
+        return internal::is_equal<RealType>(
+                   param.a(), static_cast<RealType>(1)) ?
+            -param.b() * std::log(1 - u01(rng)) :
+            param.b() * std::pow(-std::log(1 - u01(rng)), 1 / param.a());
     }
 }; // class WeibullDistribution
 
@@ -87,15 +84,14 @@ template <typename RealType, typename RNGType>
 inline void weibull_distribution_impl(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    u01_oo_distribution(rng, n, r);
+    u01_distribution(rng, n, r);
+    sub(n, static_cast<RealType>(1), r, r);
     log(n, r, r);
-    if (is_equal<RealType>(a, 1)) {
+    if (is_equal<RealType>(a, static_cast<RealType>(1))) {
         mul(n, -b, r, r);
     } else {
         mul(n, static_cast<RealType>(-1), r, r);
-        log(n, r, r);
-        mul(n, 1 / a, r, r);
-        exp(n, r, r);
+        pow(n, r, 1 / a, r);
         mul(n, b, r, r);
     }
 }
@@ -108,20 +104,19 @@ template <typename RealType, typename RNGType>
 inline void weibull_distribution(
     RNGType &rng, std::size_t n, RealType *r, RealType a, RealType b)
 {
-    const std::size_t k = 1000;
+    static_assert(std::is_floating_point<RealType>::value,
+        "**weibull_distribution** USED WITH RealType OTHER THAN FLOATING "
+        "POINT TYPES");
+
+    const std::size_t k = 1024;
     const std::size_t m = n / k;
     const std::size_t l = n % k;
-    for (std::size_t i = 0; i != m; ++i)
-        internal::weibull_distribution_impl(rng, k, r + i * k, a, b);
-    internal::weibull_distribution_impl(rng, l, r + m * k, a, b);
+    for (std::size_t i = 0; i != m; ++i, r += k)
+        internal::weibull_distribution_impl(rng, k, r, a, b);
+    internal::weibull_distribution_impl(rng, l, r, a, b);
 }
 
-template <typename RealType, typename RNGType>
-inline void rng_rand(RNGType &rng, WeibullDistribution<RealType> &dist,
-    std::size_t n, RealType *r)
-{
-    dist(rng, n, r);
-}
+VSMC_DEFINE_RNG_DISTRIBUTION_RAND_2(Weibull, weibull, a, b)
 
 } // namespace vsmc
 
diff --git a/include/vsmc/rngc/philox.h b/include/vsmc/rngc/philox.h
index f2cb317ba..339244e1f 100644
--- a/include/vsmc/rngc/philox.h
+++ b/include/vsmc/rngc/philox.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -84,7 +84,7 @@ typedef struct {
     uint32_t index;
 } vsmc_philox4x32;
 
-VSMC_STATIC_INLINE void vsmc_philox2x32_inc(vsmc_philox2x32_ctr_t *ctr)
+static inline void vsmc_philox2x32_inc(vsmc_philox2x32_ctr_t *ctr)
 {
     if (++ctr->v[0] != 0)
         return;
@@ -92,7 +92,7 @@ VSMC_STATIC_INLINE void vsmc_philox2x32_inc(vsmc_philox2x32_ctr_t *ctr)
         return;
 }
 
-VSMC_STATIC_INLINE void vsmc_philox4x32_inc(vsmc_philox4x32_ctr_t *ctr)
+static inline void vsmc_philox4x32_inc(vsmc_philox4x32_ctr_t *ctr)
 {
     if (++ctr->v[0] != 0)
         return;
@@ -104,31 +104,31 @@ VSMC_STATIC_INLINE void vsmc_philox4x32_inc(vsmc_philox4x32_ctr_t *ctr)
         return;
 }
 
-VSMC_STATIC_INLINE void vsmc_philox2x32_initpar(
+static inline void vsmc_philox2x32_initpar(
     const vsmc_philox2x32_key_t *key, vsmc_philox2x32_par_t *par)
 {
     par->v[0] = key->v[0];
 }
 
-VSMC_STATIC_INLINE void vsmc_philox4x32_initpar(
+static inline void vsmc_philox4x32_initpar(
     const vsmc_philox4x32_key_t *key, vsmc_philox4x32_par_t *par)
 {
     par->v[0] = key->v[0];
     par->v[1] = key->v[1];
 }
 
-VSMC_STATIC_INLINE void vsmc_philox2x32_bumpkey(vsmc_philox2x32_par_t *par)
+static inline void vsmc_philox2x32_bumpkey(vsmc_philox2x32_par_t *par)
 {
     par->v[0] += UINT32_C(0x9E3779B9);
 }
 
-VSMC_STATIC_INLINE void vsmc_philox4x32_bumpkey(vsmc_philox4x32_par_t *par)
+static inline void vsmc_philox4x32_bumpkey(vsmc_philox4x32_par_t *par)
 {
     par->v[0] += UINT32_C(0x9E3779B9);
     par->v[1] += UINT32_C(0xBB67AE85);
 }
 
-VSMC_STATIC_INLINE void vsmc_philox2x32_round(
+static inline void vsmc_philox2x32_round(
     vsmc_philox2x32_ctr_t *state, const vsmc_philox2x32_par_t *par)
 {
 #ifdef __cplusplus
@@ -145,7 +145,7 @@ VSMC_STATIC_INLINE void vsmc_philox2x32_round(
     state->v[1] = lo;
 }
 
-VSMC_STATIC_INLINE void vsmc_philox4x32_round(
+static inline void vsmc_philox4x32_round(
     vsmc_philox4x32_ctr_t *state, const vsmc_philox4x32_par_t *par)
 {
 #ifdef __cplusplus
@@ -173,7 +173,7 @@ VSMC_STATIC_INLINE void vsmc_philox4x32_round(
 
 /// \brief Generate Philox2x32 RNG state
 /// \ingroup PhiloxC
-VSMC_STATIC_INLINE void vsmc_philox2x32_gen(const vsmc_philox2x32_ctr_t *ctr,
+static inline void vsmc_philox2x32_gen(const vsmc_philox2x32_ctr_t *ctr,
     const vsmc_philox2x32_key_t *key, vsmc_philox2x32_ctr_t *state)
 {
     *state = *ctr;
@@ -203,7 +203,7 @@ VSMC_STATIC_INLINE void vsmc_philox2x32_gen(const vsmc_philox2x32_ctr_t *ctr,
 
 /// \brief Generate Philox4x32 RNG state
 /// \ingroup PhiloxC
-VSMC_STATIC_INLINE void vsmc_philox4x32_gen(const vsmc_philox4x32_ctr_t *ctr,
+static inline void vsmc_philox4x32_gen(const vsmc_philox4x32_ctr_t *ctr,
     const vsmc_philox4x32_key_t *key, vsmc_philox4x32_ctr_t *state)
 {
     *state = *ctr;
@@ -233,8 +233,7 @@ VSMC_STATIC_INLINE void vsmc_philox4x32_gen(const vsmc_philox4x32_ctr_t *ctr,
 
 /// \brief Initialize Philox2x32 RNG state
 /// \ingroup PhiloxC
-VSMC_STATIC_INLINE void vsmc_philox2x32_init(
-    vsmc_philox2x32 *rng, uint32_t seed)
+static inline void vsmc_philox2x32_init(vsmc_philox2x32 *rng, uint32_t seed)
 {
     rng->ctr.v[0] = 0;
     rng->ctr.v[1] = 0;
@@ -244,8 +243,7 @@ VSMC_STATIC_INLINE void vsmc_philox2x32_init(
 
 /// \brief Initialize Philox4x32 RNG state
 /// \ingroup PhiloxC
-VSMC_STATIC_INLINE void vsmc_philox4x32_init(
-    vsmc_philox4x32 *rng, uint32_t seed)
+static inline void vsmc_philox4x32_init(vsmc_philox4x32 *rng, uint32_t seed)
 {
     rng->ctr.v[0] = 0;
     rng->ctr.v[1] = 0;
@@ -258,7 +256,7 @@ VSMC_STATIC_INLINE void vsmc_philox4x32_init(
 
 /// \brief Generate random 32-bits integers from Philox2x32 RNG
 /// \ingroup PhiloxC
-VSMC_STATIC_INLINE uint32_t vsmc_philox2x32_rand(vsmc_philox2x32 *rng)
+static inline uint32_t vsmc_philox2x32_rand(vsmc_philox2x32 *rng)
 {
     if (rng->index == 2) {
         vsmc_philox2x32_inc(&rng->ctr);
@@ -271,7 +269,7 @@ VSMC_STATIC_INLINE uint32_t vsmc_philox2x32_rand(vsmc_philox2x32 *rng)
 
 /// \brief Generate random 32-bits integers from Philox4x32 RNG
 /// \ingroup PhiloxC
-VSMC_STATIC_INLINE uint32_t vsmc_philox4x32_rand(vsmc_philox4x32 *rng)
+static inline uint32_t vsmc_philox4x32_rand(vsmc_philox4x32 *rng)
 {
     if (rng->index == 4) {
         vsmc_philox4x32_inc(&rng->ctr);
diff --git a/include/vsmc/rngc/rngc.h b/include/vsmc/rngc/rngc.h
index 466b4db24..0b7e9f847 100644
--- a/include/vsmc/rngc/rngc.h
+++ b/include/vsmc/rngc/rngc.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/rngc/threefry.h b/include/vsmc/rngc/threefry.h
index f7154638f..edc777a06 100644
--- a/include/vsmc/rngc/threefry.h
+++ b/include/vsmc/rngc/threefry.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -134,7 +134,7 @@ typedef struct {
     uint64_t index;
 } vsmc_threefry4x64;
 
-VSMC_STATIC_INLINE void vsmc_threefry2x32_inc(vsmc_threefry2x32_ctr_t *ctr)
+static inline void vsmc_threefry2x32_inc(vsmc_threefry2x32_ctr_t *ctr)
 {
     if (++ctr->v[0] != 0)
         return;
@@ -142,7 +142,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x32_inc(vsmc_threefry2x32_ctr_t *ctr)
         return;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x32_inc(vsmc_threefry4x32_ctr_t *ctr)
+static inline void vsmc_threefry4x32_inc(vsmc_threefry4x32_ctr_t *ctr)
 {
     if (++ctr->v[0] != 0)
         return;
@@ -154,7 +154,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x32_inc(vsmc_threefry4x32_ctr_t *ctr)
         return;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x64_inc(vsmc_threefry2x64_ctr_t *ctr)
+static inline void vsmc_threefry2x64_inc(vsmc_threefry2x64_ctr_t *ctr)
 {
     if (++ctr->v[0] != 0)
         return;
@@ -162,7 +162,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x64_inc(vsmc_threefry2x64_ctr_t *ctr)
         return;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x64_inc(vsmc_threefry4x64_ctr_t *ctr)
+static inline void vsmc_threefry4x64_inc(vsmc_threefry4x64_ctr_t *ctr)
 {
     if (++ctr->v[0] != 0)
         return;
@@ -174,7 +174,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x64_inc(vsmc_threefry4x64_ctr_t *ctr)
         return;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x32_initpar(
+static inline void vsmc_threefry2x32_initpar(
     const vsmc_threefry2x32_key_t *key, vsmc_threefry2x32_par_t *par)
 {
     par->v[0] = key->v[0];
@@ -185,7 +185,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x32_initpar(
     par->v[2] ^= par->v[1];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x32_initpar(
+static inline void vsmc_threefry4x32_initpar(
     const vsmc_threefry4x32_key_t *key, vsmc_threefry4x32_par_t *par)
 {
     par->v[0] = key->v[0];
@@ -200,7 +200,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x32_initpar(
     par->v[4] ^= par->v[3];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x64_initpar(
+static inline void vsmc_threefry2x64_initpar(
     const vsmc_threefry2x64_key_t *key, vsmc_threefry2x64_par_t *par)
 {
     par->v[0] = key->v[0];
@@ -211,7 +211,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x64_initpar(
     par->v[2] ^= par->v[1];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x64_initpar(
+static inline void vsmc_threefry4x64_initpar(
     const vsmc_threefry4x64_key_t *key, vsmc_threefry4x64_par_t *par)
 {
     par->v[0] = key->v[0];
@@ -226,7 +226,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x64_initpar(
     par->v[4] ^= par->v[3];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x32_rotate(
+static inline void vsmc_threefry2x32_rotate(
     vsmc_threefry2x32_ctr_t *state, uint32_t r)
 {
     state->v[0] += state->v[1];
@@ -234,7 +234,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x32_rotate(
     state->v[1] ^= state->v[0];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x32_rotate(
+static inline void vsmc_threefry4x32_rotate(
     vsmc_threefry4x32_ctr_t *state, uint32_t r0, uint32_t r2, int i0, int i2)
 {
     state->v[0] += state->v[i0];
@@ -246,7 +246,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x32_rotate(
     state->v[i2] ^= state->v[2];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x64_rotate(
+static inline void vsmc_threefry2x64_rotate(
     vsmc_threefry2x64_ctr_t *state, uint64_t r)
 {
     state->v[0] += state->v[1];
@@ -254,7 +254,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x64_rotate(
     state->v[1] ^= state->v[0];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x64_rotate(
+static inline void vsmc_threefry4x64_rotate(
     vsmc_threefry4x64_ctr_t *state, uint64_t r0, uint64_t r2, int i0, int i2)
 {
     state->v[0] += state->v[i0];
@@ -266,18 +266,17 @@ VSMC_STATIC_INLINE void vsmc_threefry4x64_rotate(
     state->v[i2] ^= state->v[2];
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x32_insertkey(
-    vsmc_threefry2x32_ctr_t *state, const vsmc_threefry2x32_par_t *par,
-    uint32_t inc, int i0, int i1)
+static inline void vsmc_threefry2x32_insertkey(vsmc_threefry2x32_ctr_t *state,
+    const vsmc_threefry2x32_par_t *par, uint32_t inc, int i0, int i1)
 {
     state->v[0] += par->v[i0];
     state->v[1] += par->v[i1];
     state->v[1] += inc;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x32_insertkey(
-    vsmc_threefry4x32_ctr_t *state, const vsmc_threefry4x32_par_t *par,
-    uint32_t inc, int i0, int i1, int i2, int i3)
+static inline void vsmc_threefry4x32_insertkey(vsmc_threefry4x32_ctr_t *state,
+    const vsmc_threefry4x32_par_t *par, uint32_t inc, int i0, int i1, int i2,
+    int i3)
 {
     state->v[0] += par->v[i0];
     state->v[1] += par->v[i1];
@@ -286,18 +285,17 @@ VSMC_STATIC_INLINE void vsmc_threefry4x32_insertkey(
     state->v[3] += inc;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry2x64_insertkey(
-    vsmc_threefry2x64_ctr_t *state, const vsmc_threefry2x64_par_t *par,
-    uint64_t inc, int i0, int i1)
+static inline void vsmc_threefry2x64_insertkey(vsmc_threefry2x64_ctr_t *state,
+    const vsmc_threefry2x64_par_t *par, uint64_t inc, int i0, int i1)
 {
     state->v[0] += par->v[i0];
     state->v[1] += par->v[i1];
     state->v[1] += inc;
 }
 
-VSMC_STATIC_INLINE void vsmc_threefry4x64_insertkey(
-    vsmc_threefry4x64_ctr_t *state, const vsmc_threefry4x64_par_t *par,
-    uint64_t inc, int i0, int i1, int i2, int i3)
+static inline void vsmc_threefry4x64_insertkey(vsmc_threefry4x64_ctr_t *state,
+    const vsmc_threefry4x64_par_t *par, uint64_t inc, int i0, int i1, int i2,
+    int i3)
 {
     state->v[0] += par->v[i0];
     state->v[1] += par->v[i1];
@@ -308,9 +306,8 @@ VSMC_STATIC_INLINE void vsmc_threefry4x64_insertkey(
 
 /// \brief Generate Threefry2x32 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry2x32_gen(
-    const vsmc_threefry2x32_ctr_t *ctr, const vsmc_threefry2x32_key_t *key,
-    vsmc_threefry2x32_ctr_t *state)
+static inline void vsmc_threefry2x32_gen(const vsmc_threefry2x32_ctr_t *ctr,
+    const vsmc_threefry2x32_key_t *key, vsmc_threefry2x32_ctr_t *state)
 {
     *state = *ctr;
     vsmc_threefry2x32_par_t par;
@@ -346,9 +343,8 @@ VSMC_STATIC_INLINE void vsmc_threefry2x32_gen(
 
 /// \brief Generate Threefry4x32 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry4x32_gen(
-    const vsmc_threefry4x32_ctr_t *ctr, const vsmc_threefry4x32_key_t *key,
-    vsmc_threefry4x32_ctr_t *state)
+static inline void vsmc_threefry4x32_gen(const vsmc_threefry4x32_ctr_t *ctr,
+    const vsmc_threefry4x32_key_t *key, vsmc_threefry4x32_ctr_t *state)
 {
     *state = *ctr;
     vsmc_threefry4x32_par_t par;
@@ -384,9 +380,8 @@ VSMC_STATIC_INLINE void vsmc_threefry4x32_gen(
 
 /// \brief Generate Threefry2x64 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry2x64_gen(
-    const vsmc_threefry2x64_ctr_t *ctr, const vsmc_threefry2x64_key_t *key,
-    vsmc_threefry2x64_ctr_t *state)
+static inline void vsmc_threefry2x64_gen(const vsmc_threefry2x64_ctr_t *ctr,
+    const vsmc_threefry2x64_key_t *key, vsmc_threefry2x64_ctr_t *state)
 {
     *state = *ctr;
     vsmc_threefry2x64_par_t par;
@@ -422,9 +417,8 @@ VSMC_STATIC_INLINE void vsmc_threefry2x64_gen(
 
 /// \brief Generate Threefry4x64 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry4x64_gen(
-    const vsmc_threefry4x64_ctr_t *ctr, const vsmc_threefry4x64_key_t *key,
-    vsmc_threefry4x64_ctr_t *state)
+static inline void vsmc_threefry4x64_gen(const vsmc_threefry4x64_ctr_t *ctr,
+    const vsmc_threefry4x64_key_t *key, vsmc_threefry4x64_ctr_t *state)
 {
     *state = *ctr;
     vsmc_threefry4x64_par_t par;
@@ -460,7 +454,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x64_gen(
 
 /// \brief Initialize Threefry2x32 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry2x32_init(
+static inline void vsmc_threefry2x32_init(
     vsmc_threefry2x32 *rng, uint32_t seed)
 {
     rng->ctr.v[0] = 0;
@@ -472,7 +466,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x32_init(
 
 /// \brief Initialize Threefry4x32 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry4x32_init(
+static inline void vsmc_threefry4x32_init(
     vsmc_threefry4x32 *rng, uint32_t seed)
 {
     rng->ctr.v[0] = 0;
@@ -488,7 +482,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x32_init(
 
 /// \brief Initialize Threefry2x64 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry2x64_init(
+static inline void vsmc_threefry2x64_init(
     vsmc_threefry2x64 *rng, uint64_t seed)
 {
     rng->ctr.v[0] = 0;
@@ -500,7 +494,7 @@ VSMC_STATIC_INLINE void vsmc_threefry2x64_init(
 
 /// \brief Initialize Threefry4x64 RNG state
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE void vsmc_threefry4x64_init(
+static inline void vsmc_threefry4x64_init(
     vsmc_threefry4x64 *rng, uint64_t seed)
 {
     rng->ctr.v[0] = 0;
@@ -516,7 +510,7 @@ VSMC_STATIC_INLINE void vsmc_threefry4x64_init(
 
 /// \brief Generate random 32-bits integers from Threefry2x32 RNG
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE uint32_t vsmc_threefry2x32_rand(vsmc_threefry2x32 *rng)
+static inline uint32_t vsmc_threefry2x32_rand(vsmc_threefry2x32 *rng)
 {
     if (rng->index == 2) {
         vsmc_threefry2x32_inc(&rng->ctr);
@@ -529,7 +523,7 @@ VSMC_STATIC_INLINE uint32_t vsmc_threefry2x32_rand(vsmc_threefry2x32 *rng)
 
 /// \brief Generate random 32-bits integers from Threefry4x32 RNG
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE uint32_t vsmc_threefry4x32_rand(vsmc_threefry4x32 *rng)
+static inline uint32_t vsmc_threefry4x32_rand(vsmc_threefry4x32 *rng)
 {
     if (rng->index == 4) {
         vsmc_threefry4x32_inc(&rng->ctr);
@@ -542,7 +536,7 @@ VSMC_STATIC_INLINE uint32_t vsmc_threefry4x32_rand(vsmc_threefry4x32 *rng)
 
 /// \brief Generate random 64-bits integers from Threefry2x64 RNG
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE uint64_t vsmc_threefry2x64_rand(vsmc_threefry2x64 *rng)
+static inline uint64_t vsmc_threefry2x64_rand(vsmc_threefry2x64 *rng)
 {
     if (rng->index == 2) {
         vsmc_threefry2x64_inc(&rng->ctr);
@@ -555,7 +549,7 @@ VSMC_STATIC_INLINE uint64_t vsmc_threefry2x64_rand(vsmc_threefry2x64 *rng)
 
 /// \brief Generate random 64-bits integers from Threefry4x64 RNG
 /// \ingroup ThreefryC
-VSMC_STATIC_INLINE uint64_t vsmc_threefry4x64_rand(vsmc_threefry4x64 *rng)
+static inline uint64_t vsmc_threefry4x64_rand(vsmc_threefry4x64 *rng)
 {
     if (rng->index == 4) {
         vsmc_threefry4x64_inc(&rng->ctr);
diff --git a/include/vsmc/rngc/u01.h b/include/vsmc/rngc/u01.h
index fc3163c11..8cffebf70 100644
--- a/include/vsmc/rngc/u01.h
+++ b/include/vsmc/rngc/u01.h
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,135 +35,204 @@
 #include <vsmc/internal/config.h>
 
 #ifdef VSMC_OPENCL
-#define VSMC_RNGC_U01_31F (1.0f / 2147483648.0f)
-#define VSMC_RNGC_U01_24F (1.0f / 16777216.0f)
 #define VSMC_RNGC_U01_23F (1.0f / 8388608.0f)
-#define VSMC_RNGC_U01_33D (1.0 / 8589934592.0)
+#define VSMC_RNGC_U01_24F (1.0f / 16777216.0f)
+#define VSMC_RNGC_U01_31F (1.0f / 2147483648.0f)
+#define VSMC_RNGC_U01_32F (1.0f / 4294967296.0f)
+#define VSMC_RNGC_U01_33F (1.0f / 8589934592.0f)
+#define VSMC_RNGC_U01_64F (1.0f / 18446744073709551616.0f)
+#define VSMC_RNGC_U01_65F (1.0f / 36893488147419103232.0f)
+#if VSMC_HAS_OPENCL_DOUBLE
 #define VSMC_RNGC_U01_32D (1.0 / 4294967296.0)
-#define VSMC_RNGC_U01_63D (1.0 / 9223372036854775808.0)
-#define VSMC_RNGC_U01_53D (1.0 / 9007199254740992.0)
+#define VSMC_RNGC_U01_33D (1.0 / 8589934592.0)
 #define VSMC_RNGC_U01_52D (1.0 / 4503599627370496.0)
+#define VSMC_RNGC_U01_53D (1.0 / 9007199254740992.0)
+#define VSMC_RNGC_U01_63D (1.0 / 9223372036854775808.0)
+#define VSMC_RNGC_U01_64D (1.0 / 18446744073709551616.0)
+#define VSMC_RNGC_U01_65D (1.0 / 36893488147419103232.0)
+#endif // VSMC_HAS_OPENCL_DOUBLE
 #else  // VSMC_OPENCL
-static const float VSMC_RNGC_U01_31F = 1.0f / 2147483648.0f;
-static const float VSMC_RNGC_U01_24F = 1.0f / 16777216.0f;
 static const float VSMC_RNGC_U01_23F = 1.0f / 8388608.0f;
-static const double VSMC_RNGC_U01_33D = 1.0 / 8589934592.0;
+static const float VSMC_RNGC_U01_24F = 1.0f / 16777216.0f;
+static const float VSMC_RNGC_U01_31F = 1.0f / 2147483648.0f;
+static const float VSMC_RNGC_U01_32F = 1.0f / 4294967296.0f;
+static const float VSMC_RNGC_U01_33F = 1.0f / 8589934592.0f;
+static const float VSMC_RNGC_U01_64F = 1.0f / 18446744073709551616.0f;
+static const float VSMC_RNGC_U01_65F = 1.0f / 36893488147419103232.0f;
 static const double VSMC_RNGC_U01_32D = 1.0 / 4294967296.0;
-static const double VSMC_RNGC_U01_63D = 1.0 / 9223372036854775808.0;
-static const double VSMC_RNGC_U01_53D = 1.0 / 9007199254740992.0;
+static const double VSMC_RNGC_U01_33D = 1.0 / 8589934592.0;
 static const double VSMC_RNGC_U01_52D = 1.0 / 4503599627370496.0;
+static const double VSMC_RNGC_U01_53D = 1.0 / 9007199254740992.0;
+static const double VSMC_RNGC_U01_63D = 1.0 / 9223372036854775808.0;
+static const double VSMC_RNGC_U01_64D = 1.0 / 18446744073709551616.0;
+static const double VSMC_RNGC_U01_65D = 1.0 / 36893488147419103232.0;
+static const long double VSMC_RNGC_U01_32L = 1.0l / 4294967296.0l;
+static const long double VSMC_RNGC_U01_33L = 1.0l / 8589934592.0l;
+static const long double VSMC_RNGC_U01_63L = 1.0l / 9223372036854775808.0l;
+static const long double VSMC_RNGC_U01_64L = 1.0l / 18446744073709551616.0l;
+static const long double VSMC_RNGC_U01_65L = 1.0l / 36893488147419103232.0l;
 #endif // VSMC_OPENCL
 
-/// \brief Converting 32-bits unsigned to single precision uniform \f$[0,1]\f$
+/// \brief Converting 32-bits unsigned to single precision uniform \f$(0, 1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_closed_closed_u32_f32(uint32_t u)
+static inline float vsmc_u01_u32f(uint32_t u)
 {
-    return ((u & UINT32_C(0x7FFFFFC0)) + (u & UINT32_C(0x40))) *
-        VSMC_RNGC_U01_31F;
+    return u * VSMC_RNGC_U01_32F + VSMC_RNGC_U01_33F;
 }
 
-/// \brief Converting 32-bits unsigned to single precision uniform \f$[0,1)\f$
+/// \brief Converting 64-bits unsigned to single precision uniform \f$(0, 1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_closed_open_u32_f32(uint32_t u)
+static inline float vsmc_u01_u64f(uint64_t u)
 {
-    return (u >> 8) * VSMC_RNGC_U01_24F;
+    return u * VSMC_RNGC_U01_64F + VSMC_RNGC_U01_65F;
 }
 
-/// \brief Converting 32-bits unsigned to single precision uniform \f$(0,1]\f$
+#if !defined(VSMC_OPENCL) || VSMC_HAS_OPENCL_DOUBLE
+
+/// \brief Converting 32-bits unsigned to double precision uniform \f$(0, 1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_open_closed_u32_f32(uint32_t u)
+static inline double vsmc_u01_u32d(uint32_t u)
 {
-    return VSMC_RNGC_U01_24F + (u >> 8) * VSMC_RNGC_U01_24F;
+    return u * VSMC_RNGC_U01_32D + VSMC_RNGC_U01_33D;
 }
 
-/// \brief Converting 32-bits unsigned to single precision uniform \f$(0,1)\f$
+/// \brief Converting 64-bits unsigned to double precision uniform \f$(0, 1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_open_open_u32_f32(uint32_t u)
+static inline double vsmc_u01_u64d(uint64_t u)
 {
-    return VSMC_RNGC_U01_24F + (u >> 9) * VSMC_RNGC_U01_23F;
+    return u * VSMC_RNGC_U01_64D + VSMC_RNGC_U01_65D;
 }
 
-#if VSMC_HAS_RNGC_DOUBLE
+#endif // !defined(VSMC_OPENCL) || VSMC_HAS_OPENCL_DOUBLE
 
-/// \brief Converting 32-bits unsigned to double precision uniform \f$[0,1]\f$
+#ifndef VSMC_OPENCL
+
+/// \brief Converting 32-bits unsigned to long double precision uniform
+/// \f$(0, 1)\f$
+static inline long double vsmc_u01_u32l(uint32_t u)
+{
+    return u * VSMC_RNGC_U01_32L + VSMC_RNGC_U01_33L;
+}
+
+/// \brief Converting 64-bits unsigned to long double precision uniform
+/// \f$(0, 1)\f$
+static inline long double vsmc_u01_u64l(uint64_t u)
+{
+    return u * VSMC_RNGC_U01_64L + VSMC_RNGC_U01_65L;
+}
+
+#endif // VSMC_OPENCL
+
+/// \brief Converting 32-bits unsigned to single precision uniform \f$[0,1]\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_closed_closed_u32_f64(uint32_t u)
+static inline float vsmc_u01_cc_u32f(uint32_t u)
 {
-#ifdef __cplusplus
-    return (static_cast<double>(u & 1) + u) * VSMC_RNGC_U01_32D;
-#else
-    return (((double) (u & 1)) + u) * VSMC_RNGC_U01_32D;
-#endif
+    return ((u & UINT32_C(0x40)) + (u & UINT32_C(0x7FFFFFC0))) *
+        VSMC_RNGC_U01_31F;
 }
 
-/// \brief Converting 32-bits unsigned to double precision uniform \f$[0,1)\f$
+/// \brief Converting 32-bits unsigned to single precision uniform \f$[0,1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_closed_open_u32_f64(uint32_t u)
+static inline float vsmc_u01_co_u32f(uint32_t u)
 {
-    return u * VSMC_RNGC_U01_32D;
+    return (u >> 8) * VSMC_RNGC_U01_24F;
 }
 
-/// \brief Converting 32-bits unsigned to double precision uniform \f$(0,1]\f$
+/// \brief Converting 32-bits unsigned to single precision uniform \f$(0,1]\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_open_closed_u32_f64(uint32_t u)
+static inline float vsmc_u01_oc_u32f(uint32_t u)
 {
-    return VSMC_RNGC_U01_32D + u * VSMC_RNGC_U01_32D;
+    return (u >> 8) * VSMC_RNGC_U01_24F + VSMC_RNGC_U01_24F;
 }
 
-/// \brief Converting 32-bits unsigned to double precision uniform \f$(0,1)\f$
+/// \brief Converting 32-bits unsigned to single precision uniform \f$(0,1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_open_open_u32_f64(uint32_t u)
+static inline float vsmc_u01_oo_u32f(uint32_t u)
 {
-    return VSMC_RNGC_U01_33D + u * VSMC_RNGC_U01_32D;
+    return (u >> 9) * VSMC_RNGC_U01_23F + VSMC_RNGC_U01_24F;
 }
 
 /// \brief Converting 64-bits unsigned to single precision uniform \f$[0,1]\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_closed_closed_u64_f32(uint64_t u)
+static inline float vsmc_u01_cc_u64f(uint64_t u)
 {
 #ifdef __cplusplus
-    return vsmc_u01_closed_closed_u32_f32(static_cast<uint32_t>(u));
+    return vsmc_u01_cc_u32f(static_cast<uint32_t>(u >> 32));
 #else
-    return vsmc_u01_closed_closed_u32_f32(((uint32_t) u));
+    return vsmc_u01_cc_u32f(((uint32_t)(u >> 32)));
 #endif
 }
 
 /// \brief Converting 64-bits unsigned to single precision uniform \f$[0,1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_closed_open_u64_f32(uint64_t u)
+static inline float vsmc_u01_co_u64f(uint64_t u)
 {
 #ifdef __cplusplus
-    return vsmc_u01_closed_open_u32_f32(static_cast<uint32_t>(u));
+    return vsmc_u01_co_u32f(static_cast<uint32_t>(u >> 32));
 #else
-    return vsmc_u01_closed_open_u32_f32(((uint32_t) u));
+    return vsmc_u01_co_u32f(((uint32_t)(u >> 32)));
 #endif
 }
 
 /// \brief Converting 64-bits unsigned to single precision uniform \f$(0,1]\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_open_closed_u64_f32(uint64_t u)
+static inline float vsmc_u01_oc_u64f(uint64_t u)
 {
 #ifdef __cplusplus
-    return vsmc_u01_open_closed_u32_f32(static_cast<uint32_t>(u));
+    return vsmc_u01_oc_u32f(static_cast<uint32_t>(u >> 32));
 #else
-    return vsmc_u01_open_closed_u32_f32(((uint32_t) u));
+    return vsmc_u01_oc_u32f(((uint32_t)(u >> 32)));
 #endif
 }
 
 /// \brief Converting 64-bits unsigned to single precision uniform \f$(0,1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE float vsmc_u01_open_open_u64_f32(uint64_t u)
+static inline float vsmc_u01_oo_u64f(uint64_t u)
+{
+#ifdef __cplusplus
+    return vsmc_u01_oo_u32f(static_cast<uint32_t>(u >> 32));
+#else
+    return vsmc_u01_oo_u32f(((uint32_t)(u >> 32)));
+#endif
+}
+
+#if !defined(VSMC_OPENCL) || VSMC_HAS_OPENCL_DOUBLE
+
+/// \brief Converting 32-bits unsigned to double precision uniform \f$[0,1]\f$
+/// \ingroup U01C
+static inline double vsmc_u01_cc_u32d(uint32_t u)
 {
 #ifdef __cplusplus
-    return vsmc_u01_open_open_u32_f32(static_cast<uint32_t>(u));
+    return (static_cast<double>(u & 1) + u) * VSMC_RNGC_U01_32D;
 #else
-    return vsmc_u01_open_open_u32_f32(((uint32_t) u));
+    return (((double) (u & 1)) + u) * VSMC_RNGC_U01_32D;
 #endif
 }
 
+/// \brief Converting 32-bits unsigned to double precision uniform \f$[0,1)\f$
+/// \ingroup U01C
+static inline double vsmc_u01_co_u32d(uint32_t u)
+{
+    return u * VSMC_RNGC_U01_32D;
+}
+
+/// \brief Converting 32-bits unsigned to double precision uniform \f$(0,1]\f$
+/// \ingroup U01C
+static inline double vsmc_u01_oc_u32d(uint32_t u)
+{
+    return VSMC_RNGC_U01_32D + u * VSMC_RNGC_U01_32D;
+}
+
+/// \brief Converting 32-bits unsigned to double precision uniform \f$(0,1)\f$
+/// \ingroup U01C
+static inline double vsmc_u01_oo_u32d(uint32_t u)
+{
+    return VSMC_RNGC_U01_33D + u * VSMC_RNGC_U01_32D;
+}
+
 /// \brief Converting 64-bits unsigned to double precision uniform \f$[0,1]\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_closed_closed_u64_f64(uint64_t u)
+static inline double vsmc_u01_cc_u64d(uint64_t u)
 {
     return ((u & UINT64_C(0x7FFFFFFFFFFFFE00)) + (u & UINT64_C(0x200))) *
         VSMC_RNGC_U01_63D;
@@ -171,25 +240,101 @@ VSMC_STATIC_INLINE double vsmc_u01_closed_closed_u64_f64(uint64_t u)
 
 /// \brief Converting 64-bits unsigned to double precision uniform \f$[0,1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_closed_open_u64_f64(uint64_t u)
+static inline double vsmc_u01_co_u64d(uint64_t u)
 {
     return (u >> 11) * VSMC_RNGC_U01_53D;
 }
 
 /// \brief Converting 64-bits unsigned to double precision uniform \f$(0,1]\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_open_closed_u64_f64(uint64_t u)
+static inline double vsmc_u01_oc_u64d(uint64_t u)
 {
     return VSMC_RNGC_U01_53D + (u >> 11) * VSMC_RNGC_U01_53D;
 }
 
 /// \brief Converting 64-bits unsigned to double precision uniform \f$(0,1)\f$
 /// \ingroup U01C
-VSMC_STATIC_INLINE double vsmc_u01_open_open_u64_f64(uint64_t u)
+static inline double vsmc_u01_oo_u64d(uint64_t u)
 {
     return VSMC_RNGC_U01_53D + (u >> 12) * VSMC_RNGC_U01_52D;
 }
 
-#endif // VSMC_HAS_RNGC_DOUBLE
+#endif // !defined(VSMC_OPENCL) || VSMC_HAS_OPENCL_DOUBLE
+
+#ifndef VSMC_OPENCL
+
+/// \brief Converting 32-bits unsigned to long double precision uniform
+/// \f$[0,1]\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_cc_u32l(uint32_t u)
+{
+#ifdef __cplusplus
+    return (static_cast<long double>(u & 1) + u) * VSMC_RNGC_U01_32L;
+#else
+    return (((long double) (u & 1)) + u) * VSMC_RNGC_U01_32L;
+#endif
+}
+
+/// \brief Converting 32-bits unsigned to long double precision uniform
+/// \f$[0,1)\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_co_u32l(uint32_t u)
+{
+    return u * VSMC_RNGC_U01_32L;
+}
+
+/// \brief Converting 32-bits unsigned to long double precision uniform
+/// \f$(0,1]\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_oc_u32l(uint32_t u)
+{
+    return u * VSMC_RNGC_U01_32L + VSMC_RNGC_U01_32L;
+}
+
+/// \brief Converting 32-bits unsigned to long double precision uniform
+/// \f$(0,1)\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_oo_u32l(uint32_t u)
+{
+    return u * VSMC_RNGC_U01_32L + VSMC_RNGC_U01_33L;
+}
+
+/// \brief Converting 64-bits unsigned to long double precision uniform
+/// \f$[0,1]\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_cc_u64l(uint64_t u)
+{
+#ifdef __cplusplus
+    return (static_cast<long double>(u & 1) + u) * VSMC_RNGC_U01_64L;
+#else
+    return (((long double) (u & 1)) + u) * VSMC_RNGC_U01_64L;
+#endif
+}
+
+/// \brief Converting 64-bits unsigned to long double precision uniform
+/// \f$[0,1)\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_co_u64l(uint64_t u)
+{
+    return u * VSMC_RNGC_U01_64L;
+}
+
+/// \brief Converting 64-bits unsigned to long double precision uniform
+/// \f$(0,1]\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_oc_u64l(uint64_t u)
+{
+    return u * VSMC_RNGC_U01_64L + VSMC_RNGC_U01_64L;
+}
+
+/// \brief Converting 64-bits unsigned to long double precision uniform
+/// \f$(0,1)\f$
+/// \ingroup U01C
+static inline long double vsmc_u01_oo_u64l(uint64_t u)
+{
+    return (u >> 1) * VSMC_RNGC_U01_63L + VSMC_RNGC_U01_64L;
+}
+
+#endif // VSMC_OPENCL
 
 #endif // VSMC_RNGC_U01_H
diff --git a/include/vsmc/smp/backend_base.hpp b/include/vsmc/smp/backend_base.hpp
index 712ff0b91..8765f9134 100644
--- a/include/vsmc/smp/backend_base.hpp
+++ b/include/vsmc/smp/backend_base.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -72,9 +72,7 @@
     template <typename T, typename = Virtual>                                 \
     class Move##Name;                                                         \
     template <typename T, typename = Virtual>                                 \
-    class MonitorEval##Name;                                                  \
-    template <typename T, typename = Virtual>                                 \
-    class PathEval##Name;
+    class MonitorEval##Name;
 
 #define VSMC_RUNTIME_ASSERT_SMP_BACKEND_BASE_DERIVED(basename)                \
     VSMC_RUNTIME_ASSERT((dynamic_cast<Derived *>(this) != nullptr),           \
@@ -495,162 +493,6 @@ class MonitorEvalBase<T, Virtual>
     VSMC_DEFINE_SMP_BACKEND_BASE_SPECIAL_VIRTUAL(MonitorEval)
 }; // class MonitorEvalBase<T, Virtual>
 
-/// \brief Path evalution base dispatch class
-/// \ingroup SMP
-template <typename T, typename Derived>
-class PathEvalBase
-{
-    public:
-    double eval_sp(std::size_t iter, SingleParticle<T> sp)
-    {
-        return eval_sp_dispatch(iter, sp, &Derived::eval_sp);
-    }
-
-    double eval_grid(std::size_t iter, Particle<T> &particle)
-    {
-        return eval_grid_dispatch(iter, particle, &Derived::eval_grid);
-    }
-
-    void eval_pre(std::size_t iter, Particle<T> &particle)
-    {
-        eval_pre_dispatch(iter, particle, &Derived::eval_pre);
-    }
-
-    void eval_post(std::size_t iter, Particle<T> &particle)
-    {
-        eval_post_dispatch(iter, particle, &Derived::eval_post);
-    }
-
-    protected:
-    VSMC_DEFINE_SMP_BACKEND_BASE_SPECIAL(PathEval)
-
-    private:
-    // non-static non-const
-
-    template <typename D>
-    double eval_sp_dispatch(std::size_t iter, SingleParticle<T> sp,
-        double (D::*)(std::size_t, SingleParticle<T>))
-    {
-        return static_cast<Derived *>(this)->eval_sp(iter, sp);
-    }
-
-    template <typename D>
-    double eval_grid_dispatch(std::size_t iter, Particle<T> &particle,
-        double (D::*)(std::size_t, Particle<T> &))
-    {
-        return static_cast<Derived *>(this)->eval_grid(iter, particle);
-    }
-
-    template <typename D>
-    void eval_pre_dispatch(std::size_t iter, Particle<T> &particle,
-        void (D::*)(std::size_t, Particle<T> &))
-    {
-        static_cast<Derived *>(this)->eval_pre(iter, particle);
-    }
-
-    template <typename D>
-    void eval_post_dispatch(std::size_t iter, Particle<T> &particle,
-        void (D::*)(std::size_t, Particle<T> &))
-    {
-        static_cast<Derived *>(this)->eval_post(iter, particle);
-    }
-
-    // non-static const
-
-    template <typename D>
-    double eval_sp_dispatch(std::size_t iter, SingleParticle<T> sp,
-        double (D::*)(std::size_t, SingleParticle<T>) const)
-    {
-        return static_cast<Derived *>(this)->eval_sp(iter, sp);
-    }
-
-    template <typename D>
-    double eval_grid_dispatch(std::size_t iter, Particle<T> &particle,
-        double (D::*)(std::size_t, Particle<T> &) const)
-    {
-        return static_cast<Derived *>(this)->eval_grid(iter, particle);
-    }
-
-    template <typename D>
-    void eval_pre_dispatch(std::size_t iter, Particle<T> &particle,
-        void (D::*)(std::size_t, Particle<T> &) const)
-    {
-        static_cast<Derived *>(this)->eval_pre(iter, particle);
-    }
-
-    template <typename D>
-    void eval_post_dispatch(std::size_t iter, Particle<T> &particle,
-        void (D::*)(std::size_t, Particle<T> &) const)
-    {
-        static_cast<Derived *>(this)->eval_post(iter, particle);
-    }
-
-    // static
-
-    double eval_sp_dispatch(std::size_t iter, SingleParticle<T> sp,
-        double (*)(std::size_t, SingleParticle<T>))
-    {
-        return Derived::eval_sp(iter, sp);
-    }
-
-    double eval_grid_dispatch(std::size_t iter, Particle<T> &particle,
-        double (*)(std::size_t, Particle<T> &))
-    {
-        return Derived::eval_grid(iter, particle);
-    }
-
-    void eval_pre_dispatch(std::size_t iter, Particle<T> &particle,
-        void (*)(std::size_t, Particle<T> &))
-    {
-        Derived::eval_pre(iter, particle);
-    }
-
-    void eval_post_dispatch(std::size_t iter, Particle<T> &particle,
-        void (*)(std::size_t, Particle<T> &))
-    {
-        Derived::eval_post(iter, particle);
-    }
-
-    // base
-
-    double eval_sp_dispatch(std::size_t, SingleParticle<T>,
-        double (PathEvalBase::*)(std::size_t, SingleParticle<T>))
-    {
-        return 0;
-    }
-
-    double eval_grid_dispatch(std::size_t, Particle<T> &,
-        double (PathEvalBase::*)(std::size_t, Particle<T> &))
-    {
-        return 0;
-    }
-
-    void eval_pre_dispatch(std::size_t, Particle<T> &,
-        void (PathEvalBase::*)(std::size_t, Particle<T> &))
-    {
-    }
-
-    void eval_post_dispatch(std::size_t, Particle<T> &,
-        void (PathEvalBase::*)(std::size_t, Particle<T> &))
-    {
-    }
-}; // class PathEvalBase
-
-/// \brief Path evalution base dispatch class
-/// \ingroup SMP
-template <typename T>
-class PathEvalBase<T, Virtual>
-{
-    public:
-    virtual double eval_sp(std::size_t, SingleParticle<T>) { return 0; }
-    virtual double eval_grid(std::size_t, Particle<T> &) { return 0; }
-    virtual void eval_pre(std::size_t, Particle<T> &) {}
-    virtual void eval_post(std::size_t, Particle<T> &) {}
-
-    protected:
-    VSMC_DEFINE_SMP_BACKEND_BASE_SPECIAL_VIRTUAL(PathEval)
-}; // class PathEval<T, Virtual>
-
 } // namespace vsmc
 
 #endif // VSMC_SMP_BACKEND_BASE_HPP
diff --git a/include/vsmc/smp/backend_omp.hpp b/include/vsmc/smp/backend_omp.hpp
index 9e9001b9d..d94a2761b 100644
--- a/include/vsmc/smp/backend_omp.hpp
+++ b/include/vsmc/smp/backend_omp.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -53,19 +53,9 @@ class StateOMP : public StateBase
     template <typename IntType>
     void copy(size_type N, const IntType *src_idx)
     {
-#if _OPENMP < 200805
-        using stype = typename std::make_signed<size_type>::type;
-        stype n = static_cast<stype>(N);
-#pragma omp parallel for default(shared)
-        for (stype i = 0; i < n; ++i) {
-            this->copy_particle(
-                static_cast<size_type>(src_idx[i]), static_cast<size_type>(i));
-        }
-#else // _OPENMP < 200805
 #pragma omp parallel for default(shared)
         for (size_type i = 0; i < N; ++i)
             this->copy_particle(static_cast<size_type>(src_idx[i]), i);
-#endif // _OPENMP < 200805
     }
 }; // class StateOMP
 
@@ -82,19 +72,9 @@ class InitializeOMP : public InitializeBase<T, Derived>
         this->eval_param(particle, param);
         this->eval_pre(particle);
         std::size_t accept = 0;
-#if _OPENMP < 200805
-        using stype = typename std::make_signed<size_type>::type;
-        stype n = static_cast<stype>(N);
-#pragma omp parallel for reduction(+ : accept) default(shared)
-        for (stype i = 0; i < n; ++i) {
-            accept += this->eval_sp(
-                SingleParticle<T>(static_cast<size_type>(i), &particle));
-        }
-#else // _OPENMP < 200805
 #pragma omp parallel for reduction(+ : accept) default(shared)
         for (size_type i = 0; i < N; ++i)
-            accept += this->eval_sp(SingleParticle<T>(i, &particle));
-#endif // _OPENMP < 200805
+            accept += this->eval_sp(particle.sp(i));
         this->eval_post(particle);
 
         return accept;
@@ -116,19 +96,9 @@ class MoveOMP : public MoveBase<T, Derived>
         const size_type N = particle.size();
         this->eval_pre(iter, particle);
         std::size_t accept = 0;
-#if _OPENMP < 200805
-        using stype = typename std::make_signed<size_type>::type;
-        stype n = static_cast<stype>(N);
-#pragma omp parallel for reduction(+ : accept) default(shared)
-        for (stype i = 0; i < n; ++i) {
-            accept += this->eval_sp(
-                iter, SingleParticle<T>(static_cast<size_type>(i), &particle));
-        }
-#else // _OPENMP < 200805
 #pragma omp parallel for reduction(+ : accept) default(shared)
         for (size_type i = 0; i < N; ++i)
-            accept += this->eval_sp(iter, SingleParticle<T>(i, &particle));
-#endif // _OPENMP < 200805
+            accept += this->eval_sp(iter, particle.sp(i));
         this->eval_post(iter, particle);
 
         return accept;
@@ -150,22 +120,11 @@ class MonitorEvalOMP : public MonitorEvalBase<T, Derived>
         using size_type = typename Particle<T>::size_type;
         const size_type N = particle.size();
         this->eval_pre(iter, particle);
-#if _OPENMP < 200805
-        using stype = typename std::make_signed<size_type>::type;
-        stype n = static_cast<stype>(N);
-#pragma omp parallel for default(shared)
-        for (stype i = 0; i < n; ++i) {
-            this->eval_sp(iter, dim,
-                SingleParticle<T>(static_cast<size_type>(i), &particle),
-                r + static_cast<std::size_t>(i) * dim);
-        }
-#else // _OPENMP < 200805
 #pragma omp parallel for default(shared)
         for (size_type i = 0; i < N; ++i) {
-            this->eval_sp(iter, dim, SingleParticle<T>(i, &particle),
+            this->eval_sp(iter, dim, particle.sp(i),
                 r + static_cast<std::size_t>(i) * dim);
         }
-#endif // _OPENMP < 200805
         this->eval_post(iter, particle);
     }
 
@@ -173,39 +132,6 @@ class MonitorEvalOMP : public MonitorEvalBase<T, Derived>
     VSMC_DEFINE_SMP_BACKEND_SPECIAL(OMP, MonitorEval)
 }; // class MonitorEvalOMP
 
-/// \brief Path<T>::eval_type subtype using OpenMP
-/// \ingroup OMP
-template <typename T, typename Derived>
-class PathEvalOMP : public PathEvalBase<T, Derived>
-{
-    public:
-    double operator()(std::size_t iter, Particle<T> &particle, double *r)
-    {
-        using size_type = typename Particle<T>::size_type;
-        const size_type N = particle.size();
-        this->eval_pre(iter, particle);
-#if _OPENMP < 200805
-        using stype = typename std::make_signed<size_type>::type;
-        stype n = static_cast<stype>(N);
-#pragma omp parallel for default(shared)
-        for (stype i = 0; i < n; ++i) {
-            r[i] = this->eval_sp(
-                iter, SingleParticle<T>(static_cast<size_type>(i), &particle));
-        }
-#else // _OPENMP < 200805
-#pragma omp parallel for default(shared)
-        for (size_type i = 0; i < N; ++i)
-            r[i] = this->eval_sp(iter, SingleParticle<T>(i, &particle));
-#endif // _OPENMP < 200805
-        this->eval_post(iter, particle);
-
-        return this->eval_grid(iter, particle);
-    }
-
-    protected:
-    VSMC_DEFINE_SMP_BACKEND_SPECIAL(OMP, PathEval)
-}; // class PathEvalOMP
-
 } // namespace vsmc
 
 #endif // VSMC_SMP_BACKEND_OMP_HPP
diff --git a/include/vsmc/smp/backend_seq.hpp b/include/vsmc/smp/backend_seq.hpp
index 5e5c01d43..caee2411d 100644
--- a/include/vsmc/smp/backend_seq.hpp
+++ b/include/vsmc/smp/backend_seq.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -114,28 +114,6 @@ class MonitorEvalSEQ : public MonitorEvalBase<T, Derived>
     VSMC_DEFINE_SMP_BACKEND_SPECIAL(SEQ, MonitorEval)
 }; // class MonitorEvalSEQ
 
-/// \brief Path<T>::eval_type subtype
-/// \ingroup SEQ
-template <typename T, typename Derived>
-class PathEvalSEQ : public PathEvalBase<T, Derived>
-{
-    public:
-    double operator()(std::size_t iter, Particle<T> &particle, double *r)
-    {
-        using size_type = typename Particle<T>::size_type;
-        const size_type N = particle.size();
-        this->eval_pre(iter, particle);
-        for (size_type i = 0; i != N; ++i)
-            r[i] = this->eval_sp(iter, SingleParticle<T>(i, &particle));
-        this->eval_post(iter, particle);
-
-        return this->eval_grid(iter, particle);
-    }
-
-    protected:
-    VSMC_DEFINE_SMP_BACKEND_SPECIAL(SEQ, PathEval)
-}; // class PathEvalSEQ
-
 } // namespace vsmc
 
 #endif // VSMC_SMP_BACKEND_SEQ_HPP
diff --git a/include/vsmc/smp/backend_tbb.hpp b/include/vsmc/smp/backend_tbb.hpp
index 3876ac5a4..aaa96c5d1 100644
--- a/include/vsmc/smp/backend_tbb.hpp
+++ b/include/vsmc/smp/backend_tbb.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -56,13 +56,6 @@
     ::tbb::parallel_for args;                                                 \
     this->eval_post(iter, particle);
 
-#define VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(args)              \
-    this->eval_pre(iter, particle);                                           \
-    work_type work(this, iter, &particle, r);                                 \
-    ::tbb::parallel_for args;                                                 \
-    this->eval_post(iter, particle);                                          \
-    return this->eval_grid(iter, particle);
-
 namespace vsmc
 {
 
@@ -79,9 +72,9 @@ class StateTBB : public StateBase
     explicit StateTBB(size_type N) : StateBase(N) {}
 
     template <typename IntType>
-    void copy(size_type N, const IntType *src_idx)
+    void copy(size_type N, const IntType *index)
     {
-        parallel_copy_run(src_idx, ::tbb::blocked_range<size_type>(0, N));
+        parallel_copy_run(index, ::tbb::blocked_range<size_type>(0, N));
     }
 
     protected:
@@ -89,86 +82,86 @@ class StateTBB : public StateBase
     class work_type
     {
         public:
-        work_type(StateTBB<StateBase> *state, const IntType *src_idx)
-            : state_(state), src_idx_(src_idx)
+        work_type(StateTBB<StateBase> *state, const IntType *index)
+            : state_(state), index_(index)
         {
         }
 
         void operator()(const ::tbb::blocked_range<size_type> &range) const
         {
             for (size_type i = range.begin(); i != range.end(); ++i) {
-                state_->copy_particle(static_cast<size_type>(src_idx_[i]), i);
+                state_->copy_particle(static_cast<size_type>(index_[i]), i);
             }
         }
 
         private:
         StateTBB<StateBase> *const state_;
-        const IntType *const src_idx_;
+        const IntType *const index_;
     }; // class work_type
 
     template <typename IntType>
     void parallel_copy_run(
-        const IntType *src_idx, const ::tbb::blocked_range<size_type> &range)
+        const IntType *index, const ::tbb::blocked_range<size_type> &range)
     {
-        ::tbb::parallel_for(range, work_type<IntType>(this, src_idx));
+        ::tbb::parallel_for(range, work_type<IntType>(this, index));
     }
 
     template <typename IntType>
-    void parallel_copy_run(const IntType *src_idx,
+    void parallel_copy_run(const IntType *index,
         const ::tbb::blocked_range<size_type> &range,
         const ::tbb::auto_partitioner &partitioner)
     {
         ::tbb::parallel_for(
-            range, work_type<IntType>(this, src_idx), partitioner);
+            range, work_type<IntType>(this, index), partitioner);
     }
 
     template <typename IntType>
-    void parallel_copy_run(const IntType *src_idx,
+    void parallel_copy_run(const IntType *index,
         const ::tbb::blocked_range<size_type> &range,
         const ::tbb::simple_partitioner &partitioner)
     {
         ::tbb::parallel_for(
-            range, work_type<IntType>(this, src_idx), partitioner);
+            range, work_type<IntType>(this, index), partitioner);
     }
 
     template <typename IntType>
-    void parallel_copy_run(const IntType *src_idx,
+    void parallel_copy_run(const IntType *index,
         const ::tbb::blocked_range<size_type> &range,
         ::tbb::affinity_partitioner &partitioner)
     {
         ::tbb::parallel_for(
-            range, work_type<IntType>(this, src_idx), partitioner);
+            range, work_type<IntType>(this, index), partitioner);
     }
 
 #if __TBB_TASK_GROUP_CONTEXT
     template <typename IntType>
-    void parallel_copy_run(const IntType *src_idx,
+    void parallel_copy_run(const IntType *index,
         const ::tbb::blocked_range<size_type> &range,
         const ::tbb::auto_partitioner &partitioner,
         ::tbb::task_group_context &context)
     {
         ::tbb::parallel_for(
-            range, work_type<IntType>(this, src_idx), partitioner, context);
+            range, work_type<IntType>(this, index), partitioner, context);
     }
 
     template <typename IntType>
-    void parallel_copy_run(const IntType *src_idx,
+    void parallel_copy_run(const IntType *index,
         const ::tbb::blocked_range<size_type> &range,
         const ::tbb::simple_partitioner &partitioner,
         ::tbb::task_group_context &context)
     {
         ::tbb::parallel_for(
-            range, work_type<IntType>(this, src_idx), partitioner, context);
+            range, work_type<IntType>(this, index), partitioner, context);
     }
 
     template <typename IntType>
-    void parallel_copy_run(const IntType *src_idx,
+    void parallel_copy_run(const IntType *index,
         const ::tbb::blocked_range<size_type> &range,
         ::tbb::affinity_partitioner &partitioner,
         ::tbb::task_group_context &context)
     {
         ::tbb::parallel_for(
-            range, work_type<IntType>(this, src_idx), partitioner, context);
+            range, work_type<IntType>(this, index), partitioner, context);
     }
 #endif // __TBB_TASK_GROUP_CONTEXT
 };     // class StateTBB
@@ -207,7 +200,7 @@ class InitializeTBB : public InitializeBase<T, Derived>
         void operator()(const ::tbb::blocked_range<size_type> &range)
         {
             for (size_type i = range.begin(); i != range.end(); ++i)
-                accept_ += wptr_->eval_sp(SingleParticle<T>(i, pptr_));
+                accept_ += wptr_->eval_sp(pptr_->sp(i));
         }
 
         void join(const work_type &other) { accept_ += other.accept_; }
@@ -318,7 +311,7 @@ class MoveTBB : public MoveBase<T, Derived>
         void operator()(const ::tbb::blocked_range<size_type> &range)
         {
             for (size_type i = range.begin(); i != range.end(); ++i)
-                accept_ += wptr_->eval_sp(iter_, SingleParticle<T>(i, pptr_));
+                accept_ += wptr_->eval_sp(iter_, pptr_->sp(i));
         }
 
         void join(const work_type &other) { accept_ += other.accept_; }
@@ -423,7 +416,7 @@ class MonitorEvalTBB : public MonitorEvalBase<T, Derived>
         void operator()(const ::tbb::blocked_range<size_type> &range) const
         {
             for (size_type i = range.begin(); i != range.end(); ++i) {
-                wptr_->eval_sp(iter_, dim_, SingleParticle<T>(i, pptr_),
+                wptr_->eval_sp(iter_, dim_, pptr_->sp(i),
                     r_ + static_cast<std::size_t>(i) * dim_);
             }
         }
@@ -503,106 +496,6 @@ class MonitorEvalTBB : public MonitorEvalBase<T, Derived>
 #endif // __TBB_TASK_GROUP_CONTEXT
 };     // class MonitorEvalTBB
 
-/// \brief Path<T>::eval_type subtype using Intel Threading Building Blocks
-/// \ingroup TBB
-template <typename T, typename Derived>
-class PathEvalTBB : public PathEvalBase<T, Derived>
-{
-    public:
-    double operator()(std::size_t iter, Particle<T> &particle, double *r)
-    {
-        return parallel_run(iter, particle, r,
-            ::tbb::blocked_range<typename Particle<T>::size_type>(
-                                0, particle.size()));
-    }
-
-    protected:
-    VSMC_DEFINE_SMP_BACKEND_SPECIAL(TBB, PathEval)
-
-    class work_type
-    {
-        public:
-        using size_type = typename Particle<T>::size_type;
-
-        work_type(PathEvalTBB<T, Derived> *wptr, std::size_t iter,
-            Particle<T> *pptr, double *r)
-            : wptr_(wptr), iter_(iter), pptr_(pptr), r_(r)
-        {
-        }
-
-        void operator()(const ::tbb::blocked_range<size_type> &range) const
-        {
-            for (size_type i = range.begin(); i != range.end(); ++i)
-                r_[i] = wptr_->eval_sp(iter_, SingleParticle<T>(i, pptr_));
-        }
-
-        private:
-        PathEvalTBB<T, Derived> *const wptr_;
-        const std::size_t iter_;
-        Particle<T> *const pptr_;
-        double *const r_;
-    }; // class ParallelPathState
-
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL((range, work));
-    }
-
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range,
-        const ::tbb::auto_partitioner &partitioner)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(
-            (range, work, partitioner));
-    }
-
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range,
-        const ::tbb::simple_partitioner &partitioner)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(
-            (range, work, partitioner));
-    }
-
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range,
-        ::tbb::affinity_partitioner &partitioner)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(
-            (range, work, partitioner));
-    }
-
-#if __TBB_TASK_GROUP_CONTEXT
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range,
-        const ::tbb::auto_partitioner &partitioner,
-        ::tbb::task_group_context &context)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(
-            (range, work, partitioner, context));
-    }
-
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range,
-        const ::tbb::simple_partitioner &partitioner,
-        ::tbb::task_group_context &context)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(
-            (range, work, partitioner, context));
-    }
-
-    double parallel_run(std::size_t iter, Particle<T> &particle, double *r,
-        const ::tbb::blocked_range<typename Particle<T>::size_type> &range,
-        ::tbb::affinity_partitioner &partitioner,
-        ::tbb::task_group_context &context)
-    {
-        VSMC_DEFINE_SMP_BACKEND_TBB_PARALLEL_RUN_PATH_EVAL(
-            (range, work, partitioner, context));
-    }
-#endif // __TBB_TASK_GROUP_CONTEXT
-};     // PathEvalTBB
-
 } // namespace vsmc
 
 #endif // VSMC_SMP_BACKEND_TBB_HPP
diff --git a/include/vsmc/smp/smp.hpp b/include/vsmc/smp/smp.hpp
index 3b607b628..98a1c9ccf 100644
--- a/include/vsmc/smp/smp.hpp
+++ b/include/vsmc/smp/smp.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/utility/aligned_memory.hpp b/include/vsmc/utility/aligned_memory.hpp
index 8031ac79d..3c67f02d0 100644
--- a/include/vsmc/utility/aligned_memory.hpp
+++ b/include/vsmc/utility/aligned_memory.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,14 +32,13 @@
 #ifndef VSMC_UTILITY_ALIGNED_MEMORY
 #define VSMC_UTILITY_ALIGNED_MEMORY
 
-#include <vsmc/internal/config.h>
 #include <vsmc/internal/assert.hpp>
+#include <vsmc/internal/config.h>
 
 #include <cstddef>
 #include <cstdlib>
+#include <limits>
 #include <memory>
-#include <new>
-#include <type_traits>
 #include <vector>
 
 #if VSMC_HAS_POSIX
@@ -76,19 +75,6 @@
 #define VSMC_ALIGNMENT 32
 #endif
 
-#define VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY_POWER_OF_TWO(Alignment)     \
-    VSMC_STATIC_ASSERT(                                                       \
-        (Alignment != 0 && (Alignment & (Alignment - 1)) == 0),               \
-        "**AlignedAllocator** USED WITH Alignment NOT A POWER OF TWO")
-
-#define VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY_SIZEOF_VOID(Alignemnt)      \
-    VSMC_STATIC_ASSERT((Alignment >= sizeof(void *)),                         \
-        "**AlginedAllocator** USED WITH Alignment LESS THAN sizeof(void *)")
-
-#define VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY                             \
-    VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY_POWER_OF_TWO(Alignment);        \
-    VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY_SIZEOF_VOID(Alignment);
-
 #define VSMC_RUNTIME_ASSERT_UTILITY_ALIGNED_MEMORY_POWER_OF_TWO(alignment)    \
     VSMC_RUNTIME_ASSERT(                                                      \
         (alignment != 0 && (alignment & (alignment - 1)) == 0),               \
@@ -257,7 +243,7 @@ using AlignedMemory = VSMC_ALIGNED_MEMORY_TYPE;
 /// \tparam T The value type
 /// \tparam Alignment The alignment requirement of memory, must be a power of
 /// two and no less than `sizeof(void *)`.
-/// \tparam Memory The memory management class. Must provides two member
+/// \tparam Memory The memory management class. Must provides two static member
 /// functions, `aligned_malloc` and `aligned_free`. The member function
 /// `aligned_malloc` shall behave similar to `std::malloc` but take an
 /// additional arguments for alignment. The member function `aligned_free`
@@ -266,72 +252,96 @@ template <typename T, std::size_t Alignment = VSMC_ALIGNMENT,
     typename Memory = AlignedMemory>
 class AlignedAllocator : public std::allocator<T>
 {
+    static_assert(Alignment != 0 && (Alignment & (Alignment - 1)) == 0,
+        "**AlignedAllocator** USED WITH Alignment OTHER THAN A POWER OF TWO "
+        "POSITIVE INTEGER");
+
+    static_assert(Alignment >= sizeof(void *),
+        "**AlignedAllocator** USED WITH Alignment LESS THAN sizeof(void *)");
+
     public:
-    using size_type = typename std::allocator<T>::size_type;
-    using pointer = typename std::allocator<T>::pointer;
+    using value_type = T;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using pointer = T *;
+    using const_pointer = const T *;
+    using reference = typename std::add_lvalue_reference<T>::type;
+    using const_reference = typename std::add_lvalue_reference<const T>::type;
+    using is_always_equal = std::true_type;
 
     template <typename U>
     class rebind
     {
         public:
-        using other = AlignedAllocator<U, Alignment>;
+        using other = AlignedAllocator<U, Alignment, Memory>;
     }; // class rebind
 
-    AlignedAllocator() { VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY; }
+    AlignedAllocator() = default;
 
-    AlignedAllocator(const AlignedAllocator<T, Alignment, Memory> &other)
-        : std::allocator<T>(other)
-    {
-        VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY;
-    }
+    AlignedAllocator(const AlignedAllocator<T, Alignment, Memory> &) = default;
 
     template <typename U>
     AlignedAllocator(const AlignedAllocator<U, Alignment, Memory> &other)
         : std::allocator<T>(static_cast<std::allocator<U>>(other))
     {
-        VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY;
-    }
-
-    AlignedAllocator<T, Alignment, Memory> &operator=(
-        const AlignedAllocator<T, Alignment, Memory> &) = default;
-
-    AlignedAllocator(AlignedAllocator<T, Alignment, Memory> &&other)
-        : std::allocator<T>(std::move(other))
-    {
-        VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY;
-    }
-
-    template <typename U>
-    AlignedAllocator(AlignedAllocator<U, Alignment, Memory> &&other)
-        : std::allocator<T>(static_cast<std::allocator<U>>(std::move(other)))
-    {
-        VSMC_STATIC_ASSERT_UTILITY_ALIGNED_MEMORY;
     }
 
-    AlignedAllocator<T, Alignment, Memory> &operator=(
-        AlignedAllocator<T, Alignment, Memory> &&) = default;
-
-    ~AlignedAllocator() {}
-
-    pointer allocate(size_type n, const void * = nullptr)
+    static pointer allocate(size_type n, const void * = nullptr)
     {
         if (n == 0)
             return nullptr;
 
         return static_cast<pointer>(
-            memory_.aligned_malloc(sizeof(T) * n, Alignment));
+            Memory::aligned_malloc(sizeof(T) * n, Alignment));
     }
 
-    void deallocate(pointer ptr, size_type)
+    static void deallocate(pointer ptr, size_type)
     {
         if (ptr != nullptr)
-            memory_.aligned_free(ptr);
+            Memory::aligned_free(ptr);
     }
+}; // class AlignedAllocator
+
+template <std::size_t Alignment, typename Memory>
+class AlignedAllocator<void, Alignment, Memory>
+{
+    using value_type = void;
+    using pointer = void *;
+    using const_pointer = const void *;
+
+    template <class U>
+    struct rebind {
+        using other = AlignedAllocator<U, Alignment, Memory>;
+    };
+}; // class AlignedAllocator
 
-    private:
-    Memory memory_;
+template <std::size_t Alignment, typename Memory>
+class AlignedAllocator<const void, Alignment, Memory>
+{
+    using value_type = const void;
+    using pointer = const void *;
+    using const_pointer = const void *;
+
+    template <class U>
+    struct rebind {
+        using other = AlignedAllocator<U, Alignment, Memory>;
+    };
 }; // class AlignedAllocator
 
+template <typename T1, typename T2, std::size_t Alignment, typename Memory>
+inline bool operator==(const AlignedAllocator<T1, Alignment, Memory> &,
+    const AlignedAllocator<T2, Alignment, Memory> &)
+{
+    return true;
+}
+
+template <typename T1, typename T2, std::size_t Alignment, typename Memory>
+inline bool operator!=(const AlignedAllocator<T1, Alignment, Memory> &,
+    const AlignedAllocator<T2, Alignment, Memory> &)
+{
+    return false;
+}
+
 /// \brief AlignedAllocator for scalar type and `std::allocator` for others
 /// \ingroup AlignedMemory
 template <typename T>
@@ -347,7 +357,7 @@ using AlignedVector = std::vector<T, AlignedAllocator<T>>;
 /// \ingroup AlignedMemory
 template <typename T>
 using Vector = typename std::conditional<std::is_scalar<T>::value,
-    std::vector<T, AlignedAllocator<T>>, std::vector<T>>::type;
+    AlignedVector<T>, std::vector<T>>::type;
 
 } // namespace vsmc
 
diff --git a/include/vsmc/utility/covariance.hpp b/include/vsmc/utility/covariance.hpp
new file mode 100644
index 000000000..3d71efd47
--- /dev/null
+++ b/include/vsmc/utility/covariance.hpp
@@ -0,0 +1,384 @@
+//============================================================================
+// vSMC/include/vsmc/utility/covariance.hpp
+//----------------------------------------------------------------------------
+//                         vSMC: Scalable Monte Carlo
+//----------------------------------------------------------------------------
+// Copyright (c) 2013-2016, Yan Zhou
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+//   Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#ifndef VSMC_UTILITY_COVARIANCE_HPP
+#define VSMC_UTILITY_COVARIANCE_HPP
+
+#include <vsmc/internal/common.hpp>
+#if VSMC_USE_MKL_VSL
+#include <vsmc/utility/mkl.hpp>
+#endif
+
+namespace vsmc
+{
+
+namespace internal
+{
+
+template <typename RealType>
+inline void cov_pack(std::size_t dim, const RealType *cov, RealType *chol,
+    MatrixLayout layout, bool upper, bool packed)
+{
+    unsigned l = layout == RowMajor ? 0 : 1;
+    unsigned u = upper ? 1 : 0;
+    unsigned p = packed ? 1 : 0;
+    unsigned c = (l << 2) + (u << 1) + p;
+    switch (c) {
+        case 0: // Row, Lower, Full
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j <= i; ++j)
+                    *chol++ = cov[i * dim + j];
+            break;
+        case 1: // Row, Lower, Pack
+            std::copy_n(cov, dim * (dim + 1) / 2, chol);
+            break;
+        case 2: // Row, Upper, Full
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j <= i; ++j)
+                    *chol++ = cov[j * dim + i];
+            break;
+        case 3: // Row, Upper, Pack
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j <= i; ++j)
+                    *chol++ = cov[dim * j - j * (j + 1) / 2 + i];
+            break;
+        case 4: // Col, Lower, Full
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j <= i; ++j)
+                    *chol++ = cov[j * dim + i];
+            break;
+        case 5: // Col, Lower, Pack
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j <= i; ++j)
+                    *chol++ = cov[dim * j - j * (j + 1) / 2 + i];
+            break;
+        case 6: // Col, Upper, Full
+            for (std::size_t j = 0; j != dim; ++j)
+                for (std::size_t i = 0; i <= j; ++i)
+                    *chol++ = cov[j * dim + i];
+            break;
+        case 7: // Col, Upper, Pack
+            std::copy_n(cov, dim * (dim + 1) / 2, chol);
+            break;
+        default: break;
+    }
+}
+
+inline int cov_chol(std::size_t dim, float *chol)
+{
+    return static_cast<int>(::LAPACKE_spptrf(
+        LAPACK_ROW_MAJOR, 'L', static_cast<lapack_int>(dim), chol));
+}
+
+inline int cov_chol(std::size_t dim, double *chol)
+{
+    return static_cast<int>(::LAPACKE_dpptrf(
+        LAPACK_ROW_MAJOR, 'L', static_cast<lapack_int>(dim), chol));
+}
+
+} // namespace vsmc::internal
+
+/// \brief Compute Cholesky decomposition of the covariance matrix
+/// \ingroup Covariance
+///
+/// \param dim The number of rows of the covariance matrix
+/// \param cov The covariance matrix
+/// \param chol The output lower triangular elements of the Cholesky
+/// decomposition, packed row by row. This can be directly used as the input
+/// parameter of the NormalMVDistribution constructors.
+/// \param layout The storage layout of the covariance matrix
+/// \param upper If `true`, the upper triangular of the covariance matrix shall
+/// be used. Otherwise the lower triangular shall be used.
+/// \param packed If the upper or lower triangular of covariance matrix is
+/// packed, row by row if `layout == RowMajor`, or column by column if `layout
+/// == ColMajor`.
+///
+/// \return
+/// - `0` If successful
+/// - Positive value `i` if the `i`th minor of the covariance matrix is not
+/// psotive-definite.
+template <typename RealType>
+inline int cov_chol(std::size_t dim, const RealType *cov, RealType *chol,
+    MatrixLayout layout = RowMajor, bool upper = false, bool packed = false)
+{
+    static_assert(internal::is_one_of<RealType, float, double>::value,
+        "**cov_chol** USED WITH RealType OTHER THAN float OR double");
+
+    internal::cov_pack(dim, cov, chol, layout, upper, packed);
+    return internal::cov_chol(dim, chol);
+}
+
+/// \brief Covariance
+/// \ingroup Covariance
+template <typename RealType = double>
+class Covariance
+{
+    static_assert(internal::is_one_of<RealType, float, double>::value,
+        "**Covariance** USED WITH RealType OTHER THAN float OR double");
+
+    public:
+    using result_type = RealType;
+
+    /// \brief Compute the sample covariance matrix
+    ///
+    /// \param layout The storage layout of the data `x`. The data is considere
+    /// to be `N` by `dim` matrix. In `RowMajor` storage.
+    /// \param n Sample size. If `N == 0` then no computation is carried out.
+    /// \param dim Dimension of the random variable. If `dim == 0` then no
+    /// computation carried out.
+    /// \param x The samples. If it is a null pointer, then no computation is
+    /// carried out.
+    /// \param w The weights. If it is a null pointer, then all samples are
+    /// assigned weight 1.
+    /// \param mean Output storage of the mean. If it is a null pointer, then
+    /// it is ignored.
+    /// \param cov Output storage of the covarianc matrix. If it is a null
+    /// pointer, then it is ignored.
+    /// \param cov_layout The storage layout of the covariance matrix.
+    /// \param cov_upper If true, the upper triangular of the covariance matrix
+    /// is packed, otherwise the lower triangular is packed. Ignored if
+    /// `cov_pack` is `false`.
+    /// \param cov_packed If true, the matrix is packed.
+    void operator()(MatrixLayout layout, std::size_t n, std::size_t dim,
+        const result_type *x, const result_type *w, result_type *mean,
+        result_type *cov, MatrixLayout cov_layout = RowMajor,
+        bool cov_upper = false, bool cov_packed = false)
+    {
+        if (n * dim == 0)
+            return;
+
+        if (x == nullptr)
+            return;
+
+        if (mean == nullptr && cov == nullptr)
+            return;
+
+#if VSMC_USE_MKL_VSL
+        MKL_INT px = static_cast<MKL_INT>(dim);
+        MKL_INT nx = static_cast<MKL_INT>(n);
+        MKL_INT xstorage = layout == RowMajor ? VSL_SS_MATRIX_STORAGE_COLS :
+                                                VSL_SS_MATRIX_STORAGE_ROWS;
+        MKL_INT cov_storage = storage(cov_layout, cov_upper, cov_packed);
+        unsigned MKL_INT64 estimates = 0;
+        if (mean != nullptr)
+            estimates |= VSL_SS_MEAN;
+        if (cov != nullptr)
+            estimates |= VSL_SS_COV;
+
+        MKLSSTask<result_type> task(&px, &nx, &xstorage, x, w, nullptr);
+        task.edit_cov_cor(mean, cov, &cov_storage, nullptr, nullptr);
+        task.compute(estimates, VSL_SS_METHOD_FAST);
+#else  // VSMC_USE_MKL_VSL
+        result_type sw = w == nullptr ?
+            static_cast<result_type>(n) :
+            std::accumulate(w, w + n, static_cast<result_type>(0));
+        mean_.resize(dim);
+        if (w == nullptr) {
+            if (layout == RowMajor) {
+                std::fill(mean_.begin(), mean_.end(), 0);
+                for (std::size_t i = 0; i != n; ++i)
+                    add(dim, x + i * dim, mean_.data(), mean_.data());
+            } else {
+                for (std::size_t i = 0; i != dim; ++i) {
+                    mean_[i] = std::accumulate(x + i * n, x + (i + 1) * n,
+                        static_cast<result_type>(0));
+                }
+            }
+        } else {
+            mean_init(layout, n, dim, x, w);
+        }
+        div(dim, mean_.data(), sw, mean_.data());
+        if (mean != nullptr)
+            std::copy(mean_.begin(), mean_.end(), mean);
+        if (cov == nullptr)
+            return;
+
+        result_type sw2 =
+            w == nullptr ? static_cast<result_type>(n) : swsqr(n, w);
+        result_type B = sw / (sw * sw - sw2);
+        result_type BW = B * sw;
+        cov_.resize(dim * dim);
+        std::fill(cov_.begin(), cov_.end(), 0);
+        cov_init(layout, dim, static_cast<result_type *>(nullptr));
+        if (w == nullptr) {
+            cov_update(layout, n, dim, x, B, BW);
+        } else {
+            wsqrt_.resize(n);
+            buffer_.resize(n * dim);
+            sqrt(n, w, wsqrt_.data());
+            if (layout == RowMajor) {
+                for (std::size_t i = 0; i != n; ++i)
+                    mul(dim, x + i * dim, wsqrt_[i], buffer_.data() + i * dim);
+            } else {
+                for (std::size_t i = 0; i != dim; ++i)
+                    mul(n, x + i * n, wsqrt_.data(), buffer_.data() + i * n);
+            }
+            cov_update(layout, n, dim, buffer_.data(), B, BW);
+        }
+        cov_pack(dim, cov, layout, cov_layout, cov_upper, cov_packed);
+#endif // VSMC_USE_MKL_VSL
+    }
+
+    private:
+#if VSMC_USE_MKL_VSL
+    MKL_INT storage(MatrixLayout layout, bool upper, bool packed)
+    {
+        if (!packed)
+            return VSL_SS_MATRIX_STORAGE_FULL;
+
+        if (layout == RowMajor)
+            return upper ? VSL_SS_MATRIX_STORAGE_U_PACKED :
+                           VSL_SS_MATRIX_STORAGE_L_PACKED;
+
+        return upper ? VSL_SS_MATRIX_STORAGE_L_PACKED :
+                       VSL_SS_MATRIX_STORAGE_U_PACKED;
+    }
+#else  // VSMC_USE_MKL_VSL
+    Vector<result_type> mean_;
+    Vector<result_type> cov_;
+    Vector<result_type> wsqrt_;
+    Vector<result_type> buffer_;
+
+    void mean_init(MatrixLayout layout, std::size_t n, std::size_t dim,
+        const float *x, const float *w)
+    {
+        ::cblas_sgemv(layout == RowMajor ? ::CblasRowMajor : ::CblasColMajor,
+            ::CblasTrans, static_cast<VSMC_CBLAS_INT>(n),
+            static_cast<VSMC_CBLAS_INT>(dim), 1.0, x,
+            static_cast<VSMC_CBLAS_INT>(layout == RowMajor ? dim : n), w, 1,
+            0.0, mean_.data(), 1);
+    }
+
+    void mean_init(MatrixLayout layout, std::size_t n, std::size_t dim,
+        const double *x, const double *w)
+    {
+        ::cblas_dgemv(layout == RowMajor ? ::CblasRowMajor : ::CblasColMajor,
+            ::CblasTrans, static_cast<VSMC_CBLAS_INT>(n),
+            static_cast<VSMC_CBLAS_INT>(dim), 1.0, x,
+            static_cast<VSMC_CBLAS_INT>(layout == RowMajor ? dim : n), w, 1,
+            0.0, mean_.data(), 1);
+    }
+
+    static float swsqr(std::size_t n, const float *w)
+    {
+        return ::cblas_sdot(static_cast<VSMC_CBLAS_INT>(n), w, 1, w, 1);
+    }
+
+    static double swsqr(std::size_t n, const double *w)
+    {
+        return ::cblas_ddot(static_cast<VSMC_CBLAS_INT>(n), w, 1, w, 1);
+    }
+
+    void cov_init(MatrixLayout layout, std::size_t dim, float *)
+    {
+        ::cblas_ssyr(layout == RowMajor ? ::CblasRowMajor : ::CblasColMajor,
+            ::CblasLower, static_cast<VSMC_CBLAS_INT>(dim), 1, mean_.data(), 1,
+            cov_.data(), static_cast<VSMC_CBLAS_INT>(dim));
+    }
+
+    void cov_init(MatrixLayout layout, std::size_t dim, double *)
+    {
+        ::cblas_dsyr(layout == RowMajor ? ::CblasRowMajor : ::CblasColMajor,
+            ::CblasLower, static_cast<VSMC_CBLAS_INT>(dim), 1, mean_.data(), 1,
+            cov_.data(), static_cast<VSMC_CBLAS_INT>(dim));
+    }
+
+    void cov_update(MatrixLayout layout, std::size_t n, std::size_t dim,
+        const float *x, float B, float BW)
+    {
+        ::cblas_ssyrk(layout == RowMajor ? ::CblasRowMajor : ::CblasColMajor,
+            ::CblasLower, ::CblasTrans, static_cast<VSMC_CBLAS_INT>(dim),
+            static_cast<VSMC_CBLAS_INT>(n), B, x,
+            static_cast<VSMC_CBLAS_INT>(layout == RowMajor ? dim : n), -BW,
+            cov_.data(), static_cast<VSMC_CBLAS_INT>(dim));
+    }
+
+    void cov_update(MatrixLayout layout, std::size_t n, std::size_t dim,
+        const double *x, double B, double BW)
+    {
+        ::cblas_dsyrk(layout == RowMajor ? ::CblasRowMajor : ::CblasColMajor,
+            ::CblasLower, ::CblasTrans, static_cast<VSMC_CBLAS_INT>(dim),
+            static_cast<VSMC_CBLAS_INT>(n), B, x,
+            static_cast<VSMC_CBLAS_INT>(layout == RowMajor ? dim : n), -BW,
+            cov_.data(), static_cast<VSMC_CBLAS_INT>(dim));
+    }
+
+    void cov_pack(std::size_t dim, result_type *cov, MatrixLayout layout,
+        MatrixLayout cov_layout, bool cov_upper, bool cov_packed)
+    {
+        if (layout == RowMajor)
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j != i; ++j)
+                    cov_[j * dim + i] = cov_[i * dim + j];
+
+        if (layout == ColMajor)
+            for (std::size_t i = 0; i != dim; ++i)
+                for (std::size_t j = 0; j != i; ++j)
+                    cov_[i * dim + j] = cov_[j * dim + i];
+
+        if (!cov_packed) {
+            std::copy(cov_.begin(), cov_.end(), cov);
+            return;
+        }
+
+        unsigned l = cov_layout == RowMajor ? 0 : 1;
+        unsigned u = cov_upper ? 1 : 0;
+        unsigned c = (l << 1) + u;
+        switch (c) {
+            case 0: // Row, Lower, Pack
+                for (size_t i = 0; i != dim; ++i)
+                    for (std::size_t j = 0; j <= i; ++j)
+                        *cov++ = cov_[i * dim + j];
+                break;
+            case 1: // Row, Upper, Pack
+                for (std::size_t i = 0; i != dim; ++i)
+                    for (std::size_t j = i; j != dim; ++j)
+                        *cov++ = cov_[i * dim + j];
+                break;
+            case 2: // Col, Lower, Pack
+                for (std::size_t j = 0; j != dim; ++j)
+                    for (std::size_t i = j; i != dim; ++i)
+                        *cov++ = cov_[j * dim + i];
+                break;
+            case 3: // Col, Upper, Pack
+                for (std::size_t j = 0; j != dim; ++j)
+                    for (std::size_t i = 0; i <= j; ++i)
+                        *cov++ = cov_[j * dim + i];
+                break;
+            default: break;
+        }
+    }
+#endif // VSMC_USE_MKL_VSL
+};     // class Covariance
+
+} // namespace vsmc
+
+#endif // VSMC_UTILITY_COVARIANCE_HPP
diff --git a/include/vsmc/utility/hdf5io.hpp b/include/vsmc/utility/hdf5io.hpp
index a31dae1ad..391e7548c 100644
--- a/include/vsmc/utility/hdf5io.hpp
+++ b/include/vsmc/utility/hdf5io.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -41,7 +41,7 @@ namespace vsmc
 namespace internal
 {
 
-template <MatrixOrder>
+template <MatrixLayout>
 inline void hdf5io_matrix_dim(std::size_t, std::size_t, ::hsize_t *);
 
 template <>
@@ -317,7 +317,7 @@ inline void hdf5store_new(const std::string &file_name)
 /// \brief Store a matrix in the HDF5 format from an input iterator
 /// \ingroup HDF5IO
 ///
-/// \tparam Order Storage order (RowMajor or ColMajor)
+/// \tparam Layout Storage layout (RowMajor or ColMajor)
 /// \tparam T Type of the data
 /// \param nrow Number of rows
 /// \param ncol Number of columns
@@ -329,7 +329,7 @@ inline void hdf5store_new(const std::string &file_name)
 /// save in a new file
 ///
 /// \note
-/// HDF5 store data in row major order. For example,
+/// HDF5 store data in row major layout. For example,
 /// ~~~{.cpp}
 /// double data[6] = {1, 2, 3, 4, 5, 6};
 /// // Store matrix
@@ -360,12 +360,11 @@ inline void hdf5store_new(const std::string &file_name)
 /// # [3,]    3    6
 /// #
 /// ~~~
-/// That is, when the data is stored in column major order in C++ memory, then
+/// That is, when the data is stored in column major layout in C++ memory, then
 /// the read in R produces exactly the same output. If the data is stored as
-/// row
-/// major matrix in C++ memory, the read in R produces the transpose the
+/// row major matrix in C++ memory, the read in R produces the transpose the
 /// original matrix though they are identical in memory.
-template <MatrixOrder Order, typename T, typename InputIter>
+template <MatrixLayout Layout, typename T, typename InputIter>
 inline InputIter hdf5store_matrix(std::size_t nrow, std::size_t ncol,
     const std::string &file_name, const std::string &data_name,
     InputIter first, bool append = false)
@@ -375,7 +374,7 @@ inline InputIter hdf5store_matrix(std::size_t nrow, std::size_t ncol,
 
     std::string dataset_name("/" + data_name);
     ::hsize_t dim[2];
-    internal::hdf5io_matrix_dim<Order>(nrow, ncol, dim);
+    internal::hdf5io_matrix_dim<Layout>(nrow, ncol, dim);
     internal::HDF5StoreDataPtr<T> data_ptr;
     InputIter last = data_ptr.set(nrow * ncol, first);
     const T *data = data_ptr.get();
@@ -610,7 +609,7 @@ inline bool hdf5store_int(std::size_t n, IntType *r, std::false_type)
 
     bool flag = true;
     for (std::size_t i = 0; i != n; ++i) {
-        if (r[i] > std::numeric_limits<int>::max VSMC_MNE()) {
+        if (r[i] > std::numeric_limits<int>::max()) {
             flag = false;
             break;
         }
@@ -627,11 +626,11 @@ inline bool hdf5store_int(std::size_t n, IntType *r, std::true_type)
 
     bool flag = true;
     for (std::size_t i = 0; i != n; ++i) {
-        if (r[i] < std::numeric_limits<int>::min VSMC_MNE()) {
+        if (r[i] < std::numeric_limits<int>::min()) {
             flag = false;
             break;
         }
-        if (r[i] > std::numeric_limits<int>::max VSMC_MNE()) {
+        if (r[i] > std::numeric_limits<int>::max()) {
             flag = false;
             break;
         }
@@ -697,18 +696,18 @@ inline void hdf5store(const Sampler<T> &sampler, const std::string &file_name,
 
 /// \brief Store a StateMatrix in the HDF5 format
 /// \ingroup HDF5IO
-template <MatrixOrder Order, std::size_t Dim, typename T>
-inline void hdf5store(const StateMatrix<Order, Dim, T> &state,
+template <MatrixLayout Layout, std::size_t Dim, typename T>
+inline void hdf5store(const StateMatrix<Layout, Dim, T> &state,
     const std::string &file_name, const std::string &data_name,
     bool append = false)
 {
-    hdf5store_matrix<Order, T>(
+    hdf5store_matrix<Layout, T>(
         state.size(), state.dim(), file_name, data_name, state.data(), append);
 }
 
 /// \brief Store a StateCL in the HDF5 format
 /// \ingroup HDF5IO
-template <MatrixOrder Order, typename T, std::size_t StateSize,
+template <MatrixLayout Layout, typename T, std::size_t StateSize,
     typename RealType, typename ID>
 inline void hdf5store(const StateCL<StateSize, RealType, ID> &state,
     const std::string &file_name, const std::string &data_name,
@@ -720,7 +719,7 @@ inline void hdf5store(const StateCL<StateSize, RealType, ID> &state,
     Vector<T> data(N);
     state.manager().template read_buffer<T>(
         state.state_buffer().data(), N, data.data());
-    hdf5store_matrix<Order, T>(
+    hdf5store_matrix<Layout, T>(
         nrow, ncol, file_name, data_name, data.data(), append);
 }
 
@@ -739,13 +738,13 @@ inline void hdf5store(const Particle<T> &particle,
 
 /// \brief Store a Particle with StateCL value type in the HDF5 format
 /// \ingroup HDF5IO
-template <MatrixOrder Order, typename T, typename U>
+template <MatrixLayout Layout, typename T, typename U>
 inline void hdf5store(const Particle<U> &particle,
     const std::string &file_name, const std::string &data_name,
     bool append = false)
 {
     hdf5store_list_empty(file_name, data_name, append);
-    hdf5store<Order, T>(
+    hdf5store<Layout, T>(
         particle.value(), file_name, data_name + "/value", true);
     hdf5store_matrix<ColMajor, double>(particle.size(), 1, file_name,
         data_name + "/weight", particle.weight().data(), true);
diff --git a/include/vsmc/utility/mkl.hpp b/include/vsmc/utility/mkl.hpp
index 131192c5f..a2fd5634b 100644
--- a/include/vsmc/utility/mkl.hpp
+++ b/include/vsmc/utility/mkl.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,41 +32,10 @@
 #ifndef VSMC_UTILITY_MKL_HPP
 #define VSMC_UTILITY_MKL_HPP
 
-#include <vsmc/internal/common.hpp>
 #include <vsmc/rng/internal/common.hpp>
+#include <vsmc/internal/common.hpp>
 #include <mkl.h>
 
-#define VSMC_STATIC_ASSERT_UTILITY_MKL_SS_TASK_RESULT_TYPE(ResultType)        \
-    VSMC_STATIC_ASSERT((std::is_same<ResultType, float>::value ||             \
-                           std::is_same<ResultType, double>::value),          \
-        "**MKLSSTask** USED WITH A ResultType OTHER THAN float OR double")
-
-#define VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType)      \
-    VSMC_STATIC_ASSERT((std::is_same<ResultType, float>::value ||             \
-                           std::is_same<ResultType, double>::value ||         \
-                           std::is_same<ResultType, MKL_Complex8>::value ||   \
-                           std::is_same<ResultType, MKL_Complex16>::value),   \
-        "**MKLConvTask** USED WITH A ResultType OTHER THAN float, double, "   \
-        "MKL_Complex8, OR MKL_Complex16")
-
-#define VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType)      \
-    VSMC_STATIC_ASSERT((std::is_same<ResultType, float>::value ||             \
-                           std::is_same<ResultType, double>::value ||         \
-                           std::is_same<ResultType, MKL_Complex8>::value ||   \
-                           std::is_same<ResultType, MKL_Complex16>::value),   \
-        "**MKLCorrTask** USED WITH A ResultType OTHER THAN float, double, "   \
-        "MKL_Complex8, OR MKL_Complex16")
-
-#define VSMC_STATIC_ASSERT_UTILITY_MKL_DF_TASK_RESULT_TYPE(ResultType)        \
-    VSMC_STATIC_ASSERT((std::is_same<ResultType, float>::value ||             \
-                           std::is_same<ResultType, double>::value),          \
-        "**MKLDFTask** USED WITH A ResultType OTHER THAN float OR double")
-
-#define VSMC_RUNTIME_ASSERT_UTILITY_MKL_VSL_OFFSET(offset)                    \
-    VSMC_RUNTIME_ASSERT((offset < max VSMC_MNE()),                            \
-        "**MKLOffsetDynamic** "                                               \
-        "EXCESS MAXIMUM NUMBER OF INDEPDENT RNG STREAMS")
-
 namespace vsmc
 {
 
@@ -178,8 +147,6 @@ inline void swap(
 class MKLStream : public MKLBase<::VSLStreamStatePtr, MKLStream>
 {
     public:
-    MKLStream() = default;
-
     /// \brief `vslNewStream`
     MKLStream(MKL_INT brng, MKL_UINT seed) { reset(brng, seed); }
 
@@ -732,22 +699,19 @@ class MKLStream : public MKLBase<::VSLStreamStatePtr, MKLStream>
 
 /// \brief MKL `VSLSSTaskPtr`
 /// \ingroup MKL
-template <typename ResultType = double>
-class MKLSSTask : public MKLBase<::VSLSSTaskPtr, MKLSSTask<ResultType>>
+template <typename RealType = double>
+class MKLSSTask : public MKLBase<::VSLSSTaskPtr, MKLSSTask<RealType>>
 {
-    public:
-    using result_type = ResultType;
+    static_assert(internal::is_one_of<RealType, float, double>::value,
+        "**MKLSSTask** USED WITH RealType OTHER THAN float or double");
 
-    MKLSSTask()
-    {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_SS_TASK_RESULT_TYPE(ResultType);
-    }
+    public:
+    using result_type = RealType;
 
     /// \brief `vslSSNewTask`
     MKLSSTask(const MKL_INT *p, const MKL_INT *n, const MKL_INT *xstorage,
         const result_type *x, const result_type *w, const MKL_INT *indices)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_SS_TASK_RESULT_TYPE(ResultType);
         reset(p, n, xstorage, x, w, indices);
     }
 
@@ -895,7 +859,7 @@ class MKLSSTask : public MKLBase<::VSLSSTaskPtr, MKLSSTask<ResultType>>
     int compute(unsigned MKL_INT64 estimates, MKL_INT method)
     {
         return compute_dispatch(
-            estimates, method, static_cast<float *>(nullptr));
+            estimates, method, static_cast<result_type *>(nullptr));
     }
 
     private:
@@ -1278,19 +1242,18 @@ class MKLSSTask : public MKLBase<::VSLSSTaskPtr, MKLSSTask<ResultType>>
 template <typename ResultType = double>
 class MKLConvTask : public MKLBase<::VSLConvTaskPtr, MKLConvTask<ResultType>>
 {
+    static_assert(internal::is_one_of<ResultType, float, double, MKL_Complex8,
+                      MKL_Complex16>::value,
+        "**MKLConvTask** USED WITH ResultType OTHER THAN float, double, "
+        "MKL_Complex8 OR MKL_Complex16");
+
     public:
     using result_type = ResultType;
 
-    MKLConvTask()
-    {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType);
-    }
-
     /// \brief `vslConvNewTask`
     MKLConvTask(MKL_INT mode, MKL_INT dims, const MKL_INT *xshape,
         const MKL_INT *yshape, const MKL_INT *zshape)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType);
         reset(mode, dims, xshape, yshape, zshape);
     }
 
@@ -1298,7 +1261,6 @@ class MKLConvTask : public MKLBase<::VSLConvTaskPtr, MKLConvTask<ResultType>>
     MKLConvTask(
         MKL_INT mode, const MKL_INT xshape, MKL_INT yshape, MKL_INT zshape)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType);
         reset(mode, xshape, yshape, zshape);
     }
 
@@ -1307,7 +1269,6 @@ class MKLConvTask : public MKLBase<::VSLConvTaskPtr, MKLConvTask<ResultType>>
         const MKL_INT *yshape, const MKL_INT *zshape, const result_type *x,
         const MKL_INT *xstride)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType);
         reset(mode, dims, xshape, yshape, zshape, x, xstride);
     }
 
@@ -1315,15 +1276,12 @@ class MKLConvTask : public MKLBase<::VSLConvTaskPtr, MKLConvTask<ResultType>>
     MKLConvTask(MKL_INT mode, MKL_INT xshape, MKL_INT yshape, MKL_INT zshape,
         const result_type *x, const MKL_INT xstride)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType);
         reset(mode, xshape, yshape, zshape, x, xstride);
     }
 
     /// \brief `vslConvCopyTask`
     MKLConvTask(const MKLConvTask<ResultType> &other)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CONV_TASK_RESULT_TYPE(ResultType);
-
         ::VSLConvTaskPtr ptr = nullptr;
         internal::mkl_error_check(::vslConvCopyTask(&ptr, other.get()),
             "MKLConvTask::MKLConvTask", "::vslConvCopyTask");
@@ -1607,19 +1565,18 @@ class MKLConvTask : public MKLBase<::VSLConvTaskPtr, MKLConvTask<ResultType>>
 template <typename ResultType = double>
 class MKLCorrTask : public MKLBase<::VSLCorrTaskPtr, MKLCorrTask<ResultType>>
 {
+    static_assert(internal::is_one_of<ResultType, float, double, MKL_Complex8,
+                      MKL_Complex16>::value,
+        "**MKLCorrTask** USED WITH ResultType OTHER THAN float, double, "
+        "MKL_Complex8 OR MKL_Complex16");
+
     public:
     using result_type = ResultType;
 
-    MKLCorrTask()
-    {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType);
-    }
-
     /// \brief `vslCorrNewTask`
     MKLCorrTask(MKL_INT mode, MKL_INT dims, const MKL_INT *xshape,
         const MKL_INT *yshape, const MKL_INT *zshape)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType);
         reset(mode, dims, xshape, yshape, zshape);
     }
 
@@ -1627,7 +1584,6 @@ class MKLCorrTask : public MKLBase<::VSLCorrTaskPtr, MKLCorrTask<ResultType>>
     MKLCorrTask(
         MKL_INT mode, const MKL_INT xshape, MKL_INT yshape, MKL_INT zshape)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType);
         reset(mode, xshape, yshape, zshape);
     }
 
@@ -1636,7 +1592,6 @@ class MKLCorrTask : public MKLBase<::VSLCorrTaskPtr, MKLCorrTask<ResultType>>
         const MKL_INT *yshape, const MKL_INT *zshape, const result_type *x,
         const MKL_INT *xstride)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType);
         reset(mode, dims, xshape, yshape, zshape, x, xstride);
     }
 
@@ -1644,15 +1599,12 @@ class MKLCorrTask : public MKLBase<::VSLCorrTaskPtr, MKLCorrTask<ResultType>>
     MKLCorrTask(MKL_INT mode, MKL_INT xshape, MKL_INT yshape, MKL_INT zshape,
         const result_type *x, const MKL_INT xstride)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType);
         reset(mode, xshape, yshape, zshape, x, xstride);
     }
 
     /// \brief `vslCorrCopyTask`
     MKLCorrTask(const MKLCorrTask<ResultType> &other)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_CORR_TASK_RESULT_TYPE(ResultType);
-
         ::VSLCorrTaskPtr ptr = nullptr;
         internal::mkl_error_check(::vslCorrCopyTask(&ptr, other.get()),
             "MKLCorrTask::MKLCorrTask", "::vslCorrCopyTask");
@@ -1933,21 +1885,18 @@ class MKLCorrTask : public MKLBase<::VSLCorrTaskPtr, MKLCorrTask<ResultType>>
 
 /// \brief MKL `DFTaskPtr`
 /// \ingroup MKL
-template <typename ResultType = double>
-class MKLDFTask
+template <typename RealType = double>
+class MKLDFTask : public MKLBase<::DFTaskPtr, MKLDFTask<RealType>>
 {
-    public:
-    using result_type = ResultType;
+    static_assert(internal::is_one_of<RealType, float, double>::value,
+        "**MKLDFTask** USED WITH RealType OTHER THAN float OR double");
 
-    MKLDFTask()
-    {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_DF_TASK_RESULT_TYPE(ResultType);
-    }
+    public:
+    using result_type = RealType;
 
     MKLDFTask(MKL_INT nx, const result_type *x, MKL_INT xhint, MKL_INT ny,
         const result_type *y, MKL_INT yhint)
     {
-        VSMC_STATIC_ASSERT_UTILITY_MKL_DF_TASK_RESULT_TYPE(ResultType);
         reset(nx, x, xhint, ny, y, yhint);
     }
 
diff --git a/include/vsmc/utility/program_option.hpp b/include/vsmc/utility/program_option.hpp
index 717b8c1ec..cc5b6bb7a 100644
--- a/include/vsmc/utility/program_option.hpp
+++ b/include/vsmc/utility/program_option.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,33 +36,36 @@
 
 #define VSMC_RUNTIME_ASSERT_UTILITY_PROGRAM_OPTION_NULLPTR(ptr, func)         \
     VSMC_RUNTIME_ASSERT((ptr != nullptr),                                     \
-        "**ProgramOptionMap::" #func                                          \
+        "**ProgramOption::" #func                                             \
         "** ATTEMPT TO SET OPTION WITH A NULL POINTER")
 
 namespace vsmc
 {
 
-/// \brief Program option warning messages
-/// \ingroup Option
-inline void program_option_warning(const std::string &oname,
+namespace internal
+{
+
+inline void program_option_warning(const std::string &name,
     const std::string &msg, bool silent, std::ostream &os)
 {
     if (silent)
         return;
 
     os << "vSMC Program Option Warning\n";
-    os << "Option: " << oname << '\n';
+    os << "Option: --" << name << '\n';
     os << "Message : " << msg << std::endl;
 }
 
-/// \brief Program option base class
+} // namespace vsmc::internal
+
+/// \brief Option base class
 /// \ingroup Option
 class ProgramOption
 {
     public:
-    ProgramOption() {}
-    ProgramOption(const ProgramOption &) {}
-    ProgramOption &operator=(const ProgramOption &) { return *this; }
+    ProgramOption() = default;
+    ProgramOption(const ProgramOption &) = default;
+    ProgramOption &operator=(const ProgramOption &) = default;
     virtual ~ProgramOption() {}
 
     virtual bool is_bool() const = 0;
@@ -72,10 +75,9 @@ class ProgramOption
     virtual bool set_default() = 0;
     virtual std::string description() const = 0;
     virtual std::string default_str() const = 0;
-    virtual ProgramOption *clone() const = 0;
 
     protected:
-    bool set_value(const std::string &oname, const std::string &sval,
+    bool set_value(const std::string &name, const std::string &sval,
         bool *dest, bool silent, std::ostream &os)
     {
         const char *const sptr = sval.c_str();
@@ -128,13 +130,21 @@ class ProgramOption
             return true;
         }
 
-        program_option_warning(
-            oname, "Failed to set value: " + sval, silent, os);
+        internal::program_option_warning(
+            name, "Failed to set value: " + sval, silent, os);
         return false;
     }
 
+    bool set_value(const std::string &, const std::string &sval,
+        std::string *dest, bool, std::ostream &)
+    {
+        *dest = sval;
+
+        return true;
+    }
+
     template <typename T>
-    bool set_value(const std::string &oname, const std::string &sval, T *dest,
+    bool set_value(const std::string &name, const std::string &sval, T *dest,
         bool silent, std::ostream &os)
     {
         std::stringstream ss;
@@ -142,8 +152,8 @@ class ProgramOption
         T tval;
         ss >> tval;
         if (ss.fail()) {
-            program_option_warning(
-                oname, "Failed to set value: " + sval, silent, os);
+            internal::program_option_warning(
+                name, "Failed to set value: " + sval, silent, os);
             ss.clear();
             return false;
         }
@@ -164,10 +174,10 @@ class ProgramOptionHelp : public ProgramOption
 
     bool is_vector() const { return false; }
 
-    bool set(const std::string &oname, const std::string &sval, bool silent,
+    bool set(const std::string &name, const std::string &sval, bool silent,
         std::ostream &os)
     {
-        return set_value(oname, sval, &help_, silent, os);
+        return set_value(name, sval, &help_, silent, os);
     }
 
     bool set_default() { return false; }
@@ -179,8 +189,6 @@ class ProgramOptionHelp : public ProgramOption
 
     std::string default_str() const { return std::string("(false)"); }
 
-    ProgramOption *clone() const { return new ProgramOptionHelp; }
-
     bool help() const { return help_; }
 
     private:
@@ -261,16 +269,14 @@ class ProgramOptionScalar : public ProgramOptionDefault<T>
 
     bool is_vector() const { return false; }
 
-    bool set(const std::string &oname, const std::string &sval, bool silent,
+    bool set(const std::string &name, const std::string &sval, bool silent,
         std::ostream &os)
     {
-        return this->set_value(oname, sval, ptr_, silent, os);
+        return this->set_value(name, sval, ptr_, silent, os);
     }
 
     bool set_default() { return this->set_value_default(ptr_); }
 
-    ProgramOption *clone() const { return new ProgramOptionScalar<T>(*this); }
-
     private:
     T *const ptr_;
 }; // class ProgramOptionScalar
@@ -282,136 +288,52 @@ class ProgramOptionVector : public ProgramOptionDefault<T>
 {
     public:
     ProgramOptionVector(const std::string &desc, std::vector<T> *ptr)
-        : ProgramOptionDefault<T>(desc), val_(T()), ptr_(ptr)
+        : ProgramOptionDefault<T>(desc), ptr_(ptr)
     {
     }
 
     template <typename V>
     ProgramOptionVector(const std::string &desc, std::vector<T> *ptr, V val)
-        : ProgramOptionDefault<T>(desc, val), val_(T()), ptr_(ptr)
+        : ProgramOptionDefault<T>(desc, val), ptr_(ptr)
     {
     }
 
     bool is_vector() const { return true; }
 
-    bool set(const std::string &oname, const std::string &sval, bool silent,
+    bool set(const std::string &name, const std::string &sval, bool silent,
         std::ostream &os)
     {
-        bool success = this->set_value(oname, sval, &val_, silent, os);
-
+        T val;
+        bool success = this->set_value(name, sval, &val, silent, os);
         if (success)
-            ptr_->push_back(val_);
+            ptr_->push_back(val);
 
         return success;
     }
 
     bool set_default()
     {
-        bool success = this->set_value_default(&val_);
-
+        T val;
+        bool success = this->set_value_default(&val);
         if (success)
-            ptr_->push_back(val_);
+            ptr_->push_back(val);
 
         return success;
     }
 
-    ProgramOption *clone() const { return new ProgramOptionVector<T>(*this); }
-
     private:
-    T val_;
     std::vector<T> *const ptr_;
 }; // class ProgramOptionVector
 
-/// \brief A map of ProgramOption
+/// \brief Program options
 /// \ingroup Option
 class ProgramOptionMap
 {
-    using option_map_type =
-        std::map<std::string, std::pair<ProgramOption *, std::size_t>>;
-    using option_list_type =
-        std::list<std::pair<std::string, const ProgramOption *>>;
-
     public:
-    explicit ProgramOptionMap(bool silent = false, bool auto_help = true)
-        : silent_(silent)
-        , auto_help_(auto_help)
-        , help_ptr_(new ProgramOptionHelp)
-    {
-        option_map_["--help"] = std::make_pair(help_ptr_, 0);
-        option_list_.push_back(std::make_pair("--help", help_ptr_));
-    }
-
-    ProgramOptionMap(const ProgramOptionMap &other)
-        : silent_(other.silent_)
-        , auto_help_(other.auto_help_)
-        , option_map_(other.option_map_)
-        , option_list_(other.option_list_)
-    {
-        for (option_map_type::iterator iter = option_map_.begin();
-             iter != option_map_.end(); ++iter) {
-            if (iter->second.first)
-                iter->second.first = iter->second.first->clone();
-        }
-    }
-
-    ProgramOptionMap &operator=(const ProgramOptionMap &other)
-    {
-        if (this != &other) {
-            silent_ = other.silent_;
-            auto_help_ = other.auto_help_;
-            for (option_map_type::iterator iter = option_map_.begin();
-                 iter != option_map_.end(); ++iter) {
-                if (iter->second.first)
-                    delete iter->second.first;
-            }
-
-            option_map_ = other.option_map_;
-            option_list_ = other.option_list_;
-
-            for (option_map_type::iterator iter = option_map_.begin();
-                 iter != option_map_.end(); ++iter) {
-                if (iter->second.first)
-                    iter->second.first = iter->second.first->clone();
-            }
-        }
-
-        return *this;
-    }
-
-    ProgramOptionMap(ProgramOptionMap &&other)
-        : silent_(other.silent_)
-        , auto_help_(other.auto_help_)
-        , help_ptr_(other.help_ptr_)
-        , option_map_(std::move(other.option_map_))
-        , option_list_(std::move(other.option_list_))
+    explicit ProgramOptionMap(bool silent = false)
+        : silent_(silent), help_ptr_(std::make_shared<ProgramOptionHelp>())
     {
-        other.help_ptr_ = nullptr;
-        other.option_map_.clear();
-        other.option_list_.clear();
-    }
-
-    ProgramOptionMap &operator=(ProgramOptionMap &&other)
-    {
-        if (this != &other) {
-            silent_ = other.silent_;
-            help_ptr_ = other.help_ptr_;
-            option_map_ = std::move(other.option_map_);
-            option_list_ = std::move(other.option_list_);
-            other.help_ptr_ = nullptr;
-            other.option_map_.clear();
-            other.option_list_.clear();
-        }
-
-        return *this;
-    }
-
-    ~ProgramOptionMap()
-    {
-        for (option_map_type::iterator iter = option_map_.begin();
-             iter != option_map_.end(); ++iter) {
-            if (iter->second.first != nullptr)
-                delete iter->second.first;
-        }
+        add_option("help", help_ptr_);
     }
 
     /// \brief Add an option with a single value
@@ -425,11 +347,9 @@ class ProgramOptionMap
         const std::string &name, const std::string &desc, T *ptr)
     {
         VSMC_RUNTIME_ASSERT_UTILITY_PROGRAM_OPTION_NULLPTR(ptr, add);
-        const std::string oname("--" + name);
-        ProgramOption *optr = new ProgramOptionScalar<T>(desc, ptr);
-        add_option(oname, optr);
 
-        return *this;
+        return add_option(
+            name, std::make_shared<ProgramOptionScalar<T>>(desc, ptr));
     }
 
     /// \brief Add an option with a single value, with a default value
@@ -438,11 +358,9 @@ class ProgramOptionMap
         const std::string &name, const std::string &desc, T *ptr, V val)
     {
         VSMC_RUNTIME_ASSERT_UTILITY_PROGRAM_OPTION_NULLPTR(ptr, add);
-        const std::string oname("--" + name);
-        ProgramOption *optr = new ProgramOptionScalar<T>(desc, ptr, val);
-        add_option(oname, optr);
 
-        return *this;
+        return add_option(
+            name, std::make_shared<ProgramOptionScalar<T>>(desc, ptr, val));
     }
 
     /// \brief Add an option with multiple value
@@ -451,11 +369,9 @@ class ProgramOptionMap
         const std::string &name, const std::string &desc, std::vector<T> *ptr)
     {
         VSMC_RUNTIME_ASSERT_UTILITY_PROGRAM_OPTION_NULLPTR(ptr, add);
-        const std::string oname("--" + name);
-        ProgramOption *optr = new ProgramOptionVector<T>(desc, ptr);
-        add_option(oname, optr);
 
-        return *this;
+        return add_option(
+            name, std::make_shared<ProgramOptionVector<T>>(desc, ptr));
     }
 
     /// \brief Add an option with multiple value, with a default value
@@ -464,24 +380,16 @@ class ProgramOptionMap
         std::vector<T> *ptr, V val)
     {
         VSMC_RUNTIME_ASSERT_UTILITY_PROGRAM_OPTION_NULLPTR(ptr, add);
-        const std::string oname("--" + name);
-        ProgramOption *optr = new ProgramOptionVector<T>(desc, ptr, val);
-        add_option(oname, optr);
 
-        return *this;
+        return add_option(
+            name, std::make_shared<ProgramOptionVector<T>>(desc, ptr, val));
     }
 
     ProgramOptionMap &remove(const std::string &name)
     {
-        const std::string oname("--" + name);
-        option_map_type::iterator iter = option_map_.find(oname);
-        if (iter != option_map_.end()) {
-            if (iter->second.first != nullptr)
-                delete iter->second.first;
-            option_map_.erase(iter);
-            option_list_type::iterator liter = option_list_find(oname);
-            option_list_.erase(liter);
-        }
+        auto iter = option_find(name);
+        if (iter != option_vec_.end())
+            option_vec_.erase(iter);
 
         return *this;
     }
@@ -494,13 +402,12 @@ class ProgramOptionMap
     ///
     /// \param argc The first argument of the `main` function
     /// \param argv The second argument of the `main` function
-    /// \param os The output stream used to print help information if
-    /// `auto_help` is set to true, and the warning messages if any error
-    /// occurs when processing the options.
+    /// \param os The output stream used to print help information and the
+    /// warning messages if any error occurs when processing the options.
     void process(int argc, const char **argv, std::ostream &os = std::cout)
     {
         std::string arg;
-        std::vector<std::string> arg_vector;
+        Vector<std::string> arg_vector;
         arg_vector.reserve(static_cast<std::size_t>(argc));
         for (int i = 0; i != argc; ++i) {
             arg = process_arg(argv[i]);
@@ -514,7 +421,7 @@ class ProgramOptionMap
     void process(int argc, char **argv, std::ostream &os = std::cout)
     {
         std::string arg;
-        std::vector<std::string> arg_vector;
+        Vector<std::string> arg_vector;
         arg_vector.reserve(static_cast<std::size_t>(argc));
         for (int i = 0; i != argc; ++i) {
             arg = process_arg(argv[i]);
@@ -528,158 +435,150 @@ class ProgramOptionMap
     void print_help(std::ostream &os = std::cout) const
     {
         std::size_t len[2] = {0, 0};
-        std::vector<std::string> vec[3];
-        for (option_list_type::const_iterator liter = option_list_.begin();
-             liter != option_list_.end(); ++liter) {
-            vec[0].push_back(liter->first);
-            vec[1].push_back(liter->second->description());
-            vec[2].push_back(liter->second->default_str());
-            if (len[0] < vec[0].back().size())
-                len[0] = vec[0].back().size();
-            if (len[1] < vec[1].back().size())
-                len[1] = vec[1].back().size();
+        Vector<std::string> str[3];
+        for (const auto &option : option_vec_) {
+            str[0].push_back("--" + std::get<0>(option));
+            str[1].push_back(std::get<1>(option)->description());
+            str[2].push_back(std::get<1>(option)->default_str());
+            len[0] = std::max(len[0], str[0].back().size());
+            len[1] = std::max(len[1], str[1].back().size());
         }
         len[0] += 4;
         len[1] += 4;
-        for (std::size_t i = 0; i != vec[0].size(); ++i) {
-            os << vec[0][i] << std::string(len[0] - vec[0][i].size(), ' ');
-            os << vec[1][i] << std::string(len[1] - vec[1][i].size(), ' ');
-            os << vec[2][i] << std::endl;
+        for (std::size_t i = 0; i != str[0].size(); ++i) {
+            os << str[0][i] << std::string(len[0] - str[0][i].size(), ' ');
+            os << str[1][i] << std::string(len[1] - str[1][i].size(), ' ');
+            os << str[2][i] << std::endl;
         }
     }
 
     /// \brief Count the number of successful processing of an option
     std::size_t count(const std::string &name) const
     {
-        option_map_type::const_iterator iter = option_map_.find("--" + name);
-        if (iter != option_map_.end())
-            return iter->second.second;
-        else
-            return 0;
+        auto iter = option_find(name);
+        if (iter != option_vec_.end())
+            return std::get<2>(*iter);
+        return 0;
     }
 
+    /// \brief If the "help" option is processed and set to true
+    bool help() { return help_ptr_->help(); }
+
     /// \brief Get the underlying option object
-    const ProgramOption *option(const std::string &name) const
+    std::shared_ptr<ProgramOption> option(const std::string &name)
     {
-        option_map_type::const_iterator iter = option_map_.find("--" + name);
-        if (iter != option_map_.end())
-            return iter->second.first;
-        else
-            return nullptr;
+        auto iter = option_find(name);
+        if (iter != option_vec_.end())
+            return std::get<1>(*iter);
+        return std::shared_ptr<ProgramOption>(
+            static_cast<ProgramOption *>(nullptr));
     }
 
     /// \brief Get the underlying option object
-    ProgramOption *option(const std::string &name)
+    std::shared_ptr<const ProgramOption> option(const std::string &name) const
     {
-        option_map_type::const_iterator iter = option_map_.find("--" + name);
-        if (iter != option_map_.end())
-            return iter->second.first;
-        else
-            return nullptr;
+        auto iter = option_find(name);
+        if (iter != option_vec_.end())
+            return std::get<1>(*iter);
+        return std::shared_ptr<const ProgramOption>(
+            static_cast<const ProgramOption *>(nullptr));
     }
 
     /// \brief Set the silent flag, if true, no warning messages will be
     /// printed for unknown options etc.,
     void silent(bool flag) { silent_ = flag; }
 
-    /// \brief Set the auto_help flag, if true, help information is printed
-    /// automatically when the `--help` option is processed
-    void auto_help(bool flag) { auto_help_ = flag; }
-
     private:
+    using option_vec_type = Vector<
+        std::tuple<std::string, std::shared_ptr<ProgramOption>, std::size_t>>;
+
     bool silent_;
-    bool auto_help_;
-    ProgramOptionHelp *help_ptr_;
-    option_map_type option_map_;
-    option_list_type option_list_;
+    std::shared_ptr<ProgramOptionHelp> help_ptr_;
+    option_vec_type option_vec_;
 
-    option_list_type::iterator option_list_find(const std::string &oname)
+    option_vec_type::iterator option_find(const std::string &name)
     {
-        option_list_type::iterator liter = option_list_.begin();
-        for (; liter != option_list_.end(); ++liter) {
-            if (liter->first == oname)
+        auto iter = option_vec_.begin();
+        for (; iter != option_vec_.end(); ++iter)
+            if (std::get<0>(*iter) == name)
                 break;
-        }
 
-        return liter;
+        return iter;
     }
 
-    void add_option(const std::string &oname, ProgramOption *optr)
+    option_vec_type::const_iterator option_find(const std::string &name) const
     {
-        std::pair<option_map_type::iterator, bool> insert =
-            option_map_.insert(std::make_pair(oname, std::make_pair(optr, 0)));
-        if (insert.second) {
-            option_list_.push_back(std::make_pair(oname, optr));
-        } else {
-            if (insert.first->second.first != nullptr)
-                delete insert.first->second.first;
-            insert.first->second.first = optr;
-            option_list_type::iterator liter = option_list_find(oname);
-            liter->second = optr;
-        }
+        auto iter = option_vec_.begin();
+        for (; iter != option_vec_.end(); ++iter)
+            if (std::get<0>(*iter) == name)
+                break;
+
+        return iter;
+    }
+
+    ProgramOptionMap &add_option(
+        const std::string &name, std::shared_ptr<ProgramOption> optr)
+    {
+        auto option = std::make_tuple(name, optr, 0);
+        auto iter = option_find(name);
+        if (iter != option_vec_.end())
+            *iter = option;
+        else
+            option_vec_.push_back(option);
+
+        return *this;
     }
 
     void process_arg_vector(
         std::vector<std::string> &arg_vector, std::ostream &os)
     {
-        std::string option_value;
-        const std::vector<std::string> option_value_vec;
-        std::vector<std::pair<std::string, std::vector<std::string>>>
-            option_vector;
-        std::vector<std::string>::iterator aiter = arg_vector.begin();
+        Vector<std::pair<std::string, Vector<std::string>>> name_svals;
+        Vector<std::string> svals;
+        auto aiter = arg_vector.begin();
         while (aiter != arg_vector.end() && !is_option(*aiter))
             ++aiter;
         while (aiter != arg_vector.end()) {
-            option_vector.push_back(std::make_pair(*aiter, option_value_vec));
-            std::vector<std::string> &value = option_vector.back().second;
+            std::string name(aiter->begin() + 2, aiter->end());
             ++aiter;
+            svals.clear();
             while (aiter != arg_vector.end() && !is_option(*aiter)) {
-                value.push_back(*aiter);
+                svals.push_back(*aiter);
                 ++aiter;
             }
+            name_svals.push_back(std::make_pair(name, svals));
         }
 
         const std::string sval_true("1");
-        for (std::vector<std::pair<std::string,
-                 std::vector<std::string>>>::iterator iter =
-                 option_vector.begin();
-             iter != option_vector.end(); ++iter) {
-            option_map_type::iterator miter = option_map_.find(iter->first);
-            if (miter == option_map_.end()) {
-                program_option_warning(
-                    iter->first, "Unknown option", silent_, os);
+        for (auto &nsv : name_svals) {
+            auto iter = option_find(nsv.first);
+            if (iter == option_vec_.end()) {
+                internal::program_option_warning(
+                    nsv.first, "Unknown option", silent_, os);
                 continue;
             }
 
             bool proc = false;
-            const std::size_t vsize = iter->second.size();
-            if (vsize == 0 && miter->second.first->is_bool()) {
-                proc = process_option(miter, sval_true, os);
-            } else if (vsize == 0) {
-                program_option_warning(
-                    miter->first, "Value not found", silent_, os);
-            } else if (!miter->second.first->is_vector()) {
-                option_value.clear();
-                for (std::size_t i = 0; i != vsize - 1; ++i)
-                    option_value += iter->second[i] + ' ';
-                option_value += iter->second[vsize - 1];
-                proc = process_option(miter, option_value, os);
+            if (nsv.second.size() == 0 && std::get<1>(*iter)->is_bool()) {
+                proc = process_option(iter, sval_true, os);
+            } else if (nsv.second.size() == 0) {
+                internal::program_option_warning(
+                    nsv.first, "No value found", silent_, os);
+            } else if (std::get<1>(*iter)->is_vector()) {
+                for (const auto &sval : nsv.second)
+                    proc = process_option(iter, sval, os);
             } else {
-                for (std::size_t i = 0; i != vsize; ++i)
-                    proc = process_option(miter, iter->second[i], os) || proc;
+                proc = process_option(iter, nsv.second.back(), os) || proc;
             }
             if (proc)
-                ++miter->second.second;
+                ++std::get<2>(*iter);
         }
 
-        for (option_map_type::iterator iter = option_map_.begin();
-             iter != option_map_.end(); ++iter) {
-            if (iter->second.second == 0)
-                if (iter->second.first->set_default())
-                    iter->second.second = 1;
-        }
+        for (auto &option : option_vec_)
+            if (std::get<2>(option) == 0)
+                if (std::get<1>(option)->set_default())
+                    std::get<2>(option) = 1;
 
-        if (auto_help_ && help_ptr_->help())
+        if (help())
             print_help(os);
     }
 
@@ -693,15 +592,15 @@ class ProgramOptionMap
         return std::string(arg, arg + e);
     }
 
-    bool process_option(option_map_type::iterator iter,
+    bool process_option(option_vec_type::iterator iter,
         const std::string &sval, std::ostream &os)
     {
         if (sval.empty()) {
-            program_option_warning(iter->first, "No value found", silent_, os);
+            internal::program_option_warning(
+                std::get<0>(*iter), "No value found", silent_, os);
             return false;
         }
-
-        return iter->second.first->set(iter->first, sval, silent_, os);
+        return std::get<1>(*iter)->set(std::get<0>(*iter), sval, silent_, os);
     }
 
     bool is_option(const std::string &str) const
diff --git a/include/vsmc/utility/progress.hpp b/include/vsmc/utility/progress.hpp
index 601b23b89..2af14a11f 100644
--- a/include/vsmc/utility/progress.hpp
+++ b/include/vsmc/utility/progress.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -117,7 +117,7 @@ class Progress
 
     private:
     static constexpr std::size_t max_val_ =
-        std::numeric_limits<std::size_t>::max VSMC_MNE();
+        std::numeric_limits<std::size_t>::max();
 
     StopWatch watch_;
     std::thread *thread_ptr_;
diff --git a/include/vsmc/utility/simd.hpp b/include/vsmc/utility/simd.hpp
index bb2ccd0e2..2be502c24 100644
--- a/include/vsmc/utility/simd.hpp
+++ b/include/vsmc/utility/simd.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -120,8 +120,7 @@ class M128I
     M128I(const __m128i &value) : value_(value) {}
 
     template <typename T>
-    M128I(const M128I<T> &other)
-        : value_(other.value())
+    M128I(const M128I<T> &other) : value_(other.value())
     {
     }
 
@@ -449,7 +448,8 @@ inline M128I<T> operator|(const M128I<T> &a, const M128I<T> &b)
 }
 
 template <typename T>
-inline M128I<T> operator^(const M128I<T> &a, const M128I<T> &b) {
+inline M128I<T> operator^(const M128I<T> &a, const M128I<T> &b)
+{
     return M128I<T>(_mm_xor_si128(a.value(), b.value()));
 }
 
@@ -788,6 +788,7 @@ class M128TypeTrait<double>
 } // namespace vsmc::internal
 
 /// \brief floating point SSE2 type
+/// \ingroup SIMD
 template <typename T>
 using M128Type = typename std::conditional<std::is_integral<T>::value,
     M128I<T>, typename internal::M128TypeTrait<T>::type>::type;
@@ -810,8 +811,7 @@ class M256I
     M256I(const __m256i &value) : value_(value) {}
 
     template <typename T>
-    M256I(const M256I<T> &other)
-        : value_(other.value())
+    M256I(const M256I<T> &other) : value_(other.value())
     {
     }
 
@@ -1158,7 +1158,8 @@ inline M256I<T> operator|(const M256I<T> &a, const M256I<T> &b)
 }
 
 template <typename T>
-inline M256I<T> operator^(const M256I<T> &a, const M256I<T> &b) {
+inline M256I<T> operator^(const M256I<T> &a, const M256I<T> &b)
+{
     return M256I<T>(_mm256_xor_si256(a.value(), b.value()));
 }
 
@@ -1501,6 +1502,7 @@ class M256TypeTrait<double>
 } // namespace vsmc::internal
 
 /// \brief floating point SSE2 type
+/// \ingroup SIMD
 template <typename T>
 using M256Type = typename std::conditional<std::is_integral<T>::value,
     M256I<T>, typename internal::M256TypeTrait<T>::type>::type;
diff --git a/include/vsmc/utility/stop_watch.hpp b/include/vsmc/utility/stop_watch.hpp
index cc72a9eaf..d01e11087 100644
--- a/include/vsmc/utility/stop_watch.hpp
+++ b/include/vsmc/utility/stop_watch.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/include/vsmc/utility/utility.hpp b/include/vsmc/utility/utility.hpp
index 9f0bb32fd..9287a1bcb 100644
--- a/include/vsmc/utility/utility.hpp
+++ b/include/vsmc/utility/utility.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@
 
 #include <vsmc/internal/config.h>
 #include <vsmc/utility/aligned_memory.hpp>
+#include <vsmc/utility/covariance.hpp>
 #include <vsmc/utility/program_option.hpp>
 #include <vsmc/utility/progress.hpp>
 #include <vsmc/utility/stop_watch.hpp>
diff --git a/include/vsmc/vsmc.h b/include/vsmc/vsmc.h
index 0db02af3c..87a3fae27 100644
--- a/include/vsmc/vsmc.h
+++ b/include/vsmc/vsmc.h
@@ -3,7 +3,7 @@
  *----------------------------------------------------------------------------
  *                         vSMC: Scalable Monte Carlo
  *----------------------------------------------------------------------------
- * Copyright (c) 2013-2015, Yan Zhou
+ * Copyright (c) 2013-2016, Yan Zhou
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -148,6 +148,10 @@ void vsmc_rng_lognormal(
 void vsmc_rng_normal(
     vsmc_rng *rng_ptr, int n, double *r, double mean, double stddev);
 
+/// \brief `vsmc::NormalMVDistribution<double, vsmc::Dynamic>`
+void vsmc_rng_normal_mv(vsmc_rng *rng_ptr, int n, double *r, int dim,
+    const double *mean, const double *chol);
+
 /// \brief `vsmc::ParetoDistribution<double>`
 void vsmc_rng_pareto(vsmc_rng *rng_ptr, int n, double *r, double a, double b);
 
@@ -160,38 +164,10 @@ void vsmc_rng_student_t(vsmc_rng *rng_ptr, int n, double *r, double df);
 /// \brief `vsmc::U01Distribution<double>`
 void vsmc_rng_u01(vsmc_rng *rng_ptr, int n, double *r);
 
-/// \brief `vsmc::U01CCDistribution<double>`
-void vsmc_rng_u01_cc(vsmc_rng *rng_ptr, int n, double *r);
-
-/// \brief `vsmc::U01CODistribution<double>`
-void vsmc_rng_u01_co(vsmc_rng *rng_ptr, int n, double *r);
-
-/// \brief `vsmc::U01OCDistribution<double>`
-void vsmc_rng_u01_oc(vsmc_rng *rng_ptr, int n, double *r);
-
-/// \brief `vsmc::U01OODistribution<double>`
-void vsmc_rng_u01_oo(vsmc_rng *rng_ptr, int n, double *r);
-
 /// \brief `vsmc::UniformRealDistribution<double>`
 void vsmc_rng_uniform_real(
     vsmc_rng *rng_ptr, int n, double *r, double a, double b);
 
-/// \brief `vsmc::UniformRealCCDistribution<double>`
-void vsmc_rng_uniform_real_cc(
-    vsmc_rng *rng_ptr, int n, double *r, double a, double b);
-
-/// \brief `vsmc::UniformRealCODistribution<double>`
-void vsmc_rng_uniform_real_co(
-    vsmc_rng *rng_ptr, int n, double *r, double a, double b);
-
-/// \brief `vsmc::UniformRealOCDistribution<double>`
-void vsmc_rng_uniform_real_oc(
-    vsmc_rng *rng_ptr, int n, double *r, double a, double b);
-
-/// \brief `vsmc::UniformRealOODistribution<double>`
-void vsmc_rng_uniform_real_oo(
-    vsmc_rng *rng_ptr, int n, double *r, double a, double b);
-
 /// \brief `vsmc::WeibullDistribution<double>`
 void vsmc_rng_weibull(vsmc_rng *rng_ptr, int n, double *r, double a, double b);
 
@@ -263,6 +239,30 @@ int vsmc_mkl_brng_rdrand64(void);
 
 /// @}
 
+/// \defgroup C_API_RandomWalk Random walk
+/// @{
+
+/// \brief `vsmc::RandomWalk<double, vsmc::Dynamic>`
+int vsmc_random_walk(vsmc_rng *rng_ptr, int dim, double *x, double *ltx,
+    double (*log_target)(int, const double *),
+    double (*proposal)(vsmc_rng *, int, const double *, double *));
+
+/// \brief `vsmc::RandomWalkG<double, vsmc::Dynamic, vsmc::Dynamic>`
+int vsmc_random_walk_g(vsmc_rng *rng_ptr, int dim_x, int dim_g, double *x,
+    double *ltx, double *g,
+    double (*log_target)(int, int, const double *, double *),
+    double (*proposal)(vsmc_rng *, int, const double *, double *));
+
+/// \brief `vsmc::NormalProposal<double>`
+double vsmc_normal_proposal(vsmc_rng *rng_ptr, int, const double *x, double *y,
+    double stddev, double a, double b);
+
+/// \brief `vsmc::NormalMVProposal<double, vsmc::Dynamic>`
+double vsmc_normal_mv_proposal(vsmc_rng *rng_ptr, int dim, const double *x,
+    double *y, const double *chol, const double *a, const double *b);
+
+/// @}
+
 /// \defgroup C_API_Memory Memory allocation
 /// @{
 
@@ -286,11 +286,11 @@ typedef enum {
 void vsmc_resample_trans_u01_rep(
     int m, int n, const double *weight, const double *u01, int *replication);
 void vsmc_resample_trans_u01_index(
-    int m, int n, const double *weight, const double *u01, int *src_idx);
+    int m, int n, const double *weight, const double *u01, int *index);
 void vsmc_resample_trans_rep_index(
-    int m, int n, const int *replication, int *src_idx);
+    int m, int n, const int *replication, int *index);
 void vsmc_resample_trans_index_rep(
-    int m, int n, const int *src_idx, int *replication);
+    int m, int n, const int *index, int *replication);
 int vsmc_resample_trans_residual(
     int m, int n, const double *weight, double *resid, int *integ);
 
diff --git a/include/vsmc/vsmc.hpp b/include/vsmc/vsmc.hpp
index 5934a415d..65e8df97e 100644
--- a/include/vsmc/vsmc.hpp
+++ b/include/vsmc/vsmc.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -65,10 +65,6 @@
 /// \defgroup Math Mathematics
 /// \brief Mathematical utilities
 
-/// \defgroup CBLAS C BLAS
-/// \ingroup Math
-/// \brief Selected C BLAS like routines
-
 /// \defgroup Constants Constants
 /// \ingroup Math
 /// \brief Mathematical constants
@@ -100,6 +96,10 @@
 /// \ingroup RNG
 /// \brief Random number generating using Random123 Threefry RNG
 
+/// \defgroup RandomWalk Random walk
+/// \ingroup RNG
+/// \brief Random walk MCMC kernels
+
 /// \defgroup RDRAND Intel DRNG
 /// \ingroup RNG
 /// \brief Random number generating using Intel RDRAND instructions
@@ -119,14 +119,6 @@
 /// \defgroup RNGC Random number generating in C and OpenCL
 /// \brief Random number generating in C and OpenCL
 
-/// \defgroup GammaK1C GammaK1
-/// \ingroup RNGC
-/// \brief Generating Gamma(k, 1) random numbers
-
-/// \defgroup Normal01C Normal01
-/// \ingroup RNGC
-/// \brief Generating Normal(0, 1) random numbers
-
 /// \defgroup PhiloxC Philox
 /// \ingroup RNGC
 /// \brief Random number generating using Random123 Philox RNG
@@ -146,6 +138,10 @@
 /// \ingroup Utility
 /// \brief Memory allocation with alignment requirement
 
+/// \defgroup Covariance Covariance
+/// \ingroup Utility
+/// \brief Covariance matrix estimation and manipulation
+
 /// \defgroup HDF5IO HDF5 objects IO
 /// \ingroup Utility
 /// \brief Load and store objects in the HDF5 format
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 27edca13b..eec8343ef 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -3,30 +3,30 @@
 # ----------------------------------------------------------------------------
 #                          vSMC: Scalable Monte Carlo
 # ----------------------------------------------------------------------------
-# Copyright (c) 2013-2015, Yan Zhou
-# All rights reserved.
+#  Copyright (c) 2013-2016, Yan Zhou
+#  All rights reserved.
 #
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
 #
-#   Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
+#    Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
 #
-#   Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
+#    Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
 # ============================================================================
 
 PROJECT(vSMCLib C CXX)
@@ -34,13 +34,12 @@ PROJECT(vSMCLib C CXX)
 SET(VSMC_LIB_SOURCE ${VSMC_LIB_SOURCE}
     ${PROJECT_SOURCE_DIR}/src/vsmc_memory.cpp
     ${PROJECT_SOURCE_DIR}/src/vsmc_resample.cpp
-    ${PROJECT_SOURCE_DIR}/src/vsmc_rng.cpp
-    )
+    ${PROJECT_SOURCE_DIR}/src/vsmc_rng.cpp)
 
-IF (MKL_FOUND)
+IF(MKL_FOUND)
     SET(VSMC_LIB_SOURCE ${VSMC_LIB_SOURCE}
         ${PROJECT_SOURCE_DIR}/src/vsmc_mkl_brng.cpp)
-ENDIF (MKL_FOUND)
+ENDIF(MKL_FOUND)
 
 ADD_LIBRARY(libvsmc ${VSMC_LIB_SOURCE})
 SET_TARGET_PROPERTIES(libvsmc PROPERTIES OUTPUT_NAME vsmc)
diff --git a/lib/src/vsmc_memory.cpp b/lib/src/vsmc_memory.cpp
index b18f4c5b2..707de5f9d 100644
--- a/lib/src/vsmc_memory.cpp
+++ b/lib/src/vsmc_memory.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/lib/src/vsmc_mkl_brng.cpp b/lib/src/vsmc_mkl_brng.cpp
index d24ce68bd..4d56254b3 100644
--- a/lib/src/vsmc_mkl_brng.cpp
+++ b/lib/src/vsmc_mkl_brng.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/lib/src/vsmc_resample.cpp b/lib/src/vsmc_resample.cpp
index 2fcd7d11b..dac294d8d 100644
--- a/lib/src/vsmc_resample.cpp
+++ b/lib/src/vsmc_resample.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include <vsmc/vsmc.h>
 #include <vsmc/resample/resample.hpp>
+#include <vsmc/vsmc.h>
 #include "vsmc_rng_cast.hpp"
 
 #define VSMC_DEFINE_C_API_RESAMPLE(Name, name)                                \
@@ -53,24 +53,24 @@ void vsmc_resample_trans_u01_rep(
 }
 
 void vsmc_resample_trans_u01_index(
-    int m, int n, const double *weight, const double *u01, int *src_idx)
+    int m, int n, const double *weight, const double *u01, int *index)
 {
     ::vsmc::resample_trans_u01_index(static_cast<std::size_t>(m),
-        static_cast<std::size_t>(n), weight, u01, src_idx);
+        static_cast<std::size_t>(n), weight, u01, index);
 }
 
 void vsmc_resample_trans_rep_index(
-    int m, int n, const int *replication, int *src_idx)
+    int m, int n, const int *replication, int *index)
 {
     ::vsmc::resample_trans_rep_index(static_cast<std::size_t>(m),
-        static_cast<std::size_t>(n), replication, src_idx);
+        static_cast<std::size_t>(n), replication, index);
 }
 
 void vsmc_resample_trans_index_rep(
-    int m, int n, const int *src_idx, int *replication)
+    int m, int n, const int *index, int *replication)
 {
     ::vsmc::resample_trans_index_rep(
-        static_cast<std::size_t>(m), std::size_t(n), src_idx, replication);
+        static_cast<std::size_t>(m), std::size_t(n), index, replication);
 }
 
 int vsmc_resample_trans_residual(
diff --git a/lib/src/vsmc_rng.cpp b/lib/src/vsmc_rng.cpp
index 9f311542a..b22cb0c87 100644
--- a/lib/src/vsmc_rng.cpp
+++ b/lib/src/vsmc_rng.cpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,8 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //============================================================================
 
-#include <vsmc/vsmc.h>
 #include <vsmc/rng/rng.hpp>
+#include <vsmc/vsmc.h>
 #include "vsmc_rng_cast.hpp"
 
 #define VSMC_DEFINE_RNG_DIST                                                  \
@@ -192,11 +192,15 @@ void vsmc_rng_poisson(vsmc_rng *rng_ptr, int n, int *r, double mean)
     VSMC_DEFINE_RNG_DIST;
 }
 
+void vsmc_rng_normal_mv(vsmc_rng *rng_ptr, int n, double *r, int dim,
+    const double *mean, const double *chol)
+{
+    ::vsmc::RNG &rng = ::vsmc::internal::rng_cast(rng_ptr);
+    ::vsmc::normal_mv_distribution<double>(rng, static_cast<std::size_t>(n), r,
+        static_cast<std::size_t>(dim), mean, chol);
+}
+
 VSMC_DEFINE_RNG_DIST_0(u01)
-VSMC_DEFINE_RNG_DIST_0(u01_cc)
-VSMC_DEFINE_RNG_DIST_0(u01_co)
-VSMC_DEFINE_RNG_DIST_0(u01_oc)
-VSMC_DEFINE_RNG_DIST_0(u01_oo)
 VSMC_DEFINE_RNG_DIST_1(chi_squared)
 VSMC_DEFINE_RNG_DIST_1(exponential)
 VSMC_DEFINE_RNG_DIST_1(rayleigh)
@@ -213,10 +217,63 @@ VSMC_DEFINE_RNG_DIST_2(lognormal)
 VSMC_DEFINE_RNG_DIST_2(normal)
 VSMC_DEFINE_RNG_DIST_2(pareto)
 VSMC_DEFINE_RNG_DIST_2(uniform_real)
-VSMC_DEFINE_RNG_DIST_2(uniform_real_cc)
-VSMC_DEFINE_RNG_DIST_2(uniform_real_co)
-VSMC_DEFINE_RNG_DIST_2(uniform_real_oc)
-VSMC_DEFINE_RNG_DIST_2(uniform_real_oo)
 VSMC_DEFINE_RNG_DIST_2(weibull)
 
+int vsmc_random_walk(vsmc_rng *rng_ptr, int dim, double *x, double *ltx,
+    double (*log_target)(int, const double *),
+    double (*proposal)(vsmc_rng *, int, const double *, double *))
+{
+    ::vsmc::RNG &rng = ::vsmc::internal::rng_cast(rng_ptr);
+    ::vsmc::RandomWalk<double, ::vsmc::Dynamic> rw(
+        static_cast<std::size_t>(dim));
+
+    auto lt = [log_target, dim](
+        std::size_t, const double *lx) { return log_target(dim, lx); };
+
+    auto prop = [proposal, rng_ptr, dim](::vsmc::RNG &, std::size_t,
+        const double *px,
+        double *py) { return proposal(rng_ptr, dim, px, py); };
+
+    return static_cast<int>(rw(rng, x, ltx, lt, prop));
+}
+
+int vsmc_random_walk_g(vsmc_rng *rng_ptr, int dim_x, int dim_g, double *x,
+    double *ltx, double *g,
+    double (*log_target)(int, int, const double *, double *),
+    double (*proposal)(vsmc_rng *, int, const double *, double *))
+{
+    ::vsmc::RNG &rng = ::vsmc::internal::rng_cast(rng_ptr);
+    ::vsmc::RandomWalkG<double, vsmc::Dynamic, ::vsmc::Dynamic> rw(
+        static_cast<std::size_t>(dim_x), static_cast<std::size_t>(dim_g));
+
+    auto lt = [log_target, dim_x, dim_g](std::size_t, std::size_t,
+        const double *lx,
+        double *lg) { return log_target(dim_x, dim_g, lx, lg); };
+
+    auto prop = [proposal, rng_ptr, dim_x](::vsmc::RNG &, std::size_t,
+        const double *px,
+        double *py) { return proposal(rng_ptr, dim_x, px, py); };
+
+    return static_cast<int>(rw(rng, x, ltx, g, lt, prop));
+}
+
+double vsmc_normal_proposal(vsmc_rng *rng_ptr, int, const double *x,
+    double *y, double stddev, double a, double b)
+{
+    ::vsmc::RNG &rng = ::vsmc::internal::rng_cast(rng_ptr);
+    ::vsmc::NormalProposal<double> prop(stddev, a, b);
+
+    return prop(rng, 1, x, y);
+}
+
+double vsmc_normal_mv_proposal(vsmc_rng *rng_ptr, int dim, const double *x,
+    double *y, const double *chol, const double *a, const double *b)
+{
+    ::vsmc::RNG &rng = ::vsmc::internal::rng_cast(rng_ptr);
+    ::vsmc::NormalMVProposal<double, vsmc::Dynamic> prop(
+        static_cast<std::size_t>(dim), chol, a, b);
+
+    return prop(rng, static_cast<std::size_t>(dim), x, y);
+}
+
 } // extern "C"
diff --git a/lib/src/vsmc_rng_cast.hpp b/lib/src/vsmc_rng_cast.hpp
index a13be08cb..ffd0f0e4a 100644
--- a/lib/src/vsmc_rng_cast.hpp
+++ b/lib/src/vsmc_rng_cast.hpp
@@ -3,7 +3,7 @@
 //----------------------------------------------------------------------------
 //                         vSMC: Scalable Monte Carlo
 //----------------------------------------------------------------------------
-// Copyright (c) 2013-2015, Yan Zhou
+// Copyright (c) 2013-2016, Yan Zhou
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/user_guide/cpp/Makefile b/user_guide/cpp/Makefile
new file mode 100644
index 000000000..a046a2586
--- /dev/null
+++ b/user_guide/cpp/Makefile
@@ -0,0 +1,25 @@
+CXX := clang++
+CXXFLAGS := $(CXXFLAGS) -std=c++11 -O3
+CXXFLAGS := $(CXXFLAGS) -DVSMC_HAS_MKL
+CXXFLAGS := $(CXXFLAGS) -DVSMC_HAS_TBB
+CXXFLAGS := $(CXXFLAGS) -I ~/Documents/GitHub/vSMC/include
+LDFLAGS := -lmkl_rt -ltbb -ltbbmalloc
+
+all : pf_seq pf_tbb progress program_option
+
+pf_seq : pf_seq.cpp
+	$(CXX) $(CXXFLAGS) -o pf_seq pf_seq.cpp $(LDFLAGS)
+
+pf_tbb : pf_tbb.cpp
+	$(CXX) $(CXXFLAGS) -o pf_tbb pf_tbb.cpp $(LDFLAGS)
+
+program_option : program_option.cpp
+	$(CXX) $(CXXFLAGS) -o program_option program_option.cpp $(LDFLAGS)
+
+progress : program_option.cpp
+	$(CXX) $(CXXFLAGS) -o progress progress.cpp $(LDFLAGS)
+
+clean:
+	rm -f pf_seq pf_tbb pf.out pf.rout pf.pdf
+	rm -f program_option
+	rm -f progress
diff --git a/user_guide/cpp/pf.R b/user_guide/cpp/pf.R
new file mode 100644
index 000000000..45f224673
--- /dev/null
+++ b/user_guide/cpp/pf.R
@@ -0,0 +1,18 @@
+library(ggplot2)
+
+pf <- read.table("pf.out", header = TRUE)
+sink("pf.rout")
+print(pf[1:5,])
+sink()
+
+obs <- read.table("pf.data", header = FALSE)
+dat <- data.frame(
+    X = c(pf[["pos.0"]], obs[,1]),
+    Y = c(pf[["pos.1"]], obs[,2]))
+dat[["Source"]] <- rep(c("Estimate", "Observation"), each = dim(obs)[1])
+plt <- qplot(x = X, y = Y, data = dat, geom = "path")
+plt <- plt + aes(group = Source, color = Source, linetype = Source)
+plt <- plt + theme_bw() + theme(legend.position = "top")
+pdf("pf.pdf")
+print(plt)
+dev.off()
diff --git a/user_guide/cpp/pf.data b/user_guide/cpp/pf.data
new file mode 100644
index 000000000..df69373cc
--- /dev/null
+++ b/user_guide/cpp/pf.data
@@ -0,0 +1,100 @@
+-1.3112368222317        3.05445446146941
+-1.18084192156169       3.2026952109489
+-1.31786851718686       2.88913768742942
+-1.50357802721903       3.46463318923276
+-1.46226396819012       3.49006369310796
+-1.77395566111683       3.53281069036811
+-1.58329636598704       3.45802947672192
+-1.4146213179481        3.85036842928615
+-1.17347691181189       3.81191420563327
+-1.08267505748716       3.92139843981037
+-1.41722298460544       3.50086173794325
+-1.66180722650006       3.93522677670438
+-1.36838808985725       4.2243437123992
+-1.45689906906278       3.7661800548733
+-1.62117111483901       3.77176805335284
+-2.05667361970354       4.13014864754588
+-1.84307687567164       3.99681489590283
+-1.92830122010255       3.93465858140606
+-1.76181569319081       4.01237249226117
+-2.02529650940206       4.52977318841181
+-1.87568750745383       4.299671960717
+-1.82992860052213       4.63331719382436
+-1.79903630802634       4.68648978693436
+-2.23556932017191       4.57005876109171
+-2.13713691324512       4.63016410212737
+-1.96619524510094       4.58321212409546
+-2.43123169274206       4.77850978611916
+-2.51643163674528       4.77792919938755
+-2.31003174748722       5.00379558200132
+-2.35797680508247       5.34785489907257
+-2.32462119928004       5.51990404993908
+-2.3349085071662        5.52469354183444
+-2.60343946213774       5.79362985065052
+-2.55018471403192       5.87863721004318
+-2.55696551469937       6.12792624607303
+-2.53846749609916       5.87502765334891
+-2.57204648440671       6.17039025731492
+-2.90648727919603       6.44736846673525
+-3.022485019546         5.92064570108665
+-2.9813547166056        6.27516190223598
+-3.03380507188488       6.11877220099743
+-2.70406187998343       6.14575746349561
+-3.04971400398642       6.4201702677493
+-3.26134069169318       6.54250153420819
+-3.53289991721096       6.98680509090092
+-3.30662650671125       7.24805594408631
+-3.38978183894266       7.37846457594642
+-3.51812762277457       7.3696206333205
+-3.29403338586032       7.50878097362767
+-3.5185838082989        7.47381143469807
+-3.4840502392735        7.73599690745035
+-3.64739563333776       7.84664116813733
+-3.6435996445389        8.1846210209492
+-3.37571118810378       8.15012182818292
+-3.24647125688327       8.43890143795509
+-3.60812471950665       8.44661408222665
+-3.40115814959644       8.5052955914581
+-3.65316069021424       8.75834905591329
+-3.75665451208443       8.64136558847173
+-3.68847007183854       8.90752775482836
+-3.98837564571867       9.23112070904798
+-4.10126681252717       9.3650499841899
+-4.17913322655295       9.44170746398772
+-3.76790962900344       9.56215713316184
+-3.91972534140568       9.24019430868945
+-4.04104585790236       9.09843531599986
+-3.72112255981371       9.50683000262241
+-3.90470320549296       9.89346728030876
+-4.5668738568331        9.87248720069089
+-4.07106917364539       10.2148743105618
+-4.11494386469553       10.1834193038788
+-4.12252384878619       10.3319125875737
+-4.35281607763816       10.5148549796559
+-4.31862922599998       10.8015124545271
+-4.7433116147536        11.2010253021165
+-4.93531767342049       11.3341115377069
+-5.17459468423179       11.3460629450974
+-5.21310280977189       11.3492898633319
+-5.43063839492691       11.6323905116851
+-5.16452610629386       11.4155238150159
+-5.32337678808473       11.4295103541245
+-5.40731537219592       11.716576123091
+-5.26096785525304       11.6977526683485
+-5.14062953068388       12.1676097562834
+-5.34778083933244       11.6795576734845
+-5.28673905730339       11.9946422147231
+-5.29813104472041       11.9202142873615
+-5.52583985863003       12.1198024872054
+-5.34731702593164       12.6396076782783
+-5.4308495174137        12.7045461177656
+-5.74374015270827       12.8847311081838
+-5.47139763530352       12.7807494978094
+-5.70252008086239       13.1944817465382
+-5.78573701298091       13.2160604563087
+-5.81577139983498       13.0020585013642
+-5.75998162986483       12.8745793253432
+-6.09326684844694       13.037394682276
+-6.05343255791713       13.1331797169505
+-6.25115479362752       12.9688423961153
+-6.52454592352636       13.3655802454826
diff --git a/user_guide/cpp/pf_seq.cpp b/user_guide/cpp/pf_seq.cpp
new file mode 100644
index 000000000..2a3aec18e
--- /dev/null
+++ b/user_guide/cpp/pf_seq.cpp
@@ -0,0 +1,168 @@
+#include <vsmc/vsmc.hpp>
+
+static constexpr std::size_t N = 1000; // Number of particles
+static constexpr std::size_t n = 100;  // Number of data points
+static constexpr std::size_t PosX = 0;
+static constexpr std::size_t PosY = 1;
+static constexpr std::size_t VelX = 2;
+static constexpr std::size_t VelY = 3;
+
+using PFStateBase = vsmc::StateMatrix<vsmc::RowMajor, 4, double>;
+
+class PFState : public PFStateBase
+{
+    public:
+    using PFStateBase::PFStateBase;
+
+    double log_likelihood(std::size_t t, size_type i) const
+    {
+        double llh_x = 10 * (this->state(i, PosX) - obs_x_[t]);
+        double llh_y = 10 * (this->state(i, PosY) - obs_y_[t]);
+        llh_x = std::log(1 + llh_x * llh_x / 10);
+        llh_y = std::log(1 + llh_y * llh_y / 10);
+
+        return -0.5 * (10 + 1) * (llh_x + llh_y);
+    }
+
+    void read_data(const char *param)
+    {
+        if (param == nullptr)
+            return;
+
+        obs_x_.resize(n);
+        obs_y_.resize(n);
+        std::ifstream data(param);
+        for (std::size_t i = 0; i != n; ++i)
+            data >> obs_x_[i] >> obs_y_[i];
+        data.close();
+    }
+
+    private:
+    vsmc::Vector<double> obs_x_;
+    vsmc::Vector<double> obs_y_;
+};
+
+class PFInit
+{
+    public:
+    std::size_t operator()(vsmc::Particle<PFState> &particle, void *param)
+    {
+        eval_param(particle, param);
+        eval_pre(particle);
+        std::size_t acc = 0;
+        for (auto sp : particle)
+            acc += eval_sp(sp);
+        eval_post(particle);
+
+        return acc;
+    }
+
+    void eval_param(vsmc::Particle<PFState> &particle, void *param)
+    {
+        particle.value().read_data(static_cast<const char *>(param));
+    }
+
+    void eval_pre(vsmc::Particle<PFState> &particle)
+    {
+        w_.resize(particle.size());
+    }
+
+    std::size_t eval_sp(vsmc::SingleParticle<PFState> sp)
+    {
+        vsmc::NormalDistribution<double> norm_pos(0, 2);
+        vsmc::NormalDistribution<double> norm_vel(0, 1);
+        sp.state(PosX) = norm_pos(sp.rng());
+        sp.state(PosY) = norm_pos(sp.rng());
+        sp.state(VelX) = norm_vel(sp.rng());
+        sp.state(VelY) = norm_vel(sp.rng());
+        w_[sp.id()] = sp.particle().value().log_likelihood(0, sp.id());
+
+        return 0;
+    }
+
+    void eval_post(vsmc::Particle<PFState> &particle)
+    {
+        particle.weight().set_log(w_.data());
+    }
+
+    private:
+    vsmc::Vector<double> w_;
+};
+
+class PFMove
+{
+    public:
+    std::size_t operator()(std::size_t t, vsmc::Particle<PFState> &particle)
+    {
+        eval_pre(t, particle);
+        std::size_t acc = 0;
+        for (auto sp : particle)
+            acc += eval_sp(t, sp);
+        eval_post(t, particle);
+
+        return 0;
+    }
+
+    void eval_pre(std::size_t t, vsmc::Particle<PFState> &particle)
+    {
+        w_.resize(particle.size());
+    }
+
+    std::size_t eval_sp(std::size_t t, vsmc::SingleParticle<PFState> sp)
+    {
+        vsmc::NormalDistribution<double> norm_pos(0, std::sqrt(0.02));
+        vsmc::NormalDistribution<double> norm_vel(0, std::sqrt(0.001));
+        sp.state(PosX) += norm_pos(sp.rng()) + 0.1 * sp.state(VelX);
+        sp.state(PosY) += norm_pos(sp.rng()) + 0.1 * sp.state(VelY);
+        sp.state(VelX) += norm_vel(sp.rng());
+        sp.state(VelY) += norm_vel(sp.rng());
+        w_[sp.id()] = sp.particle().value().log_likelihood(t, sp.id());
+
+        return 0;
+    }
+
+    void eval_post(std::size_t t, vsmc::Particle<PFState> &particle)
+    {
+        particle.weight().add_log(w_.data());
+    }
+
+    private:
+    vsmc::Vector<double> w_;
+};
+
+class PFMEval
+{
+    public:
+    void operator()(std::size_t t, std::size_t dim,
+        vsmc::Particle<PFState> &particle, double *r)
+    {
+        eval_pre(t, particle);
+        for (std::size_t i = 0; i != particle.size(); ++i, r += dim)
+            eval_sp(t, dim, particle.sp(i), r);
+        eval_post(t, particle);
+    }
+
+    void eval_pre(std::size_t t, vsmc::Particle<PFState> &particle) {}
+
+    void eval_sp(std::size_t t, std::size_t dim,
+        vsmc::SingleParticle<PFState> sp, double *r)
+    {
+        r[0] = sp.state(PosX);
+        r[1] = sp.state(PosY);
+    }
+
+    void eval_post(std::size_t t, vsmc::Particle<PFState> &particle) {}
+};
+
+int main()
+{
+    vsmc::Sampler<PFState> sampler(N, vsmc::Multinomial, 0.5);
+    sampler.init(PFInit()).move(PFMove(), false).monitor("pos", 2, PFMEval());
+    sampler.initialize(const_cast<char *>("pf.data")).iterate(n - 1);
+
+    std::ofstream output("pf.out");
+    output << sampler;
+    output.close();
+
+    return 0;
+}
diff --git a/user_guide/cpp/pf_tbb.cpp b/user_guide/cpp/pf_tbb.cpp
new file mode 100644
index 000000000..8f6b6dc1d
--- /dev/null
+++ b/user_guide/cpp/pf_tbb.cpp
@@ -0,0 +1,154 @@
+#include <vsmc/vsmc.hpp>
+
+static constexpr std::size_t N = 1000; // Number of particles
+static constexpr std::size_t n = 100;  // Number of data points
+
+using PFStateBase = vsmc::StateMatrix<vsmc::RowMajor, 4, double>;
+
+template <typename T>
+using PFStateSPBase = PFStateBase::single_particle_type<T>;
+
+class PFState : public PFStateBase
+{
+    public:
+    using PFStateBase::StateMatrix;
+
+    template <typename S>
+    class single_particle_type : public PFStateSPBase<S>
+    {
+        public:
+        using PFStateSPBase<S>::single_particle_type;
+
+        double &pos_x() { return this->state(0); }
+        double &pos_y() { return this->state(1); }
+        double &vel_x() { return this->state(2); }
+        double &vel_y() { return this->state(3); }
+
+        double log_likelihood(std::size_t t)
+        {
+            double llh_x = 10 * (pos_x() - obs_x(t));
+            double llh_y = 10 * (pos_y() - obs_y(t));
+            llh_x = std::log(1 + llh_x * llh_x / 10);
+            llh_y = std::log(1 + llh_y * llh_y / 10);
+
+            return -0.5 * (10 + 1) * (llh_x + llh_y);
+        }
+
+        private:
+        double obs_x(std::size_t t)
+        {
+            return this->particle().value().obs_x_[t];
+        }
+
+        double obs_y(std::size_t t)
+        {
+            return this->particle().value().obs_y_[t];
+        }
+    };
+
+    void read_data(const char *param)
+    {
+        if (param == nullptr)
+            return;
+
+        obs_x_.resize(n);
+        obs_y_.resize(n);
+        std::ifstream data(param);
+        for (std::size_t i = 0; i != n; ++i)
+            data >> obs_x_[i] >> obs_y_[i];
+        data.close();
+    }
+
+    private:
+    vsmc::Vector<double> obs_x_;
+    vsmc::Vector<double> obs_y_;
+};
+
+class PFInit : public vsmc::InitializeTBB<PFState, PFInit>
+{
+    public:
+    void eval_param(vsmc::Particle<PFState> &particle, void *param)
+    {
+        particle.value().read_data(static_cast<const char *>(param));
+    }
+
+    void eval_pre(vsmc::Particle<PFState> &particle)
+    {
+        w_.resize(particle.size());
+    }
+
+    std::size_t eval_sp(vsmc::SingleParticle<PFState> sp)
+    {
+        vsmc::NormalDistribution<double> norm_pos(0, 2);
+        vsmc::NormalDistribution<double> norm_vel(0, 1);
+        sp.pos_x() = norm_pos(sp.rng());
+        sp.pos_y() = norm_pos(sp.rng());
+        sp.vel_x() = norm_vel(sp.rng());
+        sp.vel_y() = norm_vel(sp.rng());
+        w_[sp.id()] = sp.log_likelihood(0);
+
+        return 0;
+    }
+
+    void eval_post(vsmc::Particle<PFState> &particle)
+    {
+        particle.weight().set_log(w_.data());
+    }
+
+    private:
+    vsmc::Vector<double> w_;
+};
+
+class PFMove : public vsmc::MoveTBB<PFState, PFMove>
+{
+    public:
+    void eval_pre(std::size_t t, vsmc::Particle<PFState> &particle)
+    {
+        w_.resize(particle.size());
+    }
+
+    std::size_t eval_sp(std::size_t t, vsmc::SingleParticle<PFState> sp)
+    {
+        vsmc::NormalDistribution<double> norm_pos(0, std::sqrt(0.02));
+        vsmc::NormalDistribution<double> norm_vel(0, std::sqrt(0.001));
+        sp.pos_x() += norm_pos(sp.rng()) + 0.1 * sp.vel_x();
+        sp.pos_y() += norm_pos(sp.rng()) + 0.1 * sp.vel_y();
+        sp.vel_x() += norm_vel(sp.rng());
+        sp.vel_y() += norm_vel(sp.rng());
+        w_[sp.id()] = sp.log_likelihood(t);
+
+        return 0;
+    }
+
+    void eval_post(std::size_t t, vsmc::Particle<PFState> &particle)
+    {
+        particle.weight().add_log(w_.data());
+    }
+
+    private:
+    vsmc::Vector<double> w_;
+};
+
+class PFMEval : public vsmc::MonitorEvalTBB<PFState, PFMEval>
+{
+    public:
+    void eval_sp(std::size_t t, std::size_t dim,
+        vsmc::SingleParticle<PFState> sp, double *r)
+    {
+        r[0] = sp.pos_x();
+        r[1] = sp.pos_y();
+    }
+};
+
+int main()
+{
+    vsmc::Sampler<PFState> sampler(N, vsmc::Multinomial, 0.5);
+    sampler.init(PFInit()).move(PFMove(), false).monitor("pos", 2, PFMEval());
+    sampler.initialize(const_cast<char *>("pf.data")).iterate(n - 1);
+
+    std::ofstream output("pf.out");
+    output << sampler;
+    output.close();
+
+    return 0;
+}
diff --git a/user_guide/cpp/program_option.cpp b/user_guide/cpp/program_option.cpp
new file mode 100644
index 000000000..3778daf36
--- /dev/null
+++ b/user_guide/cpp/program_option.cpp
@@ -0,0 +1,24 @@
+#include <vsmc/vsmc.hpp>
+
+int main(int argc, char **argv)
+{
+    int n;
+    std::string str;
+    std::vector<double> vec;
+
+    vsmc::ProgramOptionMap option_map;
+    option_map
+        .add("str", "A string option with a default value", &str, "default")
+        .add("n", "An integer option", &n)
+        .add("vec", "A vector option", &vec);
+    option_map.process(argc, argv);
+
+    std::cout << "n: " << n << std::endl;
+    std::cout << "str: " << str << std::endl;
+    std::cout << "vec: ";
+    for (auto v : vec)
+        std::cout << v << ' ';
+    std::cout << std::endl;
+
+    return 0;
+}
diff --git a/user_guide/cpp/program_option.sh b/user_guide/cpp/program_option.sh
new file mode 100755
index 000000000..6c0f1e2a7
--- /dev/null
+++ b/user_guide/cpp/program_option.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+./program_option --vec 1 2 1e-1 --str "abc" --vec 8 9 --str "def hij" --n 2 4
diff --git a/user_guide/cpp/progress.cpp b/user_guide/cpp/progress.cpp
new file mode 100644
index 000000000..0c89f7a00
--- /dev/null
+++ b/user_guide/cpp/progress.cpp
@@ -0,0 +1,24 @@
+#include <vsmc/vsmc.hpp>
+
+int main()
+{
+    vsmc::RNG rng;
+    vsmc::FisherFDistribution<double> dist(10, 20);
+    std::size_t n = 1000;
+    double r = 0;
+    vsmc::Progress progress;
+    progress.start(n * n);
+    for (std::size_t i = 0; i != n; ++i) {
+        std::stringstream ss;
+        ss << "i = " << i;
+        progress.message(ss.str());
+        for (std::size_t j = 0; j != n; ++j) {
+            for (std::size_t k = 0; k != n; ++k)
+                r += dist(rng);
+            progress.increment();
+        }
+    }
+    progress.stop();
+
+    return 0;
+}
diff --git a/user_guide/tex/advanced.tex b/user_guide/tex/advanced.tex
new file mode 100644
index 000000000..420e1f9e8
--- /dev/null
+++ b/user_guide/tex/advanced.tex
@@ -0,0 +1,326 @@
+\chapter{Advanced usage}
+\label{chap:Advanced usage}
+
+\section{Cloning objects}
+\label{sec:Cloning objects}
+
+The \cppinline{Sampler<T>} and \cppinline{Particle<T>} objects have copy
+constructors, assignment operators, move constructors, and move assignment
+operators that behaves exactly the way as \cpp programmers would expect.
+However, these behaviors are not always desired. For example, in
+\textcite{stpf} a stable particle filter in high-dimensions was developed.
+Without going into the details, the algorithm consists of a particle system
+where each particle is itself a particle filter. And thus when resampling the
+global system, the \cppinline{Sampler<T>} object will be copied, together with
+all of its sub-objects. This include the \rng system within the
+\cppinline{Particle<T>} object. Even if the user does not use this \rng system
+for random number generating within user defined operations, one of these \rng
+will be used for resampling by the \cppinline{Particle<T>} object. Direct
+copying the \cppinline{Sampler<T>} object will lead to multiple local filters
+start to generating exactly the same random numbers in the next iteration. This
+is an undesired side effects. In this situation, one can clone the sampler with
+the following method,
+\begin{cppcode}
+  auto new_sampler = sampler.clone(new_rng);
+\end{cppcode}
+where \cppinline{new_rng} is a boolean value. If it is \cppinline{true}, then
+an exact copy of \cppinline{sampler} will be returned, except it will have the
+\rng system re-seeded. If it is \cppinline{false}, then the above assignemnt
+behaves exactly the same as
+\begin{cppcode}
+  auto new_sampler = sampler;
+\end{cppcode}
+Alternatively, the contents of an existing \cppinline{Sampler<T>} object can be
+replaced from another one by the following method,
+\begin{cppcode}
+  sampler.clone(other_sampler, retain_rng);
+\end{cppcode}
+where \cppinline{retain_rng} is a boolean value. If it is \cppinline{true},
+then the \rng system of \cppinline{other_sampler} is not copied and its own
+\rng system is retained. If it is \cppinline{false}, then the above call
+behaves exactly the same as
+\begin{cppcode}
+  sampler = other_sampler;
+\end{cppcode}
+The above method also supports move semantics. Similar \cppinline{clone}
+methods exist for the \cppinline{Particle<T>} class.
+
+\section{Customizing member types}
+\label{sec:Customizing member types}
+
+The \cppinline{Particle<T>} class has a few member types that can be replaced
+by the user. If the class \cppinline{T} has the corresponding types, then the
+member type of \cppinline{Particle<T>} will be replaced. For example, given the
+following declarations inside class \cppinline{T},
+\begin{cppcode}
+  class T
+  {
+      public:
+      using size_type = int;
+      using weight_type = /* User defined type */;
+      using rng_set_type = RNGSetTBB<AES256_4x32>;
+  };
+\end{cppcode}
+The corresponding \cppinline{Particle<T>::size_type}, etc., will have their
+defaults replaced with the above types.
+
+A note on \cppinline{weight_type}, it needs to provide the following method,
+\begin{cppcode*}{texcomments}
+  w.ess();           // Get $\text{\normalfont\textsc{ess}}$
+  w.set_equal();     // Set $W^{(i)} = 1/N$
+  w.resample_size(); // Get the size $N$.
+  w.resample_data(); // Get a pointer to normalized weights
+\end{cppcode*}
+For the library's default class \cppinline{Weight}, the last two calls are the
+same as \cppinline{w.size()} and \cppinline{w.data()}. However, this does not
+need to be so. For example, below is the outline of an implementation of
+\cppinline{weight_type} for distributed systems, assuming each computing node
+has been allocated $N_r$ particles.
+\begin{cppcode*}{texcomments}
+  class WeightMPI
+  {
+      public:
+      double ess()
+      {
+          double local = /* $\sum_{i=1}^{N_r}(W^{(i)})^2$ */;
+          double global = /* Gather local from each all nodes */;
+          // Broadcast the value of global
+
+          return 1 / global;
+      }
+
+      std::size_t resample_size() { return /* $\sum N_r$ */; }
+
+      double *resample_data()
+      {
+          if (rank == 0) {
+              // Gather all normalized weights into a member data on this node
+              // Say resample\_weight\_
+              return resample_weight_.data();
+          } else {
+              return nullptr;
+          }
+      }
+
+      void set_equal()
+      {
+          // Set all weights to $1 / \sum N_r$
+          // Synchronization
+      }
+
+      void set(const double *v)
+      {
+          // Set $W^{(i)} = v_i$ for $i = 1,\dots,N_r$
+          // Compute $S_r = \sum_{i=1}^{N_r} W^{(i)}$
+          // Gathering $S_r$, compute $S = \sum S_r$
+          // Broadcast $S$
+          // Set $W^{(i)} = W^{(i)} / S$ for $i = 1,\dots,N_r$
+      }
+  };
+\end{cppcode*}
+When \cppinline{Particle<T>} performs resampling, it checks if the pointer
+returned by \cppinline{w.resample_data()} is a null pointer. It will only
+generate the vector $\{a_i\}_{i=1}^N$ (see section~\ref{sub:State}) when it is
+not a null pointer. And then a pointer to this vector is passed to
+\cppinline{T::copy}. Of course, the class \cppinline{T} also needs to provide a
+suitable method \cppinline{copy} that can handle the distributed system. By
+defining suitable \cppinline{WeightMPI} and \cppinline{T::copy}, the library
+can be extended to handle distributed systems.
+
+\section{Extending \protect\spt}
+\label{sec:Extending SP}
+
+The \cppinline{SingleParticle<T>} can also be extended by the user. We have
+already see in section~\ref{sub:State} that if class \cppinline{T} is a
+subclass of \cppinline{StateMatrix}, \cppinline{SingleParticle<T>} can have
+additional methods to access the state. This class can be extended by defining
+a member class template inside class \cppinline{T}. For example, for the simple
+particle filter in section~\ref{sec:A simple particle filter}, we can redefine
+the \cppinline{PFState} as the following,
+\begin{cppcode}
+  using PFStateBase = StateMatrix<RowMajor, 4, double>;
+
+  template <typename T>
+  using PFStateSPBase = PFStateBase::single_particle_type<T>;
+
+  class PFState : public PFStateBase
+  {
+      public:
+      using PFStateBase::StateMatrix;
+
+      template <typename S>
+      class single_particle_type : public PFStateSPBase<S>
+      {
+          public:
+          using PFStateSPBase<S>::single_particle_type;
+
+          double &pos_x() { return this->state(0); }
+          double &pos_y() { return this->state(1); }
+          double &vel_x() { return this->state(2); }
+          double &vel_y() { return this->state(3); }
+
+          // Return $\ell(X_t^{(i)}|Y_t)$
+          double log_likelihood(std::size_t t);
+      };
+
+      void read_data(const char *param);
+
+      private:
+      Vector<double> obs_x_;
+      Vector<double> obs_y_;
+  };
+\end{cppcode}
+And later, we can use these methods when implement \cppinline{PFInit} etc.,
+\begin{cppcode}
+  class PFInit : public InitializeTBB<PFState, PFInit>
+  {
+      public:
+      void eval_param(Particle<PFState> &particle, void *param);
+
+      void eval_pre(Particle<PFState> &particle);
+
+      std::size_t eval_sp(SingleParticle<PFState> sp)
+      {
+          NormalDistribution<double> norm_pos(0, 2);
+          NormalDistribution<double> norm_vel(0, 1);
+          sp.pos_x() = norm_pos(sp.rng());
+          sp.pos_y() = norm_pos(sp.rng());
+          sp.vel_x() = norm_vel(sp.rng());
+          sp.vel_y() = norm_vel(sp.rng());
+          w_[sp.id()] = sp.log_likelihood(0);
+
+          return 0;
+      }
+
+      void eval_post(Particle<PFState> &particle);
+
+      private:
+      Vector<double> w_;
+  };
+\end{cppcode}
+It shall be noted that, it is important to keep
+\cppinline{single_particle_type} small and copying the object efficient. The
+library will frequently pass argument of \cppinline{SingleParticle<T>} type by
+value.
+
+\subsection{Compared to custom state type}
+\label{sub:Compared to custom state type}
+
+One can also write a custom state type. For example,
+\begin{cppcode}
+  class PFStateSP
+  {
+      public:
+      double &pos_x() { return pos_x_; }
+      double &pos_y() { return pos_y_; }
+      double &vel_x() { return vel_x_; }
+      double &vel_y() { return vel_y_; }
+
+      double log_likelihood(double obs_x, double obs_y) const;
+
+      private:
+      double pos_x_;
+      double pos_y_;
+      double vel_x_;
+      double vel_y_;
+  };
+\end{cppcode}
+And the \cppinline{PFState} class will be defined as,
+\begin{cppcode}
+  using PFStateBase = StateMatrix<RowMajor, 1, PFStateSP>;
+
+  class PFState : public PFStateBase
+  {
+      public:
+      using PFStateBase::StateMatrix;
+
+      double log_likelihood(std::size_t t, std::size_t i) const
+      {
+          return this->state(i, 0).log_likelihood(obs_x_[t], obs_y_[t]);
+      }
+
+      void read_data(const char *param);
+
+      private:
+      Vector<double> obs_x_;
+      Vector<double> obs_y_;
+  };
+\end{cppcode}
+The implementation of \cppinline{PFInit}, etc., will be similar. Compared to
+extending the \cppinline{SingleParticle<T>} type, this method is perhaps more
+intuitive. Functionality-wise, they are almost identical. However, there are a
+few advantages of extending \cppinline{SingleParticle<T>}. First, it allows
+more compact data storage. Consider a situation where the state space is best
+represented by a real and an integer. The most intuitive way might be the
+following,
+\begin{cppcode}
+  class S
+  {
+      public:
+      double &x() { return x_; }
+      double &u() { return u_; }
+
+      private:
+      double x_;
+      int u_;
+  };
+
+  class T : StateMatrix<RowMajor, 1, S>;
+\end{cppcode}
+However, the type \cppinline{S} will need to satisfy the alignment requirement
+of \cppinline{double}, which is 8-bytes on most platforms. However, its size
+might not be a multiple of 8-bytes. Therefore the type will be padded and the
+storage of a vector of such type will not be as compact as possible. This can
+affect performance in some situations. An alternative approach would be the
+following,
+\begin{cppcode}
+  class T
+  {
+      public:
+      template <typename S>
+      class single_particle_type : SingleParticleBase<S>
+      {
+          public:
+          using SingleParticleBase<S>::SingleParticleBase;
+
+          double &x() { return this->particle().x_[this->id()]; }
+          double &u() { return this->particle().u_[this->id()]; }
+      };
+
+      private:
+      Vector<double> x_;
+      Vector<int> u_;
+  };
+\end{cppcode}
+By extending \cppinline{SingleParticle<T>}, it provides the same easy access to
+each particle. However, now the state values are stored as two compact vectors.
+
+A second advantage is that it allows easier access to the raw data. Consider
+the implementation \cppinline{PFMEval} in section~\ref{sub:Implementations}.
+It is rather redundant to copy each value of the two positions, just so later
+we can compute weighted sums from them. Recall that in
+section~\ref{sub:Monitor} we showed that a monitor that compute the final
+results directly can also be added to a sampler. Therefore, we might
+implement \cppinline{PFMEval} as the following,
+\begin{cppcode*}{texcomments}
+  class PFMEval
+  {
+      public:
+      void operator()(std::size_t t, std::size_t dim,
+          Particle<PFState> &particle, double *r)
+      {
+          cblas_dgemv(CblasRowMajor, CblasTrans, particle.size(), dim, 1,
+              particle.value().data(), particle.value().dim(),
+              particle.weight().data(), 1, 0, r, 1);
+      }
+  };
+\end{cppcode*}
+And it can be added to a sampler as,
+\begin{cppcode}
+  sampler.monitor("pos", 2, PFMEval(), true);
+\end{cppcode}
+For this particular case, the performance benefit is small. But the possibility
+of accessing compact vector as raw data allows easier interfacing with external
+numerical libraries. If we implemented \cppinline{PFState} with the alternative
+approach shown earlier, the above direct invoking of \cppinline{cblas_dgemv}
+will not be possible.
diff --git a/user_guide/tex/app.tex b/user_guide/tex/app.tex
new file mode 100644
index 000000000..9c6ce9480
--- /dev/null
+++ b/user_guide/tex/app.tex
@@ -0,0 +1,20 @@
+\appendix
+
+\chapter{Source code of complete programs}
+\label{chap:Source code of complete programs}
+
+\section{Sequential implementation of a simple particle filter}
+\label{sec:Sequential implementation of a simple particle filter}
+\cppfile{cpp/pf_seq.cpp}
+
+\section{Parallelized implementation of a simple particle filter}
+\label{sec:Parallelized implementation of a simple particle filter}
+\cppfile{cpp/pf_tbb.cpp}
+
+\section{Processing command line program options}
+\label{sec:Processing command line program options}
+\cppfile{cpp/program_option.cpp}
+
+\section{Display program progress}
+\label{sec:Display program progress}
+\cppfile{cpp/progress.cpp}
diff --git a/user_guide/tex/basic.tex b/user_guide/tex/basic.tex
new file mode 100644
index 000000000..5d2421150
--- /dev/null
+++ b/user_guide/tex/basic.tex
@@ -0,0 +1,813 @@
+\chapter{Basic usage}
+\label{chap:Basic usage}
+
+\section{Conventions}
+\label{sec:Conventions}
+
+All classes that are accessible to users are within the name space
+\cppinline{vsmc}. Class names are in \cppinline{CamelCase} and function names
+and class members are in \cppinline{small_cases}. In the remaining of this
+guide, we will omit the \cppinline{vsmc::} name space qualifiers. We will use
+``function'' for referring to name space scope functions and ``method'' for
+class member functions.
+
+\section{Getting and installing the library}
+\label{sec:Getting and installing the library}
+
+The library is hosted at
+GitHub\footnote{\url{https://github.com/zhouyan/vSMC}}. This is a header only
+\cpp template library. To install the library just move the contents of the
+\texttt{include} directory into a proper place, e.g.,
+\texttt{/usr/local/include} on Unix-alike systems. This library requires
+working \cppoo, \blas and \lapack implementations. Standard C interface headers
+for the later two (\cppinline{cblas.h} and \cppinline{lapacke.h}) are required.
+Intel Threading Building
+Blocks\footnote{\url{https://www.threadingbuildingblocks.org}} (\tbb), Intel
+Math Kernel Library\footnote{\url{https://software.intel.com/en-us/intel-mkl}}
+(\mkl) and \hdf\footnote{\url{http://www.hdfgroup.org}} are optional
+third-party libraries. One need to define the configuration macros
+\cppinline{VSMC_HAS_TBB}, \cppinline{VSMC_HAS_MKL} and
+\cppinline{VSMC_HAS_HDF5} to nonzero values before including any \vsmc headers
+to make their existence known to the library, respectively.
+
+\section{Concepts}
+\label{sec:Concepts}
+
+The library is structured around a few core concepts. A sampler is responsible
+for running an algorithm. It contains a particle system and operations on it. A
+particle system is formed by the states $\{X^{(i)}\}_{i=1}^N$ and weights
+$\{W^{(i)}\}_{i=1}^N$. This system will also be responsible for resampling. All
+user defined operations are to be applied to the whole system. These are
+``initialization'' and ``moves'' which are applied before resampling, and
+``\mcmc'' moves which are applied after resampling. These operations do not
+have to be \mcmc kernels. They can be used for any purpose that suits the
+particular algorithm. Most statistical inferences requires calculation of
+$\sum_{i=1}^NW^{(i)}\varphi(X^{(i)})$ for some function $\varphi$. This can be
+carried out along each sampler iteration by a monitor. Table~\ref{tab:concepts}
+lists these concepts and the corresponding types in the library. Each of them
+are introduced in detail in the following sections.
+
+\begin{table}[t]
+  \begin{tabu}{X[l]X[2l]}
+    \toprule
+    Concept & Type \\
+    \midrule
+    State, $\{X^{(i)}\}_{i=1}^N$            & \texttt{T}, user defined   \\
+    Weight, $\{W^{(i)}\}_{i=1}^N$           & \texttt{Weight}            \\
+    Particle, $\{W^{(i)},X^{(i)}\}_{i=1}^N$ & \texttt{Particle<T>}       \\
+    Single particle, $\{W^{(i)},X^{(i)}\}$  & \texttt{SingleParticle<T>} \\
+    Sampler        & \texttt{Sampler<T>}                           \\
+    Initialization & \texttt{Sampler<T>::init\_type}, user defined \\
+    Move           & \texttt{Sampler<T>::move\_type}, user defined \\
+    \mcmc          & \texttt{Sampler<T>::mcmc\_type}, user defined \\
+    Monitor        & \texttt{Monitor<T>}                           \\
+    \bottomrule
+  \end{tabu}
+  \caption{Core concepts of the library}
+  \label{tab:concepts}
+\end{table}
+
+\subsection{State}
+\label{sub:State}
+
+The library gives users the maximum flexibility of how the states
+$\{X^{(i)}\}_{i=1}^N$ shall be stored and structured. Any class type with a
+constructor that takes a single integer value, the number of particles, as its
+argument, and a method named \cppinline{copy} is acceptable. For example,
+\begin{cppcode*}{texcomments}
+  class T
+  {
+      public:
+      T(std::size_t N);
+
+      template <typename IntType>
+      void copy(std::size_t N, IntType *index)
+      {
+          for (std::size_t i = 0; i != N; ++i) {
+              // Let $a_i =$ index[i], set $X^{(i)} = X^{(a_i)}$
+          }
+      }
+  };
+\end{cppcode*}
+How the state values are actually stored and accessed are entirely up to the
+user. The method \cppinline{copy} is necessary since the library assumes no
+knowledge of the internal structure of the state. And thus it cannot perform
+the last step of a resampling algorithm, which makes copies of particles with
+larger weights and eliminate those with smaller weights.
+
+For most applications, the values can be stored within an $N$ by $d$ matrix,
+where $d$ is the dimension of the state. The library provides a convenient
+class template for this situation,
+\begin{cppcode}
+  template <MatrixLayout Layout, std::size_t Dim, typename T>
+  class StateMatrix;
+\end{cppcode}
+where \cppinline{Layout} is either \cppinline{RowMajor} or
+\cppinline{ColMajor}, which specifies the matrix storage layout;
+\cppinline{Dim} is a non-negative integer value. If \cppinline{Dim} is zero,
+then the dimension may be changed at runtime. If it is positive, then the
+dimension is fixed and cannot be changed at runtime. The last template
+parameter \cppinline{T} is the \cpp type of state space. The following
+constructs an object of this class,
+\begin{cppcode}
+  StateMatrix<ColMajor, Dynamic, double> s(N);
+\end{cppcode}
+where \cppinline{Dynamic} is just an enumerator with value zero. We can specify
+the dimension at runtime through the method \cppinline{s.resize_dim(d)}. Note
+that, if the template parameter \cppinline{Dim} is positive, then this call
+results in a compile-time error.
+
+To access $X_{ij}$, the value of the state of the $i$\ith particle at the
+$j$\ith coordinate, one can use the method \cppinline{s.state(i,j)}. The method
+\cppinline{s.data()} returns a pointer to the beginning of the matrix.  If
+\cppinline{Layout} is \cppinline{RowMajor}, then the method
+\cppinline{s.row_data(i)} returns a pointer to the beginning of the $i$\ith
+row. If \cppinline{Layout} is \cppinline{ColMajor}, then the method
+\cppinline{s.col_data(j)} returns a pointer to the beginning of the $j$\ith
+column. These methods help interfacing with numerical libraries, such as \blas.
+
+The \cppinline{StateMatrix} class deliberately does not provide a
+\cppinline{resize} method. There are algorithms that change the sample size
+between iterations. However, such algorithms often change size through
+resampling or other methods, either deterministically or stochastically. An
+example of changing size of a sampler is provided in section~\ref{sec:Resizing
+  a sampler}.
+
+\subsection{Weight}
+\label{sub:Weight}
+
+The vector of weights $\{W^{(i)}\}_{i=1}^N$ is abstracted in the library by the
+\cppinline{Weight} class. The following constructs an object of this class,
+\begin{cppcode}
+  Weight w(N);
+\end{cppcode}
+There are a few methods for accessing the weights,
+\begin{cppcode*}{texcomments}
+  w.ess();          // Get {\normalfont\textsc{ess}}
+  w.set_equal();    // Set $W^{(i)} = 1/N$
+\end{cppcode*}
+The weights can be manipulated, given a vector of length $N$, say $v$,
+\begin{cppcode*}{texcomments}
+  w.set(v);         // Set $W^{(i)} \propto v^{(i)}$
+  w.mul(v);         // Set $W^{(i)} \propto W^{(i)} v^{(i)}$
+  w.set_log(v);     // Set $\log W^{(i)} = v^{(i)} + \text{const.}$
+  w.add_log(v);     // Set $\log W^{(i)} = \log W^{(i)} + v^{(i)} + \text{const.}$
+\end{cppcode*}
+The method \cppinline{w.data()} returns a pointer to the normalized weights. It
+is important to note that the weights are always normalized and all mutable
+methods only allow access to $\{W^{(i)}\}_{i=1}^N$ as a whole.
+
+\subsection{Particle}
+\label{sub:Particle}
+
+A particle system is composed of both the state values, which is of user
+defined type, say \cppinline{T}, and the weights. The following constructs an
+object of class \cppinline{Particle<T>},
+\begin{cppcode}
+  Particle<T> particle(N);
+\end{cppcode}
+The method \cppinline{particle.value()} returns the type \cppinline{T} object,
+and \cppinline{particle.weight()} returns the type \cppinline{Weight}
+object\footnote{More precisely, it is a \cppinline{Particle<T>::weight_type}
+  object, whose exact type depends on the type \cppinline{T}. See
+  section~\ref{sec:Customizing member types} for more details. If the user does
+  not do something special as shown in that section, then the default type is
+  the class \cppinline{Weight}}. They are constructed with the same integer
+value $N$ when the above constructor is invoked.
+
+As a Monte Carlo algorithm, random number generators (\rng) will be used
+frequently. The user is free to use whatever \rng mechanism as they see fit.
+However, one common issue encountered in practice is how to maintain
+independence of the \rng streams between function calls. For example, consider
+below a function that manipulates some state values,
+\begin{cppcode}
+  void function(double &x)
+  {
+      std::mt19937 rng;
+      std::normal_distribution<double> rnorm(0, 1);
+      x = rnorm(rng);
+  }
+\end{cppcode}
+Every call of this function will give \cppinline{x} exactly the same value.
+This is hardly what the user intended. One might consider an global \rng or one
+as class member data. For example,
+\begin{cppcode}
+  std::mt19937 rng;
+  void function(double &x)
+  {
+      std::normal_distribution<double> rnorm(0, 1);
+      x = rnorm(rng);
+  }
+\end{cppcode}
+This will work fine as long as the function is never called by two threads at
+the same time. However, \smc algorithms are natural candidates to
+parallelization. Therefore, the user will need to either lock the \rng, which
+degenerates the performance, or construct different \rng{}s for different
+threads. The later, though ensures thread-safety, has other issues. For
+example, consider
+\begin{cppcode*}{texcomments}
+  std::mt19937 rng1(s1); // For thread $i_1$ with seed $s_1$
+  std::mt19937 rng2(s2); // For thread $i_2$ with seed $s_2$
+\end{cppcode*}
+where the seeds $s_1 \ne s_2$. It is difficult to ensure that the two streams
+generated by the two \rng{}s are independent. Common practice for parallel \rng
+is to use sub-streams or leap-frog algorithms. Without going into any further
+details, it is sufficient to say that this is perhaps not a problem that most
+users bother to solve.
+
+The library provides a simple solution to this issue. The method
+\cppinline{particle.rng(i)} returns a reference to an \rng that conforms to the
+\cppoo uniform \rng concept. It can be called from different threads at the
+same time, for example,
+\begin{cppcode*}{texcomments}
+  auto &rng1 = particle.rng(i1); // Called from thread $i_1$
+  auto &rng2 = particle.rng(i2); // Called from thread $i_2$
+\end{cppcode*}
+If $i_1 \ne i_2$, then the subsequent use of the two \rng{}s are guaranteed to
+be thread-safe. In addition, they will produce independent streams. If \tbb is
+available to the library, then it is also thread-safe even if $i_1 = i_2$. One
+can write functions that process each particle, for example,
+\begin{cppcode*}{texcomments}
+  void function(std::size_t i)
+  {
+      auto &rng = particle.rng(i);
+      // Process the particle i using rng
+  }
+\end{cppcode*}
+And if later this function is called from a parallelized environment, it is
+still thread-safe and produce desired statistical results. The details of the
+\rng system are documented later in chapter~\ref{chap:Random number generating}.
+
+\subsection{Single particle}
+\label{sub:Single particle}
+
+It is often easier to define a function $f(X^{(i)})$ than
+$f(X^{(1)},\dots,X^{(N)})$. However, \cppinline{Particle<T>} only provides
+access to $\{X^{(i)}\}_{i=1}^N$ as a whole through
+\cppinline{particle.value()}. To allow direct access to $X^{(i)}$, the library
+uses a class template \cppinline{SingeParticle<T>}. An object of this class is
+constructed from the index $i$ of the particle, and a pointer to the particle
+system it belongs to,
+\begin{cppcode}
+  SingleParticle<T> sp(i, &particle);
+\end{cppcode}
+or more conveniently,
+\begin{cppcode}
+  auto sp = particle.sp(i);
+\end{cppcode}
+In its most basic form, it has the following methods,
+\begin{cppcode}
+  sp.id();       // Get the value i that sp was constructed with
+  sp.particle(); // Get a reference to the Particle<T> object sp belongs to
+  sp.rng();      // => sp.particle().rng(sp.id());
+\end{cppcode}
+If \cppinline{T} is a subclass of \cppinline{StateMatrix}, then it has two
+additional methods,
+\begin{cppcode}
+  sp.dim();    // => sp.particle().value().dim();
+  sp.state(j); // => sp.particle().value().state(sp.id(), j);
+\end{cppcode}
+It is clear now that the interface of \cppinline{SingleParticle<T>} depends on
+the type \cppinline{T}. Later in section~\ref{sec:Extending SP}
+we will show how to insert additional methods into this class.
+
+A \cppinline{SingleParticle<T>} object is similar to an iterator. In fact, it
+supports almost all of the operations of a random access iterator with two
+exceptions. First dereferencing a \cppinline{SingleParticle<T>} object returns
+itself. The support of \cppinline{operator*} allows the range-based for loop to
+be applied on a \cppinline{Particle<T>} object, for example,
+\begin{cppcode}
+  for (auto sp : particle) {
+      // sp is of type SingleParticle<T>
+  }
+\end{cppcode}
+The above loop does make some sense. However trying to dereferencing a
+\cppinline{SingleParticle<T>} object in other contexts does not make much
+sense. Recall that it is an \emph{index}, not a \emph{pointer}. The library
+does not require the user defined type \cppinline{T} to provide access to
+individual values, and thus it cannot dereference a
+\cppinline{SingleParticle<T>} object to obtain such a value. Similarly, the
+expression \cppinline{sp[n]} returns \cppinline{sp + n}, another
+\cppinline{SingleParticle<T>} object. For the same reason,
+\cppinline{operator->} is not supported at all.
+
+\subsection{Sampler}
+\label{sub:Sampler}
+
+A sampler can be constructed in a few ways,
+\begin{cppcode}
+  Sampler<T> sampler(N);
+\end{cppcode}
+constructs a sampler that is never resampled, while
+\begin{cppcode}
+  Sampler<T> sampler(N, Multinomial);
+\end{cppcode}
+constructs a sampler that is resampled every iteration, using the multinomial
+algorithm. Other resampling schemes are also implemented, see
+chapter~\ref{chap:Resampling}. Last, one can also construct a sampler that is
+only resampled when $\ess < \alpha N$, where $\alpha\in[0, 1]$, by the
+following,
+\begin{cppcode}
+  Sampler<T> sampler(N, Multinomial, alpha);
+\end{cppcode}
+If $\alpha > 1$, then it has the same effect as the first constructor, since
+$\ess \le N$. If $\alpha < 0$, then it has the same effect as the second
+constructor, since $\ess > 0$.
+
+In summary, if one does not tell the constructor which resampling scheme to
+use, then it is assumed one does not want to do resampling. If one specify the
+resampling scheme without a threshold for \ess, then it is assumed it need to
+be done at every step.
+
+The method \cppinline{sampler.particle()} returns a reference to the particle
+system. A sampler can be initialized by user defined object that is convertible
+to the following type,
+\begin{cppcode}
+  using init_type = std::function<std::size_t(Particle<T> &, void *)>;
+\end{cppcode}
+For example,
+\begin{cppcode}
+  auto init = [](Particle<T> &particle, void *param) {
+      // Process initialization parameter
+      // Initialize the particle system
+  };
+\end{cppcode}
+is a \cppoo lambda expression that can be used for this purpose. One can add it
+to a sampler by calling \cppinline{sampler.init(init)}. Upon calling
+\cppinline{sampler.initialize(param)}, the user defined function
+\cppinline{init} will be called and the argument \cppinline{param} will be
+passed to it.
+
+Similarly, after initialization, at each iteration, the particle system can be
+manipulated by user defined callable objects that is convertible to the
+following types,
+\begin{cppcode}
+  using move_type = std::function<std::size_t(std::size_t, Particle<T> &)>;
+  using mcmc_type = std::function<std::size_t(std::size_t, Particle<T> &)>;
+\end{cppcode}
+Multiple moves can be added to a sampler. The call
+\cppinline{sampler.move(move, append)} adds a \cppinline{move_type} object to
+the sampler, where \cppinline{append} is a boolean value. If it is
+\cppinline{false}, it will clear any moves that were added before. If it is
+\cppinline{true}, then \cppinline{move} is appended to the end of an existing
+sequence of moves. Each move will be called one by one upon calling
+\cppinline{sampler.iterate()}. A similar sequence of \mcmc moves can also be
+added to a sampler. The call \cppinline{sampler.iterate()} will call user
+defined moves first, then perform the possible resampling, and then the
+sequence of \mcmc moves.
+
+Note that the possible resampling will also be performed after the user defined
+initialization function is called by \cppinline{sampler.initialize(param)}. And
+after that, the sequence of \mcmc moves will be called. If it desired not to
+perform mutations during initialization, then following can be used,
+\begin{cppcode}
+  sampler.init(init).initialize(param);
+  sampler.move(move, false).mcmc(mcmc, false).iterate(n);
+\end{cppcode}
+The above code also demonstrates that most methods of \cppinline{Sampler<T>}
+return a reference to the sampler itself and thus method calls can be chained.
+In addition, method \cppinline{sampler.iterate(n)} accepts an optional argument
+that specifies the number of iterations. It is a shortcut for
+\begin{cppcode}
+  for (std::size_t i = 0; i != n; ++i)
+      sampler.iterate();
+\end{cppcode}
+
+\subsection{Monitor}
+\label{sub:Monitor}
+
+Inferences using a \smc algorithm usually require the calculation of the
+quantity $\sum_{i=1}^NW^{(i)}\varphi(X^{(i)})$ at each iteration for some
+function $\varphi$. One can define callable object that is convertible to the
+following type,
+\begin{cppcode}
+  using eval_type =
+      std::function<void(std::size_t, std::size_t, Particle<T> &, double *);
+\end{cppcode}
+For example,
+\begin{cppcode*}{texcomments}
+  void eval(std::size_t iter, std::size_t d, Particle<T> &particle, double *r)
+  {
+      for (std::size_t i = 0; i != particle.size(); ++i, r += dim) {
+          auto sp = particle.sp(i);
+          r[0] = /* $\varphi_1(X^{(i)})$ */;
+          // ...
+          r[d - 1] = /* $\varphi_d(X^{(i)})$ */;
+      }
+  }
+\end{cppcode*}
+The argument \cppinline{d} is the dimension of the vector function $\varphi$.
+The output is an $N$ by $d$ matrix in row major layout, with each row
+corresponding to the value of $\varphi(X^{(i)})$. Then one can add this
+function to a sampler by calling,
+\begin{cppcode}
+  sampler.monitor("name", d, eval);
+\end{cppcode}
+where the first argument is the name for the monitor, the second its dimension,
+and the third the evaluation function. At each iteration, after all the
+initialization, possible resampling, moves and \mcmc moves are done, the
+sampler will calculate $\sum_{i=1}^NW^{(i)}\varphi(X^{(i)})$. This method has
+two optional arguments. First is a boolean value \cppinline{record_only}. If it
+is \cppinline{true}, it is assumed that no summation is needed. For example,
+\begin{cppcode*}{texcomments}
+  void eval(std::size_t iter, std::size_t d, Particle<T> &particle, double *r)
+  {
+      r[0] = /* $\varphi_1(\{X^{(i)}\}_{i=1}^N)$ */;
+      // ...
+      r[d - 1] = /* $\varphi_d(\{X^{(i)}\}_{i=1}^N)$ */;
+  }
+\end{cppcode*}
+In this case, the monitor acts merely as a storage facility. The second
+optional argument is \cppinline{stage} which specifies at which point the
+monitoring shall happen. It can be \cppinline{MonitorMove}, which specifies
+that the monitoring happens right after the moves and before resampling. It can
+also be \cppinline{MonitorResample}, which specifies that the monitoring
+happens right after the resampling and before the \mcmc moves. Last, the
+default is \cppinline{MonitorMCMC}, which specifies that the monitoring happens
+after everything.
+
+The output of a sampler, together with the records of any monitors it has can
+be output in plain text forms through a \cpp output stream. For example,
+\begin{cppcode}
+  std::cout << sampler;
+\end{cppcode}
+We will see how this works later with a concrete particle filter example. If
+the \hdf library is available, it is also possible to write such output to \hdf
+format, for example,
+\begin{cppcode}
+  hdf5store(sampler, file_name, data_name);
+\end{cppcode}
+Details can be found in section~\ref{sec:Storing objects in HDF5}.
+
+\section{A simple particle filter}
+\label{sec:A simple particle filter}
+
+\subsection{Model and algorithm}
+\label{sub:Model and algorithm}
+
+This is an example used in \textcite{Johansen:2009wd}. Through this example, we
+will show how to re-implement a simple particle filter in \vsmc. It shall walk
+one through the basic features of the library introduced above.
+
+The state space model, known as the almost constant velocity model in the
+tracking literature, provides a simple scenario. The state vector $X_t$
+contains the position and velocity of an object moving in a plane. That is,
+$X_t = (\xpos^t, \ypos^t, \xvel^t, \yvel^t)^T$. Imperfect observations $Y_t =
+(\xobs^t, \yobs^t)^T$ of the positions are possible at each time instance. The
+state and observation equations are linear with additive noises,
+\begin{align*}
+  X_t &= AX_{t-1} + V_t \\
+  Y_t &= BX_t + \alpha W_t
+\end{align*}
+where
+\begin{equation*}
+  A = \begin{pmatrix}
+    1 & \Delta & 0 & 0 \\
+    0 & 1 & 0 & 0 \\
+    0 & 0 & 1 & 0 \\
+    0 & 0 & 0 & 1
+  \end{pmatrix} \qquad
+  B = \begin{pmatrix}
+    1 & 0 & 0 & 0 \\
+    0 & 1 & 0 & 0 \\
+  \end{pmatrix} \qquad
+  \alpha = 0.1
+\end{equation*}
+and we assume that the elements of the noise vector $V_t$ are independent
+Gaussian with variance $0.02$ and $0.001$ for position and velocity,
+respectively. The observation noise, $W_t$ comprises independent, identically
+distributed $t$-distributed random variables with degree of freedom $\nu = 10$.
+The prior at time $0$ corresponds to an axis-aligned Gaussian with variance $4$
+for the position coordinates and $1$ for the velocity coordinates. The particle
+filter algorithm is shown in algorithm~\ref{alg:pf}.
+
+\begin{algorithm}[t]
+  \begin{algorithmic}
+    \hrule\vskip1ex
+    \STATE \emph{Initialization}
+    \STATE\STATESKIP Set $t\leftarrow0$.
+    \STATE\STATESKIP Sample
+    $\xpos^{(0,i)},\ypos^{(0,i)}\sim\calN(0,4)$ and
+    $\xvel^{(0,i)},\yvel^{(0,i)}\sim\calN(0,1)$.
+    \STATE\STATESKIP Weight $W_0^{(i)} \propto \exp{\ell(X_0^{(i)}|Y_0)}$ where
+    $\ell$ is the likelihood function.
+
+    \STATE \emph{Iteration}
+    \STATE\STATESKIP Set $t\leftarrow t + 1$.
+    \STATE\STATESKIP Sample
+    \begin{align*}
+      \xpos^{(t,i)}&\sim\calN(\xpos^{(t-1,i)} + \Delta\xvel^{(t-1,i)}, 0.02) &
+      \xvel^{(t,i)}&\sim\calN(\xvel^{(t-1,i)}, 0.001) \\
+      \ypos^{(t,i)}&\sim\calN(\ypos^{(t-1,i)} + \Delta\yvel^{(t-1,i)}, 0.02) &
+      \yvel^{(t,i)}&\sim\calN(\yvel^{(t-1,i)}, 0.001)
+    \end{align*}
+    \STATE\STATESKIP Weight $W_t^{(i)} \propto
+    W_{t-1}^{(i)}\exp{\ell(X_t^{(i)}|Y_t)}$.
+
+    \STATE \emph{Repeat the \emph{Iteration} step until all data are processed}.
+    \vskip1ex\hrule
+  \end{algorithmic}
+  \caption{Particle filter algorithm for the almost constant velocity model.}
+  \label{alg:pf}
+\end{algorithm}
+
+\subsection{Implementations}
+\label{sub:Implementations}
+
+The complete program is shown in appendix~\appref{sec:Sequential implementation
+  of a simple particle filter}. In this section we show the outline of the
+implementation.
+
+\subsubsection{The main program}
+
+\begin{cppcode}
+    Sampler<PFState> sampler(N, Multinomial, 0.5);
+    sampler.init(PFInit()).move(PFMove(), false).monitor("pos", 2, PFMEval());
+    sampler.initialize(const_cast<char *>("pf.data")).iterate(n - 1);
+
+    std::ofstream output("pf.out");
+    output << sampler;
+    output.close();
+\end{cppcode}
+\cppinline{Sampler<PFState>} object is constructed first. Then the
+initialization \cppinline{PFInit}, move \cppinline{PFMove} and a monitor
+\cppinline{PFMEval} that records $\xpos^t$ and $\ypos^t$ are added to the
+sampler. The monitor is named \cppinline{"pos"}. Then it is initialized with
+the name of the data file \cppinline{"pf.data"}, and iterated $n - 1$ times,
+where $n$ is the number of data points. At last, the output is written into a
+text file \cppinline{"pf.out"}. Below is a short
+R\footnote{\url{http://r-project.org}} script that can be used to process the
+output
+\begin{rcode}
+  library(ggplot2)
+
+  pf <- read.table("pf.out", header = TRUE)
+  sink("pf.rout")
+  print(pf[1:5,])
+  sink()
+
+  obs <- read.table("pf.data", header = FALSE)
+  dat <- data.frame(
+  X = c(pf[["pos.0"]], obs[,1]),
+  Y = c(pf[["pos.1"]], obs[,2]))
+  dat[["Source"]] <- rep(c("Estimate", "Observation"), each = dim(obs)[1])
+  plt <- qplot(x = X, y = Y, data = dat, geom = "path")
+  plt <- plt + aes(group = Source, color = Source, linetype = Source)
+  plt <- plt + theme_bw() + theme(legend.position = "top")
+  pdf("pf.pdf")
+  print(plt)
+  dev.off()
+\end{rcode}
+
+The \rinline{print} statement shows the first five lines of the output,
+\begin{textcode}
+    Size Resampled Accept.0      ESS    pos.0   pos.1
+  1 1000         1        0   2.9204 -1.21951 3.16397
+  2 1000         1        0 313.6830 -1.15602 3.22770
+  3 1000         1        0  33.0421 -1.26451 3.04031
+  4 1000         1        0  80.1088 -1.45922 3.37625
+  5 1000         1        0 382.8820 -1.47299 3.49230
+\end{textcode}
+The column \textinline{Size} shows the sample size at each iteration. The
+library does not provide direct support of changing the sample size. However,
+it is possible and an example is shown in section~\ref{sec:Resizing a sampler}.
+The column \textinline{Resampled} shows nonzero values if resampling were
+performed and zero otherwise. For each moves and \mcmc steps, an acceptance
+count will be recorded. In this particular example, it is irrelevant. Next the
+column \textinline{ESS} shows the value of \ess. The last two columns show the
+importance sampling estimates of the positions recorded by the monitor named
+\cppinline{"pos"}. The graphical representation of the output is shown in
+figure~\ref{fig:pf}.
+
+\begin{figure}
+  \includegraphics[width=\linewidth]{cpp/pf}
+  \caption{A simple particle filter}
+  \label{fig:pf}
+\end{figure}
+
+Before diving into the details of the implementation of \cppinline{PFState},
+etc., we will first define a few constant and types. The state space is of
+dimension $4$. And it is natural to use a \cppinline{StateMatrix} as the base
+class of \cppinline{PFState},
+\begin{cppcode}
+  using PFStateBase = StateMatrix<RowMajor, 4, double>;
+\end{cppcode}
+The numbers of particles and data points are also defined as constants in this
+simple example,
+\begin{cppcode}
+  static constexpr std::size_t N = 1000; // Number of particles
+  static constexpr std::size_t n = 100;  // Number of data points
+\end{cppcode}
+Last, we define the following constants as the indices of each state component.
+\begin{cppcode}
+  static constexpr std::size_t PosX = 0;
+  static constexpr std::size_t PosY = 1;
+  static constexpr std::size_t VelX = 2;
+  static constexpr std::size_t VelY = 3;
+\end{cppcode}
+
+\subsubsection{State: \texttt{PFState}}
+
+As noted earlier, \cppinline{StateMatrix} will be used as the base class of
+\cppinline{PFState}. Since the data will be shared by all particles, we also
+store the data within this class. And methods will be provided to read the data
+from an external file, and compute the log-likelihood $\ell(X^{(i)})$, which
+accesses the data. Below the declaration of the class \cppinline{PFState} is
+shown,
+\begin{cppcode*}{texcomments}
+  class PFState : public PFStateBase
+  {
+      public:
+      using PFStateBase::PFStateBase;
+
+      // Return $\ell(X_t^{(i)}|Y_t)$
+      double log_likelihood(std::size_t t, size_type i) const;
+
+      // Read data from an external file
+      void read_data(const char *param);
+
+      private:
+      Vector<double> obs_x_;
+      Vector<double> obs_y_;
+  };
+\end{cppcode*}
+
+\subsubsection{Initialization: \texttt{PFInit}}
+
+The initialization step is implemented as below,
+\begin{cppcode}
+  class PFInit
+  {
+      public:
+      std::size_t operator()(Particle<PFState> &particle, void *param)
+      {
+          eval_param(particle, param);
+          eval_pre(particle);
+          std::size_t acc = 0;
+          for (auto sp : particle)
+              acc += eval_sp(sp);
+          eval_post(particle);
+
+          return acc;
+      }
+
+      void eval_param(Particle<PFState> &particle, void *param)
+      {
+          particle.value().read_data(static_cast<const char *>(param));
+      }
+
+      void eval_pre(Particle<PFState> &particle)
+      {
+          w_.resize(particle.size());
+      }
+
+      std::size_t eval_sp(SingleParticle<PFState> sp)
+      {
+          NormalDistribution<double> norm_pos(0, 2);
+          NormalDistribution<double> norm_vel(0, 1);
+          sp.state(PosX) = norm_pos(sp.rng());
+          sp.state(PosY) = norm_pos(sp.rng());
+          sp.state(VelX) = norm_vel(sp.rng());
+          sp.state(VelY) = norm_vel(sp.rng());
+          w_[sp.id()] = sp.particle().value().log_likelihood(0, sp.id());
+
+          return 0;
+      }
+
+      void eval_post(Particle<PFState> &particle)
+      {
+          particle.weight().set_log(w_.data());
+      }
+
+      private:
+      Vector<double> w_;
+  };
+\end{cppcode}
+An object of this class is convertible to
+\cppinline{Sampler<PFState>::init_type}. In the main method,
+\cppinline{operator()}, \cppinline{eval_param} is called first to initialize
+the data. Then \cppinline{eval_pre} is called to allocated any resource this
+class need before calling any \cppinline{eval_sp}. In this case, it allocate
+the vector \cppinline{w_} for storing weights computed later. Next, the main
+loop initializes each state component with the respective Gaussian
+distribution, computes the log-likelihood and store them in the vector
+allocated in the last step. This is done by calling the \cppinline{eval_sp}
+method. After all particles have been initialized, we set the weights of the
+system in \cppinline{eval_post}. Later in section~\ref{sec:Symmetric
+  Multiprocessing} it will become clear why we structured the implementation
+this way.
+
+\subsubsection{Move: \texttt{PFMove}}
+
+The move step is similar to the initialization. We show the declaration here,
+\begin{cppcode}
+  class PFMove
+  {
+      public:
+      std::size_t operator()(std::size_t t, Particle<PFState> &particle);
+      void eval_pre(std::size_t t, Particle<PFState> &particle);
+      std::size_t eval_sp(std::size_t t, SingleParticle<PFState> sp);
+      void eval_post(std::size_t t, Particle<PFState> &particle);
+
+      private:
+      Vector<double> w_;
+  };
+\end{cppcode}
+
+\subsubsection{Monitor: \texttt{PFMEval}}
+
+Last we define \cppinline{PFMEval}, which simply copies the values of the
+positions.
+\begin{cppcode}
+  class PFMEval
+  {
+      public:
+      void operator()(std::size_t t, std::size_t dim,
+          Particle<PFState> &particle, double *r)
+      {
+          eval_pre(t, particle);
+          for (std::size_t i = 0; i != particle.size(); ++i, r += dim)
+              eval_sp(t, dim, particle.sp(i), r);
+          eval_post(t, particle);
+      }
+
+      void eval_pre(std::size_t t, Particle<PFState> &particle) {}
+
+      void eval_sp(std::size_t t, std::size_t dim,
+          SingleParticle<PFState> sp, double *r)
+      {
+          r[0] = sp.state(PosX);
+          r[1] = sp.state(PosY);
+      }
+
+      void eval_post(std::size_t t, Particle<PFState> &particle) {}
+  };
+\end{cppcode}
+
+\section{Symmetric Multiprocessing}
+\label{sec:Symmetric Multiprocessing}
+
+The above example is implemented in a sequential fashion. However, the loops
+inside \cppinline{PFInit}, \cppinline{PFMove} and \cppinline{PFMEval} clearly
+can be parallelized. The library provides basic support of multicore
+parallelization through its \smp module. Two widely used backends, OpenMP and
+\tbb are available. Here we demonstrate how to use the \tbb backend. First we
+will declare the implementation classes as subclasses as below,
+\begin{cppcode}
+  class PFInit : public InitializationTBB<PFState>;
+  class PFMove : public MoveTBB<PFState>;
+  class PFMEval : public MonitorEvalTBB<PFState>;
+\end{cppcode}
+And remove \cppinline{operator()} from their implementations. After these
+changes, the implementation will be parallelized using \tbb. The complete
+program can be found in section The complete program is shown in
+appendix~\appref{sec:Parallelized implementation of a simple particle filter}.
+
+It works as if \cppinline{InitializationTBB<PFState>} has an implementation of
+\cppinline{operator()} as we did before, except it is parallelized. Now it is
+clear that, method such as \cppinline{eval_pre} and \cppinline{eval_post} are
+called before and after the main loop. Method \cppinline{eval_sp} is called
+within the loop and it need to be thread-safe if called with different
+arguments. This is the main reason we constructed the
+\cppinline{NormalDistribution} objects within \cppinline{eval_sp} instead of as
+member data, even though they are constructed in exactly the same way for each
+particle. This is because \cppinline{NormalDistribution::operator()} is a
+mutable method and thus not thread-safe. If any of these member functions does
+not do anything, then it does not have to be defined in the derived class.
+
+Apart from the three base classes we have shown here, there are also
+\cppinline{InitializationOMP}, etc., for using the OpenMP backend. And
+\cppinline{InitializationSEQ}, etc., for implementation without
+parallelization. The later works in exactly the same way as our implementation
+in the last section. It is often easier to debug a single-threaded program than
+a parallelized one. And thus one may develop the algorithm with the sequential
+backend and obtain optimal performance latter by only changing the name of a
+few base class names. This can usually be done automatically through a build
+system.
+
+\subsection{Performance consideration}
+\label{sec:Performance consideration}
+
+The base classes dispatch calls to \cppinline{eval_pre}, \cppinline{eval_sp},
+etc., through the virtual function mechanism. The performance impact is minimal
+for \cppinline{eval_pre} and \cppinline{eval_post}, since they are called only
+once in each iteration and we expect the computational cost will be dominated
+by \cppinline{eval_sp} in most cases. However, the dynamic dispatch can cause
+considerable performance degenerating if the cost of a single call to
+\cppinline{eval_sp} is small while the number of particles is large. Modern
+optimizing compilers can usually devirtualize the method calls in trivial
+situations. However, it is not always possible. In this situation, the library
+will need a little help from the user to make compile-time dispatch. For each
+implementation class, we will declare it in the following way,
+\begin{cppcode}
+  class PFInit : public InitializationTBB<PFState, PFInit>;
+  class PFMove : public MoveTBB<PFState, PFMove>;
+  class PFMEval : public MonitorEvalTBB<PFState, PFMEval>;
+\end{cppcode}
+The second template argument of the base class need to be exactly the same as
+the derived class. For interested users, this is called Curiously Recurring
+Template
+Pattern\footnote{\url{https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern}}
+(\crtp). This usage of the library's base classes also provides other
+flexibility. The methods \cppinline{eval_pre} etc., can be either
+\cppinline{const} or mutable. They can also be \cppinline{static}.
diff --git a/user_guide/tex/config.tex b/user_guide/tex/config.tex
new file mode 100644
index 000000000..023d9037f
--- /dev/null
+++ b/user_guide/tex/config.tex
@@ -0,0 +1,63 @@
+\chapter{Configuration macros}
+\label{chap:Configuration macros}
+
+The library has a few configuration macros. All these macros can be overwritten
+by the user by defining them with proper values before including any of the
+library's headers. Most of the macros have the prefix \cppinline{VSMC_HAS_},
+which specify if a feature is available to the library. A few have the prefix
+\cppinline{VSMC_USE_}, which specify if a feature shall be used in the case
+that it is available. A handful remaining ones define some constants or types
+used by the library. These macros are listed in table~\ref{tab:Configuration
+  macros}.
+
+\begin{table}[ht]
+  \begin{tabu}{X[l]X[l]X[2l]}
+    \toprule
+    Macro & Default & Description \\
+    \midrule
+    \texttt{VSMC\_INT64} & Platform dependent &
+    The 64-bits integer type used by x86 intrinsics functions \\
+    \texttt{VSMC\_HAS\_INT128} & Platform dependent &
+    Support for 128-bits integers \\
+    \texttt{VSMC\_INT128} & Platform dependent &
+    The 128-bits integer type \\
+    \texttt{VSMC\_HAS\_SSE2} & Platform dependent &
+    Support for \sse{}2 intrinsic functions \\
+    \texttt{VSMC\_HAS\_AVX2} & Platform dependent &
+    Support for \avx{}2 intrinsic functions \\
+    \texttt{VSMC\_HAS\_AES\_NI} & Platform dependent &
+    Support for \aesni intrinsic functions \\
+    \texttt{VSMC\_HAS\_RDRAND} & Platform dependent &
+    Support for \rdrand intrinsic functions \\
+    \texttt{VSMC\_HAS\_X86} & Platform dependent &
+    If we are using x86 \\
+    \texttt{VSMC\_HAS\_X86\_64} & Platform dependent &
+    If we are using x86-64 \\
+    \texttt{VSMC\_HAS\_POSIX} & Platform dependent &
+    If we are on a \posix platform \\
+    \texttt{VSMC\_HAS\_OMP} & Platform dependent &
+    Support for OpenMP~3.0 or higher \\
+    \texttt{VSMC\_HAS\_TBB} & \texttt{0} & Support for \tbb 4.0 or higher \\
+    \texttt{VSMC\_HAS\_TBB\_MALLOC} & \texttt{VSMC\_HAS\_TBB} &
+    Support for \tbb scalable memory allocation \\
+    \texttt{VSMC\_HAS\_HDF5} & \texttt{0} & Support for \hdf 1.8.6 or higher \\
+    \texttt{VSMC\_HAS\_MKL} & \texttt{0} & Support for \mkl 11 or higher \\
+    \texttt{VSMC\_USE\_MKL\_CBLAS} & \texttt{VSMC\_HAS\_MKL} &
+    Use \mkl header \texttt{mkl\_cblas.h} instead of the standard
+    \texttt{cblas.h} \\
+    \texttt{VSMC\_USE\_MKL\_LAPACKE} & \texttt{VSMC\_HAS\_MKL} &
+    Use \mkl header \texttt{mkl\_lapacke.h} instead of the standard
+    \texttt{lapacke.h} \\
+    \texttt{VSMC\_USE\_MKL\_VML} & \texttt{VSMC\_HAS\_MKL} &
+    Use \mkl vector mathematical functions (\vml) \\
+    \texttt{VSMC\_USE\_MKL\_VSL} & \texttt{VSMC\_HAS\_MKL} &
+    Use \mkl statistical functions (\vsl) \\
+    \texttt{VSMC\_CBLAS\_INT\_TYPE} & \texttt{int} &
+    The default integer type of \blas routines \\
+    \texttt{VSMC\_ALIGNMENT} & \texttt{32} &
+    Default alignment of \texttt{AlignedAllocator} \\
+    \bottomrule
+  \end{tabu}
+  \caption{Configuration macros}
+  \label{tab:Configuration macros}
+\end{table}
diff --git a/user_guide/tex/math.tex b/user_guide/tex/math.tex
new file mode 100644
index 000000000..4aeb66985
--- /dev/null
+++ b/user_guide/tex/math.tex
@@ -0,0 +1,179 @@
+\chapter{Mathematical operations}
+\label{chap:Mathemtical operations}
+
+\section{Constants}
+\label{sec:Constants}
+
+The library defines some mathematical constants in the form of
+\cppinline{constexpr} functions. For example, to get the value of $\pi$ with a
+desired precision, one can call the following.
+\begin{cppcode}
+  auto pi_f = const_pi<float>();
+  auto pi_d = const_pi<double>();
+  auto pi_l = const_pi<long double>();
+\end{cppcode}
+The compiler will evaluate these values at compile-time and thus there is no
+performance difference from hard-coding the constants in the program, while the
+readability is improved. All defined constants are listed in
+table~\ref{tab:Mathematical constants}. Note that all functions has a prefix
+\cppinline{const_}, which is omitted in the table.
+
+\begin{table}[ht]
+  \begin{tabu}{X[2l]X[l]X[2l]X[l]X[2l]X[l]}
+    \toprule
+    Function & Value &
+    Function & Value &
+    Function & Value \\
+    \midrule
+    \texttt{pi}             & $\pi$              &
+    \texttt{pi\_2}          & $2\pi$             &
+    \texttt{pi\_inv}        & $1/\pi$            \\
+    \texttt{pi\_sqr}        & $\pi^2$            &
+    \texttt{pi\_by2}        & $\pi/2$            &
+    \texttt{pi\_by3}        & $\pi/3$            \\
+    \texttt{pi\_by4}        & $\pi/4$            &
+    \texttt{pi\_by6}        & $\pi/6$            &
+    \texttt{pi\_2by3}       & $2\pi/3$           \\
+    \texttt{pi\_3by4}       & $3\pi/4$           &
+    \texttt{pi\_4by3}       & $4\pi/3$           &
+    \texttt{sqrt\_pi}       & $\sqrt{\pi}$       \\
+    \texttt{sqrt\_pi\_2}    & $\sqrt{2\pi}$      &
+    \texttt{sqrt\_pi\_inv}  & $\sqrt{1/\pi}$     &
+    \texttt{sqrt\_pi\_by2}  & $\sqrt{\pi/2}$     \\
+    \texttt{sqrt\_pi\_by3}  & $\sqrt{\pi/3}$     &
+    \texttt{sqrt\_pi\_by4}  & $\sqrt{\pi/4}$     &
+    \texttt{sqrt\_pi\_by6}  & $\sqrt{\pi/6}$     \\
+    \texttt{sqrt\_pi\_2by3} & $\sqrt{2\pi/3}$    &
+    \texttt{sqrt\_pi\_3by4} & $\sqrt{3\pi/4}$    &
+    \texttt{sqrt\_pi\_4by3} & $\sqrt{4\pi/3}$    \\
+    \texttt{ln\_pi}         & $\ln{\pi}$         &
+    \texttt{ln\_pi\_2}      & $\ln{2\pi}$        &
+    \texttt{ln\_pi\_inv}    & $\ln{1/\pi}$       \\
+    \texttt{ln\_pi\_by2}    & $\ln{\pi/2}$       &
+    \texttt{ln\_pi\_by3}    & $\ln{\pi/3}$       &
+    \texttt{ln\_pi\_by4}    & $\ln{\pi/4}$       \\
+    \texttt{ln\_pi\_by6}    & $\ln{\pi/6}$       &
+    \texttt{ln\_pi\_2by3}   & $\ln{2\pi/3}$      &
+    \texttt{ln\_pi\_3by4}   & $\ln{3\pi/4}$      \\
+    \texttt{ln\_pi\_4by3}   & $\ln{4\pi/3}$      &
+    \texttt{e}              & $\EE$              &
+    \texttt{e\_inv}         & $1/\EE$            \\
+    \texttt{sqrt\_e}        & $\sqrt{\EE}$       &
+    \texttt{sqrt\_e\_inv}   & $\sqrt{1/\EE}$     &
+    \texttt{sqrt\_2}        & $\sqrt{2}$         \\
+    \texttt{sqrt\_3}        & $\sqrt{3}$         &
+    \texttt{sqrt\_5}        & $\sqrt{5}$         &
+    \texttt{sqrt\_10}       & $\sqrt{10}$        \\
+    \texttt{sqrt\_1by2}     & $\sqrt{1/2}$       &
+    \texttt{sqrt\_1by3}     & $\sqrt{1/3}$       &
+    \texttt{sqrt\_1by5}     & $\sqrt{1/5}$       \\
+    \texttt{sqrt\_1by10}    & $\sqrt{1/10}$      &
+    \texttt{ln\_2}          & $\ln{2}$           &
+    \texttt{ln\_3}          & $\ln{3}$           \\
+    \texttt{ln\_5}          & $\ln{5}$           &
+    \texttt{ln\_10}         & $\ln{10}$          &
+    \texttt{ln\_inv\_2}     & $1/\ln{2}$         \\
+    \texttt{ln\_inv\_3}     & $1/\ln{3}$         &
+    \texttt{ln\_inv\_5}     & $1/\ln{5}$         &
+    \texttt{ln\_inv\_10}    & $1/\ln{10}$        \\
+    \texttt{ln\_ln\_2}      & $\ln\ln{2}$        &
+    &                    &
+    &                    \\
+    \bottomrule
+  \end{tabu}
+  \caption{Mathematical constants. Note: All functions are prefixed by
+    \cppinline{const_}.}
+  \label{tab:Mathematical constants}
+\end{table}
+
+\section{Vectorized operations}
+\label{sec:Vectorized operations}
+
+The library provides a set of functions for vectorized mathematical operations.
+For example,
+\begin{cppcode}
+  std::size_t n = 1000;
+  vsmc::Vector<double> a(n), b(n), y(n);
+  // Fill vectors a and b
+  add(n, a.data(), b.data(), y.data());
+\end{cppcode}
+performs addition for vectors. It is equivalent to
+\begin{cppcode}
+  for (std::size_t i = 0; i != n; ++i)
+      y[i] = a[i] + b[i];
+\end{cppcode}
+The functions defined are listed in table~\ref{tab:Vectorized mathematical
+  operations}.
+
+\begin{table}[ht]
+  \begin{tabu}{X[l]X[2l]X[l]X[2l]X[l]X[2l]}
+    \toprule
+    Function & Operation &
+    Function & Operation &
+    Function & Operation \\
+    \midrule
+    \texttt{add}     & $a + b$                               &
+    \texttt{sub}     & $a - b$                               &
+    \texttt{sqr}     & $a^2$                                 \\
+    \texttt{mul}     & $ab$                                  &
+    \texttt{abs}     & $|a|$                                 &
+    \texttt{fma}     & $ab + c$                              \\
+    \texttt{inv}     & $1 / a$                               &
+    \texttt{div}     & $a / b$                               &
+    \texttt{sqrt}    & $\sqrt{a}$                            \\
+    \texttt{invsqrt} & $1 / \sqrt{a}$                        &
+    \texttt{cbrt}    & $\sqrt[3]{a}$                         &
+    \texttt{invcbrt} & $1 / \sqrt[3]{a}$                     \\
+    \texttt{pow2o3}  & $a^{2/3}$                             &
+    \texttt{pow3o2}  & $a^{3/2}$                             &
+    \texttt{pow}     & $a^b$                                 \\
+    \texttt{hypot}   & $\sqrt{a^2 + b^2}$                    &
+    \texttt{exp}     & $\EE^a$                               &
+    \texttt{exp2}    & $2^a$                                 \\
+    \texttt{exp10}   & $10^a$                                &
+    \texttt{expm1}   & $\EE^a - 1$                           &
+    \texttt{log}     & $\ln(a)$                              \\
+    \texttt{log2}    & $\log_2(a)$                           &
+    \texttt{log10}   & $\log_{10}(a)$                        &
+    \texttt{log1p}   & $\ln(a + 1)$                          \\
+    \texttt{cos}     & $\cos(a)$                             &
+    \texttt{sin}     & $\sin(a)$                             &
+    \texttt{sincos}  & $\sin(a)$ and $\cos(a)$               \\
+    \texttt{tan}     & $\tan(a)$                             &
+    \texttt{acos}    & $\arccos(a)$                          &
+    \texttt{asin}    & $\arcsin(a)$                          \\
+    \texttt{atan}    & $\arctan(a)$                          &
+    \texttt{acos}    & $\arccos(a)$                          &
+    \texttt{atan2}   & $\arctan(a / b)$                      \\
+    \texttt{cosh}    & $\cosh(a)$                            &
+    \texttt{sinh}    & $\sinh(a)$                            &
+    \texttt{tanh}    & $\tanh(a)$                            \\
+    \texttt{acosh}   & $\mathrm{arc}\cosh(a)$                &
+    \texttt{asinh}   & $\mathrm{arc}\sinh(a)$                &
+    \texttt{atanh}   & $\mathrm{arc}\tanh(a)$                \\
+    \texttt{erf}     & $\mathrm{erf}(a)$                     &
+    \texttt{erfc}    & $\mathrm{erfc}(a)$                    &
+    \texttt{cdfnorm} & $1 - \mathrm{erfc}(a / \sqrt{2}) / 2$ \\
+    \texttt{lgamma}  & $\ln\Gamma(a)$                        &
+    \texttt{tgamma}  & $\Gamma(a)$                           &
+                     &                                       \\
+    \bottomrule
+  \end{tabu}
+  \caption{Vectorized mathematical operations}
+  \label{tab:Vectorized mathematical operations}
+\end{table}
+
+For each function, the first parameter is always the length of
+the vector, and the last is a pointer to the output vector (except
+\cppinline{sincos} which has two output parameters). For all functions, the
+output is always a vector. If there are more than one input parameters, then
+some of them, but not all, can be scalars. The order of the input parameters
+are as they appear in the mathematical expressions. For example,
+\cppinline{fma} perform the operation $ab + c$, then the function shall be
+called as,
+\begin{cppcode}
+  fma(n, a, b, c, y);
+\end{cppcode}
+where \cppinline{a}, \cppinline{b}, \cppinline{c} are input parameters, and
+some of them, not all, can be scalars instead of pointers. And \cppinline{y} is
+the output parameter, which has to be pointer to a length $n$ vector.
diff --git a/user_guide/tex/resample.tex b/user_guide/tex/resample.tex
new file mode 100644
index 000000000..a943bf963
--- /dev/null
+++ b/user_guide/tex/resample.tex
@@ -0,0 +1,68 @@
+\chapter{Resampling}
+\label{chap:Resampling}
+
+The library supports resampling in a more general way than the algorithm
+described in chapter~\ref{chap:Sequential Monte Carlo}. Recall that, given a
+particle system $\{W^{(i)},X^{(i)}\}_{i=1}^N$, a new system $\{\bar{W}^{(i)},
+\bar{X}^{(i)}\}_{i=1}^M$ is generated. Regardless other statistical properties,
+in practice, such an algorithm can be decomposed into three steps. First, a
+vector $\{r_i\}_{i=1}^N$ is generated such that $\sum_{i=1}^N r_i = M$. Then a
+vector $\{a_i\}_{i=1}^M$ is generated such that, $\sum_{i=1}^M
+\mathbb{I}_{\{j\}}(a_i) = r_j$. And last, set $\bar{X}^{(i)} = X^{(a_i)}$.
+
+The first step determines the statistical properties of the resampling
+algorithm. The library defines all algorithms discussed in
+\textcite{Douc:2005wa}. Samplers can be constructed with builtin schemes as
+seen in section~\ref{sub:Implementations}. In addition, samplers can also be
+constructed with user defined resampling operations. Below is the signature,
+\begin{cppcode}
+  template <typename IntType, typename RNGType>
+  void resample(std::size_t M, std::size_t N, RNGType &rng,
+  const double *weight, IntType *replication);
+\end{cppcode}
+The last parameter is the output vector $\{r_i\}_{i=1}^N$. The builtin schemes
+are implemented as classes with \cppinline{operator()} conforms to the above
+signature. For example, \cppinline{ResampleMultinomial} implements the
+multinomial resampling algorithm.
+
+To transform $\{r_i\}_{i=1}^N$ into $\{a_i\}_{i=1}^M$, one can call the
+following function,
+\begin{cppcode}
+  template <typename IntType1, typename IntType2>
+  void resample_trans_rep_index(std::size_t M, std::size_t N,
+  const IntType1 *replication, IntType2 *index);
+\end{cppcode}
+where the last parameter is the output vector $\{a_i\}_{i=1}^M$. This function
+guarantees that $a_i = i$ if $r_i > 0$. However, its output may not be optimal
+for all applications. The last step of a resampling operation, the copying of
+particles can be the most time consuming one, especially on distributed
+systems. The topology of the system will need to be taking into consideration
+to achieve optimal performance. In those situations, it is best to use
+\cppinline{ResampleMultinomial} etc., to generate the replication numbers, and
+manually perform the rest of the resampling algorithm.
+
+\section{Resizing a sampler}
+\label{sec:Resizing a sampler}
+
+Now, we provide an example of changing sampler size,
+\begin{cppcode}
+  // sampler is an existing Sampler<T> object
+  auto N = sampler.size();
+  auto &rng = sampler.particle().rng();
+  auto weight = sampler.particle().weight().data();
+  Vector<std::size_t> rep(N);
+  Vector<std::size_t> idx(M);
+  ResampleMultinomial resample;
+  resample(M, N, rng, weight, rep.data());
+  resample_trans_rep_index(M, N, rep.data(), idx.data());
+  Particle<T> particle(M);
+  for (std::size_t i = 0; i != M; ++i) {
+      auto sp_dst = particle.sp(i);
+      auto sp_src = sampler.partice().sp(idx[i]);
+      // Assuming T is a subclass of StateMatrix
+      for (std::size_t d = 0; d != sp_dst.dim(); ++d)
+      sp_dst.state(d) = sp_src.state(d);
+  }
+  // Copy other data of class T if any
+  sampler.particle() = std::move(particle);
+\end{cppcode}
diff --git a/user_guide/tex/rng.tex b/user_guide/tex/rng.tex
new file mode 100644
index 000000000..0deb16bce
--- /dev/null
+++ b/user_guide/tex/rng.tex
@@ -0,0 +1,425 @@
+\chapter{Random number generating}
+\label{chap:Random number generating}
+
+The library has a comprehensive \rng system to facilitate implementation of
+Monte Carlo algorithms.
+
+\section{Seeding}
+\label{sec:Seeding}
+
+The singleton class template \cppinline{SeedGenerator} can be used to generate
+distinctive seed sequentially. For example,
+\begin{cppcode}
+  auto &seed = SeedGenerator<void, unsigned>::instance();
+  RNG rng1(seed.get()); // Construct rng1
+  RNG rng2(seed.get()); // Construct rng2 with another seed
+\end{cppcode}
+The first argument to the template can be any type. For different types,
+different instances of \cppinline{SeedGenerator} will be created. Thus, the
+seeds generated by \cppinline{SeedGenerator<T1>} and
+\cppinline{SeedGenerator<T2>} will be independent. The second parameter is the
+type of the seed values. It can be an unsigned integer type. Classes such as
+\cppinline{Particle<T>} will use the generator of the following type,
+\begin{cppcode}
+  using Seed = SeedGenerator<NullType, VSMC_SEED_RESULT_TYPE>;
+\end{cppcode}
+where \cppinline{VSMC_SEED_RESULT_TYPE} is a configuration macro which is
+defined to \cppinline{unsigned} by default.
+
+One can save and set the seed generator using standard \cpp streams. For
+example
+\begin{cppcode}
+  std::ifstream seed_txt("seed.txt");
+  if (seed_txt.good())
+      seed_txt >> Seed::instance(); // Read seed from a file
+  else
+      Seed::instance().set(101);    // The default seed
+  seed_txt.close();
+  // The program
+  std::ofstream seed_txt("seed.txt");
+  seed_txt << Seed::instance();     // Write the seed to a file
+  seed_txt.close();
+\end{cppcode}
+This way, if the simulation program need to be repeated multiple times, each
+time is will use a different set of seeds.
+
+A single seed generator is enough for a single computer program. However, it is
+more difficult to ensure that each computing node has a distinctive set of
+seeds in a distributed system. A simple solution is to use the
+\cppinline{modulo} method of \cppinline{SeedGenerator}. For example,
+\begin{cppcode}
+  Seed::instance().modulo(n, r);
+\end{cppcode}
+where $n$ is the number of processes and $r$ is the rank of the current node.
+After this call, all seeds generated will belong to the equivalent class $s
+\equiv r\mod{n}$. Therefore, no two nodes will ever generate the same seeds.
+
+\section{Counter-based \protect\rng}
+\label{sec:Counter-based RNG}
+
+The standard library provides a set of \rng classes. Unfortunately, none of
+them are suitable for parallel computing without considerable efforts.
+
+The development by \textcite{Salmon:2011um} made high performance parallel \rng
+much more accessible than it was before. In the author's personal opinion, it
+is the most significant development for parallel Monte Carlo algorithms in
+recent memory. See the paper for more details. Here, it is sufficient to
+mention that, the \rng introduced in the paper use a deterministic function
+$f_k$, such that, for any sequence $\{c_i\}_{i>0}$, the sequence
+$\{y_i\}_{i>0}$, $y_i = f_k(c_i)$, appears as random. In addition, for $k_1 \ne
+k_2$, $f_{k_1}$ and $f_{k_2}$ will generate two sequences that appear
+statistically independent. Compared to more conventional \rng{}s which use
+recursions $y_i = f(y_{i - 1})$, these counter-based \rng{}s are much easier to
+setup in a parallelized environment.
+
+If $c$, the counter, is an unsigned integer with $b$ bits, and $k$, the key, is
+an unsigned integer with $d$ bits. Then for each $k$, the \rng has a period
+$2^b$. And there can be at most $2^d$ independent streams.
+Table~\ref{tab:Counter-based RNG} lists all counter-based \rng{}s implemented
+in this library, along with the bits of the counter and the key. They all
+conform to the \cppoo uniform \rng concept. All \rng{}s in
+\textcite{Salmon:2011um} are implemented along with a few additions. Note that,
+the actual period of an \rng can be longer. For example, \cppinline{Philox4x64}
+has a 256-bits counter but output 64-bits integers. And thus it has a
+$2^{1024}$ period. Such period length may seems very small compared to many
+well known \rng{}s. For example, the famous Mersenne-Twister generator
+(\cppinline{std::mt19937}) has a period $2^{19937} - 1$. However, combined with
+$2^{256}$ independent streams, only the most demanding programs will find these
+counter-base \rng{}s insufficient.
+
+\begin{table}[t]
+  \def\B{\textcolor{MRed}{\textit{B}}}
+  \def\V{\textcolor{MRed}{\textit{V}}}
+  \begin{tabu}{X[2l]X[2l]X[l]X[l]}
+    \toprule
+    & & \multicolumn{2}{c}{Bits} \\
+    \cmidrule{3-4}
+    Class & Result type & Counter & Key \\
+    \midrule
+    \texttt{AES128\_\B x32} & \texttt{std::uint32\_t} & $128$ & $128$ \\
+    \texttt{AES128\_\B x64} & \texttt{std::uint64\_t} & $128$ & $128$ \\
+    \texttt{AES192\_\B x32} & \texttt{std::uint32\_t} & $128$ & $192$ \\
+    \texttt{AES192\_\B x64} & \texttt{std::uint64\_t} & $128$ & $192$ \\
+    \texttt{AES256\_\B x32} & \texttt{std::uint32\_t} & $128$ & $256$ \\
+    \texttt{AES256\_\B x64} & \texttt{std::uint64\_t} & $128$ & $256$ \\
+    \texttt{ARS\_\B x32}    & \texttt{std::uint32\_t} & $128$ & $128$ \\
+    \texttt{ARS\_\B x64}    & \texttt{std::uint64\_t} & $128$ & $128$ \\
+    \texttt{Philox2x32\V}   & \texttt{std::uint32\_t} & $64$  & $64$  \\
+    \texttt{Philox2x64\V}   & \texttt{std::uint64\_t} & $128$ & $128$ \\
+    \texttt{Philox4x32\V}   & \texttt{std::uint32\_t} & $128$ & $128$ \\
+    \texttt{Philox4x64\V}   & \texttt{std::uint64\_t} & $256$ & $256$ \\
+    \texttt{Threefry2x32\V} & \texttt{std::uint32\_t} & $64$  & $64$  \\
+    \texttt{Threefry2x64\V} & \texttt{std::uint64\_t} & $128$ & $128$ \\
+    \texttt{Threefry4x32\V} & \texttt{std::uint32\_t} & $128$ & $128$ \\
+    \texttt{Threefry4x64\V} & \texttt{std::uint64\_t} & $256$ & $256$ \\
+    \bottomrule
+  \end{tabu}
+  \caption{Counter-based \rng; \B: either \cppinline{1}, \cppinline{2},
+    \cppinline{4}, or \cppinline{8}; \V: either empty, \cppinline{SSE2}, or
+    \cppinline{AVX2}.}
+  \label{tab:Counter-based RNG}
+\end{table}
+
+\begin{table}[t]
+  \begin{tabu}{X[l]X[l]}
+    \toprule
+    Macro & Default \\
+    \midrule
+    \texttt{VSMC\_RNG\_AES\_BLOCKS} & \texttt{4} \\
+    \texttt{VSMC\_RNG\_ARS\_ROUNDS} & \texttt{5} \\
+    \texttt{VSMC\_RNG\_ARS\_BLOCKS} & \texttt{4} \\
+    \texttt{VSMC\_RNG\_PHILOX\_ROUNDS} & \texttt{10} \\
+    \texttt{VSMC\_RNG\_PHILOX\_VECTOR\_LENGTH} & \texttt{4} \\
+    \texttt{VSMC\_RNG\_THREEFRY\_ROUNDS} & \texttt{20} \\
+    \texttt{VSMC\_RNG\_THREEFRY\_VECTOR\_LENGTH} & \texttt{4} \\
+    \bottomrule
+  \end{tabu}
+  \caption{Configuration macros for the counter-based \rng}
+  \label{tab:Configuration macros for the counter-based RNG}
+\end{table}
+
+\subsection{\protect\aesni intrinsics based \protect\rng}
+\label{sub:AES-NI intrinsics based RNG}
+
+The \aesni intrinsics based \rng{}s in \textcite{Salmon:2011um} are implemented
+in a more general form,
+\begin{cppcode}
+  template <typename ResultType, typename KeySeqType, std::size_t Rounds,
+  std::size_t Blocks>
+  using AESNIEngine =
+  CounterEngine<AESNIGenerator<ResultType, KeySeqType, Rounds, Blocks>>;
+\end{cppcode}
+where \cppinline{KeySeqType} is the class used to generate the sequence of
+round keys; \cppinline{Rounds} is the number of rounds of \aes encryption to be
+performed. See the reference manual for details of how to define the key
+sequence class. The \aesni intrinsics have a latency of seven or eight cycles,
+while they can be issued at every cycle. Therefore better performance can be
+achieved if multiple 128-bits random integers are generated at the same time.
+This is specified by the template parameter \cppinline{Blocks}. Larger blocks,
+up to eight, can improve runtime performance but this is at the cost of larger
+state size.
+
+Four types of key sequences are implemented by the library, corresponding to
+the \ars algorithm in \textcite{Salmon:2011um} and the \aes-128, \aes-192, and
+\aes-256 algorithms. The following \rng engines are defined.
+\begin{cppcode}
+  template <typename ResultType, std::size_t Rounds = VSMC_RNG_ARS_ROUNDS,
+      std::size_t Blocks = VSMC_RNG_ARS_BLOCKS>
+  using ARSEngine =
+      AESNIEngine<ResultType, ARSKeySeq<ResultType>, Rounds, Blocks>;
+
+  template <typename ResultType, std::size_t Blocks = VSMC_RNG_AES_BLOCKS>
+  using AES128Engine =
+      AESNIEngine<ResultType, AES128KeySeq<ResultType, 10>, 10, Blocks>;
+
+  template <typename ResultType, std::size_t Blocks = VSMC_RNG_AES_BLOCKS>
+  using AES192Engine =
+      AESNIEngine<ResultType, AES192KeySeq<ResultType, 12>, 12, Blocks>;
+
+  template <typename ResultType, std::size_t Blocks = VSMC_RNG_AES_BLOCKS>
+  using AES256Engine =
+      AESNIEngine<ResultType, AES256KeySeq<ResultType, 14>, 14, Blocks>;
+\end{cppcode}
+The default template arguments can be changed by configuration macros listed in
+table~\ref{tab:Configuration macros for the counter-based RNG}. Shortcuts are
+also defined, as listed in table~\ref{tab:Counter-based RNG}. For example,
+\cppinline{ARS_4x32} is \cppinline{ARSEngine} with result type
+\cppinline{std::uint32_t}, four blocks, and the default number of rounds.
+
+\subsection{Philox}
+\label{sub:Philox}
+
+The Philox algorithm in \textcite{Salmon:2011um} is implemented in a more
+general form,
+\begin{cppcode}
+  template <typename ResultType, std::size_t K = VSMC_RNG_PHILOX_VECTOR_LENGTH,
+      std::size_t Rounds = VSMC_RNG_PHILOX_ROUNDS>
+  using PhiloxEngine = CounterEngine<PhiloxGenerator<ResultType, K, Rounds>>;
+\end{cppcode}
+The default vector length and the number of rounds can be changed by
+configuration macros listed in table~\ref{tab:Configuration macros for the RNG
+  system}. Shortcuts are also defined, as listed in
+table~\ref{tab:Counter-based RNG}. For example, \cppinline{Philox4x32} is
+\cppinline{PhiloxEngine} with result type \cppinline{std::uint32_t}, vector
+length four, and the default number of rounds.
+
+\subsection{Threefry}
+\label{sub:Threefry}
+
+The Threefry algorithm in \textcite{Salmon:2011um} is implemented in a more
+general form,
+\begin{cppcode}
+  template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
+      std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
+  using ThreefryEngine = CounterEngine<ThreefryGenerator<ResultType, K, Rounds>>;
+\end{cppcode}
+The default vector length and the number of rounds can be changed by
+configuration macros listed in table~\ref{tab:Configuration macros for the RNG
+  system}. Shortcuts are also defined, as listed in
+table~\ref{tab:Counter-based RNG}. For example, \cppinline{Threefry4x32} is
+\cppinline{ThreefryEngine} with result type \cppinline{std::uint32_t}, vector
+length four, and the default number of rounds.
+
+If \sse{}2 intrinsics are supported, then a more optimized version is also
+implemented. This implementation can have higher performance at the cost of
+larger state size. If \avx{}2 intrinsics are supported, then a even more
+optimized, with even larger state size version is also implemented.
+\begin{cppcode}
+  template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
+      std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
+  using ThreefryEngineSSE2 =
+      CounterEngine<ThreefryGeneratorSSE2<ResultType, K, Rounds>>;
+
+  template <typename ResultType, std::size_t K = VSMC_RNG_THREEFRY_VECTOR_LENGTH,
+      std::size_t Rounds = VSMC_RNG_THREEFRY_ROUNDS>
+  using ThreefryEngineAVX2 =
+      CounterEngine<ThreefryGeneratorAVX2<ResultType, K, Rounds>>;
+\end{cppcode}
+
+\subsection{Default \protect\rng}
+\label{sub:Default RNG}
+
+Note that, not all \rng{}s defined by the library is available on all
+platforms. The library also defines a type alias \cppinline{RNG} which is one
+of the \rng{}s listed in table~\ref{tab:Counter-based RNG}. More specifically,
+if the \aesni intrinsics are supported,
+\begin{cppcode}
+  using RNG = ARS_4x32;
+\end{cppcode}
+otherwise if \avx{}2 intrinsics are supported,
+\begin{cppcode}
+  using RNG = Threefry4x32AVX2;
+\end{cppcode}
+otherwise if \sse{}2 intrinsics are supported,
+\begin{cppcode}
+  using RNG = Threefry4x32SSE2;
+\end{cppcode}
+and last, on all other platforms,
+\begin{cppcode}
+  using RNG = Threefry4x32;
+\end{cppcode}
+This can be changed by the configuration macro \cppinline{VSMC_RNG_TYPE}.
+
+\section{Non-deterministic \protect\rng}
+\label{sec:Non-deterministic RNG}
+
+If the \rdrand intrinsics are supported, the library also implement three
+\rng{}s, \cppinline{RDRAND16}, \cppinline{RDRAND32} and \cppinline{RDRAND64}.
+They output 16-, 32-, and 64-bits random unsigned integers, respectively.
+
+\section{\protect\mkl{} \protect\rng}
+\label{sec:MKL RNG}
+
+The \mkl library provides some high performance \rng{}s. The library implement
+a wrapper class \cppinline{MKLEngine} that makes them accessible as \cppoo{}
+generators. They are listed in table~\ref{tab:MKL RNG}. Note that, \mkl{}
+\rng{}s performs best when they are used to generate vectors of random numbers.
+These wrappers use a buffer to store such vectors. And thus they have much
+larger state space than usual \rng{}s.
+
+\begin{table}[t]
+  \begin{tabu}{X[l]X[l]}
+    \toprule
+    Class & \mkl \brng \\
+    \midrule
+    \texttt{MKL\_MCG59}         & \texttt{VSL\_BRNG\_MCG59}         \\
+    \texttt{MKL\_MT19937}       & \texttt{VSL\_BRNG\_MT19937}       \\
+    \texttt{MKL\_MT2203}        & \texttt{VSL\_BRNG\_MT2203}        \\
+    \texttt{MKL\_SFMT19937}     & \texttt{VSL\_BRNG\_SFMT19937}     \\
+    \texttt{MKL\_NONDETERM}     & \texttt{VSL\_BRNG\_NONDETERM}     \\
+    \texttt{MKL\_ARS5}          & \texttt{VSL\_BRNG\_ARS5}          \\
+    \texttt{MKL\_PHILOX4X32X10} & \texttt{VSL\_BRNG\_PHILOX4X32X10} \\
+    \bottomrule
+  \end{tabu}
+  \caption[Intel \protect\mkl{} \protect\rng]{\mkl{} \rng. Note: all
+    classes can have a suffix \cppinline{_64}.}
+  \label{tab:MKL RNG}
+\end{table}
+
+\section{Multiple \protect\rng streams}
+\label{sec:Multiple RNG streams}
+
+Earlier in section~\ref{sub:Particle} we introduced that
+\cppinline{particle.rng(i)} returns an independent \rng instance. This is
+actually done through a class template called \cppinline{RNGSet}. Three of them
+are implemented in the library. They all have the same interface,
+\begin{cppcode}
+  RNGSet<RNG> rng_set(N); // A set of N RNGs
+  rng_set.resize(n);      // Change the size of the set
+  rng_set.seed();         // Seed each RNG in the set with Seed::instance()
+  rng_set[i];             // Get a reference to the i-th RNG
+\end{cppcode}
+The first implementation is \cppinline{RNGSetScalar}. As its name suggests, it
+is only a wrapper of a single \rng. All calls to \cppinline{rng_set[i]} returns
+a reference to the same \rng. It is only useful when an \cppinline{RNGSet}
+interface is required while the thread-safety and other issues are not
+important.
+
+The second implementation is \cppinline{RNGSetVector}. It is an array of
+\rng{}s with length $N$. It has memory cost $O(N)$. Many of the counter-based
+\rng{}s have small state size and thus for moderate $N$, this cost is not an
+issue. The method calls \cppinline{rng_set[i]} and \cppinline{rng_set[j]}
+return independent \rng{}s if $i \ne j$.
+
+Last, if \tbb is available, there is a third implementation
+\cppinline{RNGSetTBB}, which uses thread-local storage (\tls). It has much
+smaller memory footprint than \cppinline{RNGSetVector} while maintains better
+thread-safety. The performance impact of using \tls is minimal unless the
+computation at the calling site is trivial. For example,
+\begin{cppcode}
+  std::size_t eval_pre(SingleParticle<T> sp)
+  {
+      auto &rng = sp.rng();
+      // using rng to initialize state
+      // do some computation, likely far more costly than TLS
+  }
+\end{cppcode}
+The type alias \cppinline{RNGSet} is defined to be \cppinline{RNGSetTBB} if
+\tbb is available, otherwise defined to be \cppinline{RNGSetVector}. It is used
+by the \cppinline{Particle} class template. One can replace the type of \rng
+set used by \cppinline{Particle<T>} with a member type of \cppinline{T}.  For
+example,
+\begin{cppcode}
+  class T
+  {
+      using rng_set_type = /* User defined type */;
+  };
+\end{cppcode}
+will make the \rng set used by \cppinline{Particle<T>} replaced by the user
+defined type.
+
+\section{Distributions}
+\label{sec:Distributions}
+
+The library also provides implementations of some common distributions. They
+all conforms to the \cppoo random number distribution concepts. Some of them
+are the same as those in the \cppoo standard library, with
+\cppinline{CamelCase} names. For example, \cppinline{NormalDistribuiton} can be
+used as an drop-in replacement for \cppinline{std::normal_distribuiton}. This
+includes all of the continuous distributions defined in the standard library.
+Their benefits compared to the standard library will be discussed later.
+Table~\ref{tab:Random number distributions} lists all the additional
+distributions implemented.
+
+\begin{table}[t]
+  \begin{tabu}{X[l]X[4l]}
+    \toprule
+    Class & Notes \\
+    \midrule
+    \texttt{UniformBits} & No parameters,
+    uniform on the set $\{0,\dots,2^b - 1\}$, where $b$ is the number of bits
+    of the result type, which has to be an unsigned integer type. \\
+    \texttt{U01}         & No parameters, uniform on $[0, 1)$ \\
+    \texttt{U01CC}       & No parameters, uniform on $[0, 1]$ \\
+    \texttt{U01CO}       & No parameters, uniform on $[0, 1)$ \\
+    \texttt{U01OC}       & No parameters, uniform on $(0, 1]$ \\
+    \texttt{U01OO}       & No parameters, uniform on $(0, 1)$ \\
+    \texttt{Laplace}     & Parameters: location \texttt{a}; scale \texttt{b}\\
+    \texttt{Levy}        & Parameters: location \texttt{a}; scale \texttt{b}\\
+    \texttt{Pareto}      & Parameters: shape \texttt{a}; scale \texttt{b}   \\
+    \texttt{Rayleigh}    & Parameters: scale \texttt{sigma}                 \\
+    \bottomrule
+  \end{tabu}
+  \caption{Random number distributions. Note: all class names have a suffix
+    \cppinline{Distribution} which is omitted in the table}
+  \label{tab:Random number distributions}
+\end{table}
+
+The last, the library also implement the multivariate Normal distribution. Its
+usable is summarized by the following.
+\begin{cppcode*}{texcomments}
+  double mean[2] = { /* the mean vector */ };
+  double cov[4] = { /* the covariance matrix */ };
+  double chol[3];
+  double r[2];
+  // Compute the lower triangular of the Cholesky decomposition
+  cov_chol(2, cov, chol);
+  RNG rng;
+  NormalMVDistribution<double, 2> norm2(mean, chol); // Bivariate Normal
+  NormalMVDistribution<double, Dynamic> normd(2, mean, chol); // Same as above
+  norm2(rng, r); // Generate a bivariate Normal
+  normd(rng, r); // Same as above
+\end{cppcode*}
+We shall mention here that the static form, where the dimension is specified as
+a template parameter is more efficient.
+
+\section{Vectorized random number generating}
+\label{sec:Vectorized random number generating}
+
+The \rng{}s and distributions implemented by this library provides vectorized
+operations. For example,
+\begin{cppcode}
+  std::size_t n = 1000;
+  RNG rng;
+  NormalDistribution<double> norm(0, 1);
+  Vector<RNG::result_type> u(n);
+  Vector<double> r(n);
+  rng(n, u.data());           // Generate n random unsigned integers
+  rng_rand(rng, n, u.data()); // Same as above
+  norm(rng, n, r.data());     // Generate n Normal random numbers
+  normal_distribution(rng, n, r.data(), 0.0, 1.0);     // Same as above
+  normal_distribution(rng, n, r.data(), norm.param()); // Same as above
+  rng_rand(rng, norm, n, r.data());                    // Same as above
+\end{cppcode}
+Note that these functions will be specialized to use \mkl routines if
+\cppinline{rng} is one of the engines listed in table~\ref{tab:MKL RNG}.
diff --git a/user_guide/tex/smc.tex b/user_guide/tex/smc.tex
new file mode 100644
index 000000000..e660723cb
--- /dev/null
+++ b/user_guide/tex/smc.tex
@@ -0,0 +1,205 @@
+\chapter{Sequential Monte Carlo}
+\label{chap:Sequential Monte Carlo}
+
+\section{Introduction}
+\label{sec:Introduction}
+
+Sequential Monte Carlo (\smc) methods are a class of sampling algorithms that
+combine importance sampling and resampling. They have been primarily used as
+``particle filters'' to solve optimal filtering problems; see, for example,
+\textcite{Cappe:2007hz} and \textcite{Doucet:2011us} for recent reviews. They
+are also used in a static setting where a target distribution is of interest,
+for example, for the purpose of Bayesian modeling. This was proposed by
+\textcite{DelMoral:2006hc} and developed by \textcite{Peters:2005wh} and
+\textcite{DelMoral:2006wv}. This framework involves the construction of a
+sequence of artificial distributions on spaces of increasing dimensions which
+admit the distributions of interest as particular marginals.
+
+\smc algorithms are perceived as being difficult to implement while general
+tools were not available until the development of \textcite{Johansen:2009wd},
+which provided a general framework for implementing \smc algorithms. \smc
+algorithms admit natural and scalable parallelization. However, there are only
+parallel implementations of \smc algorithms for many problem specific
+applications, usually associated with specific \smc related researches.
+\textcite{Lee:2010fm} studied the parallelization of \smc algorithms on \gpu{}s
+with some generality. There are few general tools to implement \smc algorithms
+on parallel hardware though multicore \cpu{}s are very common today and
+computing on specialized hardware such as \gpu{}s are more and more popular.
+
+The purpose of the current work is to provide a general framework for
+implementing \smc algorithms on both sequential and parallel hardware. There
+are two main goals of the presented framework. The first is reusability. It
+will be demonstrated that the same implementation source code can be used to
+build a serialized sampler, or using different programming models (for example,
+OpenMP and Intel Threading Building Blocks) to build parallelized samplers for
+multicore \cpu{}s. The second is extensibility. It is possible to write a
+backend for \vsmc to use new parallel programming models while reusing existing
+implementations. It is also possible to enhance the library to improve
+performance for specific applications. Almost all components of the library can
+be reimplemented by users and thus if the default implementation is not
+suitable for a specific application, they can be replaced while being
+integrated with other components seamlessly.
+
+\section{Sequential importance sampling and resampling}
+\label{sec:Sequential importance sampling and resampling}
+
+Importance sampling is a technique which allows the calculation of the
+expectation of a function $\varphi$ with respect to a distribution $\pi$ using
+samples from some other distribution $\eta$ with respect to which $\pi$ is
+absolutely continuous, based on the identity,
+\begin{equation}
+  \Exp_{\pi}[\varphi(X)]
+  = \int\varphi(x)\pi(x)\intd x
+  = \int\frac{\varphi(x)\pi(x)}{\eta(x)}\eta(x)\intd x
+  = \Exp_{\eta}\Square[Big]{\frac{\varphi(X)\pi(X)}{\eta(X)}}
+\end{equation}
+And thus, let $\{X^{(i)}\}_{i=1}^N$ be samples from $\eta$, then
+$\Exp_{\pi}[\varphi(X)]$ can be approximated by
+\begin{equation}
+  \hat\varphi_1 =
+  \frac{1}{N}\sum_{i=1}^N\frac{\varphi(X^{(i)})\pi(X^{(i)})}{\eta(X^{(i)})}
+\end{equation}
+In practice $\pi$ and $\eta$ are often only known up to some normalizing
+constants, which can be estimated using the same samples. Let $w^{(i)} =
+\pi(X^{(i)})/\eta(X^{(i)})$, then we have
+\begin{equation}
+  \hat\varphi_2 =
+  \frac{\sum_{i=1}^Nw^{(i)}\varphi(X^{(i)})}{\sum_{i=1}^Nw^{(i)}}
+\end{equation}
+or
+\begin{equation}
+  \hat\varphi_3 = \sum_{i=1}^NW^{(i)}\varphi(X^{(i)})
+\end{equation}
+where $W^{(i)}\propto w^{(i)}$ and are normalized such that
+$\sum_{i=1}^NW^{(i)} = 1$.
+
+Sequential importance sampling (\sis) generalizes the importance sampling
+technique for a sequence of distributions $\{\pi_t\}_{t\ge0}$ defined on spaces
+$\{\prod_{k=0}^tE_k\}_{t\ge0}$. At time $t = 0$, sample $\{X_0^{(i)}\}_{i=1}^N$
+from $\eta_0$ and compute the weights $W_0^{(i)} \propto
+\pi_0(X_0^{(i)})/\eta_0(X_0^{(i)})$. At time $t\ge1$, each sample
+$X_{0:t-1}^{(i)}$, usually termed \emph{particles} in the literature, is
+extended to $X_{0:t}^{(i)}$ by a proposal distribution
+$q_t(\cdot|X_{0:t-1}^{(i)})$. And the weights are recalculated by $W_t^{(i)}
+\propto \pi_t(X_{0:t}^{(i)})/\eta_t(X_{0:t}^{(i)})$ where
+\begin{equation}
+  \eta_t(X_{0:t}^{(i)}) =
+  \eta_{t-1}(X_{0:t-1}^{(i)})q_t(X_{0:t}^{(i)}|X_{0:t-1}^{(i)})
+\end{equation}
+and thus
+\begin{align}
+  W_t^{(i)} \propto \frac{\pi_t(X_{0:t}^{(i)})}{\eta_t(X_{0:t}^{(i)})}
+  &= \frac{\pi_t(X_{0:t}^{(i)})\pi_{t-1}(X_{0:t-1}^{(i)})}
+  {\eta_{t-1}(X_{0:t-1}^{(i)})q_t(X_{0:t}^{(i)}|X_{0:t-1}^{(i)})
+    \pi_{t-1}(X_{0:t-1}^{(i)})} \notag\\
+  &= \frac{\pi_t(X_{0:t}^{(i)})}
+  {q_t(X_{0:t}^{(i)}|X_{0:t-1}^{(i)})\pi_{t-1}(X_{0:t-1}^{(i)})}W_{t-1}^{(i)}
+  \label{eq:si}
+\end{align}
+and importance sampling estimate of $\Exp_{\pi_t}[\varphi_t(X_{0:t})]$ can be
+obtained using $\{W_t^{(i)},X_{0:t}^{(i)}\}_{i=1}^N$.
+
+However this approach fails as $t$ becomes large. The weights tend to become
+concentrated on a few particles as the discrepancy between $\eta_t$ and $\pi_t$
+becomes larger. Resampling techniques are applied such that, a new particle
+system $\{\bar{W}_t^{(i)},\bar{X}_{0:t}^{(i)}\}_{i=1}^M$ is obtained with the
+property,
+\begin{equation}
+  \Exp\Square[Big]{\sum_{i=1}^M\bar{W}_t^{(i)}\varphi_t(\bar{X}_{0:t}^{(i)})} =
+  \Exp\Square[Big]{\sum_{i=1}^NW_t^{(i)}\varphi_t(X_{0:t}^{(i)})}
+  \label{eq:resample}
+\end{equation}
+In practice, the resampling algorithm is usually chosen such that $M = N$ and
+$\bar{W}^{(i)} = 1/N$ for $i=1,\dots,N$. Resampling can be performed at each
+time $t$ or adaptively based on some criteria of the discrepancy. One popular
+quantity used to monitor the discrepancy is \emph{effective sample size}
+(\ess), introduced by \textcite{Liu:1998iu}, defined as
+\begin{equation}
+  \ess_t = \frac{1}{\sum_{i=1}^N (W_t^{(i)})^2}
+\end{equation}
+where $\{W_t^{(i)}\}_{i=1}^N$ are the normalized weights. And resampling can be
+performed when $\ess\le \alpha N$ where $\alpha\in[0,1]$.
+
+The common practice of resampling is to replicate particles with large weights
+and discard those with small weights. In other words, instead of generating a
+random sample $\{\bar{X}_{0:t}^{(i)}\}_{i=1}^N$ directly, a random sample of
+integers $\{R^{(i)}\}_{i=1}^N$ is generated, such that $R^{(i)} \ge 0$ for $i =
+1,\dots,N$ and $\sum_{i=1}^N R^{(i)} = N$. And each particle value
+$X_{0:t}^{(i)}$ is replicated for $R^{(i)}$ times in the new particle system.
+The distribution of $\{R^{(i)}\}_{i=1}^N$ shall fulfill the requirement of
+Equation~\eqref{eq:resample}. One such distribution is a multinomial
+distribution of size $N$ and weights $(W_t^{(i)},\dots,W_t^{(N)})$. See
+\textcite{Douc:2005wa} for some commonly used resampling algorithms.
+
+\section{\protect\smc samplers}
+\label{sec:SMC Samplers}
+
+\smc samplers allow us to obtain, iteratively, collections of weighted samples
+from a sequence of distributions $\{\pi_t\}_{t\ge0}$ over essentially any
+random variables on some spaces $\{E_t\}_{t\ge0}$, by constructing a sequence
+of auxiliary distributions $\{\tilde\pi_t\}_{t\ge0}$ on spaces of increasing
+dimensions, $\tilde\pi_t(x_{0:t})=\pi_t (x_t) \prod_{s=0}^{t-1}
+L_s(x_{s+1},x_s)$, where the sequence of Markov kernels $\{L_s\}_{s=0}^{t-1}$,
+termed backward kernels, is formally arbitrary but critically influences the
+estimator variance. See \textcite{DelMoral:2006hc} for further details and
+guidance on the selection of these kernels.
+
+Standard sequential importance sampling and resampling algorithms can then be
+applied to the sequence of synthetic distributions, $\{\tilde\pi_t\}_{t\ge0}$.
+At time $t - 1$, assume that a set of weighted particles
+$\{W_{t-1}^{(i)},X_{0:t-1}^{(i)}\}_{i=1}^N$ approximating $\tilde\pi_{t-1}$ is
+available, then at time $t$, the path of each particle is extended with a
+Markov kernel say, $K_t(x_{t-1}, x_t)$ and the set of particles
+$\{X_{0:t}^{(i)}\}_{i=1}^N$ reach the distribution $\eta_t(X_{0:t}^{(i)}) =
+\eta_0(X_0^{(i)})\prod_{k=1}^tK_t(X_{t-1}^{(i)}, X_t^{(i)})$, where $\eta_0$ is
+the initial distribution of the particles. To correct the discrepancy between
+$\eta_t$ and $\tilde\pi_t$, Equation~\eqref{eq:si} is applied and in this case,
+\begin{equation}
+  W_t^{(i)} \propto \frac{\tilde\pi_t(X_{0:t}^{(i)})}{\eta_t(X_{0:t}^{(i)})}
+  = \frac{\pi_t(X_t^{(i)})\prod_{s=0}^{t-1}L_s(X_{s+1}^{(i)}, X_s^{(i)})}
+  {\eta_0(X_0^{(i)})\prod_{k=1}^tK_t(X_{t-1}^{(i)},X_t^{(i)})}
+  \propto \tilde{w}_t(X_{t-1}^{(i)}, X_t^{(i)})W_{t-1}^{(i)}
+\end{equation}
+where $\tilde{w}_t$, termed the \emph{incremental weights}, are calculated as,
+\begin{equation}
+  \tilde{w}_t(X_{t-1}^{(i)},X_t^{(i)}) =
+  \frac{\pi_t(X_t^{(i)})L_{t-1}(X_t^{(i)}, X_{t-1}^{(i)})}
+  {\pi_{t-1}(X_{t-1}^{(i)})K_t(X_{t-1}^{(i)}, X_t^{(i)})}
+\end{equation}
+If $\pi_t$ is only known up to a normalizing constant, say $\pi_t(x_t) =
+\gamma_t(x_t)/Z_t$, then we can use the \emph{unnormalized} incremental weights
+\begin{equation}
+  w_t(X_{t-1}^{(i)},X_t^{(i)}) =
+  \frac{\gamma_t(X_t^{(i)})L_{t-1}(X_t^{(i)}, X_{t-1}^{(i)})}
+  {\gamma_{t-1}(X_{t-1}^{(i)})K_t(X_{t-1}^{(i)}, X_t^{(i)})}
+\end{equation}
+for importance sampling. Further, with the previously \emph{normalized} weights
+$\{W_{t-1}^{(i)}\}_{i=1}^N$, we can estimate the ratio of normalizing constant
+$Z_t/Z_{t-1}$ by
+\begin{equation}
+  \frac{\hat{Z}_t}{Z_{t-1}} =
+  \sum_{i=1}^N W_{t-1}^{(i)}w_t(X_{t-1}^{(i)},X_t^{(i)})
+\end{equation}
+Sequentially, the normalizing constant between initial distribution $\pi_0$ and
+some target $\pi_T$, $T\ge1$ can be estimated. See \textcite{DelMoral:2006hc}
+for details on calculating the incremental weights. In practice, when $K_t$ is
+invariant to $\pi_t$, and an approximated suboptimal backward kernel
+\begin{equation}
+  L_{t-1}(x_t, x_{t-1}) = \frac{\pi(x_{t-1})K_t(x_{t-1}, x_t)}{\pi_t(x_t)}
+\end{equation}
+is used, the unnormalized incremental weights will be
+\begin{equation}
+  w_t(X_{t-1}^{(i)},X_t^{(i)}) =
+  \frac{\gamma_t(X_{t-1}^{(i)})}{\gamma_{t-1}(X_{t-1}^{(i)})}.
+  \label{eq:inc_weight_mcmc}
+\end{equation}
+
+\section{Other sequential Monte Carlo algorithms}
+\label{sec:Other sequential Monte Carlo algorithms}
+
+Some other commonly used sequential Monte Carlo algorithms can be viewed as
+special cases of algorithms introduced above. The annealed importance sampling
+(\ais; \textcite{Neal:2001we}) can be viewed as \smc samplers without
+resampling. Particle filters as seen in the physics and signal processing
+literature, can also be interpreted as the sequential importance sampling and
+resampling algorithms. See \textcite{Doucet:2011us} for a review of this topic.
diff --git a/user_guide/tex/util.tex b/user_guide/tex/util.tex
new file mode 100644
index 000000000..f55a10d3e
--- /dev/null
+++ b/user_guide/tex/util.tex
@@ -0,0 +1,237 @@
+\chapter{Utilities}
+\label{chap:Utilities}
+
+The library provides some utilities for writing Monte Carlo simulation
+programs. For some of them, such as command line option processing, there are
+more advanced, dedicated libraries out there. The library only provides some
+basic functionality that is sufficient for most simple cases.
+
+\section{Aligned memory allocation}
+\label{sec:Aligned memory allocation}
+
+The standard library class \cppinline{std::allocator} is used by containers to
+allocate memory. It works fine in most cases. However, sometime it is desired
+to allocate memory aligned by a certain boundary. The library provides the
+class template,
+\begin{cppcode}
+  template <typename T, std::size_t Alignment = VSMC_ALIGNMENT,
+      typename Memory = AlignedMemory>
+  class AlignedAllocator;
+\end{cppcode}
+where the configuration macro \cppinline{VSMC_ALIGNMENT} is defined to be
+\cppinline{32} by default. For the requirement of the parameter type
+\cppinline{Memory}, see the reference manual. It is sufficient to mention here
+that the default implementation works best if \tbb is available. This class can
+be used as a drop-in replacement of \cppinline{std::allocator<T>}. In fact,
+this library defines a type alias \cppinline{Vector<T>} which is
+\cppinline{std::vector<T, AlignedAllocator<T>>} if \cppinline{T} is a scalar
+type, and \cppinline{std::vector<T>} otherwise.
+
+\section{Sample covariance estimating}
+\label{sec:Sample covariance estimating}
+
+The library provides some basic functionality to estimate sample variance. For
+example,
+\begin{cppcode}
+  constexpr std::size_t d = /* Dimension */;
+  using T = StateMatrix<RowMajor, d, double>;
+  Sampler<T> sampler(N);
+  // operations on the sampler
+  double mean[d];
+  double cov[d * d];
+  Covariance eval;
+  auto x = sampler.particle().value().data();
+  auto w = sampler.particle().weight().data();
+  eval(RowMajor, N, d, x, w, mean, cov);
+\end{cppcode}
+The sample covariance matrix will be computed and stored in \cppinline{cov}.
+The mean vector is stored in \cppinline{mean}. Note that, if any of them is a
+null pointer, then the corresponding output is not computed. The sample
+\cppinline{x} is assumed to be stored in an $N$ by $d$ matrix. The first
+argument passed to \cppinline{eval} is the storage layout of this matrix. If
+\cppinline{x} is a null pointer, then no computation will be done. If
+\cppinline{w} is a null pointer, then the weight is assumed to be equal for all
+samples. The method has three additional optional parameters. The first is
+\cppinline{cov_layout}, which specifies the covariance matrix storage layout.
+The second is \cppinline{cov_upper} and the third \cppinline{cov_packed}, both
+are \cppinline{false} by default. If the later is \cppinline{cov_packed}, a
+packed vector of length $d(d+1)/2$ is written into \cppinline{cov}. If
+\cppinline{cov_upper} is \cppinline{false}, then the upper triangular is
+packed, otherwise the lower triangular is packed.
+
+The estimated covariance matrix is often used to construct multivariate Normal
+distribution for the purpose of generating random walk scales. The
+\cppinline{NormalMVDistribution} in section~\ref{sec:Distributions} accepts
+the lower triangular of the Cholesky decomposition of the covariance instead of
+the covariance matrix itself. The following function will compute this
+decomposition,
+\begin{cppcode}
+  double chol[d * (d + 1) / 2];
+  cov_chol(d, cov, chol);
+\end{cppcode}
+The output \cppinline{chol} is a packed vector in row major storage. This
+function also has three optional parameters, which are the same as those of
+\cppinline{Covariance::operator()}, except that they are now used to specify
+the storage scheme of the input parameter \cppinline{cov}.
+
+\section{Storing objects in \protect\hdf}
+\label{sec:Storing objects in HDF5}
+
+If the \hdf library is available, it is possible to store
+\cppinline{Sampler<T>} objects, etc., in the \hdf format. For example,
+\begin{cppcode}
+  hdf5store(sampler, "pf.h5", "sampler");
+\end{cppcode}
+create a \hdf file with the sampler stored as a list. In R it can be processed
+as the following,
+\begin{rcode}
+  library(rhdf5)
+  pf <- as.data.frame(h5read("pf.h5", "sampler"))
+\end{rcode}
+This creates a \rinline{data.frame} similar to that shown in
+section~\ref{sub:Implementations}. Other types of objects can also be store,
+see the reference manual for details.
+
+\section{\protect\raii classes for \protect\mkl pointers}
+\label{sec:RAII classes for MKL pointers}
+
+The library provides a few classes to manage \mkl pointers. It provides
+Resource Acquisition Is Initialization (\raii) idiom on top of the \mkl C
+interface. For example,
+\begin{cppcode}
+  // VSLSSTaskPtr ptr;
+  // vsldSSNewTask(&ptr, &p, &n, &xstorage, x, w, indices);
+  MKLSSTask<double> task(&p, &n, &xstorage, x, w, indices);
+  // vsldSSEditMoments(ptr, mean, r2m, r3m, r4m, c2m, c3m, c4m);
+  task.edit_moments(mean, r2m, r3m, r4m, c2m, c3m, c4m);
+  // vsldSSCompute(ptr, estimates, method);
+  task.compute(estimates, method)
+  // vslSSDeleteTask(&ptr);
+\end{cppcode}
+In the above snippets, \cppinline{MKLSSTask} manages a \cppinline{VSLSSTaskPtr}
+task pointer. All C functions that operates on the pointer, is also defined as
+methods in the class. Table~\ref{tab:RAII classes for MKL pointers} lists the
+classes defined by the library and their corresponding \mkl pointers.
+
+\begin{table}[t]
+  \begin{tabu}{X[l]X[l]}
+    \toprule
+    Class & \mkl pointer type \\
+    \midrule
+    \texttt{MKLStream}   & \texttt{VSLStreamStatePtr} \\
+    \texttt{MKLSSTask}   & \texttt{VSLSSTaskPtr}      \\
+    \texttt{MKLConvTask} & \texttt{VSLConvTask}       \\
+    \texttt{MKLCorrTask} & \texttt{VSLCorrTask}       \\
+    \texttt{MKLDFTask}   & \texttt{DFTaskPtr}         \\
+    \bottomrule
+  \end{tabu}
+  \caption{\protect\raii classes for \protect\mkl pointers}
+  \label{tab:RAII classes for MKL pointers}
+\end{table}
+
+\section{Program options}
+\label{sec:Program options}
+
+The library provides some basic support of processing command line program
+options. Here we show a minimal example. The complete program is shown in
+appendix~\appref{sec:Processing command line program options}. First, one need to
+allocated variables to store the options to be processed.
+\begin{cppcode}
+  int n;
+  std::string str;
+  std::vector<double> vec;
+\end{cppcode}
+All types that support standard library \io stream operations are supported. In
+addition, \cppinline{std::vector<T>}, where \cppinline{T} is a type that
+supports standard library \io stream operations is also supported. Then,
+\begin{cppcode}
+  ProgramOptionMap option_map;
+\end{cppcode}
+constructs the container of options. Options can be added to the map,
+\begin{cppcode}
+  option_map
+      .add("str", "A string option with a default value", &str, "default")
+      .add("n", "An integer option", &n)
+      .add("vec", "A vector option", &vec);
+\end{cppcode}
+The first argument is the name of the option, the second is a description, and
+the third is a pointer to where the option's value shall be stored. The last
+optional argument is a default value. The options on the command line can be
+processed as the following,
+\begin{cppcode}
+  option_map.process(argc, argv);
+\end{cppcode}
+where \cppinline{argc} and \cppinline{argv} are the arguments of the
+\cppinline{main} function. When the program is invoked, each option can be
+passed like below,
+\begin{textcode}
+  ./program_option --vec 1 2 1e-1 --str "abc" --vec 8 9 --str "def hij" --n 2 4
+\end{textcode}
+The results of the option processing is displayed below,
+\begin{textcode}
+  n: 4
+  str: def hij
+  vec: 1 2 0.1 8 9
+\end{textcode}
+To summarize these output, the same option can be specified multiple times. If
+it is a scalar option, the last one is used (\textinline{--str},
+\textinline{--n}). A string option's value can be grouped by quotes. For a
+vector option (\textinline{--vec}), all values are gather together and inserted
+into the vector.
+
+\section{Program progress}
+\label{sec:Program progress}
+
+Sometime it is desirable to see how much progress of a program has been made.
+The library provide a \cppinline{Progress} class for this purpose. Here we show
+a minimal example. The complete program is shown in
+appendix~\appref{sec:Display program progress}.
+\begin{cppcode}
+  vsmc::Progress progress;
+  progress.start(n * n);
+  for (std::size_t i = 0; i != n; ++i) {
+      std::stringstream ss;
+      ss << "i = " << i;
+      progress.message(ss.str());
+      for (std::size_t j = 0; j != n; ++j) {
+          // Do some computation
+          progress.increment();
+      }
+  }
+  progress.stop();
+\end{cppcode}
+When invoked, the program output something similar the below
+\begin{textcode}
+  [  4%][00:07][  49019/1000000][i = 49]
+\end{textcode}
+The method \cppinline{progress.start(n * n)} starts the printing of the
+progress. The argument specifies how many iterations there will be before it is
+stopped. The method \cppinline{progress.message(ss.str())} direct the program
+to print a message. This is optional. Each time after we finish $n$ iterations,
+we increment the progress count by calling \cppinline{progress.increment()}.
+And after everything is finished, the method \cppinline{progress.stop()} is
+called.
+
+\section{Timing}
+\label{sec:Timing}
+
+Performance can only be improved after it is first properly benchmarked. There
+are advanced profiling programs for this purpose. However, sometime simple
+timing facilities are enough. The library provides a simple class
+\cppinline{StopWatch} for this purpose. As its name suggests, it works much
+like a physical stop watch. Here is a simple example
+\begin{cppcode}
+  StopWatch watch;
+  for (std::size_t i = 0; i != n; ++i) {
+      // Some computation
+      watch.start();
+      // Computation to be benchmarked;
+      watch.stop();
+      // Some other computation
+  }
+  double t = watch.seconds(); // The time in seconds
+\end{cppcode}
+The above example demonstrate that timing can be accumulated between loop
+iterations, function calls, etc. It shall be noted that, the time is only
+accurate if the computation between \cppinline{watch.start()} and
+\cppinline{watch.stop()} is non-trivial.
diff --git a/user_guide/user_guide.bib b/user_guide/user_guide.bib
new file mode 100644
index 000000000..5d41b460e
--- /dev/null
+++ b/user_guide/user_guide.bib
@@ -0,0 +1,2923 @@
+%% Created using Papers on Tue, 15 Mar 2016.
+%% http://papersapp.com/papers/
+
+@article{stpf,
+    author = {{Beskos}, A. and {Crisan}, D. and {Jasra}, A. and {Kamatani}, K.
+        and {Zhou}, Y.},
+    title = "{A Stable Particle Filter in High-Dimensions}",
+    journal = {ArXiv e-prints},
+    archivePrefix = "arXiv",
+    eprint = {1412.3501},
+    year = 2014,
+    month = dec,
+}
+
+@article{Jasra:2012hg,
+author = {Jasra, Ajay and Singh, S S and Martin, J S and McCoy, E},
+title = {{Filtering via approximate Bayesian computation}},
+journal = {Statistics and Computing},
+year = {2012}
+}
+
+@book{Robert:2004tn,
+author = {Robert, Christian P. and Casella, George},
+title = {{\emph{Monte Carlo Statistical Methods}}},
+publisher = {Springer},
+year = {2004}
+}
+
+@article{BarndorffNielsen:2001gi,
+author = {Barndorff Nielsen, Ole E and Shephard, Neil},
+title = {{Non-Gaussian Ornstein{\textendash}Uhlenbeck-based models and some of their uses in financial economics}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2001},
+volume = {63},
+number = {2},
+pages = {167--241},
+month = jan
+}
+
+@book{Lehmann:2005vy,
+author = {Lehmann, E. L. and Romano, Joseph P.},
+title = {{Testing Statistical Hypotheses}},
+publisher = {Springer-Verlag},
+year = {2005},
+address = {New York}
+}
+
+@article{Chib:1995em,
+author = {Chib, Siddhartha},
+title = {{Marginal likelihood from the Gibbs output}},
+journal = {Journal of the American Statistical Association},
+year = {1995},
+volume = {90},
+number = {432},
+pages = {1313--1321}
+}
+
+@book{Claeskens:2008tq,
+author = {Claeskens, Gerda and Hjort, Nils Lid},
+title = {{Model Selection and Model Averaging}},
+publisher = {Cambridge University Press},
+year = {2008},
+series = {Cambridge Series in Statistical and Probablistic Mathematics},
+address = {Cambridge, UK}
+}
+
+@article{Richard:2007gz,
+author = {Richard, Jean-Francois and Zhang, Wei},
+title = {{Efficient high-dimensional importance sampling}},
+journal = {Journal of Econometrics},
+year = {2007},
+volume = {141},
+number = {2},
+pages = {1385--1411}
+}
+
+@inproceedings{Hol:2006ff,
+author = {Hol, Jeroen D. and Schon, Thomas B. and Gustafsson, Fredrik},
+title = {{On resampling algorithms for particle filters}},
+booktitle = {Proceedings of IEEE Nonlinear Statistical Signal Processing Workshop},
+year = {2006},
+pages = {79--82},
+publisher = {IEEE}
+}
+
+@article{Jones:1993cn,
+author = {Jones, D. R. and Perttunen, C. D. and Stuckman, B. E.},
+title = {{Lipschitzian optimization without the Lipschitz constant}},
+journal = {Journal of Optimization Theory and Application},
+year = {1993},
+volume = {79},
+number = {1},
+pages = {157--181}
+}
+
+@article{Douc:2008ux,
+author = {Douc, Randal and Moulines, Eric},
+title = {{Limit theorems for weighted samples with applications to sequential Monte Carlo methods}},
+journal = {The Annals of Statistics},
+year = {2008},
+volume = {36},
+number = {5},
+pages = {2344--2376},
+month = oct
+}
+
+@article{Snyder:2008by,
+author = {Snyder, C and Bengtsson, T and Bickel, P},
+title = {{Obstacles to high-dimensional particle filtering}},
+journal = {Monthly Weather {\ldots}},
+year = {2008}
+}
+
+@article{Gunzburger:2010ff,
+title = {{A nonlocal vector calculus with application to nonlocal boundary value problems}},
+year = {2010},
+volume = {8},
+number = {5},
+pages = {1581--1598}
+}
+
+@article{Shevchenko:2014uw,
+author = {Shevchenko, Pavel V and Del Moral, Pierre},
+title = {{Valuation of barrier options using sequential Monte Carlo}},
+year = {2014},
+eprint = {1405.5294},
+eprinttype = {arxiv},
+month = may
+}
+
+@article{Banterle:2014wk,
+author = {Banterle, M and Grazian, C and Robert, Christian P.},
+title = {{Accelerating Metropolis-Hastings algorithms: Delayed acceptance with prefetching}},
+journal = {ArXiv},
+year = {2014},
+eprint = {4540E792-A05E-4F96-9AFF-9672771F7934},
+eprinttype = {scholar}
+}
+
+@article{BarndorffNielsen:1983ia,
+author = {Barndorff-Nielsen, O.},
+title = {{On a formula for the distribution of the maximum likelihood estimator}},
+journal = {Biometrika},
+year = {1983},
+volume = {70},
+number = {2},
+pages = {343--365}
+}
+
+@article{Goldberg:1991wo,
+author = {Goldberg, David},
+title = {{What every computer scientist should know about floating-point arithmetic}},
+journal = {ACM Computing Surveys},
+year = {1991},
+volume = {23},
+number = {1},
+pages = {5--48}
+}
+
+@article{Lecuyer:2002uy,
+author = {L'ecuyer, Pierre and Simard, Richard and Chen, E. Jack and Kelton, W. David},
+title = {{An object-oriented random-number package with many long streams and substreams}},
+journal = {Operational Research},
+year = {2002},
+volume = {50},
+number = {6},
+pages = {1073--1075}
+}
+
+@article{Craiu:2014vk,
+author = {Craiu, Radu V and Gray, Lawrence and Latuszynski, Krzysztof and Madras, Neal and Roberts, Gareth O and Rosenthal, Jeffrey S},
+title = {{Stability of Adversarial Markov Chains, with an Application to Adaptive MCMC Algorithms}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1403.3950v3},
+eprinttype = {arxiv},
+eprintclass = {math.PR},
+month = mar
+}
+
+@techreport{Zhou:2011uo,
+author = {Zhou, Yan and Aston, John A. D. and Johansen, Adam M.},
+title = {{Bayesian model comparison for compartmental models with applications in positron emission tomography}},
+institution = {CRiSM, University of Warwick},
+year = {2011}
+}
+
+@article{Parzen:1962tf,
+author = {Parzen, Emanuel},
+title = {{On estimation of a probability density function and mode}},
+journal = {The Annals of Mathematical Statistics},
+year = {1962},
+volume = {33},
+number = {3},
+pages = {1065--1076}
+}
+
+@article{Wood:1993ku,
+author = {Wood, Andrew T. A. and Booth, James G. and Butler, Ronald W.},
+title = {{Saddlepoint approximations to the CDF of some statistics with nonnormal limit distributions}},
+journal = {Journal of the American Statistical Association},
+year = {1993},
+volume = {88},
+number = {422},
+pages = {680}
+}
+
+@article{Papaspiliopoulos:2008dk,
+author = {Papaspiliopoulos, O and Roberts, G. O.},
+title = {{Retrospective Markov chain Monte Carlo methods for Dirichlet process hierarchical models}},
+journal = {Biometrika},
+year = {2008},
+volume = {95},
+number = {1},
+pages = {169--186},
+month = jan
+}
+
+@article{Roberts:2002vb,
+author = {Roberts, Gareth O. and Rosenthal, Jeffery S.},
+title = {{The polar slice sampler}},
+journal = {Statistical Models},
+year = {2002},
+volume = {18},
+number = {2},
+pages = {257--280}
+}
+
+@article{Eddy:1980wm,
+author = {Eddy, William F.},
+title = {{Optimum kernel estimators of the mode}},
+journal = {The Annals of Statistics},
+year = {1980},
+volume = {8},
+number = {4},
+pages = {870--882}
+}
+
+@article{DelMoral:2005hw,
+author = {Del Moral, Pierre and Garnier, Josselin},
+title = {{Genealogical particle analysis of rare events}},
+journal = {The Annals of Applied Probability},
+year = {2005},
+volume = {15},
+number = {4},
+pages = {2496--2534}
+}
+
+@inproceedings{Bolic:2003hl,
+author = {Boli{\'c}, Miodrag and Djuri{\'c}, Petar M. and Hong, Sangjin},
+title = {{New resampling algorithms for particel filters}},
+booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing Proceedings},
+year = {2003},
+pages = {589--592}
+}
+
+@article{Jasra:2010eh,
+author = {Jasra, Ajay and Stephens, David A. and Doucet, Arnaud and Tsagaris, Theodoros},
+title = {{Inference for L{\'e}vy-Driven Stochastic Volatility Models via Adaptive Sequential Monte Carlo}},
+journal = {Scandinavian Journal of Statistics},
+year = {2010},
+volume = {38},
+number = {1},
+pages = {1--22},
+month = dec
+}
+
+@article{Fan:2008tf,
+author = {Fan, Y. and Leslie, D. S. and Wand, M. P.},
+title = {{Generalised linear mixed model analysis via sequential Monte Carlo sampling}},
+journal = {Electronic Journal of Statistics},
+year = {2008},
+volume = {2},
+pages = {916--938}
+}
+
+@article{Muller:1984vr,
+author = {Muller, Hans-Georg},
+title = {{Smooth optimum kernel estimators of densities, regression curves and modes}},
+journal = {The Annals of Statistics},
+year = {1984},
+volume = {12},
+number = {2},
+pages = {766--774}
+}
+
+@book{Larson:2013ds,
+author = {Larson, Mats G and Bengzon, Fredrik},
+title = {{The Finite Element Method: Theory, Implementation, and Applications}},
+publisher = {Springer Science {\&} Business Media},
+year = {2013},
+volume = {10},
+series = {Texts in Computational Science and Engineering},
+address = {Berlin, Heidelberg},
+month = jan
+}
+
+@article{2013InvPr..29h5010H,
+author = {Hoang, Viet Ha and Schwab, Christoph and Stuart, Andrew M},
+title = {{Complexity analysis of accelerated MCMC methods for Bayesian inversion}},
+journal = {Inverse Problems},
+year = {2013},
+volume = {29},
+number = {8},
+pages = {085010},
+month = aug
+}
+
+@incollection{DelMoral:2006wv,
+author = {Del Moral, Pierre and Doucet, Arnaud and Jasra, Ajay},
+title = {{Sequential Monte Carlo methods for Bayesian computation}},
+booktitle = {Bayesian Statistics 8},
+year = {2006},
+pages = {--},
+publisher = {Oxford University Press}
+}
+
+@article{Folks:1978tb,
+author = {Folks, J. L. and Chhikara, R. S.},
+title = {{The inverse Gaussian distribution and its statistical application{\textendash}a review}},
+journal = {Journal of Royal Statistical Society B},
+year = {1978},
+volume = {40},
+number = {3},
+pages = {263--289}
+}
+
+@article{Anonymous:N3t5mJ33,
+author = {Marjoram, Paul and Molitor, John and Plagnol, Vincent and Tavar{\'e}, Simon},
+title = {{Markov chain Monte Carlo without likelihoods}},
+journal = {Proceedings of the National Academy of Sciences of the United States of America},
+year = {2003},
+volume = {100},
+number = {26},
+pages = {15324--15328}
+}
+
+@article{Bartlett:1963ty,
+author = {Bartlett, M. S.},
+title = {{Statistical estimation of density functions}},
+journal = {Sankhy{\=a}: The Indian Journal of Statistics, Series A},
+year = {1963},
+volume = {25},
+number = {3},
+pages = {245--254}
+}
+
+@article{Lammertsma:1996ik,
+author = {Lammertsma, Adriaan A. and Hume, Susan P.},
+title = {{Simplified reference tissue model for PET receptor studies.}},
+journal = {NeuroImage},
+year = {1996},
+volume = {4},
+number = {3 Pt 1},
+pages = {153--158}
+}
+
+@article{Vyshemirsky:2008ch,
+author = {Vyshemirsky, Vladislav and Girolami, Mark A.},
+title = {{Bayesian ranking of biochemical system models}},
+journal = {Bioinformatics},
+year = {2008},
+volume = {24},
+number = {6},
+pages = {833--839}
+}
+
+@article{Sisson:2007il,
+author = {Sisson, S. A. and Fan, Y. and Tanaka, Mark M.},
+title = {{Sequential Monte Carlo without likelihoods}},
+journal = {Proceedings of the National Academy of Sciences of the United States of America},
+year = {2007},
+volume = {104},
+number = {6},
+pages = {1760--1765}
+}
+
+@article{Chopin:2007fn,
+author = {Chopin, Nicolas},
+title = {{Inference and model choice for sequentially ordered hidden Markov models}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2007},
+volume = {69},
+number = {2},
+pages = {269--284},
+month = apr
+}
+
+@article{Chen:2005dg,
+author = {Chen, Yuguo and Diaconis, Persi and Holmes, Susan P. and Liu, Jun S.},
+title = {{Sequential Monte Carlo methods for statistical analysis of tables}},
+journal = {Journal of the American Statistical Association},
+year = {2005},
+volume = {100},
+number = {469},
+pages = {109--120}
+}
+
+@article{Kass:1993vy,
+author = {Kass, Robert E.},
+title = {{Bayes factors in practice}},
+journal = {The Statistician},
+year = {1993},
+volume = {42},
+number = {5},
+pages = {551--560}
+}
+
+@article{Gustafsson:2010ky,
+author = {Gustafsson, Fredrik},
+title = {{Particle filter theory and practice with positioning applications}},
+journal = {IEEE Aerospace and Electronic Systems Magazine},
+year = {2010},
+volume = {25},
+number = {7},
+pages = {53--82}
+}
+
+@article{Cappe:2003ek,
+author = {Capp{\'e}, Olivier and Robert, Christian P. and Ryden, Tobias},
+title = {{Reversible jump, birth-and-death and more general continuous time Markov chain Monte Carlo samplers}},
+journal = {Journal of Royal Statistical Society B},
+year = {2003},
+volume = {65},
+number = {3},
+pages = {679--700}
+}
+
+@article{Kinderman:1977tn,
+author = {Kinderman, A. J. and Monahan, J. F.},
+title = {{Computer generation of random variables using the ratio of uniform deviates}},
+journal = {ACM Transactions on Mathematical Software},
+year = {1977},
+volume = {3},
+number = {3},
+pages = {257--260}
+}
+
+@article{Cox:2004te,
+author = {Cox, D. R. and Reid, N.},
+title = {{A note on pseudolikelihood constructed from marginal densities}},
+journal = {Biometrika},
+year = {2004},
+volume = {91},
+number = {3},
+pages = {729--737}
+}
+
+@article{Giles:2008gc,
+author = {Giles, M B},
+title = {{Multilevel Monte Carlo path simulation}},
+journal = {Operations Research},
+year = {2008},
+volume = {56},
+number = {3},
+pages = {607--617}
+}
+
+@mastersthesis{Peters:2005wh,
+author = {Peters, Gareth W},
+title = {{Topics in sequential Monte Carlo samplers}},
+year = {2005}
+}
+
+@article{Chan:2013gv,
+author = {Chan, Hock Peng and Lai, Tze Leung},
+title = {{A general theory of particle filters in hidden Markov models and some applications}},
+journal = {The Annals of Statistics},
+year = {2013},
+volume = {41},
+number = {6},
+pages = {2877--2904},
+month = dec
+}
+
+@article{Fearnhead:2010ua,
+author = {Fearnhead, Paul and Taylor, Benjamin M.},
+title = {{An adaptive sequential Monte Carlo sampler}},
+journal = {ArXiv},
+year = {2010},
+pages = {--}
+}
+
+@article{DelMoral:2012jq,
+author = {Del Moral, Pierre and Doucet, Arnaud and Jasra, Ajay},
+title = {{On adaptive resampling strategies for sequential Monte Carlo methods}},
+journal = {Bernoulli},
+year = {2012},
+volume = {18},
+number = {1},
+pages = {252--278},
+month = feb
+}
+
+@article{Hong:2004ja,
+author = {Hong, Sangjin and Boli{\'c}, Miodrag and Djuri{\'c}, Petar M.},
+title = {{An efficient fixed-point implementation of residual resampling scheme for high-speed particle filters}},
+journal = {IEEE Signal Processing Letters},
+year = {2004},
+volume = {11},
+number = {5},
+pages = {482--485}
+}
+
+@article{Hills:1993vb,
+author = {Hills, Susan E and Smith, Adrian FM},
+title = {{Diagnostic plots for improved parameterization in Bayesian inference}},
+journal = {Biometrika},
+year = {1993},
+volume = {80},
+number = {1},
+pages = {61--74}
+}
+
+@article{Jeffreys:1946jf,
+author = {Jeffreys, Harold},
+title = {{An Invariant Form for the Prior Probability in Estimation Problems}},
+journal = {Proceedings of the Royal Society A: Mathematical, Physical and Engineering Sciences},
+year = {1946},
+volume = {186},
+number = {1007},
+pages = {453--461}
+}
+
+@techreport{Spiegelhalter:1998uc,
+author = {Spiegelhalter, David J. and Best, Nicola G. and Carlin, Bradley P.},
+title = {{Bayesian deviance, the effective number of parameters, and the comparison of arbitrarily complex models}},
+year = {1998}
+}
+
+@article{Bernardo:1979uq,
+author = {Bernardo, Jos{\'e} M.},
+title = {{Reference posterior distributions for Bayesian inference}},
+journal = {Journal of Royal Statistical Society B},
+year = {1979},
+volume = {41},
+number = {2},
+pages = {113--147}
+}
+
+@article{Alpert:2009by,
+author = {Alpert, Nathaniel M. and Yuan, Fang},
+title = {{A general method of Bayesian estimation for parametric imaging of the brain.}},
+journal = {NeuroImage},
+year = {2009},
+volume = {45},
+number = {4},
+pages = {1183--1189}
+}
+
+@article{Epanechnikov:1969vm,
+author = {Epanechnikov, V. A.},
+title = {{Non-parametric estimation of a multivariate probability density}},
+journal = {Theory of Probability and its Applications},
+year = {1969},
+volume = {14},
+pages = {153}
+}
+
+@article{Gustavson:1996fx,
+author = {Gustavson, Fred G and Gupta, Anshul},
+title = {{A new parallel algorithm for tridiagonal symmetric positive systems of equations}},
+journal = {PARA},
+year = {1996},
+volume = {1184},
+number = {Chapter 36},
+pages = {341--349}
+}
+
+@book{Courant:2012vy,
+author = {Courant, R and John, F},
+title = {{Introduction to Calculus and Analysis I}},
+year = {2012}
+}
+
+@article{Olkin:1981vi,
+author = {Olkin, Ingram and Petkau, A. John and Zidek, James V.},
+title = {{A comparison of n estimators for the Binomial distribution}},
+journal = {Journal of American Statistical Association},
+year = {1981},
+volume = {76},
+number = {375},
+pages = {637--642}
+}
+
+@article{Jasra:2005kl,
+author = {Jasra, A. and Holmes, C. C. and Stephens, D. A.},
+title = {{Markov Chain Monte Carlo Methods and the Label Switching Problem in Bayesian Mixture Modeling}},
+journal = {Statistical Science},
+year = {2005},
+volume = {20},
+number = {1},
+pages = {50--67}
+}
+
+@book{Bernardo:1994vd,
+author = {Bernardo, Jos{\'e} M. and Smith, Adrian F. M.},
+title = {{Bayesian Theory}},
+publisher = {John Wiley {\&} Sons, Inc.},
+year = {1994},
+series = {Wiley Series in Probability and Statistics}
+}
+
+@article{Fearnhead:2010ut,
+author = {Fearnhead, Paul and Papaspiliopoulos, Omiros and Roberts, Gareth O. and Stuart, Andrew},
+title = {{Random-weight particle filtering of continuous time processes}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2010},
+volume = {72},
+number = {4},
+pages = {497--512}
+}
+
+@incollection{Shibata:1989tm,
+author = {Shibata, R.},
+title = {{Statistical aspects of model selection}},
+booktitle = {From data to model},
+year = {1989},
+editor = {Willems, Jan Camiel},
+pages = {215--240},
+publisher = {Springer-Verlag}
+}
+
+@article{Mengersen:1996th,
+author = {Mengersen, K. L. and Tweedie, R. L.},
+title = {{Rates of convergence of the Hastings and Metropolis algorithms}},
+journal = {The Annals of Statistics},
+year = {1996},
+volume = {24},
+number = {1},
+pages = {101--121}
+}
+
+@article{Chopin:2002hg,
+author = {Chopin, Nicolas},
+title = {{A sequential particle filter method for static models}},
+journal = {Biometrika},
+year = {2002},
+volume = {89},
+number = {3},
+pages = {539--552}
+}
+
+@phdthesis{Fearnhead:1998wj,
+author = {Fearnhead, Paul},
+title = {{Sequential Monte Carlo methods in filter theory}},
+year = {1998}
+}
+
+@book{DeFinetti:1975ua,
+author = {De Finetti, Bruno},
+title = {{Theory of probability: A critical introductory treatment}},
+publisher = {John Wiley {\&} Sons},
+year = {1975},
+volume = {2},
+month = jul
+}
+
+@article{Johansen:2008kp,
+author = {Johansen, Adam M. and Doucet, Arnaud and Davy, Manuel},
+title = {{Particle methods for maximum likelihood estimation in latent variable models}},
+journal = {Statistics and Computing},
+year = {2008},
+volume = {18},
+number = {1},
+pages = {47--57}
+}
+
+@article{Gelfand:1994ux,
+author = {Gelfand, Alan E. and Dey, Dipak K.},
+title = {{Bayesian model choice: Asymptotics and exact calculations}},
+journal = {Journal of Royal Statistical Society B},
+year = {1994},
+volume = {56},
+number = {3},
+pages = {501--514}
+}
+
+@article{Nadaraya:1964uu,
+author = {Nadaraya, E. A.},
+title = {{On estimating regression}},
+journal = {Theory of Probability and its Applications},
+year = {1964},
+volume = {9},
+pages = {141--142}
+}
+
+@article{Kac:1949dq,
+author = {Kac, M.},
+title = {{On distributions of certain Wiener functionals}},
+journal = {Transactions of the American Mathematical Society},
+year = {1949},
+volume = {65},
+number = {1},
+pages = {1--13}
+}
+
+@article{Poyiadjis:2011dc,
+author = {Poyiadjis, G and Doucet, Arnaud and Singh, S S},
+title = {{Particle approximations of the score and observed information matrix in state space models with application to parameter estimation}},
+journal = {Biometrika},
+year = {2011}
+}
+
+@article{Chen:1995jn,
+author = {Chen, Kewei and Reiman, Eric and Lawson, Michael and Feng, Dagan and Huang, Sung-Cheng},
+title = {{Decay correction methods in dynamic PET studies}},
+journal = {IEEE Transactions on Nuclear Science},
+year = {1995},
+volume = {42},
+number = {6},
+pages = {2173--2179}
+}
+
+@article{Akaike:1980gh,
+author = {Akaike, Hirotugu},
+title = {{Likelihood and the Bayes procedure}},
+journal = {Bayesian Statistics},
+year = {1980},
+volume = {31},
+number = {1},
+pages = {143--166}
+}
+
+@article{Givens:1996ul,
+author = {Givens, Geof H. and Raftery, Adrian E.},
+title = {{Local adaptive importance sampling for multivariate densities with strong nonlinear relationships}},
+journal = {Journal of American Statistical Association},
+year = {1996},
+volume = {91},
+number = {433},
+pages = {132--141}
+}
+
+@article{Gerber:2014vu,
+author = {Gerber, M and Chopin, N},
+title = {{Sequential Quasi-Monte Carlo}},
+journal = {ArXiv},
+year = {2014},
+eprint = {10224189496803572728related:-LtBYS2e440J},
+eprinttype = {scholar}
+}
+
+@article{Hamilton:1995ty,
+author = {Hamilton, David C. and Lesperance, Mary L.},
+title = {{A comparison of methods for univariate and multivariate acceptance sampling by variables}},
+journal = {Technometrics},
+year = {1995},
+volume = {37},
+number = {3},
+pages = {329--339}
+}
+
+@article{Murray:2012wo,
+author = {Murray, Iain and Ghahramani, Zoubin and MacKay, David},
+title = {{MCMC for doubly-intractable distributions}},
+journal = {ArXiv},
+year = {2012},
+pages = {--}
+}
+
+@book{DelMoral:2004ux,
+author = {Del Moral, Pierre},
+title = {{Feynman-Kac Formulae: Genealogical and Interacting Particle Systems with Applications}},
+publisher = {Springer-Verlag},
+year = {2004},
+address = {New York}
+}
+
+@article{Doucet:2006ji,
+author = {Doucet, Arnaud and Briers, Mark and S{\'e}n{\'e}cal, St{\'e}phane},
+title = {{Efficient block sampling strategies for sequential Monte Carlo methods}},
+journal = {Journal of Computational and Graphical Statistics},
+year = {2006},
+volume = {15},
+number = {3},
+pages = {693--711}
+}
+
+@article{Golightly:2008cx,
+author = {Golightly, A. and Wilkinson, D. J.},
+title = {{Bayesian inference for nonlinear multivariate diffusion models observed with error}},
+journal = {Computational Statistics {\&} Data Analysis},
+year = {2008},
+volume = {52},
+number = {3},
+pages = {1674--1693}
+}
+
+@article{Meng:1996wj,
+author = {Meng, Xiao-Li and Wong, Wing Hung},
+title = {{Simulating ratios of normalizing constants via a simple identity: A theoretical exploration}},
+journal = {Statistica Sinica},
+year = {1996},
+volume = {6},
+pages = {831--860}
+}
+
+@article{Besag:1974vs,
+author = {Besag, Julian},
+title = {{Spatial Interaction and the Statistical Analysis of Lattice Systems}},
+journal = {Journal of Royal Statistical Society B},
+year = {1974},
+volume = {36},
+number = {2},
+pages = {192--236}
+}
+
+@article{Geweke:1989tm,
+author = {Geweke, John},
+title = {{Bayesian inference in econometric models using Monte Carlo integration}},
+journal = {Econometrica: Journal of the Econometric Society},
+year = {1989},
+volume = {57},
+number = {6},
+pages = {1317--1339}
+}
+
+@article{Berger:1992kf,
+author = {Berger, James O. and Bernardo, Jos{\'e} M.},
+title = {{Ordered group reference priors with application to the multinomial problem}},
+journal = {Biometrika},
+year = {1992},
+volume = {79},
+number = {1},
+pages = {25--37}
+}
+
+@article{Marron:1989uz,
+author = {Marron, J. S. and Nolan, D.},
+title = {{Canonical kernels for density estimation}},
+journal = {Statistics {\&} Probability Letters},
+year = {1989},
+volume = {7},
+number = {3},
+pages = {195--199}
+}
+
+@article{Metropolis:1953ex,
+author = {Metropolis, Nicholas and Rosenbluth, Arianna W. and Rosenbluth, Marshall N. and Teller, Augusta H. and Teller, Edward},
+title = {{Equation of state calculations by fast computing machines}},
+journal = {The Journal of Chemical Physics},
+year = {1953},
+volume = {21},
+number = {6},
+pages = {1087--1092}
+}
+
+@article{Fearnhead:2012vc,
+author = {Fearnhead, Paul and Prangle, Dennis},
+title = {{Constructing summary statistics for approximate Bayesian computation: semi-automatic approximate Bayesian computation}},
+journal = {Journal of Royal Statistical Society B},
+year = {2012},
+volume = {74},
+number = {3},
+pages = {1--56},
+month = apr
+}
+
+@article{Chen:2000ge,
+author = {Chen, Song Xi},
+title = {{Probability density function estimation using Gamma kernels}},
+journal = {Annals of the Institute of Statistical Mathematics},
+year = {2000},
+volume = {52},
+number = {3},
+pages = {471--480},
+month = sep
+}
+
+@article{Haario:1999dh,
+author = {Haario, Heikki and Saksman, Eero and Tamminen, Johanna},
+title = {{Adaptive proposal distribution for random walk Metropolis algorithm}},
+journal = {Computational Statistics},
+year = {1999},
+volume = {14},
+number = {3},
+pages = {375}
+}
+
+@article{Creal:2008ji,
+author = {Creal, D D},
+title = {{Analysis of filtering and smoothing algorithms for L{\'e}vy-driven stochastic volatility models}},
+journal = {Computational Statistics {\&} Data Analysis},
+year = {2008}
+}
+
+@article{Chan:2014wb,
+author = {Chan, Hock Peng and Heng, Chiang Wee and Jasra, Ajay},
+title = {{Theory of Parallel Particle Filters for Hidden Markov Models}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1409.4160v1},
+eprinttype = {arxiv},
+eprintclass = {math.ST},
+month = sep
+}
+
+@inproceedings{Yu:2010km,
+author = {Yu, Jinxia and Liu, Wenjing and Tang, Yongli},
+title = {{Improved particle filter algorithms based on partial systematic resampling}},
+booktitle = {Proceedings of IEEE International Conference on Intelligent Computing and Intelligent Systems},
+year = {2010},
+pages = {483--487}
+}
+
+@article{Cheng:1978jl,
+author = {Cheng, R. C. H.},
+title = {{Generating Beta variates with nonintegral shape parameters}},
+journal = {Communications of the ACM},
+year = {1978},
+volume = {21},
+number = {4},
+pages = {317--322}
+}
+
+@article{Robert:1994vh,
+author = {Robert, Christian P.},
+title = {{Discussion: Markov chains for exploring posterior distributions}},
+journal = {The Annals of Statistics},
+year = {1994},
+volume = {22},
+number = {4},
+pages = {1742--1747}
+}
+
+@article{Pitt:2010th,
+author = {Pitt, Michael and Silva, Ralph and Giordani, Paolo and Kohn, Robert},
+title = {{Auxiliary Particle filtering within adaptive Metropolis-Hastings Sampling}},
+journal = {ArXiv},
+year = {2010},
+eprint = {1006.1914v1},
+eprinttype = {arxiv},
+eprintclass = {stat.ME},
+pages = {--},
+month = jun
+}
+
+@article{Chopin:2013gn,
+author = {Chopin, N and Jacob, P E and Papaspiliopoulos, O},
+title = {{SMC2: an efficient algorithm for sequential analysis of state space models}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2013},
+volume = {75},
+number = {3},
+pages = {397--426},
+month = jun
+}
+
+@article{Salmon:2011um,
+author = {Salmon, John K. and Moraes, Mark A. and Dror, Ron O. and Shaw, David E.},
+title = {{Parallel random numbers: As easy as 1, 2, 3}},
+journal = {Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis},
+year = {2011},
+pages = {1--12}
+}
+
+@article{Roberts:1997dg,
+author = {Roberts, G. O. and Gelman, A. and Gilks, W. R.},
+title = {{Weak convergence and optimal scaling of random walk Metropolis algorithms}},
+journal = {The Annals of Applied Probability},
+year = {1997},
+volume = {7},
+number = {1},
+pages = {110--120}
+}
+
+@article{Cappe:2008ht,
+author = {Capp{\'e}, Olivier and Douc, Randal and Guillin, Arnaud and Marin, Jean-Michel and Robert, Christian P.},
+title = {{Adaptive importance sampling in general mixture classes}},
+journal = {Statistics and Computing},
+year = {2008},
+volume = {18},
+number = {4},
+pages = {447--459}
+}
+
+@article{Cunningham:1991fs,
+author = {Cunningham, Vincent J. and Hume, Susan P. and Price, Gary R. and Ahier, Randall G. and Cremer, Jill E. and Jones, Anthony K. P.},
+title = {{Compartmental analysis of diprenorphine binding to opiate receptors in the rat in vivo and its comparison with equilibrium data in vitro}},
+journal = {Journal of Cerebral Blood Flow {\&} Metabolism},
+year = {1991},
+volume = {11},
+number = {1},
+pages = {1--9}
+}
+
+@incollection{Doucet:2011us,
+author = {Doucet, Arnaud and Johansen, Adam M.},
+title = {{A tutorial on particle filtering and smoothing: Fifteen years later}},
+booktitle = {The Oxford Handbook of Non-linear Filtering},
+year = {2011},
+pages = {--},
+publisher = {Oxford University Press}
+}
+
+@article{Raftery:1996ct,
+author = {Raftery, Adrian E.},
+title = {{Approximate Bayes factors and accounting for model uncertainty in generalised linear models}},
+journal = {Biometrika},
+year = {1996},
+volume = {83},
+number = {2},
+pages = {251--266}
+}
+
+@article{Sugiura:1978be,
+author = {Sugiura, Nariaki},
+title = {{Further analysts of the data by akaike' s information criterion and the finite corrections}},
+journal = {Communications in Statistics - Theory and Methods},
+year = {1978},
+volume = {7},
+number = {1},
+pages = {13--26},
+month = jan
+}
+
+@article{Grelaud:2009gc,
+author = {Grelaud, Aude and Robert, Christian P. and Marin, Jean-Michel and Rodolphe, Fran{\c c}ois and Taly, Jean-Fran{\c c}ois},
+title = {{ABC likelihood-free methods for model choice in Gibbs random fields}},
+journal = {Bayesian Analysis},
+year = {2009},
+volume = {4},
+number = {2},
+pages = {317--336}
+}
+
+@article{Robert:1995ge,
+author = {Robert, Christian P.},
+title = {{Convergence control methods for Markov chain Monte Carlo algorithms}},
+journal = {Statistical Science},
+year = {1995},
+volume = {10},
+number = {3},
+pages = {231--253}
+}
+
+@article{Blumenthal:1981th,
+author = {Blumenthal, Saul and Dahiya, Ram C.},
+title = {{Estimating the Binomial parameter n}},
+journal = {Journal of American Statistical Association},
+year = {1981},
+volume = {76},
+number = {376},
+pages = {903--909}
+}
+
+@article{Hume:1992fy,
+author = {Hume, Susan P. and Myers, Ralph and Bloomfield, Peter M. and Opacka-Juffry, Jolanta and Cremer, Jill E. and Ahier, Randall G. and Luthra, Sajinda K. and Brooks, David J. and Lammertsma, Adriaan A.},
+title = {{Quantitation of Carbon-11-labeled raclopride in rat striatum using positron emission tomography.}},
+journal = {Synapse},
+year = {1992},
+volume = {12},
+number = {1},
+pages = {47--54}
+}
+
+@article{Andrieu:2008kh,
+author = {Andrieu, Christophe and Thoms, Johannes},
+title = {{A tutorial on adaptive MCMC}},
+journal = {Statistics and Computing},
+year = {2008},
+volume = {18},
+number = {4},
+pages = {343--373}
+}
+
+@article{HajiAli:2014wa,
+author = {Haji-Ali, Abdul-Lateef and Nobile, Fabio and Tempone, Raul},
+title = {{Multi-Index Monte Carlo: When Sparsity Meets Sampling}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1405.3757v4},
+eprinttype = {arxiv},
+eprintclass = {math.NA},
+month = may
+}
+
+@article{Atkinson:1979es,
+author = {Atkinson, A C},
+title = {{A family of switching algorithms for the computer generation of beta random variables}},
+journal = {Biometrika},
+year = {1979},
+volume = {66},
+number = {1},
+pages = {141--145},
+month = apr
+}
+
+@techreport{Besag:2001um,
+author = {Besag, Julian},
+title = {{Markov chain Monte Carlo for statistical inference}},
+year = {2001}
+}
+
+@article{Lee:2014wv,
+author = {Lee, Anthony and Whiteley, Nick},
+title = {{Forest resampling for distributed sequential Monte Carlo}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1406.6010v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+month = jun
+}
+
+@article{Berger:2001uy,
+author = {Berger, James O. and Pericchi, Luis R.},
+title = {{Objective Bayesian methods for model selection: introduction and comparison}},
+journal = {Model Selection},
+year = {2001},
+volume = {38},
+number = {2001},
+pages = {135--207}
+}
+
+@article{Akaike:1978ti,
+author = {Akaike, Hirotugu},
+title = {{A Bayesian analysis of the minimum AIC procedure}},
+journal = {Annals of the Institute of Statistical Mathematics},
+year = {1978},
+volume = {30},
+number = {1},
+pages = {9--14}
+}
+
+@article{Jasra:2008bb,
+author = {Jasra, Ajay and Doucet, Arnaud and Stephens, David A. and Holmes, Christopher C.},
+title = {{Interacting sequential Monte Carlo samplers for trans-dimensional simulation}},
+journal = {Computational Statistics {\&} Data Analysis},
+year = {2008},
+volume = {52},
+number = {4},
+pages = {1765--1791}
+}
+
+@article{Martin:2014tw,
+author = {Martin, Gael M and McCabe, Brendan P M and Maneesoonthorn, Worapree and Robert, Christian P},
+title = {{Approximate Bayesian Computation in State Space Models}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1409.8363v1},
+eprinttype = {arxiv},
+eprintclass = {math.ST},
+month = sep
+}
+
+@book{Kloeden:1618302,
+author = {Kloeden, Peter E and Platen, Eckhard},
+title = {{Numerical Solution of Stochastic Differential Equations}},
+publisher = {Springer Science {\&} Business Media},
+year = {1992},
+month = jan
+}
+
+@article{Jasra:2007id,
+author = {Jasra, Ajay and Stephens, David A. and Holmes, Christopher C.},
+title = {{Population-based reversible jump Markov chain Monte Carlo}},
+journal = {Biometrika},
+year = {2007},
+volume = {94},
+number = {4},
+pages = {787--807}
+}
+
+@article{DeanFang:2012gj,
+author = {Dean Fang, Yu-Hua and El Fakhri, Georges and Becker, John A. and Alpert, Nathaniel M.},
+title = {{Parametric imaging with Bayesian priors: A validation study with 11C-Altropane PET}},
+journal = {NeuroImage},
+year = {2012},
+volume = {61},
+number = {1},
+pages = {131--138}
+}
+
+@article{Liang:2001dc,
+author = {Liang, Faming and Wong, Wing Hung},
+title = {{Real-parameter evolutionary Monte Carlo with applications to Bayesian mixture models}},
+journal = {Journal of the American Statistical Association},
+year = {2001},
+volume = {96},
+number = {454},
+pages = {653--666},
+month = jun
+}
+
+@article{Geman:1993bp,
+author = {Geman, Stuart and Geman, Donald},
+title = {{Stochastic relaxation, Gibbs distributions and the Bayesian restoration of images}},
+journal = {Journal of Applied Statistics},
+year = {1993},
+volume = {20},
+number = {5},
+pages = {25--62}
+}
+
+@article{ChristopheAndrieu:2009ci,
+author = {Andrieu, Christophe and Roberts, Gareth O.},
+title = {{The pseudo-marginal approach for efficient Monte Carlo computations}},
+journal = {The Annals of Statistics},
+year = {2009},
+volume = {37},
+number = {2},
+pages = {697--725},
+month = apr
+}
+
+@article{2011CMAME.200.1237C,
+author = {Chen, X and Gunzburger, Max},
+title = {{Continuous and discontinuous finite element methods for a peridynamics model of mechanics}},
+journal = {Computer Methods in Applied Mechanics and Engineering},
+year = {2011},
+volume = {200},
+number = {9-12},
+pages = {1237--1250},
+month = feb
+}
+
+@article{Kitagawa:1996vj,
+author = {Kitagawa, Genshiro},
+title = {{Monte Carlo filter and smoother for non-Gaussian nonlinear state space models}},
+journal = {Journal of Computational and Graphical Statistics},
+year = {1996},
+volume = {5},
+number = {1},
+pages = {1--25}
+}
+
+@article{Evans:1995wz,
+author = {Evans, Michael and Swartz, Tim},
+title = {{Methods for approximating integrals in statistics with special emphasis on Bayesian integration problems}},
+journal = {Statistical Science},
+year = {1995},
+volume = {10},
+number = {3},
+pages = {254--272}
+}
+
+@article{Cowles:1996bv,
+author = {Cowles, Mary Kathryn and Carlin, Bradley P.},
+title = {{Markov chain Monte Carlo convergence diagnostics: a comparative review}},
+journal = {Journal of the American Statistical Association},
+year = {1996},
+volume = {91},
+number = {434},
+pages = {883--904}
+}
+
+@article{Roberts:1996vd,
+author = {Roberts, Gareth O. and Tweedie, Richard L.},
+title = {{Exponential convergence of Langevin distributions and their discrete approximations}},
+journal = {Bernoulli},
+year = {1996},
+volume = {2},
+number = {4},
+pages = {341--363}
+}
+
+@article{Jasra:2014ti,
+author = {Jasra, Ajay},
+title = {{Approximate Bayesian Computation for a Class of Time Series Models}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1401.0265v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+pages = {--},
+month = jan
+}
+
+@article{Birnbaum:1962tj,
+author = {Birnbaum, Allan},
+title = {{On the foundations of statistical inference}},
+journal = {Journal of American Statistical Association},
+year = {1962},
+volume = {57},
+number = {298},
+pages = {269--306}
+}
+
+@article{Heidelberger:1981ih,
+author = {Heidelberger, Philip and Welch, Peter D.},
+title = {{A spectral method for confidence interval generation and run length control in simulations}},
+journal = {Communications of the ACM},
+year = {1981},
+volume = {24},
+number = {4},
+pages = {233--245}
+}
+
+@techreport{Everitt:2011hb,
+author = {Everitt, Richard G. and Briers, Mark and Copsey, K.},
+title = {{Likelihood-free reversible jump sequential Monte Carlo samplers}},
+year = {2011}
+}
+
+@article{Carpenter:1999jb,
+author = {Carpenter, James and Clifford, Peter and Fearnhead, Paul},
+title = {{Improved particle filter for nonlinear problems}},
+journal = {IEE Proceedings - Radar, Sonar and Navigation},
+year = {1999},
+volume = {146},
+number = {1},
+pages = {2--7}
+}
+
+@article{Sawa:1978tn,
+author = {Sawa, Takamitsu},
+title = {{Information criteria for discriminating among alternative regression models}},
+journal = {Econometrica},
+year = {1978},
+volume = {46},
+number = {6},
+pages = {1273--1291}
+}
+
+@article{Cappe:2004fe,
+author = {Capp{\'e}, Olivier and Guillin, A. and Marin, J. M. and Robert, C. P.},
+title = {{Population Monte Carlo}},
+journal = {Journal of Computational and Graphical Statistics},
+year = {2004},
+volume = {13},
+number = {4},
+pages = {907--929}
+}
+
+@article{Spiegelhalter:2002wt,
+author = {Spiegelhalter, David J. and Best, Nicola G. and Carlin, Bradley P. and Van Der Linde, Angelika},
+title = {{Bayesian measures of model complexity and fit}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2002},
+volume = {64},
+number = {4},
+pages = {583--639}
+}
+
+@article{Yang:2005vj,
+author = {Yang, Yuhong},
+title = {{Can the strengths of AIC and BIC be shared? A conflict between model indentification and regression estimation}},
+journal = {Biometrika},
+year = {2005},
+volume = {92},
+number = {4},
+pages = {937--950}
+}
+
+@article{Kass:1992tz,
+author = {Kass, Robert E. and Vaidyanathan, Suresh K.},
+title = {{Approximate Bayes factors and orthogonal parameters, with application to testing equality of two binomial proportions}},
+journal = {Journal of Royal Statistical Society B},
+year = {1992},
+volume = {54},
+number = {1},
+pages = {129--144}
+}
+
+@article{Kass:1995vb,
+author = {Kass, Robert E. and Raftery, Adrian E.},
+title = {{Bayes factors}},
+journal = {Journal of the American Statistical Association},
+year = {1995},
+volume = {90},
+number = {430},
+pages = {773--795}
+}
+
+@article{Chib:2001gq,
+author = {Chib, Siddhartha and Jeliazkov, Ivan},
+title = {{Marginal likelihood from the Metropolis-Hastings output}},
+journal = {Journal of the American Statistical Association},
+year = {2001},
+volume = {96},
+number = {453},
+pages = {270--281}
+}
+
+@article{Whitley:1994kp,
+author = {Whitley, Darrell},
+title = {{A genetic algorithm tutorial}},
+journal = {Statistics and Computing},
+year = {1994},
+volume = {4},
+number = {2},
+pages = {--}
+}
+
+@article{Rebeschini:2013tq,
+author = {Rebeschini, Patrick and van Handel, Ramon},
+title = {{Can local particle filters beat the curse of dimensionality?}},
+journal = {ArXiv},
+year = {2013},
+eprint = {1301.6585v1},
+eprinttype = {arxiv},
+eprintclass = {math.ST},
+month = jan
+}
+
+@inproceedings{Zhou:2012uz,
+author = {Zhou, Yan and Johansen, Adam M. and Aston, John A. D.},
+title = {{Bayesian model selection via path-sampling sequential Monte Carlo}},
+booktitle = {Proceedings of IEEE Statistical Signal Processing Workshop},
+year = {2012},
+pages = {--}
+}
+
+@article{Bozdogan:1987wy,
+author = {Bozdogan, Hamparsum},
+title = {{Model selection and Akaike's information criterion (AIC): The general theory and its analytical extensions}},
+journal = {Psychometrika},
+year = {1987},
+volume = {52},
+number = {3},
+pages = {345--370}
+}
+
+@article{Newton:1994wm,
+author = {Newton, Michael A. and Raftery, Adrian E.},
+title = {{Approximate Bayesian inference with the weighted likelihood bootstrap}},
+journal = {Journal of Royal Statistical Society B},
+year = {1994},
+volume = {56},
+number = {1},
+pages = {3--48}
+}
+
+@article{Brooks:2003ko,
+author = {Brooks, S. P. and Giudici, P. and Roberts, Gareth O.},
+title = {{Efficient construction of reversible jump Markov chain Monte Carlo proposal distributions}},
+journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
+year = {2003},
+volume = {65},
+number = {1},
+pages = {3--39}
+}
+
+@article{West:1993us,
+author = {West, Mike},
+title = {{Approximating posterior distributions by mixtures}},
+journal = {Journal of Royal Statistical Society B},
+year = {1993},
+volume = {55},
+number = {2},
+pages = {409--422}
+}
+
+@article{Bickel:2008bx,
+author = {Bickel, Peter and Li, Bo and Bengtsson, Thomas},
+title = {{Sharp failure rates for the bootstrap particle filter in high dimensions}},
+journal = {ArXiv},
+year = {2008},
+eprint = {0805.3287v1},
+eprinttype = {arxiv},
+eprintclass = {math.ST},
+month = may
+}
+
+@article{Marsaglia:2000vq,
+author = {Marsaglia, George and wan Tsang, Wai},
+title = {{A simple method for generating gamma variables}},
+journal = {ACM Transactions on Mathematical Software},
+year = {2000},
+volume = {26},
+number = {3},
+pages = {363--372}
+}
+
+@article{Buck:1996td,
+author = {Buck, Alfred and Westera, Gerrit and VonSchulthess, Gustav K. and Burger, Cyrill},
+title = {{Modeling alternatives for cerebral Carbon-11-Iomazenil kinetics}},
+journal = {Journal of Nuclear Medicine},
+year = {1996},
+volume = {37},
+number = {4},
+pages = {699--705}
+}
+
+@article{Arulampalam:2002hg,
+author = {Arulampalam, M. Sanjeev and Maskell, Simon and Gordon, Neil and Clapp, Tim},
+title = {{A tutorial on particle filters for online nonlinear/non-Gaussian Bayesian tracking}},
+journal = {IEEE Transactions on Signal Processing},
+year = {2002},
+volume = {50},
+number = {2},
+pages = {174--188}
+}
+
+@article{Hurvich:1989ev,
+author = {Hurvich, Clifford M. and Tsai, Chin-ling},
+title = {{Regression and time series model selection in small samples}},
+journal = {Biometrika},
+year = {1989},
+volume = {76},
+number = {2},
+pages = {297--307}
+}
+
+@article{Schwarz:1978uv,
+author = {Schwarz, Gideon},
+title = {{Estimating the dimension of a model}},
+journal = {The Annals of Statistics},
+year = {1978},
+volume = {6},
+number = {2},
+pages = {461--464}
+}
+
+@article{Bolic:2004eu,
+author = {Boli{\'c}, Miodrag and Djuri{\'c}, Petar M. and Hong, Sangjin},
+title = {{Resampling algorithms for particle filters: A computational complexity perspective}},
+journal = {EURASIP Journal on Advances in Signal Processing},
+year = {2004},
+volume = {2004},
+number = {15},
+pages = {2267--2277}
+}
+
+@article{Hastie:1987dj,
+author = {Hastie, Trevor},
+title = {{A closer look at the deviance}},
+journal = {The American Statistician},
+year = {1987},
+volume = {41},
+number = {1},
+pages = {16--20}
+}
+
+@article{Prangle:2014uz,
+author = {Prangle, Dennis},
+title = {{Lazy ABC}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1405.7867v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+month = may
+}
+
+@article{Martin:2012kg,
+author = {Martin, James S and Jasra, Ajay and McCoy, Emma},
+title = {{Inference for a class of partially observed point process models}},
+journal = {Annals of the Institute of Statistical Mathematics},
+year = {2012},
+volume = {65},
+number = {3},
+pages = {413--437},
+month = aug
+}
+
+@book{Wasserman:2006uf,
+author = {Wasserman, Larry},
+title = {{All of nonparametric statistics}},
+publisher = {Springer},
+year = {2006},
+address = {New York}
+}
+
+@article{Stone:1982ck,
+author = {Stone, Charles J.},
+title = {{Local asymptotic admissibility of a generalization of Akaike's model selection rule}},
+journal = {Annals of the Institute of Statistical Mathematics},
+year = {1982},
+volume = {34},
+number = {1},
+pages = {123--133}
+}
+
+@book{Ern:2004wx,
+author = {Ern, Alexandre and Guermond, Jean-Luc},
+title = {{Theory and Practice of Finite Elements}},
+publisher = {Springer Science {\&} Business Media},
+year = {2004},
+month = apr
+}
+
+@article{Mintun:1984cu,
+author = {Mintun, Mark A. and Raichle, Marcus E. and Kilbourn, Michael R. and Wooten, G. Frederick and Welch, Michael J.},
+title = {{A quantitative model for the in vivo assessment of drug binding sites with positron emission tomography.}},
+journal = {Annals of Neurology},
+year = {1984},
+volume = {15},
+number = {3},
+pages = {217--227}
+}
+
+@phdthesis{Wickham:G4KrC0Vz,
+author = {Wickham, Hadley Alexander},
+title = {{Practical tools for exploring data and models}}
+}
+
+@article{Kalogeropoulos:2010cs,
+author = {Kalogeropoulos, Konstantinos and Roberts, Gareth O. and Dellaportas, Petros},
+title = {{Inference for stochastic volatility models using time change transformations}},
+journal = {The Annals of Statistics},
+year = {2010},
+volume = {38},
+number = {2},
+pages = {784--807}
+}
+
+@article{Godsill:2001cv,
+author = {Godsill, Simon J.},
+title = {{On the relationship between Markov chain Monte Carlo for model uncerntainty}},
+journal = {Journal of Computational and Graphical Statistics},
+year = {2001},
+volume = {10},
+number = {2},
+pages = {230--248}
+}
+
+@article{Kass:1996jj,
+author = {Kass, Robert E. and Wasserman, Larry},
+title = {{The selection of prior distributions by formal rules}},
+journal = {Journal of American Statistical Association},
+year = {1996},
+volume = {91},
+number = {435},
+pages = {1343--1370}
+}
+
+@article{Logan:2000fw,
+author = {Logan, Jean},
+title = {{Graphical analysis of PET data applied to reversible and irreversible tracers}},
+journal = {Nuclear Medicine and Biology},
+year = {2000},
+volume = {27},
+number = {7},
+pages = {661--670}
+}
+
+@article{Hastings:1970gd,
+author = {Hastings, W. Keith},
+title = {{Monte Carlo sampling methods using Markov chains and their applications}},
+journal = {Biometrika},
+year = {1970},
+volume = {57},
+number = {1},
+pages = {97--109}
+}
+
+@article{McCulloch:1991hj,
+author = {McCulloch, Robert and Rossi, Peter E.},
+title = {{A Bayesian approach to testing the arbitrage pricing theory}},
+journal = {Journal of Econometrics},
+year = {1991},
+volume = {49},
+number = {1-2},
+pages = {141--168}
+}
+
+@article{Gelman:1998ei,
+author = {Gelman, Andrew and Meng, Xiao-Li},
+title = {{Simulating normalizing constants: From importance sampling to bridge sampling to path sampling}},
+journal = {Statistical Science},
+year = {1998},
+volume = {13},
+number = {2},
+pages = {163--185}
+}
+
+@article{Owen:2000kb,
+author = {Owen, Art and Zhou, Yi},
+title = {{Safe and effective importance sampling}},
+journal = {Journal of American Statistical Association},
+year = {2000},
+volume = {95},
+number = {449},
+pages = {135--143}
+}
+
+@book{Myers:1990wt,
+author = {Myers, Raymond H},
+title = {{Classical and Modern Regression with Applications}},
+publisher = {Duxbury},
+year = {1990}
+}
+
+@article{Hawkins:1986ha,
+author = {Hawkins, Randall A. and Phelps, Michael E. and Huang, Sung-Cheng},
+title = {{Effects of temporal sampling, glucose metabolic rates, and disruptions of the blood-brain barrier on the FDG model with and without a vascular compartment: studies in human brain tumors with PET}},
+journal = {Journal of Cerebral Blood Flow {\&} Metabolism},
+year = {1986},
+volume = {6},
+number = {2},
+pages = {170--183}
+}
+
+@article{Samiuddin:1990vy,
+author = {Samiuddin, M. and El-Sayyad, G. M.},
+title = {{On nonparametric kernel density estimates}},
+journal = {Biometrika},
+year = {1990},
+volume = {77},
+number = {4},
+pages = {865}
+}
+
+@article{Carlin:1995uy,
+author = {Carlin, Bradley P. and Chib, Siddhartha},
+title = {{Bayesian model choice via Markov chain Monte Carlo methods}},
+journal = {Journal of Royal Statistical Society B},
+year = {1995},
+volume = {57},
+number = {3},
+pages = {473--484}
+}
+
+@article{Blackwell:1953vt,
+author = {Blackwell, David},
+title = {{Equivalent comparisons of experiments}},
+journal = {The Annals of Mathematical Statistics},
+year = {1953},
+volume = {24},
+number = {2},
+pages = {265--272}
+}
+
+@article{Didelot:2011wo,
+author = {Didelot, Xavier and Everitt, Richard G. and Johansen, Adam M. and Lawson, Daniel J.},
+title = {{Likelihood-free estimation of model evidence}},
+journal = {Bayesian Analysis},
+year = {2011},
+volume = {6},
+number = {1},
+pages = {49--76}
+}
+
+@article{Cliffe:2011fr,
+author = {Cliffe, K A and Giles, M B and Scheichl, R and Teckentrup, A L},
+title = {{Multilevel Monte Carlo methods and applications to elliptic PDEs with random coefficients}},
+journal = {Computing and Visualization in Science},
+year = {2011},
+volume = {14},
+number = {1},
+pages = {3--15},
+month = aug
+}
+
+@article{Johansen:2009wd,
+author = {Johansen, Adam M.},
+title = {{SMCTC: sequential Monte Carlo in C++}},
+journal = {Journal of Statistical Software},
+year = {2009},
+volume = {30},
+number = {6},
+pages = {1--41}
+}
+
+@article{Kullback:1951va,
+author = {Kullback, S. and Leibler, R. A.},
+title = {{On information and sufficiency}},
+journal = {The Annals of Mathematical Statistics},
+year = {1951},
+volume = {22},
+number = {1},
+pages = {79--86}
+}
+
+@article{Kang:2011gg,
+author = {Kang, Jian and Johnson, Timothy D. and Nichols, Thomas E. and Wager, Tor D.},
+title = {{Meta Analysis of Functional Neuroimaging Data via Bayesian Spatial Point Processes.}},
+journal = {Journal of the American Statistical Association},
+year = {2011},
+volume = {106},
+number = {493},
+pages = {124--134}
+}
+
+@article{Laskey:2003et,
+author = {Laskey, Kathryn Blackmond and Myers, James W},
+title = {{Population Markov chain Monte Carlo}},
+journal = {Machine Learning},
+year = {2003},
+volume = {50},
+number = {1/2},
+pages = {175--196}
+}
+
+@article{Daniels:1987uk,
+author = {Daniels, H. E.},
+title = {{Tail probability approximations}},
+journal = {International Statistical Review},
+year = {1987},
+volume = {55},
+number = {1},
+pages = {37--48}
+}
+
+@article{Roberts:2007wa,
+author = {Roberts, Gareth O. and Rosenthal, Jeffrey S},
+title = {{Coupling and ergodicity of adaptive Markov chain Monte Carlo algorithms}},
+journal = {Journal of Applied Statistics},
+year = {2007},
+volume = {44},
+number = {2},
+pages = {458--475}
+}
+
+@article{Jasra:2013tu,
+author = {Jasra, Ajay and Lee, Anthony and Yau, Christopher and Zhang, Xiaole},
+title = {{The alive particle filter}},
+journal = {ArXiv},
+year = {2013},
+eprint = {1304.0151v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+month = mar
+}
+
+@proceedings{Matsumoto:2001vu,
+title = {{Dynamic creation of Pseudorandom number generators}},
+year = {2001},
+month = dec
+}
+
+@article{Kachitvichyanukul:1988db,
+author = {Kachitvichyanukul, Voratas and Schmeiser, Bruce W},
+title = {{Binomial random variate generation}},
+journal = {Communications of the ACM},
+year = {1988},
+volume = {31},
+number = {2},
+pages = {216--222},
+month = feb
+}
+
+@incollection{Raftery:2007ud,
+author = {Raftery, Adrian E. and Newton, Michael A. and Satagopan, Jaya M. and Krivitsky, Pavel N.},
+title = {{Estimating the integrated likelihood via posterior simulation using the harmonic mean identity}},
+booktitle = {Bayesian Statistics 8},
+year = {2006},
+pages = {1--45},
+publisher = {Oxford University Press},
+month = nov
+}
+
+@article{Yu:1998fn,
+author = {Yu, Bin and Mykland, Per},
+title = {{Looking at Markov samplers through CUSUM path plots: a simple diagnostic idea}},
+journal = {Statistics and Computing},
+year = {1998},
+volume = {8},
+number = {3},
+pages = {275--286}
+}
+
+@article{Sin:1996vs,
+author = {Sin, Chor-Yiu and White, Halbert},
+title = {{Information criteria for selecting possibly misspecified parametric models}},
+journal = {Journal of Econometrics},
+year = {1996},
+volume = {71},
+number = {1},
+pages = {207--225}
+}
+
+@article{Doucet:2000ui,
+author = {Doucet, Arnaud and Godsill, Simon and Andrieu, Christophe},
+title = {{On sequential Monte Carlo sampling methods for Bayesian filtering}},
+journal = {Statistics and Computing},
+year = {2000},
+volume = {10},
+number = {3},
+pages = {197--208}
+}
+
+@article{Green:2001tk,
+author = {Green, Peter J. and Mira, Antonietta},
+title = {{Delayed rejection in reversible jump Metropolis-Hastings}},
+journal = {Biometrika},
+year = {2001},
+volume = {88},
+number = {4},
+pages = {1035--1053}
+}
+
+@article{Marsaglia:2003ug,
+author = {Marsaglia, G},
+title = {{Xorshift RNGs}},
+journal = {Journal of Statistical Software},
+year = {2003},
+pages = {--}
+}
+
+@article{Berger:1987iq,
+author = {Berger, James O. and Delampady, Mohan},
+title = {{Testing precise hypotheses}},
+journal = {Statistical Science},
+year = {1987},
+volume = {2},
+number = {3},
+pages = {317--335}
+}
+
+@article{DelMoral:2012em,
+author = {Del Moral, Pierre and Doucet, Arnaud and Jasra, Ajay},
+title = {{An adaptive sequential Monte Carlo method for approximate Bayesian computation}},
+journal = {Statistics and Computing},
+year = {2012},
+pages = {--}
+}
+
+@article{Ridgeway:2003gq,
+author = {Ridgeway, Greg and Madigan, David},
+title = {{A sequential Monte Carlo method for Bayesian analysis of massive datasets}},
+journal = {Data Mining and Knowledge Discovery},
+year = {2003},
+volume = {7},
+number = {3},
+pages = {301--319}
+}
+
+@article{Evensen:2003io,
+author = {Evensen, G},
+title = {{The ensemble Kalman filter: Theoretical formulation and practical implementation}},
+journal = {Ocean dynamics},
+year = {2003}
+}
+
+@techreport{DelMoral:2009th,
+author = {Del Moral, Pierre and Doucet, Arnaud},
+title = {{Particle methods: An introduction with applications}},
+year = {2009}
+}
+
+@article{Ruppert:1994wk,
+author = {Ruppert, D. and Wand, M. P.},
+title = {{Multivariate locally weighted least squares regression}},
+journal = {The Annals of Statistics},
+year = {1994},
+volume = {22},
+number = {3},
+pages = {1346--1370}
+}
+
+@article{Klaas:2012wz,
+author = {Klaas, Mike and de Freitas, Nando and Doucet, Arnaud},
+title = {{Toward Practical N2 Monte Carlo: the Marginal Particle Filter}},
+journal = {ArXiv},
+year = {2012},
+eprint = {1207.1396v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+month = jul
+}
+
+@article{Neuts:1967bc,
+author = {Neuts, M F and Zacks, S},
+title = {{On mixtures of $\chi$2- andF-distributions which yield distributions of the same family}},
+journal = {Annals of the Institute of Statistical Mathematics},
+year = {1967},
+volume = {19},
+number = {1},
+pages = {527--536},
+month = dec
+}
+
+@article{Neal:2001we,
+author = {Neal, Radford M.},
+title = {{Annealed importance sampling}},
+journal = {Statistics and Computing},
+year = {2001},
+volume = {11},
+number = {2},
+pages = {125--139}
+}
+
+@article{AlAwadhi:2004dy,
+author = {Al-Awadhi, Fahimah and Hurn, Merrilee and Jennison, Christopher},
+title = {{Improving the acceptance rate of reversible jump MCMC proposals}},
+journal = {Statistics {\&} Probability Letters},
+year = {2004},
+volume = {69},
+number = {2},
+pages = {189--198}
+}
+
+@article{Kinahan:1989ih,
+author = {Kinahan, P. E. and Rogers, J. G.},
+title = {{Analytic 3D image reconstruction using all detected events}},
+journal = {IEEE Transactions on Nuclear Science},
+year = {1989},
+volume = {36},
+number = {1},
+pages = {964--968}
+}
+
+@article{Jasra:2013wm,
+author = {Jasra, Ajay and Kantas, Nikolas and Ehrlich, Elena},
+title = {{Approximate Inference for Observation Driven Time Series Models with Intractable Likelihoods}},
+journal = {ArXiv},
+year = {2013},
+eprint = {1303.7318v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+pages = {--},
+month = mar
+}
+
+@article{Haario:2001gu,
+author = {Haario, Heikki and Saksman, Eero and Tamminen, Johanna},
+title = {{An adaptive Metropolis algorithm}},
+journal = {Bernoulli},
+year = {2001},
+volume = {7},
+number = {2},
+pages = {223}
+}
+
+@article{Roberts:1996wb,
+author = {Roberts, Gareth O. and Tweedie, R. L.},
+title = {{Geometric convergence and central limit theorems for multidimensional Hastings and Metropolis}},
+journal = {Biometrika},
+year = {1996},
+volume = {83},
+number = {1},
+pages = {95}
+}
+
+@article{Leva:1992vm,
+author = {Leva, Joseph L.},
+title = {{A fast normal random number generator}},
+journal = {ACM Transactions on Mathematical Software},
+year = {1992},
+volume = {18},
+number = {4},
+pages = {449--453}
+}
+
+@article{Chow:1981te,
+author = {Chow, Gregory C.},
+title = {{A comparison of the information and posterior probability criteria for model selection}},
+journal = {Journal of Econometrics},
+year = {1981},
+volume = {16},
+number = {1},
+pages = {21--33}
+}
+
+@article{Beskos:2005el,
+author = {Beskos, Alexandros and Roberts, Gareth O.},
+title = {{Exact simulation of diffusions}},
+journal = {The Annals of Applied Probability},
+year = {2005},
+volume = {15},
+number = {4},
+pages = {2422--2444},
+month = nov
+}
+
+@article{Shephard:2001tr,
+author = {Shephard, N and Barndorff-Nielsen, O E},
+title = {{Normal modified stable processes}},
+journal = {Theory},
+year = {2001}
+}
+
+@article{Halmos:1949ti,
+author = {Halmos, Paul R. and Savage, L. J.},
+title = {{Application of the Radon-Nikodym theorem to the theory of sufficient statistics}},
+journal = {The Annals of Mathematical Statistics},
+year = {1949},
+volume = {20},
+number = {2},
+pages = {225--241}
+}
+
+@article{Berger:1989vj,
+author = {Berger, James O. and Bernardo, Jos{\'e} M.},
+title = {{Estimating a product of means: Bayesian analysis with reference priors}},
+journal = {Journal of American Statistical Association},
+year = {1989},
+volume = {84},
+number = {405},
+pages = {200--207}
+}
+
+@article{Beskos:2014bv,
+author = {Beskos, Alexandros and Crisan, Dan and Jasra, Ajay},
+title = {{On the stability of sequential Monte Carlo methods in high dimensions}},
+journal = {The Annals of Applied Probability},
+year = {2014},
+volume = {24},
+number = {4},
+pages = {1396--1445},
+month = aug
+}
+
+@article{Grenander:1994vy,
+author = {Grenander, Ulf and Miller, Michael I.},
+title = {{Representations of knowledge in complex systems}},
+journal = {Journal of Royal Statistical Society B},
+year = {1994},
+volume = {56},
+number = {4},
+pages = {549--603}
+}
+
+@article{Han:2001wa,
+author = {Han, Cong and Carlin, Bradley P.},
+title = {{Markov chain Monte Carlo methods for computing Bayes factors: A comparative review}},
+journal = {Journal of the American Statistical Association},
+year = {2001},
+volume = {96},
+number = {455},
+pages = {1122--1132}
+}
+
+@article{Feng:2012ik,
+author = {Feng, Dai and Tierney, Luke and Magnotta, Vincent},
+title = {{MRI tissue classification using high-resolution Bayesian hidden markov normal mixture models}},
+journal = {Journal of the American Statistical Association},
+year = {2012},
+volume = {107},
+number = {497},
+pages = {102--119},
+month = mar
+}
+
+@article{Albert:1993kc,
+author = {Albert, James H. and Chib, Siddhartha},
+title = {{Bayesian analysis of binary and polychotomous response data}},
+journal = {Journal of the American Statistical Association},
+year = {1993},
+volume = {88},
+number = {422},
+pages = {669--679}
+}
+
+@article{Atchade:2010ha,
+author = {Atchad{\'e}, Yves F. and Roberts, Gareth O. and Rosenthal, Jeffrey S},
+title = {{Towards optimal scaling of Metropolis-coupled Markov chain Monte Carlo}},
+journal = {Statistics and Computing},
+year = {2010},
+volume = {21},
+number = {4},
+pages = {555--568}
+}
+
+@article{Saito:2013ht,
+author = {Saito, M and Matsumoto, M},
+title = {{Variants of mersenne twister suitable for graphic processors}},
+journal = {ACM Transactions on Mathematical Software},
+year = {2013}
+}
+
+@article{Green:1995dg,
+author = {Green, Peter J.},
+title = {{Reversible jump Markov chain Monte Carlo computation and Bayesian model determination}},
+journal = {Biometrika},
+year = {1995},
+volume = {82},
+number = {4},
+pages = {711--732}
+}
+
+@article{Anonymous:YjBmKgMg,
+author = {Beskos, Alexandros and Papaspiliopoulos, Omiros and Roberts, Gareth O. and Fearnhead, Paul},
+title = {{Exact and computationally efficient likelihood-based estimation for discretely observed diffusion processes (with discussion)}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2006},
+volume = {68},
+number = {3},
+pages = {333--382}
+}
+
+@inproceedings{Andrieu:1999cc,
+author = {Andrieu, Christophe and De Freitas, N. and Doucet, Arnaud},
+title = {{Sequential MCMC for Bayesian model selection}},
+booktitle = {Proceedings of IEEE Higher Order Statistics Workshop},
+year = {1999},
+pages = {130--134}
+}
+
+@article{BarndorffNielsen:1979ug,
+author = {Barndorff-Nielsen, O. and Cox, D. R.},
+title = {{Edgeworth and saddle-point approximations with statistical applications}},
+journal = {Journal of Royal Statistical Society B},
+year = {1979},
+volume = {41},
+number = {3},
+pages = {279--312}
+}
+
+@article{Cappe:2007hz,
+author = {Capp{\'e}, Olivier and Godsill, Simon J. and Moulines, Eric},
+title = {{An overview of existing methods and recent advances in sequential Monte Carlo}},
+journal = {Proceedings of the IEEE},
+year = {2007},
+volume = {95},
+number = {5},
+pages = {899--924}
+}
+
+@article{Gilks:1998dl,
+author = {Gilks, Walter R. and Roberts, Gareth O. and Sahu, Sujit K.},
+title = {{Adaptive Markov chain Monte Carlo through regeneration}},
+journal = {Journal of the American Statistical Association},
+year = {1998},
+volume = {93},
+number = {443},
+pages = {1045}
+}
+
+@article{Stephens:2000wq,
+author = {Stephens, Mathew},
+title = {{Bayesian analysis of mixture models with an unkown number of components -- An alternative to reversible jump methods}},
+journal = {The Annals of Statistics},
+year = {2000},
+volume = {28},
+number = {1},
+pages = {40--74}
+}
+
+@article{Kong:1994ul,
+author = {Kong, Augustine and Liu, Jun S. and Wong, Wing Huang},
+title = {{Sequential imputations and Bayesian missing data problems}},
+journal = {Journal of the American Statistical Association},
+year = {1994},
+volume = {89},
+number = {425},
+pages = {278--288}
+}
+
+@article{DElia:2015ht,
+author = {D{\textquoteright}Elia, M and Gunzburger, M},
+title = {{Identification of the diffusion parameter in nonlocal steady diffusion problems}},
+journal = {Applied Mathematics {\&} Optimization},
+year = {2015},
+pages = {1--23}
+}
+
+@incollection{Akaike:1977ul,
+author = {Akaike, Hirotugu},
+title = {{On entropy maximization principle}},
+booktitle = {Application of Statistics},
+year = {1977},
+pages = {--},
+publisher = {North-Holland}
+}
+
+@article{Nelder:1965in,
+author = {Nelder, J. A. and Mead, R.},
+title = {{A simplex method for function minimization}},
+journal = {The Computer Journal},
+year = {1965},
+volume = {7},
+number = {4},
+pages = {308--313}
+}
+
+@article{Gelfand:1990it,
+author = {Gelfand, Alan E. and Smith, Adrian F. M.},
+title = {{Sampling-based approaches to calculating marginal densities}},
+journal = {Journal of the American Statistical Association},
+year = {1990},
+volume = {85},
+number = {410},
+pages = {398--409}
+}
+
+@article{Peng:2008fx,
+author = {Peng, Jyh-Ying and Aston, John A. D. and Gunn, Roger N. and Liou, Cheng-Yuan and Ashburner, John},
+title = {{Dynamic positron emission tomography data-driven analysis using sparse Bayesian learning}},
+journal = {IEEE Transactions on Medical Imaging},
+year = {2008},
+volume = {27},
+number = {9},
+pages = {1356--1369}
+}
+
+@article{Fearnhead:2004gi,
+author = {Fearnhead, Paul},
+title = {{Particle filters for mixture models with an unknown number of components}},
+journal = {Statistics and Computing},
+year = {2004},
+volume = {14},
+number = {1},
+pages = {11--21}
+}
+
+@article{2000JMPSo..48..175S,
+author = {Silling, S A},
+title = {{Reformulation of elasticity theory for discontinuities and long-range forces}},
+journal = {Journal of the Mechanics and Physics of Solids},
+year = {2000},
+volume = {48},
+number = {1},
+pages = {175--209},
+month = jan
+}
+
+@book{Shiryaev:1995vp,
+author = {Shiryaev, A N},
+title = {{Probability}},
+publisher = {Springer-Verlag},
+year = {1995},
+series = {Graduate Texts in Mathematics},
+address = {New York}
+}
+
+@book{Burnham:2002wc,
+author = {Burnham, Kenneth P. and Anderson, David R.},
+title = {{Model Selection and Multimodel Inference: A Practical Information-theoretic Approach}},
+publisher = {Springer-Verlag},
+year = {2002},
+address = {New York},
+edition = {2}
+}
+
+@article{Picard:1984vu,
+author = {Picard, Richard R and Cook, R Dennis},
+title = {{Cross-validation of regression models}},
+journal = {Journal of the American Statistical Association},
+year = {1984},
+volume = {79},
+number = {387},
+pages = {575--583}
+}
+
+@incollection{Berger:1992wo,
+author = {Bernardo, Jos{\'e} M. and Berger, James O.},
+title = {{On the development of reference priors}},
+booktitle = {Bayesian Statistics 4},
+year = {1992},
+editor = {Berger, James O. and Bernardo, Jos{\'e} M. and Dawid, A. P. and Smith, Adrian F. M.},
+pages = {35--60},
+publisher = {Oxford University Press}
+}
+
+@article{Cox:1987wo,
+author = {Cox, D. R. and Reid, N.},
+title = {{Parameter orthogonality and approximate conditional inference}},
+journal = {Journal of Royal Statistical Society B},
+year = {1987},
+volume = {49},
+number = {1},
+pages = {1--39}
+}
+
+@article{Narisetty:2014hp,
+author = {Narisetty, N N and He, X},
+title = {{Bayesian variable selection with shrinking and diffusing priors}},
+journal = {The Annals of Statistics},
+year = {2014}
+}
+
+@techreport{Green:2009tr,
+author = {Green, Peter J. and Hastie, David I.},
+title = {{Reversible jump MCMC}},
+year = {2009}
+}
+
+@article{Peters:2010vk,
+author = {Peters, Gareth W and Hosack, Geoffrey R. and Hayes, Keith R.},
+title = {{Ecological non-linear state space model selection via adaptive particle Markov chain Monte Carlo (AdPMCMC)}},
+journal = {ArXiv},
+year = {2010},
+pages = {--}
+}
+
+@article{Gunn:2002tf,
+author = {Gunn, Roger N. and Gunn, Steve R. and Turkheimer, Federico E. and Aston, John A. D. and Cunningham, Vincent J.},
+title = {{Positron emission tomography compartmental models: A basis pursuit strategy for kinetic modeling}},
+journal = {Journal of Cerebral Blood Flow {\&} Metabolism},
+year = {2002},
+volume = {22},
+number = {12},
+pages = {1425--1439}
+}
+
+@article{Casella:1998tj,
+author = {Casella, George and Robert, Christian P.},
+title = {{Post-processing accept-reject samples: recycling and rescaling}},
+journal = {Journal of Computational and Graphical Statistics},
+year = {1998},
+volume = {7},
+number = {2},
+pages = {139--157}
+}
+
+@article{Calderhead:2009bd,
+author = {Calderhead, Ben and Girolami, Mark},
+title = {{Estimating Bayes factors via thermodynamic integration and population MCMC}},
+journal = {Computational Statistics {\&} Data Analysis},
+year = {2009},
+volume = {53},
+number = {12},
+pages = {4028--4045}
+}
+
+@incollection{Akaike:1973uc,
+author = {Akaike, Hirotugu},
+title = {{Information theory and an extension of the maximum likelihood principle}},
+booktitle = {Second international symposium on information theory},
+year = {1973},
+pages = {267--281},
+publisher = {Akademiai Kiado}
+}
+
+@booklet{Jeffreys:1961ua,
+title = {{Theory of Probability}},
+author = {Jeffreys, Harold},
+howpublished = {Clarendon Press},
+year = {1961}
+}
+
+@article{Chambers:1976dv,
+author = {Chambers, J M and Mallows, C. L. and Stuck, B W},
+title = {{A method for simulating stable random variables}},
+journal = {Journal of the American Statistical Association},
+year = {1976},
+volume = {71},
+number = {354},
+pages = {340--344}
+}
+
+@inproceedings{Johansen:2006wm,
+author = {Johansen, Adam M. and Del Moral, Pierre and Doucet, Arnaud},
+title = {{Sequential Monte Carlo samplers for rare events}},
+booktitle = {Proceedings of the 6th International Workshop on Rare Event Simulation},
+year = {2006},
+pages = {256--267}
+}
+
+@article{Everitt:2012tr,
+author = {Everitt, Richard G.},
+title = {{Bayesian Parameter Estimation for Latent Markov Random Fields and Social Networks}},
+journal = {ArXiv},
+year = {2012},
+eprint = {1203.3725v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+pages = {--},
+month = mar
+}
+
+@article{Hannan:1979us,
+author = {Hannan, Edward J and Quinn, Barry G},
+title = {{The determination of the order of an autoregression}},
+journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
+year = {1979},
+pages = {190--195}
+}
+
+@article{Andrieu:2010gc,
+author = {Andrieu, Christophe and Doucet, Arnaud and Holenstein, Roman},
+title = {{Particle Markov chain Monte Carlo methods}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2010},
+volume = {72},
+number = {3},
+pages = {269--342}
+}
+
+@article{Robert:2011vx,
+author = {Robert, Christian P. and Marin, Jean-Michel and Pillai, Natesh S.},
+title = {{Why approximate Bayesian computational (ABC) methods cannot handle model choice problems}},
+journal = {ArXiv},
+year = {2011},
+pages = {--}
+}
+
+@article{DelMoral:2006hc,
+author = {Del Moral, Pierre and Doucet, Arnaud and Jasra, Ajay},
+title = {{Sequential Monte Carlo samplers}},
+journal = {Journal of Royal Statistical Society B},
+year = {2006},
+volume = {68},
+number = {3},
+pages = {411--436}
+}
+
+@article{Altekar:2004jz,
+author = {Altekar, G and Dwarkadas, S and Huelsenbeck, J P and Ronquist, F},
+title = {{Parallel Metropolis coupled Markov chain Monte Carlo for Bayesian phylogenetic inference}},
+journal = {Bioinformatics},
+year = {2004},
+volume = {20},
+number = {3},
+pages = {407--415},
+month = feb
+}
+
+@book{Tsybakov:2009wp,
+author = {Tsybakov, Alexandre B.},
+title = {{Introdcution to nonparametric estimation}},
+publisher = {Springer},
+year = {2009},
+address = {New York}
+}
+
+@article{Jiang:2009kf,
+author = {Jiang, Ci-Ren and Aston, John A. D. and Wang, Jane-Ling},
+title = {{Smoothing dynamic positron emission tomography time courses using functional principal components}},
+journal = {NeuroImage},
+year = {2009},
+volume = {47},
+number = {1},
+pages = {184--193}
+}
+
+@article{Oh:1993tg,
+author = {Oh, Man-Suk and Berger, James O.},
+title = {{Integration of multimodal functions by Monte Carlo importance sampling}},
+journal = {Journal of American Statistical Association},
+year = {1993},
+volume = {88},
+number = {422},
+pages = {450--456}
+}
+
+@article{DeRiggi:1983wr,
+author = {DeRiggi, Dennis F.},
+title = {{Unimodality of likelihood functions for the Binomial distribution}},
+journal = {Journal of American Statistical Association},
+year = {1983},
+volume = {78},
+number = {381},
+pages = {181--183}
+}
+
+@article{Schafer:2011bx,
+author = {Sch{\"a}fer, Christian and Chopin, Nicolas},
+title = {{Sequential Monte Carlo on large binary sampling spaces}},
+journal = {Statistics and Computing},
+year = {2011},
+pages = {1--22}
+}
+
+@book{Courant:2012uz,
+author = {Courant, R and John, F},
+title = {{Introduction to Calculus and Analysis II}},
+year = {2012}
+}
+
+@article{Whiteley:2010ug,
+author = {Whiteley, Nick and Andrieu, Christophe and Doucet, Arnaud},
+title = {{Efficient Bayesian nference for switching state-space models using discrete particle Markov chain Monte Carlo methods}},
+journal = {ArXiv},
+year = {2010},
+eprint = {1011.2437v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+month = nov
+}
+
+@article{Kantas:2014ty,
+author = {Kantas, Nikolas and Doucet, Arnaud and Singh, Sumeetpal S and Maciejowski, Jan M and Chopin, Nicolas},
+title = {{On Particle Methods for Parameter Estimation in State-Space Models}},
+journal = {ArXiv},
+year = {2014},
+eprint = {1412.8695v1},
+eprinttype = {arxiv},
+eprintclass = {stat.CO},
+month = dec
+}
+
+@article{Brent:2010vj,
+author = {Brent, Richard P},
+title = {{Some long-period random number generators using shifts and xors}},
+journal = {ArXiv},
+year = {2010},
+eprint = {1004.3115v1},
+eprinttype = {arxiv},
+eprintclass = {cs.DS},
+pages = {--},
+month = apr,
+annote = {11 pages}
+}
+
+@article{Hesterberg:1995wg,
+author = {Hesterberg, Tim},
+title = {{Weighted average importance sampling and defensive mixture distributions}},
+journal = {Technometrics},
+year = {1995},
+volume = {37},
+number = {2},
+pages = {185--194}
+}
+
+@article{Davis:1993vl,
+author = {Davis, Charles S.},
+title = {{The computer generation of multinomial random variates}},
+journal = {Computational Statistics {\&} Data Analysis},
+year = {1993},
+volume = {16},
+number = {2},
+pages = {205--217}
+}
+
+@book{Robert:2007tc,
+author = {Robert, Christian P.},
+title = {{The Bayesian Choice: From Decision-theoretic Foundations to Computational Implementation}},
+publisher = {Springer},
+year = {2007},
+address = {New York},
+edition = {2}
+}
+
+@article{Du:2013jn,
+author = {Du, Qiang and Gunzburger, Max and Lehoucq, R B and Zhou, Kun},
+title = {{A nonlocal vector calculus, nonlocal volume-constrained problems, and nonlocal balance laws}},
+journal = {Mathematical Models and Methods in Applied Sciences},
+year = {2013},
+volume = {23},
+number = {03},
+pages = {493--540},
+month = mar
+}
+
+@article{Andrieu:2006tw,
+author = {Andrieu, Christophe and Moulines, Eric},
+title = {{On the ergodicity properties of some adaptive MCMC algorithms}},
+journal = {The Annals of Applied Probability},
+year = {2006},
+volume = {16},
+number = {3},
+pages = {1462--1505},
+month = aug
+}
+
+@article{Robert:2011uv,
+author = {Robert, Christian P. and Cornuet, Jean-Marie and Marin, Jean-Michel and Pillai, Natesh S.},
+title = {{Lack of confidence in approximate Bayesian computation model choice}},
+journal = {Proceedings of the {\ldots}},
+year = {2011},
+pages = {--}
+}
+
+@book{DeFinetti:1974tg,
+author = {De Finetti, Bruno},
+title = {{Theory of probability: A critical introductory treatment}},
+publisher = {John Wiley {\&} Sons},
+year = {1974},
+volume = {2}
+}
+
+@article{Akaike:1974ih,
+author = {Akaike, Hirotugu},
+title = {{A new look at the statistical model identification}},
+journal = {IEEE Transactions on Automatic Control},
+year = {1974},
+volume = {19},
+number = {6},
+pages = {716--723}
+}
+
+@article{Rousset:2006kq,
+author = {Rousset, Mathias and Stoltz, Gabriel},
+title = {{Equilibrium sampling from nonequilibrium dynamics}},
+journal = {Journal of Statistical Physics},
+year = {2006},
+volume = {123},
+number = {6},
+pages = {1251--1272}
+}
+
+@article{Marshall:2011gi,
+author = {Marshall, Tristan and Roberts, Gareth},
+title = {{An adaptive approach to Langevin MCMC}},
+journal = {Statistics and Computing},
+year = {2011},
+volume = {22},
+number = {5},
+pages = {1041--1057},
+month = sep
+}
+
+@article{Booth:1995he,
+author = {Booth, James G. and Wood, Andrew T. A.},
+title = {{An example in which the Lugannani-Rice saddlepoint formula fails}},
+journal = {Statistics {\&} Probability Letters},
+year = {1995},
+volume = {23},
+number = {1},
+pages = {53--61}
+}
+
+@article{Ronchetti:1997uc,
+author = {Ronchetti, Elvezio},
+title = {{Robustness aspects of model choice}},
+journal = {Statistica Sinica},
+year = {1997},
+volume = {7},
+pages = {327--338}
+}
+
+@article{Sacks:1981vv,
+author = {Sacks, Jermoe and Ylvisaker, Donald},
+title = {{Asymptotically optimum kernels for density estimation at a point}},
+journal = {The Annals of Statistics},
+year = {1981},
+volume = {9},
+number = {2},
+pages = {334--346}
+}
+
+@article{Chopin:2004cn,
+author = {Chopin, Nicolas},
+title = {{Central limit theorem for sequential Monte Carlo methods and its application to Bayesian inference}},
+journal = {The Annals of Statistics},
+year = {2004},
+volume = {32},
+number = {6},
+pages = {2385--2411}
+}
+
+@article{Lugannani:1980tc,
+author = {Lugannani, Robert and Rice, Stephen},
+title = {{Saddle point approximation for the distribution of the sum of indpendent random variables}},
+journal = {Advances in Applied Probability},
+year = {1980},
+volume = {12},
+number = {2},
+pages = {475--490}
+}
+
+@article{Liu:1995vl,
+author = {Liu, Jun S. and Chen, Rong},
+title = {{Blind deconvolution via sequential imputations}},
+journal = {Journal of the American Statistical Association},
+year = {1995},
+volume = {90},
+number = {430},
+pages = {567--576}
+}
+
+@article{Roberts:2001ta,
+author = {Roberts, Gareth O. and Rosenthal, Jeffrey S},
+title = {{Optimal scaling for various Metropolis-Hastings algorithms}},
+journal = {Statistical Science},
+year = {2001},
+volume = {16},
+number = {4},
+pages = {351--367}
+}
+
+@article{Lee:2010fm,
+author = {Lee, Anthony and Yau, Christopher and Giles, Michael B. and Doucet, Arnaud and Holmes, Christopher C.},
+title = {{On the utility of graphics cards to perform massively parallel simulation of advanced Monte Carlo methods}},
+journal = {Journal of Computational and Graphical Statistics},
+year = {2010},
+volume = {19},
+number = {4},
+pages = {769--789}
+}
+
+@article{Brown:1950wy,
+author = {Brown, George W.},
+title = {{Basic principles for construction and application of discriminators}},
+journal = {Journal of Clinical Psychology},
+year = {1950},
+volume = {6},
+number = {1},
+pages = {58--61}
+}
+
+@article{Robert:2000hd,
+author = {Robert, Christian P. and Ryd{\'e}n, Tobias and Titterington, D. M.},
+title = {{Bayesian inference in hidden Markov models through the reversible jump Markov chain Monte Carlo method}},
+journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+year = {2000}
+}
+
+@inproceedings{Douc:2005wa,
+author = {Douc, Randal and Capp{\'e}, Olivier and Moulines, Eric},
+title = {{Comparison of resampling schemes for particle filtering}},
+booktitle = {Proceedings of the 4th International Symposium on Imange and Signal Processing and Analysis},
+year = {2005},
+pages = {1--6}
+}
+
+@phdthesis{Hastie:2005vi,
+author = {Hastie, David},
+title = {{Towards automatic reversible jump Markov chain Monte Carlo}},
+year = {2005}
+}
+
+@article{Jasra:2007in,
+author = {Jasra, Ajay and Stephens, David A. and Holmes, Christopher C.},
+title = {{On population-based simulation for static inference}},
+journal = {Statistics and Computing},
+year = {2007},
+volume = {17},
+number = {3},
+pages = {263--279}
+}
+
+@article{Liu:1998iu,
+author = {Liu, Jun S. and Chen, Rong},
+title = {{Sequential Monte Carlo methods for dynamic systems}},
+journal = {Journal of the American Statistical Association},
+year = {1998},
+volume = {93},
+number = {443},
+pages = {1032--1044}
+}
+
+@article{Anonymous:Kx2bj97U,
+author = {Varin, Cristiano and Reid, Nancy and Firth, David},
+title = {{An overview of composite likelihood methods}},
+journal = {Statistica Sinica},
+year = {2011},
+volume = {21},
+number = {1},
+pages = {5--42}
+}
+
+@article{Lee:2001tm,
+author = {Lee, Sangyeol and Karagrigoriou, Alex},
+title = {{An asymptotically optimal selection of the order of a linear process}},
+journal = {Sankhy{\=a}: The Indian Journal of Statistics, Series A},
+year = {2001},
+pages = {93--106}
+}
+
+@article{Johansen:2006iv,
+author = {Johansen, Adam M. and Singh, Sumeetpal S. and Doucet, Arnaud and Vo, Ba-Ngu},
+title = {{Convergence of the SMC implementation of the PHD filte}},
+journal = {Methodology and Computing in Applied Probability},
+year = {2006},
+volume = {8},
+number = {2},
+pages = {265--291}
+}
+
+@article{Dodwell:2015ka,
+author = {Dodwell, T J and Ketelsen, C and Scheichl, R and Teckentrup, A L},
+title = {{A Hierarchical Multilevel Markov Chain Monte Carlo Algorithm with Applications to Uncertainty Quantification in Subsurface Flow}},
+journal = {SIAM/ASA Journal on Uncertainty Quantification},
+year = {2015},
+volume = {3},
+number = {1},
+pages = {1075--1108}
+}
+
+@article{Gunn:2001cx,
+author = {Gunn, Roger N. and Gunn, Steve R. and Cunningham, Vincent J.},
+title = {{Positron emission tomography compartmental models}},
+journal = {Journal of Cerebral Blood Flow {\&} Metabolism},
+year = {2001},
+volume = {21},
+number = {6},
+pages = {635--652}
+}
+
+@article{Huzurbazar:1999dl,
+author = {Huzurbazar, S.},
+title = {{Practical saddlepoint approximations}},
+journal = {The American Statistician},
+year = {1999},
+volume = {53},
+number = {3},
+pages = {225--232}
+}
+
+@article{Bartolucci:2006cb,
+author = {Bartolucci, Francesco and Scaccia, Luisa and Mira, Antonietta},
+title = {{Efficient Bayes factor estimation from the reversible jump output}},
+journal = {Biometrika},
+year = {2006},
+volume = {93},
+number = {1},
+pages = {41--52}
+}
+
+@article{Turkheimer:2003iy,
+author = {Turkheimer, Federico E. and Hinz, Rainer and Cunningham, Vincent J.},
+title = {{On the undecidability among kinetic models: From model selection to model averaging}},
+journal = {Journal of Cerebral Blood Flow {\&} Metabolism},
+year = {2003},
+volume = {23},
+number = {4},
+pages = {490--498}
+}
+
+@article{Mankoff:1998ui,
+author = {Mankoff, David A. and Shields, Anthony F. and Graham, Michael M. and Link, Jeanne M. and Eary, Janet F. and Krohn, Kenneth A.},
+title = {{Kinetic analysis of 2-[Carbon-11]Thymidine PET imaging studies: Compartmental model and mathematical analysis}},
+journal = {The Journal of Nuclear Medicine},
+year = {1998},
+volume = {39},
+number = {6},
+pages = {1043--1055}
+}
+
+@article{Richardson:1997ea,
+author = {Richardson, Sylvia and Green, Peter J.},
+title = {{On Bayesian analysis of mixtures with an unknown number of components}},
+journal = {Journal of Royal Statistical Society B},
+year = {1997},
+volume = {59},
+number = {4},
+pages = {731--792}
+}
+
+@phdthesis{Johansen:2006vu,
+author = {Johansen, Adam M.},
+title = {{Some non-standard sequential Monte Carlo methods and their applications}},
+school = {University of Cambridge},
+year = {2006}
+}
+
+@article{BarndorffNielsen:1980je,
+author = {Barndorff-Nielsen, O.},
+title = {{Conditionality resolutions}},
+journal = {Biometrika},
+year = {1980},
+volume = {67},
+number = {2},
+pages = {293--310}
+}
+
+@article{Tierney:1994uk,
+author = {Tierney, Luke},
+title = {{Markov chains for exploring posterior distributions}},
+journal = {The Annals of Statistics},
+year = {1994},
+volume = {22},
+number = {4},
+pages = {1701--1728}
+}
+
diff --git a/user_guide/user_guide.tex b/user_guide/user_guide.tex
new file mode 100644
index 000000000..33f72e7c4
--- /dev/null
+++ b/user_guide/user_guide.tex
@@ -0,0 +1,81 @@
+\documentclass[11pt,bib,mint,hyper,altcolor,showoverfull,toclevel=2]{mbook}
+
+\newlength\fixedchar
+\settowidth{\fixedchar}{\texttt{\normalsize 0}}
+\geometry{textwidth=80\fixedchar}
+
+\NewMinted{cpp}
+\NewMinted{r}
+\NewMinted{text}
+
+\UseAbbr{aes}
+\UseAbbr{ais}
+\UseAbbr{ars}
+\UseAbbr{avx}
+\UseAbbr{blas}
+\UseAbbr{brng}
+\UseAbbr{cpu}
+\UseAbbr{crtp}
+\UseAbbr{ess}
+\UseAbbr{gpu}
+\UseAbbr{lapack}
+\UseAbbr{mcmc}
+\UseAbbr{mkl}
+\UseAbbr{posix}
+\UseAbbr{raii}
+\UseAbbr{rdrand}
+\UseAbbr{rng}
+\UseAbbr{sis}
+\UseAbbr{smc}
+\UseAbbr{smp}
+\UseAbbr{sse}
+\UseAbbr{tbb}
+\UseAbbr{tls}
+\UseAbbr{vml}
+\UseAbbr{vsl}
+
+\UseAbbr[\aesni][\textsc]{aes-ni}
+\UseAbbr[\cpp][\textcase]{C++}
+\UseAbbr[\cppoo][\lnfigures\textcase]{C++11}
+\UseAbbr[\hdf]{hdf5}
+\UseAbbr[\ith][\textsups]{t{}h}
+\UseAbbr[\vsmc][]{vSMC}
+\UseAbbr[\io][\textsc]{i/o}
+
+\UseMathCal{N}
+
+\def\xobs{X_{\mathrm{obs}}}
+\def\xpos{X_{\mathrm{pos}}}
+\def\xvel{X_{\mathrm{vel}}}
+\def\yobs{Y_{\mathrm{obs}}}
+\def\ypos{Y_{\mathrm{pos}}}
+\def\yvel{Y_{\mathrm{vel}}}
+\def\STATESKIP{\hskip.68cm}
+\def\spt{\texttt{SingleParticle<T>}}
+\def\version{develop}
+
+\title{vSMC -- Parallel SMC in C++}
+\author{Yan Zhou}
+\date{Second edition (version \version)}
+\addbibresource{user_guide.bib}
+
+\begin{document}
+
+\maketitle
+
+\tableofcontents
+
+\input{tex/smc}
+\input{tex/basic}
+\input{tex/advanced}
+\input{tex/config}
+\input{tex/math}
+\input{tex/resample}
+\input{tex/rng}
+\input{tex/util}
+
+\printbibliography
+
+\input{tex/app}
+
+\end{document}
diff --git a/user_guide/user_guide.tex.latexmain b/user_guide/user_guide.tex.latexmain
new file mode 100644
index 000000000..e69de29bb