diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000..f4801977
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,3 @@
+# These are supported funding model platforms
+
+github: [mjansson]
diff --git a/.gitignore b/.gitignore
index c291de3d..fd145ba5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ local.properties
 .loadpath
 .ninja*
 build.ninja
+.vs
+.vscode
 
 # Generated version
 version.c
diff --git a/CHANGELOG b/CHANGELOG
index 94c74f21..2820daff 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,38 @@
+1.4.1
+
+Dual license as both released to public domain or under MIT license
+
+Allow up to 4GiB memory page sizes
+
+Fix an issue where large page sizes in conjunction with many threads waste a lot of memory (previously
+each heap occupied an entire memory page, now heaps can now share a memory page)
+
+Fixed compilation issue on macOS when ENABLE_PRELOAD is set but not ENABLE_OVERRIDE
+
+New first class heap API allowing explicit heap control and release of entire heap in a single call
+
+Added rpaligned_calloc function for aligned and zero intialized allocations
+
+Fixed natural alignment check in rpaligned_realloc to 16 bytes (check was 32, which is wrong)
+
+Minor performance improvements for all code paths by simplified span handling
+
+Minor performance improvements and for aligned allocations with alignment less or equal to 128 bytes
+by utilizing natural block alignments
+
+Refactor finalization to be compatible with global scope data causing dynamic allocations and frees, like
+C++ objects with custom ctors/dtors
+
+Refactor thread and global cache to be array based instead of list based for improved performance
+and cache size control
+
+Added missing C++ operator overloads with ENABLE_OVERRIDE when using Microsoft C++ runtimes
+
+Fixed issue in pvalloc override that could return less than a memory page in usable size
+
+Added a missing null check in the non-hot allocation code paths
+
+
 1.4.0
 
 Improved cross thread deallocations by using per-span atomic free list to minimize thread
diff --git a/LICENSE b/LICENSE
index cf1ab25d..be01deb9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -22,3 +22,31 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 
 For more information, please refer to <http://unlicense.org>
+
+
+You can also use this software under the MIT license if public domain
+is not recognized in your country
+
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Mattias Jansson
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index c8149b8d..626f63ac 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
-# rpmalloc - Rampant Pixels Memory Allocator
+# rpmalloc - General Purpose Memory Allocator
 This library provides a public domain cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/mjansson/rpmalloc
 
+Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))  -  Support development through my [GitHub Sponsors page](https://github.com/sponsors/mjansson)
+
 Platforms currently supported:
 
 - Windows
@@ -14,10 +16,8 @@ The code should be easily portable to any platform with atomic operations and an
 
 This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
 
-Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))
-
 # Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2500 lines of C code. All allocations have a natural 16-byte alignment.
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment.
 
 Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
 
@@ -31,6 +31,12 @@ The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8
 
 Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines.
 
+# Required functions
+
+Before calling any other function in the API, you __MUST__ call the initization function, either __rpmalloc_initialize__ or __pmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point.
+
+Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case.
+
 # Using
 The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads:
 
@@ -50,6 +56,8 @@ Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replaceme
 
 If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__. The list of libc entry points replaced may not be complete, use libc replacement only as a convenience for testing the library on an existing code base, not a final solution.
 
+For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1.
+
 # Building
 To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides.
 
@@ -79,11 +87,11 @@ Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is de
 
 Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`.
 
-Overwrite and underwrite guards are enabled if __ENABLE_GUARDS__ is defined to 1 (default is 0, or disabled), either on compile command line or by settings the value in `rpmalloc.c`. This will introduce up to 64 byte overhead on each allocation to store magic numbers, which will be verified when freeing the memory block. The actual overhead is dependent on the requested size compared to size class limits.
-
 To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1.
 
-To enable the runtime configurable memory page and span sizes, define __ENABLE_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.
+To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.
+
+To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled.
 
 # Huge pages
 The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done.
@@ -122,11 +130,6 @@ A span that is a subspan of a larger super span can be individually decommitted
 
 If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting invididual pages and the total super span byte size for finally releasing the entire super span memory range.
 
-# Memory guards
-If you define the __ENABLE_GUARDS__ to 1, all memory allocations will be padded with extra guard areas before and after the memory block (while still honoring the requested alignment). These dead zones will be filled with a pattern and checked when the block is freed. If the patterns are not intact the callback set in initialization config is called, or if not set an assert is fired.
-
-Note that the end of the memory block in this case is defined by the total usable size of the block as returned by `rpmalloc_usable_size`, which can be larger than the size passed to allocation request due to size class buckets.
-
 # Memory fragmentation
 There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class.
 
@@ -134,6 +137,9 @@ However, there is memory fragmentation in the meaning that a request for x bytes
 
 rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span.
 
+# First class heaps
+rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time.
+
 # Producer-consumer scenario
 Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads.
 
@@ -148,12 +154,18 @@ Since each thread cache maps spans of memory pages per size class, a thread that
 Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead).
 
 # Caveats
-Cross-thread deallocations could leave dangling spans in the owning thread heap partially used list if the deallocation is the last used block in the span and the span is previously marked as partial (at least one block deallocated by the owning thread). However, an optimization for GC like use cases is that if all the blocks in the span are freed by other threads, the span can immediately be inserted in the owning thread span cache.
-
 VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
 
 All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__.
 
+To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks.
+
+# Other languages
+
+[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs)
+
+[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp)
+
 # License
 
 This is free and unencumbered software released into the public domain.
@@ -188,7 +200,7 @@ not recognized in your country
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Rampant Pixels AB
+Copyright (c) 2017 Mattias Jansson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/build/msvs/rpmalloc-test.vcxproj b/build/msvs/rpmalloc-test.vcxproj
new file mode 100644
index 00000000..52078956
--- /dev/null
+++ b/build/msvs/rpmalloc-test.vcxproj
@@ -0,0 +1,213 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\rpmalloc\rpmalloc.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\rpmalloc\rpmalloc.h" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{01b8c8be-038d-482f-b016-3a9496ac41b0}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rpmalloc-test</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>..\..\lib\windows\debug\x86\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>..\..\lib\windows\release\x86\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir>..\..\lib\windows\debug\x86-64\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir>..\..\lib\windows\release\x86-64\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>false</MinimalRebuild>
+      <ExceptionHandling>false</ExceptionHandling>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>false</MinimalRebuild>
+      <ExceptionHandling>false</ExceptionHandling>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ExceptionHandling>false</ExceptionHandling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <StringPooling>true</StringPooling>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ExceptionHandling>false</ExceptionHandling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <StringPooling>true</StringPooling>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/build/msvs/rpmalloc.sln b/build/msvs/rpmalloc.sln
index 03e6ccf2..fa2456a8 100644
--- a/build/msvs/rpmalloc.sln
+++ b/build/msvs/rpmalloc.sln
@@ -1,12 +1,14 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 16
+# Visual Studio Version 16
 VisualStudioVersion = 16.0.28803.202
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rpmalloc", "rpmalloc.vcxproj", "{65DC4291-954E-4B91-8889-4F3ADCC9D2D5}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test.vcxproj", "{C31980DD-1241-4EF8-A351-69DAF982A7B9}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rpmalloc-test", "rpmalloc-test.vcxproj", "{01B8C8BE-038D-482F-B016-3A9496AC41B0}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -31,6 +33,14 @@ Global
 		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x64.Build.0 = Release|x64
 		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x86.ActiveCfg = Release|Win32
 		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x86.Build.0 = Release|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x64.ActiveCfg = Debug|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x64.Build.0 = Debug|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x86.ActiveCfg = Debug|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x86.Build.0 = Debug|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x64.ActiveCfg = Release|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x64.Build.0 = Release|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x86.ActiveCfg = Release|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/build/msvs/test.vcxproj b/build/msvs/test.vcxproj
index 2ba1fec2..ca051ef4 100644
--- a/build/msvs/test.vcxproj
+++ b/build/msvs/test.vcxproj
@@ -27,8 +27,8 @@
     <ClInclude Include="..\..\test\thread.h" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="rpmalloc.vcxproj">
-      <Project>{65dc4291-954e-4b91-8889-4f3adcc9d2d5}</Project>
+    <ProjectReference Include="rpmalloc-test.vcxproj">
+      <Project>{01b8c8be-038d-482f-b016-3a9496ac41b0}</Project>
     </ProjectReference>
   </ItemGroup>
   <PropertyGroup Label="Globals">
diff --git a/build/ninja/android.py b/build/ninja/android.py
index 82fba807..7d0a1fa4 100644
--- a/build/ninja/android.py
+++ b/build/ninja/android.py
@@ -81,7 +81,7 @@ def initialize_toolchain(self):
       else:
         self.hostarchname = 'windows-x86'
     elif self.host.is_linux():
-        localarch = subprocess.check_output(['uname', '-m']).strip()
+        localarch = toolchain.check_output(['uname', '-m'])
         if localarch == 'x86_64':
           self.hostarchname = 'linux-x86_64'
         else:
diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index e9ddc508..bd4f8217 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -15,13 +15,16 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.sdkpath = ''
     self.includepaths = []
     self.libpaths = libpaths
-    self.ccompiler = 'clang'
-    self.cxxcompiler = 'clang++'
-    self.archiver = 'ar'
-    self.linker = 'clang'
-    self.cxxlinker = 'clang++'
+    self.ccompiler = os.environ.get('CC') or 'clang'
+    self.cxxcompiler = os.environ.get('CXX') or 'clang++'
     if self.target.is_windows():
-      self.archiver = 'llvm-ar'
+      self.archiver = os.environ.get('AR') or 'llvm-ar'
+      self.linker = os.environ.get('CC') or 'lld-link'
+      self.cxxlinker = os.environ.get('CXX') or 'lld-link'
+    else:
+      self.archiver = os.environ.get('AR') or 'ar'
+      self.linker = os.environ.get('CC') or 'clang'
+      self.cxxlinker = os.environ.get('CXX') or 'clang++'
 
     #Default variables
     self.sysroot = ''
@@ -31,12 +34,16 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
       self.deploymenttarget = '10.7'
 
     #Command definitions
-    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
-    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
+    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags $cenvflags -c $in -o $out'
+    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags $cxxenvflags -c $in -o $out'
     self.ccdeps = 'gcc'
     self.ccdepfile = '$out.d'
-    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $out $in'
-    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags -o $out $in $libs $archlibs $oslibs $frameworks'
+    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $arenvflags $out $in'
+    if self.target.is_windows():
+      self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags /debug /nologo /subsystem:console /dynamicbase /nxcompat /manifest /manifestuac:\"level=\'asInvoker\' uiAccess=\'false\'\" /tlbid:1 /pdb:$pdbpath /out:$out $in $libs $archlibs $oslibs $frameworks'
+      self.dllcmd = self.linkcmd + ' /dll'
+    else:
+      self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags -o $out $in $libs $archlibs $oslibs $frameworks'
 
     #Base flags
     self.cflags = ['-D' + project.upper() + '_COMPILE=1',
@@ -44,11 +51,12 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
                    '-fomit-frame-pointer', '-fno-math-errno','-ffinite-math-only', '-funsafe-math-optimizations',
                    '-fno-trapping-math', '-ffast-math']
     self.cwarnflags = ['-W', '-Werror', '-pedantic', '-Wall', '-Weverything',
-                       '-Wno-padded', '-Wno-documentation-unknown-command', '-Wno-static-in-inline']
+                       '-Wno-c++98-compat', '-Wno-padded', '-Wno-documentation-unknown-command',
+                       '-Wno-implicit-fallthrough', '-Wno-static-in-inline', '-Wno-reserved-id-macro']
     self.cmoreflags = []
     self.mflags = []
     self.arflags = []
-    self.linkflags = ['-fomit-frame-pointer']
+    self.linkflags = []
     self.oslibs = []
     self.frameworks = []
 
@@ -62,13 +70,16 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.parse_default_variables(variables)
     self.read_build_prefs()
 
-    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi():
+    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi() or self.target.is_sunos():
       self.cflags += ['-D_GNU_SOURCE=1']
       self.linkflags += ['-pthread']
+      self.oslibs += ['m']
     if self.target.is_linux() or self.target.is_raspberrypi():
       self.oslibs += ['dl']
     if self.target.is_bsd():
       self.oslibs += ['execinfo']
+    if not self.target.is_windows():
+      self.linkflags += ['-fomit-frame-pointer']
 
     self.includepaths = self.prefix_includepaths((includepaths or []) + ['.'])
 
@@ -84,11 +95,11 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
       self.cflags += ['-w']
     self.cxxflags = list(self.cflags)
 
-    self.cflags += ['-std=gnu11']
+    self.cflags += ['-std=c11']
     if self.target.is_macos() or self.target.is_ios():
       self.cxxflags += ['-std=c++14', '-stdlib=libc++']
     else:
-      self.cxxflags += ['-std=gnu++14']
+      self.cxxflags += ['-std=c++14']
 
     #Overrides
     self.objext = '.o'
@@ -155,18 +166,24 @@ def write_variables(self, writer):
     writer.variable('carchflags', '')
     writer.variable('cconfigflags', '')
     writer.variable('cmoreflags', self.cmoreflags)
+    writer.variable('cenvflags', (os.environ.get('CFLAGS') or '').split())
+    writer.variable('cxxenvflags', (os.environ.get('CXXFLAGS') or '').split())
     writer.variable('arflags', self.arflags)
     writer.variable('ararchflags', '')
     writer.variable('arconfigflags', '')
+    writer.variable('arenvflags', (os.environ.get('ARFLAGS') or '').split())
     writer.variable('linkflags', self.linkflags)
     writer.variable('linkarchflags', '')
     writer.variable('linkconfigflags', '')
+    writer.variable('linkenvflags', (os.environ.get('LDFLAGS') or '').split())
     writer.variable('libs', '')
     writer.variable('libpaths', self.make_libpaths(self.libpaths))
     writer.variable('configlibpaths', '')
     writer.variable('archlibs', '')
     writer.variable('oslibs', self.make_libs(self.oslibs))
     writer.variable('frameworks', '')
+    if self.target.is_windows():
+      writer.variable('pdbpath', 'ninja.pdb')
     writer.newline()
 
   def write_rules(self, writer):
@@ -178,7 +195,10 @@ def write_rules(self, writer):
       writer.rule( 'lipo', command = self.lipocmd, description = 'LIPO $out' )
     writer.rule('ar', command = self.arcmd, description = 'LIB $out')
     writer.rule('link', command = self.linkcmd, description = 'LINK $out')
-    writer.rule('so', command = self.linkcmd, description = 'SO $out')
+    if self.target.is_windows():
+      writer.rule('dll', command = self.dllcmd, description = 'DLL $out')
+    else:
+      writer.rule('so', command = self.linkcmd, description = 'SO $out')
     writer.newline()
 
   def build_toolchain(self):
@@ -230,15 +250,15 @@ def build_xcode_toolchain(self):
       self.linkflags += ['-isysroot', '$sysroot']
     self.cflags += ['-fembed-bitcode-marker']
 
-    platformpath = subprocess.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path']).strip()
+    platformpath = toolchain.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path'])
     localpath = platformpath + "/Developer/usr/bin:/Applications/Xcode.app/Contents/Developer/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 
-    self.sysroot = subprocess.check_output(['xcrun', '--sdk', sdk, '--show-sdk-path']).strip()
+    self.sysroot = toolchain.check_output(['xcrun', '--sdk', sdk, '--show-sdk-path'])
 
-    self.ccompiler = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'clang']).strip()
-    self.archiver = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'libtool']).strip()
+    self.ccompiler = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'clang'])
+    self.archiver = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'libtool'])
     self.linker = deploytarget + " " + self.ccompiler
-    self.lipo = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'lipo']).strip()
+    self.lipo = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'lipo'])
 
     self.mflags += list(self.cflags) + ['-fobjc-arc', '-fno-objc-exceptions', '-x', 'objective-c']
     self.cflags += ['-x', 'c']
@@ -264,7 +284,7 @@ def make_libpath(self, path):
   def make_libpaths(self, libpaths):
     if not libpaths is None:
       if self.target.is_windows():
-        return ['-Xlinker /LIBPATH:' + self.path_escape(path) for path in libpaths]
+        return ['/libpath:' + self.path_escape(path) for path in libpaths]
       return ['-L' + self.make_libpath(path) for path in libpaths]
     return []
 
@@ -292,13 +312,18 @@ def make_targetarchflags(self, arch, targettype):
       flags += ['-gcc-toolchain', self.android.make_gcc_toolchain_path(arch)]
     elif self.target.is_macos() or self.target.is_ios():
       if arch == 'x86':
-        flags += [' -arch x86']
+        flags += ['-arch', 'x86']
       elif arch == 'x86-64':
-        flags += [' -arch x86_64']
+        flags += ['-arch', 'x86_64']
       elif arch == 'arm7':
-        flags += [' -arch armv7']
+        flags += ['-arch', 'armv7']
       elif arch == 'arm64':
-        flags += [' -arch arm64']
+        flags += ['-arch', 'arm64']
+    elif self.target.is_windows():
+      if arch == 'x86':
+        flags += ['-target', 'x86-pc-windows-msvc']
+      elif arch == 'x64':
+        flags += ['-target', 'x86_64-pc-windows-msvc']
     else:
       if arch == 'x86':
         flags += ['-m32']
@@ -310,21 +335,21 @@ def make_carchflags(self, arch, targettype):
     flags = []
     if targettype == 'sharedlib':
       flags += ['-DBUILD_DYNAMIC_LINK=1']
-      if self.target.is_linux() or self.target.is_bsd():
+      if self.target.is_linux() or self.target.is_bsd() or self.target.is_sunos():
        flags += ['-fPIC']
     flags += self.make_targetarchflags(arch, targettype)
     return flags
 
   def make_cconfigflags(self, config, targettype):
-    flags = []
+    flags = ['-g']
     if config == 'debug':
-      flags += ['-DBUILD_DEBUG=1', '-g']
+      flags += ['-DBUILD_DEBUG=1']
     elif config == 'release':
-      flags += ['-DBUILD_RELEASE=1', '-DNDEBUG', '-O3', '-g', '-funroll-loops']
+      flags += ['-DBUILD_RELEASE=1', '-O3', '-funroll-loops']
     elif config == 'profile':
-      flags += ['-DBUILD_PROFILE=1', '-DNDEBUG', '-O3', '-g', '-funroll-loops']
+      flags += ['-DBUILD_PROFILE=1', '-O3', '-funroll-loops']
     elif config == 'deploy':
-      flags += ['-DBUILD_DEPLOY=1', '-DNDEBUG', '-O3', '-g', '-funroll-loops']
+      flags += ['-DBUILD_DEPLOY=1', '-O3', '-funroll-loops']
     return flags
 
   def make_ararchflags(self, arch, targettype):
@@ -342,10 +367,12 @@ def make_linkarchflags(self, arch, targettype, variables):
       if arch == 'arm7':
         flags += ['-Wl,--no-warn-mismatch', '-Wl,--fix-cortex-a8']
     if self.target.is_windows():
+      # Ignore target arch flags from above, add link style arch instead
+      flags = []
       if arch == 'x86':
-        flags += ['-Xlinker', '/MACHINE:X86']
+        flags += ['/machine:x86']
       elif arch == 'x86-64':
-        flags += ['-Xlinker', '/MACHINE:X64']
+        flags += ['/machine:x64']
     if self.target.is_macos() and variables != None and 'support_lua' in variables and variables['support_lua']:
       flags += ['-pagezero_size', '10000', '-image_base', '100000000']
     return flags
@@ -353,18 +380,19 @@ def make_linkarchflags(self, arch, targettype, variables):
   def make_linkconfigflags(self, config, targettype, variables):
     flags = []
     if self.target.is_windows():
-      if targettype == 'sharedlib':
-        flags += ['-Xlinker', '/DLL']
-      elif targettype == 'bin':
-        flags += ['-Xlinker', '/SUBSYSTEM:CONSOLE']
+      if config == 'debug':
+        flags += ['/incremental', '/defaultlib:libcmtd']
+      else:
+        flags += ['/incremental:no', '/opt:ref', '/opt:icf', '/defaultlib:libcmt']
     elif self.target.is_macos() or self.target.is_ios():
       if targettype == 'sharedlib' or targettype == 'multisharedlib':
         flags += ['-dynamiclib']
     else:
       if targettype == 'sharedlib':
         flags += ['-shared', '-fPIC']
-    if config == 'release':
-      flags += ['-DNDEBUG', '-O3']
+    if config != 'debug':
+      if targettype == 'bin' or targettype == 'sharedlib':
+        flags += ['-flto']
     return flags
 
   def make_linkarchlibs(self, arch, targettype):
@@ -379,6 +407,8 @@ def make_linkarchlibs(self, arch, targettype):
 
   def make_libs(self, libs):
     if libs != None:
+      if self.target.is_windows():
+        return [lib + ".lib" for lib in libs]
       return ['-l' + lib for lib in libs]
     return []
 
@@ -450,7 +480,7 @@ def link_variables(self, config, arch, targettype, variables):
       localframeworks += list(variables['frameworks'])
     if len(localframeworks) > 0:
       localvariables += [('frameworks', self.make_frameworks(list(localframeworks)))]
-      
+
     libpaths = []
     if 'libpaths' in variables:
       libpaths = variables['libpaths']
@@ -479,6 +509,8 @@ def builder_lib(self, writer, config, arch, targettype, infiles, outfile, variab
     return writer.build(outfile, 'ar', infiles, implicit = self.implicit_deps(config, variables), variables = self.ar_variables(config, arch, targettype, variables))
 
   def builder_sharedlib(self, writer, config, arch, targettype, infiles, outfile, variables):
+    if self.target.is_windows():
+      return writer.build(outfile, 'dll', infiles, implicit = self.implicit_deps(config, variables), variables = self.link_variables(config, arch, targettype, variables))
     return writer.build(outfile, 'so', infiles, implicit = self.implicit_deps(config, variables), variables = self.link_variables(config, arch, targettype, variables))
 
   def builder_bin(self, writer, config, arch, targettype, infiles, outfile, variables):
diff --git a/build/ninja/gcc.py b/build/ninja/gcc.py
index 21fc2e49..299be53f 100644
--- a/build/ninja/gcc.py
+++ b/build/ninja/gcc.py
@@ -13,19 +13,19 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.toolchain = ''
     self.includepaths = []
     self.libpaths = libpaths
-    self.ccompiler = 'gcc'
-    self.cxxcompiler = 'g++'
-    self.archiver = 'ar'
-    self.linker = 'gcc'
-    self.cxxlinker = 'g++'
+    self.ccompiler = os.environ.get('CC') or 'gcc'
+    self.cxxcompiler = os.environ.get('CXX') or 'g++'
+    self.archiver = os.environ.get('AR') or 'ar'
+    self.linker = os.environ.get('CC') or 'gcc'
+    self.cxxlinker = os.environ.get('CXX') or 'g++'
 
     #Command definitions
-    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
-    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
+    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags $cenvflags -c $in -o $out'
+    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags $cxxenvflags -c $in -o $out'
     self.ccdeps = 'gcc'
     self.ccdepfile = '$out.d'
-    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $out $in'
-    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags -o $out $in $libs $archlibs $oslibs'
+    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $arenvflags $out $in'
+    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags -o $out $in $libs $archlibs $oslibs'
 
     #Base flags
     self.cflags = ['-D' + project.upper() + '_COMPILE=1',
@@ -49,7 +49,7 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.parse_default_variables(variables)
     self.read_build_prefs()
 
-    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi():
+    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi() or self.target.is_sunos():
       self.cflags += ['-D_GNU_SOURCE=1']
       self.linkflags += ['-pthread']
     if self.target.is_linux() or self.target.is_raspberrypi():
@@ -118,12 +118,16 @@ def write_variables(self, writer):
     writer.variable('carchflags', '')
     writer.variable('cconfigflags', '')
     writer.variable('cmoreflags', self.cmoreflags)
+    writer.variable('cenvflags', (os.environ.get('CFLAGS') or '').split())
+    writer.variable('cxxenvflags', (os.environ.get('CXXFLAGS') or '').split())
     writer.variable('arflags', self.arflags)
     writer.variable('ararchflags', '')
     writer.variable('arconfigflags', '')
+    writer.variable('arenvflags', (os.environ.get('ARFLAGS') or '').split())
     writer.variable('linkflags', self.linkflags)
     writer.variable('linkarchflags', '')
     writer.variable('linkconfigflags', '')
+    writer.variable('linkenvflags', (os.environ.get('LDFLAGS') or '').split())
     writer.variable('libs', '')
     writer.variable('libpaths', self.make_libpaths(self.libpaths))
     writer.variable('configlibpaths', '')
@@ -182,7 +186,7 @@ def make_carchflags(self, arch, targettype):
     flags = []
     if targettype == 'sharedlib':
       flags += ['-DBUILD_DYNAMIC_LINK=1']
-      if self.target.is_linux() or self.target.is_bsd():
+      if self.target.is_linux() or self.target.is_bsd() or self.target.is_sunos():
         flags += ['-fPIC']
     flags += self.make_targetarchflags(arch, targettype)
     return flags
diff --git a/build/ninja/platform.py b/build/ninja/platform.py
index 68cf0ab1..cf91c14b 100644
--- a/build/ninja/platform.py
+++ b/build/ninja/platform.py
@@ -5,7 +5,7 @@
 import sys
 
 def supported_platforms():
-  return [ 'windows', 'linux', 'macos', 'bsd', 'ios', 'android', 'raspberrypi', 'tizen' ]
+  return [ 'windows', 'linux', 'macos', 'bsd', 'ios', 'android', 'raspberrypi', 'tizen', 'sunos' ]
 
 class Platform(object):
   def __init__(self, platform):
@@ -30,6 +30,8 @@ def __init__(self, platform):
       self.platform = 'raspberrypi'
     elif self.platform.startswith('tizen'):
       self.platform = 'tizen'
+    elif self.platform.startswith('sunos'):
+      self.platform = 'sunos'
 
   def platform(self):
     return self.platform
@@ -58,5 +60,8 @@ def is_raspberrypi(self):
   def is_tizen(self):
     return self.platform == 'tizen'
 
+  def is_sunos(self):
+    return self.platform == 'sunos'
+
   def get(self):
     return self.platform
diff --git a/build/ninja/toolchain.py b/build/ninja/toolchain.py
index 727c8586..d10d8407 100644
--- a/build/ninja/toolchain.py
+++ b/build/ninja/toolchain.py
@@ -14,6 +14,11 @@
 import android
 import xcode
 
+
+def check_output(args):
+  import subprocess
+  return subprocess.check_output(args).decode().strip()
+
 def supported_toolchains():
   return ['msvc', 'gcc', 'clang', 'intel']
 
@@ -127,7 +132,7 @@ def initialize_archs(self, archs):
   def initialize_default_archs(self):
     if self.target.is_windows():
       self.archs = ['x86-64']
-    elif self.target.is_linux() or self.target.is_bsd():
+    elif self.target.is_linux() or self.target.is_bsd() or self.target.is_sunos():
       localarch = subprocess.check_output(['uname', '-m']).decode().strip()
       if localarch == 'x86_64' or localarch == 'amd64':
         self.archs = ['x86-64']
diff --git a/build/ninja/xcode.py b/build/ninja/xcode.py
index 8e158d9c..3af3761e 100644
--- a/build/ninja/xcode.py
+++ b/build/ninja/xcode.py
@@ -34,13 +34,13 @@ def build_toolchain(self):
       sdk = 'iphoneos'
       deploytarget = 'IPHONEOS_DEPLOYMENT_TARGET=' + self.deploymenttarget
 
-    platformpath = subprocess.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path']).strip()
+    platformpath = toolchain.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path'])
     localpath = platformpath + "/Developer/usr/bin:/Applications/Xcode.app/Contents/Developer/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 
-    self.plist = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'plutil']).strip()
-    self.xcassets = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'actool']).strip()
-    self.xib = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'ibtool']).strip()
-    self.dsymutil = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'dsymutil']).strip()
+    self.plist = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'plutil'])
+    self.xcassets = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'actool'])
+    self.xib = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'ibtool'])
+    self.dsymutil = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'dsymutil'])
 
     self.plistcmd = 'build/ninja/plist.py --exename $exename --prodname $prodname --bundle $bundleidentifier --target $target --deploymenttarget $deploymenttarget --output $outpath $in'
     if self.target.is_macos():
diff --git a/configure.py b/configure.py
index b3e77dbc..514189ed 100755
--- a/configure.py
+++ b/configure.py
@@ -12,6 +12,7 @@
 generator = generator.Generator(project = 'rpmalloc', variables = [('bundleidentifier', 'com.rampantpixels.rpmalloc.$(binname)')])
 
 rpmalloc_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
+rpmalloc_test_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc-test', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1', 'RPMALLOC_FIRST_CLASS_HEAPS=1', 'RPMALLOC_CONFIGURABLE=1']})
 
 if not generator.target.is_android() and not generator.target.is_ios():
 	rpmalloc_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
@@ -19,5 +20,5 @@
 	rpmallocwrap_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_PRELOAD=1', 'ENABLE_OVERRIDE=1']})
 	rpmallocwrap_lib = generator.lib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_PRELOAD=1', 'ENABLE_OVERRIDE=1']})
 
-	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmalloc_lib], libs = ['rpmalloc'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1']})
+	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmalloc_test_lib], libs = ['rpmalloc-test'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1', 'RPMALLOC_FIRST_CLASS_HEAPS=1', 'RPMALLOC_CONFIGURABLE=1']})
 	generator.bin(module = 'test', sources = ['thread.c', 'main-override.cc'], binname = 'rpmallocwrap-test', implicit_deps = [rpmallocwrap_lib], libs = ['rpmallocwrap'], includepaths = ['rpmalloc', 'test'], variables = {'runtime': 'c++', 'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1']})
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 426a14ae..56becb8d 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -30,8 +30,6 @@ _Static_assert(sizeof(void*) == 4, "Data type size mismatch");
 #pragma GCC visibility push(default)
 #endif
 
-#if ENABLE_OVERRIDE
-
 #define USE_IMPLEMENT 1
 #define USE_INTERPOSE 0
 #define USE_ALIAS 0
@@ -39,6 +37,17 @@ _Static_assert(sizeof(void*) == 4, "Data type size mismatch");
 #if defined(__APPLE__) && ENABLE_PRELOAD
 #undef USE_INTERPOSE
 #define USE_INTERPOSE 1
+
+typedef struct interpose_t {
+	void* new_func;
+	void* orig_func;
+} interpose_t;
+
+#define MAC_INTERPOSE_PAIR(newf, oldf) 	{ (void*)newf, (void*)oldf }
+#define MAC_INTERPOSE_SINGLE(newf, oldf) \
+__attribute__((used)) static const interpose_t macinterpose##newf##oldf \
+__attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf)
+
 #endif
 
 #if !defined(_WIN32) && !USE_INTERPOSE
@@ -53,13 +62,20 @@ _Static_assert(sizeof(void*) == 4, "Data type size mismatch");
 #undef malloc
 #undef free
 #undef calloc
+#define RPMALLOC_RESTRICT __declspec(restrict)
+#else
+#define RPMALLOC_RESTRICT
 #endif
 
+#if ENABLE_OVERRIDE
+
+typedef struct rp_nothrow_t { int __dummy; } rp_nothrow_t;
+
 #if USE_IMPLEMENT
 
-extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
-extern inline void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
-extern inline void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
 extern inline void* RPMALLOC_CDECL reallocf(void* ptr, size_t size) { return rprealloc(ptr, size); }
 extern inline void* RPMALLOC_CDECL aligned_alloc(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); }
 extern inline void* RPMALLOC_CDECL memalign(size_t alignment, size_t size) { return rpmemalign(alignment, size); }
@@ -69,6 +85,9 @@ extern inline void RPMALLOC_CDECL cfree(void* ptr) { rpfree(ptr); }
 extern inline size_t RPMALLOC_CDECL malloc_usable_size(void* ptr) { return rpmalloc_usable_size(ptr); }
 extern inline size_t RPMALLOC_CDECL malloc_size(void* ptr) { return rpmalloc_usable_size(ptr); }
 
+#ifdef _WIN32
+// For Windows, #include <rpnew.h> in one source file to get the C++ operator overrides implemented in your module
+#else
 // Overload the C++ operators using the mangled names (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling)
 // operators delete and delete[]
 extern void _ZdlPv(void* p); void _ZdlPv(void* p) { rpfree(p); }
@@ -79,28 +98,45 @@ extern void* _Znwm(uint64_t size); void* _Znwm(uint64_t size) { return rpmalloc(
 extern void* _Znam(uint64_t size); void* _Znam(uint64_t size) { return rpmalloc(size); }
 extern void* _Znwmm(uint64_t size, uint64_t align); void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
 extern void* _Znamm(uint64_t size, uint64_t align); void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmSt11align_val_t(uint64_t size, uint64_t align); void* _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_t(uint64_t size, uint64_t align); void* _ZnamSt11align_val_t(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); void* _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); void* _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t); void* _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t); void* _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+// 64-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvm(void* p, uint64_t size); void _ZdlPvm(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdaPvm(void* p, uint64_t size); void _ZdaPvm(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdlPvSt11align_val_t(void* p, uint64_t align); void _ZdlPvSt11align_val_t(void* p, uint64_t align) { rpfree(p); (void)sizeof(align); }
+extern void _ZdaPvSt11align_val_t(void* p, uint64_t align); void _ZdaPvSt11align_val_t(void* p, uint64_t align) { rpfree(p); (void)sizeof(align); }
+extern void _ZdlPvmSt11align_val_t(void* p, uint64_t size, uint64_t align); void _ZdlPvmSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(align); }
+extern void _ZdaPvmSt11align_val_t(void* p, uint64_t size, uint64_t align); void _ZdaPvmSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(align); }
 #else
 // 32-bit operators new and new[], normal and aligned
 extern void* _Znwj(uint32_t size); void* _Znwj(uint32_t size) { return rpmalloc(size); }
 extern void* _Znaj(uint32_t size); void* _Znaj(uint32_t size) { return rpmalloc(size); }
-extern void* _Znwjj(uint64_t size, uint64_t align); void* _Znwjj(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
-extern void* _Znajj(uint64_t size, uint64_t align); void* _Znajj(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _Znwjj(uint32_t size, uint32_t align); void* _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern void* _Znajj(uint32_t size, uint32_t align); void* _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwjSt11align_val_t(size_t size, size_t align); void* _ZnwjSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnajSt11align_val_t(size_t size, size_t align); void* _ZnajSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+// 32-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvj(void* p, uint64_t size); void _ZdlPvj(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdaPvj(void* p, uint64_t size); void _ZdaPvj(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdlPvSt11align_val_t(void* p, uint32_t align); void _ZdlPvSt11align_val_t(void* p, uint64_t a) { rpfree(p); (void)sizeof(align); }
+extern void _ZdaPvSt11align_val_t(void* p, uint32_t align); void _ZdaPvSt11align_val_t(void* p, uint64_t a) { rpfree(p); (void)sizeof(align); }
+extern void _ZdlPvjSt11align_val_t(void* p, uint32_t size, uint32_t align); void _ZdlPvjSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(a); }
+extern void _ZdaPvjSt11align_val_t(void* p, uint32_t size, uint32_t align); void _ZdaPvjSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(a); }
+#endif
 #endif
 
 #endif
 
 #if USE_INTERPOSE
 
-typedef struct interpose_t {
-	void* new_func;
-	void* orig_func;
-} interpose_t;
-
-#define MAC_INTERPOSE_PAIR(newf, oldf) 	{ (void*)newf, (void*)oldf }
-#define MAC_INTERPOSE_SINGLE(newf, oldf) \
-__attribute__((used)) static const interpose_t macinterpose##newf##oldf \
-__attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf)
-
 __attribute__((used)) static const interpose_t macinterpose_malloc[]
 __attribute__ ((section("__DATA, __interpose"))) = {
 	//new and new[]
@@ -113,7 +149,9 @@ __attribute__ ((section("__DATA, __interpose"))) = {
 	MAC_INTERPOSE_PAIR(rpmalloc, calloc),
 	MAC_INTERPOSE_PAIR(rprealloc, realloc),
 	MAC_INTERPOSE_PAIR(rprealloc, reallocf),
+#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15
 	MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc),
+#endif
 	MAC_INTERPOSE_PAIR(rpmemalign, memalign),
 	MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign),
 	MAC_INTERPOSE_PAIR(rpfree, free),
@@ -133,6 +171,12 @@ __attribute__ ((section("__DATA, __interpose"))) = {
 // operators delete and delete[]
 void _ZdlPv(void* p) RPALIAS(rpfree)
 void _ZdaPv(void* p) RPALIAS(rpfree)
+extern inline void _ZdlPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
+extern inline void _ZdaPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
+extern inline void _ZdlPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
+extern inline void _ZdaPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
+extern inline void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
+extern inline void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
 
 #if ARCH_64BIT
 // 64-bit operators new and new[], normal and aligned
@@ -140,12 +184,24 @@ void* _Znwm(uint64_t size) RPALIAS(rpmalloc)
 void* _Znam(uint64_t size) RPALIAS(rpmalloc)
 extern inline void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
 extern inline void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwmSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnamSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern inline void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 #else
 // 32-bit operators new and new[], normal and aligned
 void* _Znwj(uint32_t size) RPALIAS(rpmalloc)
 void* _Znaj(uint32_t size) RPALIAS(rpmalloc)
 extern inline void* _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
 extern inline void* _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwjSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnajSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern inline void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 #endif
 
 void* malloc(size_t size) RPALIAS(rpmalloc)
@@ -157,11 +213,20 @@ void* memalign(size_t alignment, size_t size) RPALIAS(rpmemalign)
 int posix_memalign(void** memptr, size_t alignment, size_t size) RPALIAS(rpposix_memalign)
 void free(void* ptr) RPALIAS(rpfree)
 void cfree(void* ptr) RPALIAS(rpfree)
+#if defined(__ANDROID__)
+size_t malloc_usable_size(const void* ptr) RPALIAS(rpmalloc_usable_size)
+#else
 size_t malloc_usable_size(void* ptr) RPALIAS(rpmalloc_usable_size)
+#endif
 size_t malloc_size(void* ptr) RPALIAS(rpmalloc_usable_size)
 
 #endif
 
+static inline size_t
+_rpmalloc_page_size(void) {
+	return _memory_page_size;
+}
+
 extern inline void* RPMALLOC_CDECL
 reallocarray(void* ptr, size_t count, size_t size) {
 	size_t total;
@@ -188,34 +253,21 @@ reallocarray(void* ptr, size_t count, size_t size) {
 extern inline void* RPMALLOC_CDECL
 valloc(size_t size) {
 	get_thread_heap();
-	if (!size)
-		size = _memory_page_size;
-	size_t total_size = size + _memory_page_size;
-#if ENABLE_VALIDATE_ARGS
-	if (total_size < size) {
-		errno = EINVAL;
-		return 0;
-	}
-#endif
-	void* buffer = rpmalloc(total_size);
-	if ((uintptr_t)buffer & (_memory_page_size - 1))
-		return (void*)(((uintptr_t)buffer & ~(_memory_page_size - 1)) + _memory_page_size);
-	return buffer;
+	return rpaligned_alloc(_rpmalloc_page_size(), size);
 }
 
 extern inline void* RPMALLOC_CDECL
 pvalloc(size_t size) {
 	get_thread_heap();
-	size_t aligned_size = size;
-	if (aligned_size % _memory_page_size)
-		aligned_size = (1 + (aligned_size / _memory_page_size)) * _memory_page_size;
+	const size_t page_size = _rpmalloc_page_size();
+	const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size;
 #if ENABLE_VALIDATE_ARGS
 	if (aligned_size < size) {
 		errno = EINVAL;
 		return 0;
 	}
 #endif
-	return valloc(size);
+	return rpaligned_alloc(_rpmalloc_page_size(), aligned_size);
 }
 
 #endif // ENABLE_OVERRIDE
@@ -226,7 +278,10 @@ pvalloc(size_t size) {
 
 #if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK
 
-__declspec(dllexport) BOOL WINAPI
+extern __declspec(dllexport) BOOL WINAPI
+DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved);
+
+extern __declspec(dllexport) BOOL WINAPI
 DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved) {
 	(void)sizeof(reserved);
 	(void)sizeof(instance);
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 451d03de..a23d62af 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1,4 +1,4 @@
-/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias Jansson
  *
  * This library provides a cross-platform lock free thread caching malloc implementation in C11.
  * The latest source code is always available at
@@ -11,7 +11,20 @@
 
 #include "rpmalloc.h"
 
+////////////
+///
 /// Build time configurable limits
+///
+//////
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#elif defined(__GCC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 #ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           47
@@ -45,59 +58,46 @@
 #define ENABLE_PRELOAD            0
 #endif
 #ifndef DISABLE_UNMAP
-//! Disable unmapping memory pages
+//! Disable unmapping memory pages (also enables unlimited cache)
 #define DISABLE_UNMAP             0
 #endif
-#ifndef DEFAULT_SPAN_MAP_COUNT
-//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
-#define DEFAULT_SPAN_MAP_COUNT    64
-#endif
-
-#if ENABLE_THREAD_CACHE
 #ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited thread and global cache
+//! Enable unlimited global cache (no unmapping until finalization)
 #define ENABLE_UNLIMITED_CACHE    0
 #endif
-#ifndef ENABLE_UNLIMITED_THREAD_CACHE
-//! Unlimited cache disables any thread cache limitations
-#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_THREAD_CACHE
-#ifndef THREAD_CACHE_MULTIPLIER
-//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
-#define THREAD_CACHE_MULTIPLIER 16
-#endif
 #ifndef ENABLE_ADAPTIVE_THREAD_CACHE
-//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
-#define ENABLE_ADAPTIVE_THREAD_CACHE  0
+//! Enable adaptive thread cache size based on use heuristics
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
 #endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
+#define DEFAULT_SPAN_MAP_COUNT    64
 #endif
+#ifndef GLOBAL_CACHE_MULTIPLIER
+//! Multiplier for global cache
+#define GLOBAL_CACHE_MULTIPLIER   8
 #endif
 
-#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
-#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Unlimited cache disables any global cache limitations
-#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
-#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
+#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
+#error Must use global cache if unmap is disabled
 #endif
-#else
-#  undef ENABLE_GLOBAL_CACHE
-#  define ENABLE_GLOBAL_CACHE 0
+
+#if DISABLE_UNMAP
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 1
 #endif
 
-#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
-#  undef ENABLE_ADAPTIVE_THREAD_CACHE
-#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#if !ENABLE_GLOBAL_CACHE
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 0
 #endif
 
-#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
-#  error Must use global cache if unmap is disabled
+#if !ENABLE_THREAD_CACHE
+#undef ENABLE_ADAPTIVE_THREAD_CACHE
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
 #endif
 
-#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
 #  define PLATFORM_WINDOWS 1
 #  define PLATFORM_POSIX 0
 #else
@@ -107,14 +107,20 @@
 
 /// Platform and arch specifics
 #if defined(_MSC_VER) && !defined(__clang__)
-#  define FORCEINLINE inline __forceinline
+#  ifndef FORCEINLINE
+#    define FORCEINLINE inline __forceinline
+#  endif
 #  define _Static_assert static_assert
 #else
-#  define FORCEINLINE inline __attribute__((__always_inline__))
+#  ifndef FORCEINLINE
+#    define FORCEINLINE inline __attribute__((__always_inline__))
+#  endif
 #endif
 #if PLATFORM_WINDOWS
-#  define WIN32_LEAN_AND_MEAN
-#  include <windows.h>
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <Windows.h>
 #  if ENABLE_VALIDATE_ARGS
 #    include <Intsafe.h>
 #  endif
@@ -123,8 +129,11 @@
 #  include <stdio.h>
 #  include <stdlib.h>
 #  if defined(__APPLE__)
+#    include <TargetConditionals.h>
+#    if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
 #    include <mach/mach_vm.h>
 #    include <mach/vm_statistics.h>
+#    endif
 #    include <pthread.h>
 #  endif
 #  if defined(__HAIKU__)
@@ -135,6 +144,30 @@
 
 #include <stdint.h>
 #include <string.h>
+#include <errno.h>
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#include <fibersapi.h>
+static DWORD fls_key;
+static void NTAPI
+_rpmalloc_thread_destructor(void* value) {
+	if (value)
+		rpmalloc_thread_finalize();
+}
+#endif
+
+#if PLATFORM_POSIX
+#  include <sys/mman.h>
+#  include <sched.h>
+#  ifdef __FreeBSD__
+#    include <sys/sysctl.h>
+#    define MAP_HUGETLB MAP_ALIGNED_SUPER
+#  endif
+#  ifndef MAP_UNINITIALIZED
+#    define MAP_UNINITIALIZED 0
+#  endif
+#endif
+#include <errno.h>
 
 #if ENABLE_ASSERTS
 #  undef NDEBUG
@@ -150,30 +183,32 @@
 #  include <stdio.h>
 #endif
 
-/// Atomic access abstraction
+//////
+///
+/// Atomic access abstraction (since MSVC does not do C11 yet)
+///
+//////
+
 #if defined(_MSC_VER) && !defined(__clang__)
 
 typedef volatile long      atomic32_t;
 typedef volatile long long atomic64_t;
 typedef volatile void*     atomicptr_t;
 
-#define atomic_thread_fence_acquire()
-#define atomic_thread_fence_release()
-
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
-static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, 1) + 1; }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
-static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, -1) - 1; }
-#endif
-static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
+static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)InterlockedIncrement(val); }
+static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)InterlockedDecrement(val); }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { *dst = val; }
+static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)InterlockedExchangeAdd64(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
-#  if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange64((volatile long long*)dst, (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; }
-#else
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange((volatile long*)dst, (long)val, (long)ref) == (long)ref) ? 1 : 0; }
-#endif
+static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return (void*)InterlockedExchangePointer((void* volatile*)dst, val); }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
 
 #define EXPECTED(x) (x)
 #define UNEXPECTED(x) (x)
@@ -186,27 +221,66 @@ typedef volatile _Atomic(int32_t) atomic32_t;
 typedef volatile _Atomic(int64_t) atomic64_t;
 typedef volatile _Atomic(void*) atomicptr_t;
 
-#define atomic_thread_fence_acquire() atomic_thread_fence(memory_order_acquire)
-#define atomic_thread_fence_release() atomic_thread_fence(memory_order_release)
-
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
-#endif
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_release); }
+static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_release, memory_order_acquire); }
+static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return atomic_exchange_explicit(dst, val, memory_order_acquire); }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_relaxed, memory_order_relaxed); }
 
 #define EXPECTED(x) __builtin_expect((x), 1)
 #define UNEXPECTED(x) __builtin_expect((x), 0)
     
 #endif
 
+////////////
+///
+/// Statistics related functions (evaluate to nothing when statistics not enabled)
+///
+//////
+
+#if ENABLE_STATISTICS
+#  define _rpmalloc_stat_inc(counter) atomic_incr32(counter)
+#  define _rpmalloc_stat_dec(counter) atomic_decr32(counter)
+#  define _rpmalloc_stat_add(counter, value) atomic_add32(counter, (int32_t)(value))
+#  define _rpmalloc_stat_add64(counter, value) atomic_add64(counter, (int64_t)(value))
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
+#  define _rpmalloc_stat_sub(counter, value) atomic_add32(counter, -(int32_t)(value))
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do { \
+	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
+	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
+		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
+	atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \
+} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do { \
+	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
+	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
+} while(0)
+#else
+#  define _rpmalloc_stat_inc(counter) do {} while(0)
+#  define _rpmalloc_stat_dec(counter) do {} while(0)
+#  define _rpmalloc_stat_add(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add64(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do {} while (0)
+#  define _rpmalloc_stat_sub(counter, value) do {} while(0)
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do {} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0)
+#endif
+
+
+///
 /// Preconfigured limits and sizes
-//! Granularity of a small allocation block
+///
+
+//! Granularity of a small allocation block (must be power of two)
 #define SMALL_GRANULARITY         16
 //! Small granularity shift count
 #define SMALL_GRANULARITY_SHIFT   4
@@ -223,13 +297,24 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 //! Total number of small + medium size classes
 #define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
 //! Number of large block size classes
-#define LARGE_CLASS_COUNT         32
+#define LARGE_CLASS_COUNT         63
 //! Maximum size of a medium block
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
-//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
-#define SPAN_HEADER_SIZE          96
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
+#define SPAN_HEADER_SIZE          128
+//! Number of spans in thread cache
+#define MAX_THREAD_SPAN_CACHE     256
+//! Number of spans to transfer between thread and global cache
+#define THREAD_SPAN_CACHE_TRANSFER 64
+//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2)
+#define MAX_THREAD_SPAN_LARGE_CACHE 64
+//! Number of spans to transfer between thread and global cache for large spans
+#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
+
+_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two");
+_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two");
 
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
@@ -242,11 +327,17 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 
 #define INVALID_POINTER ((void*)((uintptr_t)-1))
 
+#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
+#define SIZE_CLASS_HUGE ((uint32_t)-1)
+
+////////////
+///
 /// Data types
+///
+//////
+
 //! A memory heap, per thread
 typedef struct heap_t heap_t;
-//! Heap spans per size class
-typedef struct heap_class_t heap_class_t;
 //! Span of memory pages
 typedef struct span_t span_t;
 //! Span list
@@ -270,22 +361,22 @@ struct span_use_t {
 	//! Current number of spans used (actually used, not in cache)
 	atomic32_t current;
 	//! High water mark of spans used
-	uint32_t high;
+	atomic32_t high;
 #if ENABLE_STATISTICS
 	//! Number of spans transitioned to global cache
-	uint32_t spans_to_global;
+	atomic32_t spans_to_global;
 	//! Number of spans transitioned from global cache
-	uint32_t spans_from_global;
+	atomic32_t spans_from_global;
 	//! Number of spans transitioned to thread cache
-	uint32_t spans_to_cache;
+	atomic32_t spans_to_cache;
 	//! Number of spans transitioned from thread cache
-	uint32_t spans_from_cache;
+	atomic32_t spans_from_cache;
 	//! Number of spans transitioned to reserved state
-	uint32_t spans_to_reserved;
+	atomic32_t spans_to_reserved;
 	//! Number of spans transitioned from reserved state
-	uint32_t spans_from_reserved;
+	atomic32_t spans_from_reserved;
 	//! Number of raw memory map calls
-	uint32_t spans_map_calls;
+	atomic32_t spans_map_calls;
 #endif
 };
 typedef struct span_use_t span_use_t;
@@ -298,64 +389,59 @@ struct size_class_use_t {
 	//! Peak number of allocations
 	int32_t alloc_peak;
 	//! Total number of allocations
-	int32_t alloc_total;
+	atomic32_t alloc_total;
 	//! Total number of frees
 	atomic32_t free_total;
 	//! Number of spans in use
-	uint32_t spans_current;
+	atomic32_t spans_current;
 	//! Number of spans transitioned to cache
-	uint32_t spans_peak;
+	int32_t spans_peak;
 	//! Number of spans transitioned to cache
-	uint32_t spans_to_cache;
+	atomic32_t spans_to_cache;
 	//! Number of spans transitioned from cache
-	uint32_t spans_from_cache;
+	atomic32_t spans_from_cache;
 	//! Number of spans transitioned from reserved state
-	uint32_t spans_from_reserved;
+	atomic32_t spans_from_reserved;
 	//! Number of spans mapped
-	uint32_t spans_map_calls;
+	atomic32_t spans_map_calls;
+	int32_t unused;
 };
 typedef struct size_class_use_t size_class_use_t;
 #endif
 
-typedef enum span_state_t {
-	SPAN_STATE_ACTIVE = 0,
-	SPAN_STATE_PARTIAL,
-	SPAN_STATE_FULL
-} span_state_t;
-
-//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
-//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
-//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
-//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
-//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
-//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
-//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
-//to reduce physical memory use).
+// A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+// or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+// span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
+// (super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+// that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+// superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+// in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+// to reduce physical memory use).
 struct span_t {
 	//! Free list
 	void*       free_list;
-	//! State
-	uint32_t    state;
-	//! Used count when not active (not including deferred free list)
-	uint32_t    used_count;
-	//! Block count
+	//! Total block count of size class
 	uint32_t    block_count;
 	//! Size class
 	uint32_t    size_class;
 	//! Index of last block initialized in free list
 	uint32_t    free_list_limit;
-	//! Span list size when part of a cache list, or size of deferred free list when partial/full
-	uint32_t    list_size;
+	//! Number of used blocks remaining when in partial state
+	uint32_t    used_count;
 	//! Deferred free list
 	atomicptr_t free_list_deferred;
+	//! Size of deferred free list, or list of spans when part of a cache list
+	uint32_t    list_size;
 	//! Size of a block
 	uint32_t    block_size;
 	//! Flags and counters
 	uint32_t    flags;
 	//! Number of spans
 	uint32_t    span_count;
-	//! Total span counter for master spans, distance for subspans
-	uint32_t    total_spans_or_distance;
+	//! Total span counter for master spans
+	uint32_t    total_spans;
+	//! Offset from master span for subspans
+	uint32_t    offset_from_master;
 	//! Remaining span counter, for master spans
 	atomic32_t  remaining_spans;
 	//! Alignment offset
@@ -369,51 +455,87 @@ struct span_t {
 };
 _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
 
-struct heap_class_t {
+struct span_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_CACHE];
+};
+typedef struct span_cache_t span_cache_t;
+
+struct span_large_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_LARGE_CACHE];
+};
+typedef struct span_large_cache_t span_large_cache_t;
+
+struct heap_size_class_t {
 	//! Free list of active span
 	void*        free_list;
-	//! Double linked list of partially used spans with free blocks for each size class.
-	//  Current active span is at head of list. Previous span pointer in head points to tail span of list.
+	//! Double linked list of partially used spans with free blocks.
+	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
+	//! Early level cache of fully free spans
+	span_t*      cache;
 };
+typedef struct heap_size_class_t heap_size_class_t;
 
+// Control structure for a heap, either a thread heap or a first class heap if enabled
 struct heap_t {
-	//! Active and semi-used span data per size class
-	heap_class_t span_class[SIZE_CLASS_COUNT];
+	//! Owning thread ID
+	uintptr_t    owner_thread;
+	//! Free lists for each size class
+	heap_size_class_t size_class[SIZE_CLASS_COUNT];
 #if ENABLE_THREAD_CACHE
-	//! List of free spans (single linked list)
-	span_t*      span_cache[LARGE_CLASS_COUNT];
-	//! List of deferred free spans of class 0 (single linked list)
-	atomicptr_t  span_cache_deferred;
-#endif
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	//! Current and high water mark of spans used per span count
-	span_use_t   span_use[LARGE_CLASS_COUNT];
+	//! Arrays of fully freed spans, single span
+	span_cache_t span_cache;
 #endif
+	//! List of deferred free spans (single linked list)
+	atomicptr_t  span_free_deferred;
+	//! Number of full spans
+	size_t       full_span_count;
 	//! Mapped but unused spans
 	span_t*      span_reserve;
 	//! Master span for mapped but unused spans
 	span_t*      span_reserve_master;
 	//! Number of mapped but unused spans
-	size_t       spans_reserved;
+	uint32_t     spans_reserved;
+	//! Child count
+	atomic32_t   child_count;
 	//! Next heap in id list
 	heap_t*      next_heap;
 	//! Next heap in orphan list
 	heap_t*      next_orphan;
-	//! Memory pages alignment offset
-	size_t       align_offset;
 	//! Heap ID
 	int32_t      id;
+	//! Finalization state flag
+	int          finalize;
+	//! Master heap owning the memory pages
+	heap_t*      master_heap;
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, large spans with > 1 span count
+	span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
+#endif
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//! Double linked list of fully utilized spans with free blocks for each size class.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      full_span[SIZE_CLASS_COUNT];
+	//! Double linked list of large and huge spans allocated by this heap
+	span_t*      large_huge_span;
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//! Current and high water mark of spans used per span count
+	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
 #if ENABLE_STATISTICS
-	//! Number of bytes transitioned thread -> global
-	size_t       thread_to_global;
-	//! Number of bytes transitioned global -> thread
-	size_t       global_to_thread;
 	//! Allocation stats per size class
 	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
+	//! Number of bytes transitioned thread -> global
+	atomic64_t   thread_to_global;
+	//! Number of bytes transitioned global -> thread
+	atomic64_t   global_to_thread;
 #endif
 };
 
+// Size class for defining a block size bucket
 struct size_class_t {
 	//! Size of blocks in this class
 	uint32_t block_size;
@@ -425,15 +547,27 @@ struct size_class_t {
 _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
 
 struct global_cache_t {
-	//! Cache list pointer
-	atomicptr_t cache;
-	//! Cache size
-	atomic32_t size;
-	//! ABA counter
-	atomic32_t counter;
+	//! Cache lock
+	atomic32_t lock;
+	//! Cache count
+	uint32_t count;
+	//! Cached spans
+	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
+	//! Unlimited cache overflow
+	span_t* overflow;
 };
 
+////////////
+///
 /// Global data
+///
+//////
+
+//! Default span size (64KiB)
+#define _memory_default_span_size (64 * 1024)
+#define _memory_default_span_size_shift 16
+#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+
 //! Initialized flag
 static int _rpmalloc_initialized;
 //! Configuration
@@ -452,10 +586,10 @@ static size_t _memory_span_size_shift;
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
 #else
-//! Hardwired span size (64KiB)
-#define _memory_span_size (64 * 1024)
-#define _memory_span_size_shift 16
-#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+//! Hardwired span size
+#define _memory_span_size _memory_default_span_size
+#define _memory_span_size_shift _memory_default_span_size_shift
+#define _memory_span_mask _memory_default_span_mask
 #endif
 //! Number of spans to map in each map call
 static size_t _memory_span_map_count;
@@ -475,12 +609,22 @@ static int _memory_huge_pages;
 //! Global span cache
 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
+//! Global reserved spans
+static span_t* _memory_global_reserve;
+//! Global reserved count
+static size_t _memory_global_reserve_count;
+//! Global reserved master
+static span_t* _memory_global_reserve_master;
 //! All heaps
-static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+static heap_t* _memory_heaps[HEAP_ARRAY_SIZE];
+//! Used to restrict access to mapping memory for huge pages
+static atomic32_t _memory_global_lock;
 //! Orphaned heaps
-static atomicptr_t _memory_orphan_heaps;
-//! Running orphan counter to avoid ABA issues in linked list
-static atomic32_t _memory_orphan_counter;
+static heap_t* _memory_orphan_heaps;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+//! Orphaned heaps (first class heaps)
+static heap_t* _memory_first_class_orphan_heaps;
+#endif
 #if ENABLE_STATISTICS
 //! Active heap count
 static atomic32_t _memory_active_heaps;
@@ -488,6 +632,8 @@ static atomic32_t _memory_active_heaps;
 static atomic32_t _mapped_pages;
 //! Peak number of concurrently mapped memory pages
 static int32_t _mapped_pages_peak;
+//! Number of mapped master spans
+static atomic32_t _master_spans;
 //! Number of currently unused spans
 static atomic32_t _reserved_spans;
 //! Running counter of total number of mapped memory pages since start
@@ -502,6 +648,12 @@ static atomic32_t _huge_pages_current;
 static int32_t _huge_pages_peak;
 #endif
 
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
 //! Current thread heap
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 static pthread_key_t _memory_thread_heap;
@@ -541,6 +693,32 @@ get_thread_heap(void) {
 #endif
 }
 
+//! Fast thread ID
+static inline uintptr_t
+get_thread_id(void) {
+#if defined(_WIN32)
+	return (uintptr_t)((void*)NtCurrentTeb());
+#elif defined(__GNUC__) || defined(__clang__)
+	uintptr_t tid;
+#  if defined(__i386__)
+	__asm__("movl %%gs:0, %0" : "=r" (tid) : : );
+#  elif defined(__MACH__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+	__asm__("movq %%gs:0, %0" : "=r" (tid) : : );
+#  elif defined(__x86_64__)
+	__asm__("movq %%fs:0, %0" : "=r" (tid) : : );
+#  elif defined(__arm__)
+	__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+#  elif defined(__aarch64__)
+	__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+#  else
+	tid = (uintptr_t)((void*)get_thread_heap_raw());
+#  endif
+	return tid;
+#else
+	return (uintptr_t)((void*)get_thread_heap_raw());
+#endif
+}
+
 //! Set the current thread heap
 static void
 set_thread_heap(heap_t* heap) {
@@ -549,85 +727,220 @@ set_thread_heap(heap_t* heap) {
 #else
 	_memory_thread_heap = heap;
 #endif
+	if (heap)
+		heap->owner_thread = get_thread_id();
 }
 
-//! Default implementation to map more virtual memory
-static void*
-_memory_map_os(size_t size, size_t* offset);
-
-//! Default implementation to unmap virtual memory
-static void
-_memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
-
-//! Lookup a memory heap from heap ID
-static heap_t*
-_memory_heap_lookup(int32_t id) {
-	uint32_t list_idx = id % HEAP_ARRAY_SIZE;
-	heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
-	while (heap && (heap->id != id))
-		heap = heap->next_heap;
-	return heap;
-}
-
-#if ENABLE_STATISTICS
-#  define _memory_statistics_inc(counter, value) counter += value
-#  define _memory_statistics_dec(counter, value) counter -= value
-#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
-#  define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
-#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
-#  define _memory_statistics_inc_alloc(heap, class_idx) do { \
-	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
-	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
-		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
-	heap->size_class_use[class_idx].alloc_total++; \
-} while(0)
-#  define _memory_statistics_inc_free(heap, class_idx) do { \
-	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
-	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
-} while(0)
-#else
-#  define _memory_statistics_inc(counter, value) do {} while(0)
-#  define _memory_statistics_dec(counter, value) do {} while(0)
-#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
-#  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
-#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
-#  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
-#  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
-#endif
-
-static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span);
+////////////
+///
+/// Low level memory map/unmap
+///
+//////
 
 //! Map more virtual memory
+//  size is number of bytes to map
+//  offset receives the offset in bytes from start of mapped region
+//  returns address to start of mapped region to use
 static void*
-_memory_map(size_t size, size_t* offset) {
+_rpmalloc_mmap(size_t size, size_t* offset) {
 	assert(!(size % _memory_page_size));
 	assert(size >= _memory_page_size);
-	_memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
-	_memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
+	_rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
+	_rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift));
 	return _memory_config.memory_map(size, offset);
 }
 
 //! Unmap virtual memory
+//  address is the memory address to unmap, as returned from _memory_map
+//  size is the number of bytes to unmap, which might be less than full region for a partial unmap
+//  offset is the offset in bytes to the actual mapped region, as set by _memory_map
+//  release is set to 0 for partial unmap, or size of entire range for a full unmap
 static void
-_memory_unmap(void* address, size_t size, size_t offset, size_t release) {
+_rpmalloc_unmap(void* address, size_t size, size_t offset, size_t release) {
 	assert(!release || (release >= size));
 	assert(!release || (release >= _memory_page_size));
 	if (release) {
 		assert(!(release % _memory_page_size));
-		_memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift));
-		_memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift));
+		_rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift));
+		_rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift));
 	}
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
+//! Default implementation to map new pages to virtual memory
+static void*
+_rpmalloc_mmap_os(size_t size, size_t* offset) {
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
+	assert(size >= _memory_page_size);
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
+	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		assert(ptr && "Failed to map virtual memory block");
+		return 0;
+	}
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#  if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (_memory_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
+#  elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#  elif defined(MAP_ALIGN)
+	caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0);
+	void* ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
+#  else
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+#  endif
+	if ((ptr == MAP_FAILED) || !ptr) {
+		assert("Failed to map virtual memory block" == 0);
+		return 0;
+	}
+#endif
+	_rpmalloc_stat_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
+	if (padding) {
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+		assert(final_padding <= _memory_span_size);
+		assert(final_padding <= padding);
+		assert(!(final_padding % 8));
+		ptr = pointer_offset(ptr, final_padding);
+		*offset = final_padding >> 3;
+	}
+	assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask));
+	return ptr;
+}
+
+//! Default implementation to unmap pages from virtual memory
+static void
+_rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
+	assert(release || (offset == 0));
+	assert(!release || (release >= _memory_page_size));
+	assert(size >= _memory_page_size);
+	if (release && offset) {
+		offset <<= 3;
+		address = pointer_offset(address, -(int32_t)offset);
+		if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) {
+			//Padding is always one span size
+			release += _memory_span_size;
+		}
+	}
+#if !DISABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		assert(address && "Failed to unmap virtual memory block");
+	}
+#else
+	if (release) {
+		if (munmap(address, release)) {
+			assert("Failed to unmap virtual memory block" == 0);
+		}
+	} else {
+#if defined(POSIX_MADV_FREE)
+		if (posix_madvise(address, size, POSIX_MADV_FREE))
+#endif
+		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+			assert("Failed to madvise virtual memory block as free" == 0);
+		}
+	}
+#endif
+#endif
+	if (release)
+		_rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
+}
+
+static void
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count);
+
+//! Use global reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_rpmalloc_global_get_reserved_spans(size_t span_count) {
+	span_t* span = _memory_global_reserve;
+	_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, span, span_count);
+	_memory_global_reserve_count -= span_count;
+	if (_memory_global_reserve_count)
+		_memory_global_reserve = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+	else
+		_memory_global_reserve = 0;
+	return span;
+}
+
+//! Store the given spans as global reserve (must only be called from within new heap allocation, not thread safe)
+static void
+_rpmalloc_global_set_reserved_spans(span_t* master, span_t* reserve, size_t reserve_span_count) {
+	_memory_global_reserve_master = master;
+	_memory_global_reserve_count = reserve_span_count;
+	_memory_global_reserve = reserve;
+}
+
+
+////////////
+///
+/// Span linked list management
+///
+//////
+
+//! Add a span to double linked list at the head
+static void
+_rpmalloc_span_double_link_list_add(span_t** head, span_t* span) {
+	if (*head) {
+		span->next = *head;
+		(*head)->prev = span;
+	} else {
+		span->next = 0;
+	}
+	*head = span;
+}
+
+//! Pop head span from double linked list
+static void
+_rpmalloc_span_double_link_list_pop_head(span_t** head, span_t* span) {
+	assert(*head == span);
+	span = *head;
+	*head = span->next;
+}
+
+//! Remove a span from double linked list
+static void
+_rpmalloc_span_double_link_list_remove(span_t** head, span_t* span) {
+	assert(*head);
+	if (*head == span) {
+		*head = span->next;
+	} else {
+		span_t* next_span = span->next;
+		span_t* prev_span = span->prev;
+		prev_span->next = next_span;
+		if (EXPECTED(next_span != 0)) {
+			next_span->prev = prev_span;
+		}
+	}
+}
+
+
+////////////
+///
+/// Span control
+///
+//////
+
+static void
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span);
+
+static void
+_rpmalloc_heap_finalize(heap_t* heap);
+
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count);
+
 //! Declare the span to be a subspan and store distance from master span and span count
 static void
-_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
 	assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
 	if (subspan != master) {
 		subspan->flags = SPAN_FLAG_SUBSPAN;
-		subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
+		subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
 		subspan->align_offset = 0;
 	}
 	subspan->span_count = (uint32_t)span_count;
@@ -635,86 +948,120 @@ _memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size
 
 //! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
 static span_t*
-_memory_map_from_reserve(heap_t* heap, size_t span_count) {
+_rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) {
 	//Update the heap span reserve
 	span_t* span = heap->span_reserve;
-	heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
-	heap->spans_reserved -= span_count;
+	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+	heap->spans_reserved -= (uint32_t)span_count;
 
-	_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
+	_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
 	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1);
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved);
 
 	return span;
 }
 
 //! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
 static size_t
-_memory_map_align_span_count(size_t span_count) {
+_rpmalloc_span_align_count(size_t span_count) {
 	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
 	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
-		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);	
+		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);
 	return request_count;
 }
 
-//! Store the given spans as reserve in the given heap
-static void
-_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
-	heap->span_reserve_master = master;
-	heap->span_reserve = reserve;
-	heap->spans_reserved = reserve_span_count;
-}
-
 //! Setup a newly mapped span
 static void
-_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
-	span->total_spans_or_distance = (uint32_t)total_span_count;
+_rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
+	span->total_spans = (uint32_t)total_span_count;
 	span->span_count = (uint32_t)span_count;
 	span->align_offset = (uint32_t)align_offset;
 	span->flags = SPAN_FLAG_MASTER;
-	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);	
+	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
 }
 
-//! Map a akigned set of spans, taking configured mapping granularity and the page size into account
+static void
+_rpmalloc_span_unmap(span_t* span);
+
+//! Map an aligned set of spans, taking configured mapping granularity and the page size into account
 static span_t*
-_memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
+_rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) {
 	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
 	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
-	size_t aligned_span_count = _memory_map_align_span_count(span_count);
+	size_t aligned_span_count = _rpmalloc_span_align_count(span_count);
 	size_t align_offset = 0;
-	span_t* span = _memory_map(aligned_span_count * _memory_span_size, &align_offset);
+	span_t* span = (span_t*)_rpmalloc_mmap(aligned_span_count * _memory_span_size, &align_offset);
 	if (!span)
 		return 0;
-	_memory_span_initialize(span, aligned_span_count, span_count, align_offset);
-	_memory_statistics_add(&_reserved_spans, aligned_span_count);
+	_rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
+	_rpmalloc_stat_add(&_reserved_spans, aligned_span_count);
+	_rpmalloc_stat_inc(&_master_spans);
 	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1);
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
 	if (aligned_span_count > span_count) {
+		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		size_t reserved_count = aligned_span_count - span_count;
 		if (heap->spans_reserved) {
-			_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
-			_memory_heap_cache_insert(heap, heap->span_reserve);
+			_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
+			_rpmalloc_heap_cache_insert(heap, heap->span_reserve);
 		}
-		_memory_heap_set_reserved_spans(heap, span, pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count);
+		if (reserved_count > DEFAULT_SPAN_MAP_COUNT) {
+			size_t remain_count = reserved_count - DEFAULT_SPAN_MAP_COUNT;
+			reserved_count = DEFAULT_SPAN_MAP_COUNT;
+			span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size);
+			if (_memory_global_reserve)
+				_rpmalloc_span_unmap(_memory_global_reserve);
+			_rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
+		}
+		_rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
 	}
 	return span;
 }
 
+static span_t*
+_rpmalloc_global_get_reserved_spans(size_t span_count);
+
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
-_memory_map_spans(heap_t* heap, size_t span_count) {
+_rpmalloc_span_map(heap_t* heap, size_t span_count) {
 	if (span_count <= heap->spans_reserved)
-		return _memory_map_from_reserve(heap, span_count);
-	return _memory_map_aligned_span_count(heap, span_count);
+		return _rpmalloc_span_map_from_reserve(heap, span_count);
+	span_t* span = 0;
+	if (_memory_page_size > _memory_span_size) {
+		// If huge pages, make sure only one thread maps more memory to avoid bloat
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) {
+			/* Spin */
+		}
+		if (_memory_global_reserve_count >= span_count) {
+			size_t reserve_count = (!heap->spans_reserved ? DEFAULT_SPAN_MAP_COUNT : span_count);
+			if (_memory_global_reserve_count < reserve_count)
+				reserve_count = _memory_global_reserve_count;
+			span = _rpmalloc_global_get_reserved_spans(reserve_count);
+			if (span) {
+				if (reserve_count > span_count) {
+					span_t* reserved_span = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+					_rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, reserved_span, reserve_count - span_count);
+				}
+				// Already marked as subspan in _rpmalloc_global_get_reserved_spans
+				span->span_count = (uint32_t)span_count;
+			}
+		}
+	}
+	if (!span)
+		span = _rpmalloc_span_map_aligned_count(heap, span_count);
+	if (_memory_page_size > _memory_span_size)
+		atomic_store32_release(&_memory_global_lock, 0);
+	return span;
 }
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_span(span_t* span) {
+_rpmalloc_span_unmap(span_t* span) {
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 
 	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : (pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size)));
+	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size)));
 	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
 	assert(master->flags & SPAN_FLAG_MASTER);
 
@@ -723,8 +1070,8 @@ _memory_unmap_span(span_t* span) {
 		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
 		assert(span->align_offset == 0);
 		if (_memory_span_size >= _memory_page_size) {
-			_memory_unmap(span, span_count * _memory_span_size, 0, 0);
-			_memory_statistics_sub(&_reserved_spans, span_count);
+			_rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
+			_rpmalloc_stat_sub(&_reserved_spans, span_count);
 		}
 	} else {
 		//Special double flag to denote an unmapped master
@@ -737,485 +1084,762 @@ _memory_unmap_span(span_t* span) {
 		assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
 		size_t unmap_count = master->span_count;
 		if (_memory_span_size < _memory_page_size)
-			unmap_count = master->total_spans_or_distance;
-		_memory_statistics_sub(&_reserved_spans, unmap_count);
-		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
+			unmap_count = master->total_spans;
+		_rpmalloc_stat_sub(&_reserved_spans, unmap_count);
+		_rpmalloc_stat_sub(&_master_spans, 1);
+		_rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
 	}
 }
 
-#if ENABLE_THREAD_CACHE
-
-//! Unmap a single linked list of spans
+//! Move the span (used for small or medium allocations) to the heap thread cache
 static void
-_memory_unmap_span_list(span_t* span) {
-	size_t list_size = span->list_size;
-	for (size_t ispan = 0; ispan < list_size; ++ispan) {
-		span_t* next_span = span->next;
-		_memory_unmap_span(span);
-		span = next_span;
+_rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
+	assert(heap == span->heap);
+	assert(span->size_class < SIZE_CLASS_COUNT);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	atomic_decr32(&heap->span_use[0].current);
+#endif
+	_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+	if (!heap->finalize) {
+		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+		if (heap->size_class[span->size_class].cache)
+			_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache);
+		heap->size_class[span->size_class].cache = span;
+	} else {
+		_rpmalloc_span_unmap(span);
 	}
-	assert(!span);
 }
 
-//! Add span to head of single linked span list
-static size_t
-_memory_span_list_push(span_t** head, span_t* span) {
-	span->next = *head;
-	if (*head)
-		span->list_size = (*head)->list_size + 1;
-	else
-		span->list_size = 1;
-	*head = span;
-	return span->list_size;
+//! Initialize a (partial) free list up to next system memory page, while reserving the first block
+//! as allocated, returning number of blocks in list
+static uint32_t
+free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, uint32_t block_count, uint32_t block_size) {
+	assert(block_count);
+	*first_block = block_start;
+	if (block_count > 1) {
+		void* free_block = pointer_offset(block_start, block_size);
+		void* block_end = pointer_offset(block_start, (size_t)block_size * block_count);
+		//If block size is less than half a memory page, bound init to next memory page boundary
+		if (block_size < (_memory_page_size >> 1)) {
+			void* page_end = pointer_offset(page_start, _memory_page_size);
+			if (page_end < block_end)
+				block_end = page_end;
+		}
+		*list = free_block;
+		block_count = 2;
+		void* next_block = pointer_offset(free_block, block_size);
+		while (next_block < block_end) {
+			*((void**)free_block) = next_block;
+			free_block = next_block;
+			++block_count;
+			next_block = pointer_offset(next_block, block_size);
+		}
+		*((void**)free_block) = 0;
+	} else {
+		*list = 0;
+	}
+	return block_count;
 }
 
-//! Remove span from head of single linked span list, returns the new list head
-static span_t*
-_memory_span_list_pop(span_t** head) {
-	span_t* span = *head;
-	span_t* next_span = 0;
-	if (span->list_size > 1) {
-		assert(span->next);
-		next_span = span->next;
-		assert(next_span);
-		next_span->list_size = span->list_size - 1;
+//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list
+static void*
+_rpmalloc_span_initialize_new(heap_t* heap, span_t* span, uint32_t class_idx) {
+	assert(span->span_count == 1);
+	size_class_t* size_class = _memory_size_class + class_idx;
+	span->size_class = class_idx;
+	span->heap = heap;
+	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
+	span->block_size = size_class->block_size;
+	span->block_count = size_class->block_count;
+	span->free_list = 0;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
+
+	//Setup free list. Only initialize one system page worth of free blocks in list
+	void* block;
+	span->free_list_limit = free_list_partial_init(&heap->size_class[class_idx].free_list, &block, 
+		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
+	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
+	if (span->free_list_limit < span->block_count) {
+		_rpmalloc_span_double_link_list_add(&heap->size_class[class_idx].partial_span, span);
+		span->used_count = span->free_list_limit;
+	} else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		span->used_count = span->block_count;
 	}
-	*head = next_span;
-	return span;
+	return block;
 }
 
-//! Split a single linked span list
-static span_t*
-_memory_span_list_split(span_t* span, size_t limit) {
-	span_t* next = 0;
-	if (limit < 2)
-		limit = 2;
-	if (span->list_size > limit) {
-		uint32_t list_size = 1;
-		span_t* last = span;
-		next = span->next;
-		while (list_size < limit) {
-			last = next;
-			next = next->next;
-			++list_size;
+static void
+_rpmalloc_span_extract_free_list_deferred(span_t* span) {
+	// We need acquire semantics on the CAS operation since we are interested in the list size
+	// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
+	do {
+		span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (span->free_list == INVALID_POINTER);
+	span->used_count -= span->list_size;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
+}
+
+static int
+_rpmalloc_span_is_fully_utilized(span_t* span) {
+	assert(span->free_list_limit <= span->block_count);
+	return !span->free_list && (span->free_list_limit >= span->block_count);
+}
+
+static int
+_rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
+	void* free_list = heap->size_class[iclass].free_list;
+	span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask);
+	if (span == class_span) {
+		// Adopt the heap class free list back into the span free list
+		void* block = span->free_list;
+		void* last_block = 0;
+		while (block) {
+			last_block = block;
+			block = *((void**)block);
 		}
-		last->next = 0;
-		assert(next);
-		next->list_size = span->list_size - list_size;
-		span->list_size = list_size;
-		span->prev = 0;
+		uint32_t free_count = 0;
+		block = free_list;
+		while (block) {
+			++free_count;
+			block = *((void**)block);
+		}
+		if (last_block) {
+			*((void**)last_block) = free_list;
+		} else {
+			span->free_list = free_list;
+		}
+		heap->size_class[iclass].free_list = 0;
+		span->used_count -= free_count;
+	}
+	//If this assert triggers you have memory leaks
+	assert(span->list_size == span->used_count);
+	if (span->list_size == span->used_count) {
+		_rpmalloc_stat_dec(&heap->span_use[0].current);
+		_rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current);
+		// This function only used for spans in double linked lists
+		if (list_head)
+			_rpmalloc_span_double_link_list_remove(list_head, span);
+		_rpmalloc_span_unmap(span);
+		return 1;
 	}
-	return next;
+	return 0;
 }
 
-#endif
 
-//! Add a span to partial span double linked list at the head
-static void
-_memory_span_partial_list_add(span_t** head, span_t* span) {
-	if (*head) {
-		span->next = *head;
-		//Maintain pointer to tail span
-		span->prev = (*head)->prev;
-		(*head)->prev = span;
-	} else {
-		span->next = 0;
-		span->prev = span;
-	}
-	*head = span;
-}
+////////////
+///
+/// Global cache
+///
+//////
 
-//! Add a span to partial span double linked list at the tail
+#if ENABLE_GLOBAL_CACHE
+
+//! Finalize a global cache
 static void
-_memory_span_partial_list_add_tail(span_t** head, span_t* span) {
-	span->next = 0;
-	if (*head) {
-		span_t* tail = (*head)->prev;
-		tail->next = span;
-		span->prev = tail;
-		//Maintain pointer to tail span
-		(*head)->prev = span;
-	} else {
-		span->prev = span;
-		*head = span;
+_rpmalloc_global_cache_finalize(global_cache_t* cache) {
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
+	for (size_t ispan = 0; ispan < cache->count; ++ispan)
+		_rpmalloc_span_unmap(cache->span[ispan]);
+	cache->count = 0;
+
+	while (cache->overflow) {
+		span_t* span = cache->overflow;
+		cache->overflow = span->next;
+		_rpmalloc_span_unmap(span);
 	}
+
+	atomic_store32_release(&cache->lock, 0);
 }
 
-//! Pop head span from partial span double linked list
 static void
-_memory_span_partial_list_pop_head(span_t** head) {
-	span_t* span = *head;
-	*head = span->next;
-	if (*head) {
-		//Maintain pointer to tail span
-		(*head)->prev = span->prev;
+_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) {
+	const size_t cache_limit = (span_count == 1) ? 
+		GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE :
+		GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t insert_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
+	if ((cache->count + insert_count) > cache_limit)
+		insert_count = cache_limit - cache->count;
+
+	memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count);
+	cache->count += (uint32_t)insert_count;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (insert_count < count) {
+#else
+	// Enable unlimited cache if huge pages, or we will leak since it is unlikely that an entire huge page
+	// will be unmapped, and we're unable to partially decommit a huge page
+	while ((_memory_page_size > _memory_span_size) && (insert_count < count)) {
+#endif		
+		span_t* current_span = span[insert_count++];
+		current_span->next = cache->overflow;
+		cache->overflow = current_span;
 	}
+	atomic_store32_release(&cache->lock, 0);
+
+	for (size_t ispan = insert_count; ispan < count; ++ispan)
+		_rpmalloc_span_unmap(span[ispan]);
 }
 
-//! Remove a span from partial span double linked list
-static void
-_memory_span_partial_list_remove(span_t** head, span_t* span) {
-	if (UNEXPECTED(*head == span)) {
-		_memory_span_partial_list_pop_head(head);
-	} else {
-		span_t* next_span = span->next;
-		span_t* prev_span = span->prev;
-		prev_span->next = next_span;
-		if (EXPECTED(next_span != 0)) {
-			next_span->prev = prev_span;
-		} else {
-			//Update pointer to tail span
-			(*head)->prev = prev_span;
-		}
+static size_t
+_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) {
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t extract_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
+	if (extract_count > cache->count)
+		extract_count = cache->count;
+
+	memcpy(span, cache->span + (cache->count - extract_count), sizeof(span_t*) * extract_count);
+	cache->count -= (uint32_t)extract_count;
+
+	while ((extract_count < count) && cache->overflow) {
+		span_t* current_span = cache->overflow;
+		span[extract_count++] = current_span;
+		cache->overflow = current_span->next;
 	}
+	atomic_store32_release(&cache->lock, 0);
+
+	return extract_count;
 }
 
-#if ENABLE_GLOBAL_CACHE
+#endif
+
+////////////
+///
+/// Heap control
+///
+//////
+
+static void _rpmalloc_deallocate_huge(span_t*);
 
-//! Insert the given list of memory page spans in the global cache
+//! Store the given spans as reserve in the given heap
 static void
-_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
-	assert((span->list_size == 1) || (span->next != 0));
-	int32_t list_size = (int32_t)span->list_size;
-	//Unmap if cache has reached the limit
-	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-		_memory_unmap_span_list(span);
-		atomic_add32(&cache->size, -list_size);
-		return;
-#endif
-	}
-	void* current_cache, *new_cache;
-	do {
-		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev = (void*)((uintptr_t)current_cache & _memory_span_mask);
-		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
+	heap->span_reserve_master = master;
+	heap->span_reserve = reserve;
+	heap->spans_reserved = (uint32_t)reserve_span_count;
 }
 
-//! Extract a number of memory page spans from the global cache
-static span_t*
-_memory_cache_extract(global_cache_t* cache) {
-	uintptr_t span_ptr;
-	do {
-		void* global_span = atomic_load_ptr(&cache->cache);
-		span_ptr = (uintptr_t)global_span & _memory_span_mask;
-		if (span_ptr) {
-			span_t* span = (void*)span_ptr;
-			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
-			//does not manage to traverse the span to being unmapped before we access it
-			void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
-				atomic_add32(&cache->size, -(int32_t)span->list_size);
-				return span;
+//! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
+static void
+_rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
+	span_t* span = (span_t*)((void*)atomic_exchange_ptr_acquire(&heap->span_free_deferred, 0));
+	while (span) {
+		span_t* next_span = (span_t*)span->free_list;
+		assert(span->heap == heap);
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			assert(heap->full_span_count);
+			--heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
+#endif
+			if (single_span && !*single_span) {
+				*single_span = span;
+			} else {
+				_rpmalloc_stat_dec(&heap->span_use[0].current);
+				_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+				_rpmalloc_heap_cache_insert(heap, span);
+			}
+		} else {
+			if (span->size_class == SIZE_CLASS_HUGE) {
+				_rpmalloc_deallocate_huge(span);
+			} else {
+				assert(span->size_class == SIZE_CLASS_LARGE);
+				assert(heap->full_span_count);
+				--heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+				_rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
+#endif
+				uint32_t idx = span->span_count - 1;
+				if (!idx && single_span && !*single_span) {
+					*single_span = span;
+				} else {
+					_rpmalloc_stat_dec(&heap->span_use[idx].current);
+					_rpmalloc_heap_cache_insert(heap, span);
+				}
 			}
 		}
-	} while (span_ptr);
-	return 0;
+		span = next_span;
+	}
 }
 
-//! Finalize a global cache, only valid from allocator finalization (not thread safe)
 static void
-_memory_cache_finalize(global_cache_t* cache) {
-	void* current_cache = atomic_load_ptr(&cache->cache);
-	span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
-	while (span) {
-		span_t* skip_span = (void*)((uintptr_t)span->prev & _memory_span_mask);
-		atomic_add32(&cache->size, -(int32_t)span->list_size);
-		_memory_unmap_span_list(span);
-		span = skip_span;
+_rpmalloc_heap_unmap(heap_t* heap) {
+	if (!heap->master_heap) {
+		if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
+			span_t* span = (span_t*)((uintptr_t)heap & _memory_span_mask);
+			_rpmalloc_span_unmap(span);
+		}
+	} else {
+		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
+			_rpmalloc_heap_unmap(heap->master_heap);
+		}
 	}
-	assert(!atomic_load32(&cache->size));
-	atomic_store_ptr(&cache->cache, 0);
-	atomic_store32(&cache->size, 0);
 }
 
-//! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_insert(span_t* span) {
-	size_t span_count = span->span_count;
-#if ENABLE_UNLIMITED_GLOBAL_CACHE
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
-#else
-	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
-#endif
-}
+_rpmalloc_heap_global_finalize(heap_t* heap) {
+	if (heap->finalize++ > 1) {
+		--heap->finalize;
+		return;
+	}
 
-//! Extract a number of memory page spans from the global cache for large blocks
-static span_t*
-_memory_global_cache_extract(size_t span_count) {
-	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
-	assert(!span || (span->span_count == span_count));
-	return span;
-}
+	_rpmalloc_heap_finalize(heap);
 
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
+	}
 #endif
 
-#if ENABLE_THREAD_CACHE
-//! Adopt the deferred span cache list
-static void
-_memory_heap_cache_adopt_deferred(heap_t* heap) {
-	atomic_thread_fence_acquire();
-	span_t* span = atomic_load_ptr(&heap->span_cache_deferred);
-	if (!span)
+	if (heap->full_span_count) {
+		--heap->finalize;
 		return;
-	do {
-		span = atomic_load_ptr(&heap->span_cache_deferred);
-	} while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
-	while (span) {
-		span_t* next_span = span->next;
-		_memory_span_list_push(&heap->span_cache[0], span);
-#if ENABLE_STATISTICS
-		atomic_decr32(&heap->span_use[span->span_count - 1].current);
-		++heap->size_class_use[span->size_class].spans_to_cache;
-		--heap->size_class_use[span->size_class].spans_current;
-#endif
-		span = next_span;
 	}
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) {
+			--heap->finalize;
+			return;
+		}
+	}
+	//Heap is now completely free, unmap and remove from heap list
+	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+	heap_t* list_heap = _memory_heaps[list_idx];
+	if (list_heap == heap) {
+		_memory_heaps[list_idx] = heap->next_heap;
+	} else {
+		while (list_heap->next_heap != heap)
+			list_heap = list_heap->next_heap;
+		list_heap->next_heap = heap->next_heap;
+	}
+
+	_rpmalloc_heap_unmap(heap);
 }
-#endif
 
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span) {
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) {
+	if (UNEXPECTED(heap->finalize != 0)) {
+		_rpmalloc_span_unmap(span);
+		_rpmalloc_heap_global_finalize(heap);
+		return;
+	}
 #if ENABLE_THREAD_CACHE
 	size_t span_count = span->span_count;
-	size_t idx = span_count - 1;
-	_memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1);
-	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap);
-#if ENABLE_UNLIMITED_THREAD_CACHE
-	_memory_span_list_push(&heap->span_cache[idx], span);
-#else
-	const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
-	size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span);
-	if (current_cache_size <= release_count)
-		return;
-	const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
-	if (current_cache_size <= hard_limit) {
-#if ENABLE_ADAPTIVE_THREAD_CACHE
-		//Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
-		const size_t high_mark = heap->span_use[idx].high;
-		const size_t min_limit = (high_mark >> 2) + release_count + 1;
-		if (current_cache_size < min_limit)
-			return;
+	_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
+	if (span_count == 1) {
+		span_cache_t* span_cache = &heap->span_cache;
+		span_cache->span[span_cache->count++] = span;
+		if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
+			const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER);
 #else
-		return;
-#endif
-	}
-	heap->span_cache[idx] = _memory_span_list_split(span, release_count);
-	assert(span->list_size == release_count);
-#if ENABLE_STATISTICS
-	heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size;
-	heap->span_use[idx].spans_to_global += span->list_size;
+			for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
 #endif
+			span_cache->count = remain_count;
+		}
+	} else {
+		size_t cache_idx = span_count - 2;
+		span_large_cache_t* span_cache = heap->span_large_cache + cache_idx;
+		span_cache->span[span_cache->count++] = span;
+		const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+		if (span_cache->count == cache_limit) {
+			const size_t transfer_limit = 2 + (cache_limit >> 2);
+			const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit);
+			const size_t remain_count = cache_limit - transfer_count;
 #if ENABLE_GLOBAL_CACHE
-	_memory_global_cache_insert(span);
+			_rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count);
 #else
-	_memory_unmap_span_list(span);
-#endif
+			for (size_t ispan = 0; ispan < transfer_count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
 #endif
+			span_cache->count = remain_count;
+		}
+	}
 #else
 	(void)sizeof(heap);
-	_memory_unmap_span(span);
+	_rpmalloc_span_unmap(span);
 #endif
 }
 
 //! Extract the given number of spans from the different cache levels
 static span_t*
-_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
+_rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
+	span_t* span = 0;
+	if (span_count == 1) {
+		_rpmalloc_heap_cache_adopt_deferred(heap, &span);
+		if (span)
+			return span;
+	}
 #if ENABLE_THREAD_CACHE
-	size_t idx = span_count - 1;
-	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap);
-	if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-		heap->span_use[idx].spans_from_cache++;
-#endif
-		return _memory_span_list_pop(&heap->span_cache[idx]);
+	span_cache_t* span_cache;
+	if (span_count == 1)
+		span_cache = &heap->span_cache;
+	else
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+	if (span_cache->count) {
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
+		return span_cache->span[--span_cache->count];
 	}
 #endif
-	return 0;
+	return span;
 }
 
 static span_t*
-_memory_heap_reserved_extract(heap_t* heap, size_t span_count) {
+_rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) {
 	if (heap->spans_reserved >= span_count)
-		return _memory_map_spans(heap, span_count);
+		return _rpmalloc_span_map(heap, span_count);
 	return 0;
 }
 
 //! Extract a span from the global cache
 static span_t*
-_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) {
+_rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 #if ENABLE_GLOBAL_CACHE
-	size_t idx = span_count - 1;
-	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
-	if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-		heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size;
-		heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size;
-#endif
-		return _memory_span_list_pop(&heap->span_cache[idx]);
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	size_t wanted_count;
+	if (span_count == 1) {
+		span_cache = &heap->span_cache;
+		wanted_count = THREAD_SPAN_CACHE_TRANSFER;
+	} else {
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+		wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
 	}
+	span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count);
+	if (span_cache->count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count);
+		return span_cache->span[--span_cache->count];
+	}
+#else
+	span_t* span = 0;
+	size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
+	if (count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count);
+		return span;
+	}
+#endif
 #endif
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
 	return 0;
 }
 
 //! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
 static span_t*
-_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
-	(void)sizeof(class_idx);
+_rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
+	span_t* span;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	uint32_t idx = (uint32_t)span_count - 1;
 	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
-	if (current_count > heap->span_use[idx].high)
-		heap->span_use[idx].high = current_count;
-#if ENABLE_STATISTICS
-	uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current;
-	if (spans_current > heap->size_class_use[class_idx].spans_peak)
-		heap->size_class_use[class_idx].spans_peak = spans_current;
+	if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
+		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
+	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
 #endif
-#endif	
-	span_t* span = _memory_heap_thread_cache_extract(heap, span_count);
+#if ENABLE_THREAD_CACHE
+	if (class_idx < SIZE_CLASS_COUNT) {
+		if (heap->size_class[class_idx].cache) {
+			span = heap->size_class[class_idx].cache;
+			span_t* new_cache = 0;
+			if (heap->span_cache.count)
+				new_cache = heap->span_cache.span[--heap->span_cache.count];
+			heap->size_class[class_idx].cache = new_cache;
+			return span;
+		}
+	}
+#else
+	(void)sizeof(class_idx);
+#endif
+	span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
+		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
 	}
-	span = _memory_heap_reserved_extract(heap, span_count);
+	span = _rpmalloc_heap_reserved_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1);
+		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
 		return span;
 	}
-	span = _memory_heap_global_cache_extract(heap, span_count);
+	span = _rpmalloc_heap_global_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
+		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
 	}
 	//Final fallback, map in more virtual memory
-	span = _memory_map_spans(heap, span_count);
-	_memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1);
+	span = _rpmalloc_span_map(heap, span_count);
+	_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
 	return span;
 }
 
-//! Move the span (used for small or medium allocations) to the heap thread cache
 static void
-_memory_span_release_to_cache(heap_t* heap, span_t* span) {
-	heap_class_t* heap_class = heap->span_class + span->size_class;
-	assert(heap_class->partial_span != span);
-	if (span->state == SPAN_STATE_PARTIAL)
-		_memory_span_partial_list_remove(&heap_class->partial_span, span);
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	atomic_decr32(&heap->span_use[0].current);
+_rpmalloc_heap_initialize(heap_t* heap) {
+	//Get a new heap ID
+	heap->id = 1 + atomic_incr32(&_memory_heap_id);
+
+	//Link in heap in heap ID map
+	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+	heap->next_heap = _memory_heaps[list_idx];
+	_memory_heaps[list_idx] = heap;
+}
+
+static void
+_rpmalloc_heap_orphan(heap_t* heap, int first_class) {
+	heap->owner_thread = (uintptr_t)-1;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	heap_t** heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+#else
+	(void)sizeof(first_class);
+	heap_t** heap_list = &_memory_orphan_heaps;
 #endif
-	_memory_statistics_inc(heap->span_use[0].spans_to_cache, 1);
-	_memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1);
-	_memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1);
-	_memory_heap_cache_insert(heap, span);
+	heap->next_orphan = *heap_list;
+	*heap_list = heap;
 }
 
-//! Initialize a (partial) free list up to next system memory page, while reserving the first block
-//! as allocated, returning number of blocks in list
-static uint32_t
-free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start,
-                       uint32_t block_count, uint32_t block_size) {
-	assert(block_count);
-	*first_block = block_start;
-	if (block_count > 1) {
-		void* free_block = pointer_offset(block_start, block_size);
-		void* block_end = pointer_offset(block_start, block_size * block_count);
-		//If block size is less than half a memory page, bound init to next memory page boundary
-		if (block_size < (_memory_page_size >> 1)) {
-			void* page_end = pointer_offset(page_start, _memory_page_size);
-			if (page_end < block_end)
-				block_end = page_end;
+//! Allocate a new heap from newly mapped memory pages
+static heap_t*
+_rpmalloc_heap_allocate_new(void) {
+	// Map in pages for a 16 heaps. If page size is greater than required size for this, map a page and
+	// use first part for heaps and remaining part for spans for allocations. Adds a lot of complexity,
+	// but saves a lot of memory on systems where page size > 64 spans (4MiB)
+	size_t heap_size = sizeof(heap_t);
+	size_t aligned_heap_size = 16 * ((heap_size + 15) / 16);
+	size_t request_heap_count = 16;
+	size_t heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+	size_t block_size = _memory_span_size * heap_span_count;
+	size_t span_count = heap_span_count;
+	span_t* span = 0;
+	// If there are global reserved spans, use these first
+	if (_memory_global_reserve_count >= heap_span_count) {
+		span = _rpmalloc_global_get_reserved_spans(heap_span_count);
+	}
+	if (!span) {
+		if (_memory_page_size > block_size) {
+			span_count = _memory_page_size / _memory_span_size;
+			block_size = _memory_page_size;
+			// If using huge pages, make sure to grab enough heaps to avoid reallocating a huge page just to serve new heaps
+			size_t possible_heap_count = (block_size - sizeof(span_t)) / aligned_heap_size;
+			if (possible_heap_count >= (request_heap_count * 16))
+				request_heap_count *= 16;
+			else if (possible_heap_count < request_heap_count)
+				request_heap_count = possible_heap_count;
+			heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
 		}
-		*list = free_block;
-		block_count = 2;
-		void* next_block = pointer_offset(free_block, block_size);
-		while (next_block < block_end) {
-			*((void**)free_block) = next_block;
-			free_block = next_block;
-			++block_count;
-			next_block = pointer_offset(next_block, block_size);
+
+		size_t align_offset = 0;
+		span = (span_t*)_rpmalloc_mmap(block_size, &align_offset);
+		if (!span)
+			return 0;
+
+		// Master span will contain the heaps
+		_rpmalloc_stat_add(&_reserved_spans, span_count);
+		_rpmalloc_stat_inc(&_master_spans);
+		_rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
+	}
+
+	size_t remain_size = _memory_span_size - sizeof(span_t);
+	heap_t* heap = (heap_t*)pointer_offset(span, sizeof(span_t));
+	_rpmalloc_heap_initialize(heap);
+
+	// Put extra heaps as orphans
+	size_t num_heaps = remain_size / aligned_heap_size;
+	if (num_heaps < request_heap_count)
+		num_heaps = request_heap_count;
+	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
+	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
+	while (num_heaps > 1) {
+		_rpmalloc_heap_initialize(extra_heap);
+		extra_heap->master_heap = heap;
+		_rpmalloc_heap_orphan(extra_heap, 1);
+		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
+		--num_heaps;
+	}
+
+	if (span_count > heap_span_count) {
+		// Cap reserved spans
+		size_t remain_count = span_count - heap_span_count;
+		size_t reserve_count = (remain_count > DEFAULT_SPAN_MAP_COUNT ? DEFAULT_SPAN_MAP_COUNT : remain_count);
+		span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size);
+		_rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
+
+		if (remain_count > reserve_count) {
+			// Set to global reserved spans
+			remain_span = (span_t*)pointer_offset(remain_span, reserve_count * _memory_span_size);
+			reserve_count = remain_count - reserve_count;
+			_rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count);
 		}
-		*((void**)free_block) = 0;
-	} else {
-		*list = 0;
 	}
-	return block_count;
-}
 
-//! Initialize an unused span (from cache or mapped) to be new active span
-static void*
-_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) {
-	assert(span->span_count == 1);
-	size_class_t* size_class = _memory_size_class + class_idx;
-	span->size_class = class_idx;
-	span->heap = heap;
-	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
-	span->block_count = size_class->block_count;
-	span->block_size = size_class->block_size;
-	span->state = SPAN_STATE_ACTIVE;
-	span->free_list = 0;
+	return heap;
+}
 
-	//Setup free list. Only initialize one system page worth of free blocks in list
-	void* block;
-	span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, 
-		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	span->list_size = 0;
-	atomic_thread_fence_release();
+static heap_t*
+_rpmalloc_heap_extract_orphan(heap_t** heap_list) {
+	heap_t* heap = *heap_list;
+	*heap_list = (heap ? heap->next_orphan : 0);
+	return heap;
+}
 
-	_memory_span_partial_list_add(&heap_class->partial_span, span);
-	return block;
+//! Allocate a new heap, potentially reusing a previously orphaned heap
+static heap_t*
+_rpmalloc_heap_allocate(int first_class) {
+	heap_t* heap = 0;
+	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+		/* Spin */;
+	if (first_class == 0)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	if (!heap)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+#endif
+	if (!heap)
+		heap = _rpmalloc_heap_allocate_new();
+	atomic_store32_release(&_memory_global_lock, 0);
+	return heap;
 }
 
-//! Promote a partially used span (from heap used list) to be new active span
 static void
-_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) {
-	assert(span->state == SPAN_STATE_PARTIAL);
-	assert(span->block_count == _memory_size_class[span->size_class].block_count);
-	//Move data to heap size class and set span as active
-	heap_class->free_list = span->free_list;
-	span->state = SPAN_STATE_ACTIVE;
-	span->free_list = 0;
-	assert(heap_class->free_list);
+_rpmalloc_heap_release(void* heapptr, int first_class) {
+	heap_t* heap = (heap_t*)heapptr;
+	if (!heap)
+		return;
+	//Release thread cache spans back to global cache
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
+			continue;
+#if ENABLE_GLOBAL_CACHE
+		if (heap->finalize) {
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
+		} else {
+			_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+		}
+#else
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+		span_cache->count = 0;
+	}
+#endif
+
+	if (get_thread_heap_raw() == heap)
+		set_thread_heap(0);
+
+#if ENABLE_STATISTICS
+	atomic_decr32(&_memory_active_heaps);
+	assert(atomic_load32(&_memory_active_heaps) >= 0);
+#endif
+
+	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+		/* Spin */;
+	_rpmalloc_heap_orphan(heap, first_class);
+	atomic_store32_release(&_memory_global_lock, 0);
 }
 
-//! Mark span as full (from active)
 static void
-_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) {
-	assert(span->state == SPAN_STATE_ACTIVE);
-	assert(span == heap_class->partial_span);
-	_memory_span_partial_list_pop_head(&heap_class->partial_span);
-	span->used_count = span->block_count;
-	span->state = SPAN_STATE_FULL;
-	span->free_list = 0;
+_rpmalloc_heap_release_raw(void* heapptr) {
+	_rpmalloc_heap_release(heapptr, 0);
 }
 
-//! Move span from full to partial state
 static void
-_memory_span_set_full_partial(heap_t* heap, span_t* span) {
-	assert(span->state == SPAN_STATE_FULL);
-	heap_class_t* heap_class = &heap->span_class[span->size_class];
-	span->state = SPAN_STATE_PARTIAL;
-	_memory_span_partial_list_add_tail(&heap_class->partial_span, span);
-}
+_rpmalloc_heap_finalize(heap_t* heap) {
+	if (heap->spans_reserved) {
+		span_t* span = _rpmalloc_span_map(heap, heap->spans_reserved);
+		_rpmalloc_span_unmap(span);
+		heap->spans_reserved = 0;
+	}
 
-static void*
-_memory_span_extract_deferred(span_t* span) {
-	void* free_list;
-	do {
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
-	span->list_size = 0;
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	atomic_thread_fence_release();
-	return free_list;
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (heap->size_class[iclass].cache)
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache);
+		heap->size_class[iclass].cache = 0;
+		span_t* span = heap->size_class[iclass].partial_span;
+		while (span) {
+			span_t* next = span->next;
+			_rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span);
+			span = next;
+		}
+		// If class still has a free list it must be a full span
+		if (heap->size_class[iclass].free_list) {
+			span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask);
+			span_t** list = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			list = &heap->full_span[iclass];
+#endif
+			--heap->full_span_count;
+			if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
+				if (list)
+					_rpmalloc_span_double_link_list_remove(list, class_span);
+				_rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span);
+			}
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
+	}
+#endif
+	assert(!atomic_load_ptr(&heap->span_free_deferred));
 }
 
+
+////////////
+///
+/// Allocation entry points
+///
+//////
+
 //! Pop first block from a free list
 static void*
 free_list_pop(void** list) {
@@ -1226,84 +1850,83 @@ free_list_pop(void** list) {
 
 //! Allocate a small/medium sized memory block from the given heap
 static void*
-_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
-	heap_class_t* heap_class = &heap->span_class[class_idx];
-	void* block;
-
-	span_t* active_span = heap_class->partial_span;
-	if (EXPECTED(active_span != 0)) {
-		assert(active_span->state == SPAN_STATE_ACTIVE);
-		assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count);
-		//Swap in free list if not empty
-		if (active_span->free_list) {
-			heap_class->free_list = active_span->free_list;
-			active_span->free_list = 0;
-			return free_list_pop(&heap_class->free_list);
-		}
-		//If the span did not fully initialize free list, link up another page worth of blocks
-		if (active_span->free_list_limit < active_span->block_count) {
-			void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size));
-			active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
+_rpmalloc_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
+	span_t* span = heap->size_class[class_idx].partial_span;
+	if (EXPECTED(span != 0)) {
+		assert(span->block_count == _memory_size_class[span->size_class].block_count);
+		assert(!_rpmalloc_span_is_fully_utilized(span));
+		void* block;
+		if (span->free_list) {
+			//Swap in free list if not empty
+			heap->size_class[class_idx].free_list = span->free_list;
+			span->free_list = 0;
+			block = free_list_pop(&heap->size_class[class_idx].free_list);
+		} else {
+			//If the span did not fully initialize free list, link up another page worth of blocks			
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
+			span->free_list_limit += free_list_partial_init(&heap->size_class[class_idx].free_list, &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
-				active_span->block_count - active_span->free_list_limit, active_span->block_size);
-			return block;
-		}
-		//Swap in deferred free list
-		atomic_thread_fence_acquire();
-		if (atomic_load_ptr(&active_span->free_list_deferred)) {
-			heap_class->free_list = _memory_span_extract_deferred(active_span);
-			return free_list_pop(&heap_class->free_list);
+				span->block_count - span->free_list_limit, span->block_size);
 		}
+		assert(span->free_list_limit <= span->block_count);
+		span->used_count = span->free_list_limit;
 
-		//If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list)
-		assert(!heap_class->free_list);
-		assert(active_span->free_list_limit >= active_span->block_count);
-		_memory_span_set_active_full(heap_class, active_span);
-	}
-	assert(!heap_class->free_list);
+		//Swap in deferred free list if present
+		if (atomic_load_ptr(&span->free_list_deferred))
+			_rpmalloc_span_extract_free_list_deferred(span);
+
+		//If span is still not fully utilized keep it in partial list and early return block
+		if (!_rpmalloc_span_is_fully_utilized(span))
+			return block;
 
-	//Try promoting a semi-used span to active
-	active_span = heap_class->partial_span;
-	if (EXPECTED(active_span != 0)) {
-		_memory_span_set_partial_active(heap_class, active_span);
-		return free_list_pop(&heap_class->free_list);
+		//The span is fully utilized, unlink from partial list and add to fully utilized list
+		_rpmalloc_span_double_link_list_pop_head(&heap->size_class[class_idx].partial_span, span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		return block;
 	}
-	assert(!heap_class->free_list);
-	assert(!heap_class->partial_span);
 
 	//Find a span in one of the cache levels
-	active_span = _memory_heap_extract_new_span(heap, 1, class_idx);
+	span = _rpmalloc_heap_extract_new_span(heap, 1, class_idx);
+	if (EXPECTED(span != 0)) {
+		//Mark span as owned by this heap and set base data, return first block
+		return _rpmalloc_span_initialize_new(heap, span, class_idx);
+	}
 
-	//Mark span as owned by this heap and set base data, return first block
-	return _memory_span_set_new_active(heap, heap_class, active_span, class_idx);
+	return 0;
 }
 
 //! Allocate a small sized memory block from the given heap
 static void*
-_memory_allocate_small(heap_t* heap, size_t size) {
+_rpmalloc_allocate_small(heap_t* heap, size_t size) {
+	assert(heap);
 	//Small sizes have unique size classes
 	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
-	_memory_statistics_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
-		return free_list_pop(&heap->span_class[class_idx].free_list);
-	return _memory_allocate_from_heap_fallback(heap, class_idx);
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap->size_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->size_class[class_idx].free_list);
+	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
 //! Allocate a medium sized memory block from the given heap
 static void*
-_memory_allocate_medium(heap_t* heap, size_t size) {
+_rpmalloc_allocate_medium(heap_t* heap, size_t size) {
+	assert(heap);
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
 	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
-	_memory_statistics_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
-		return free_list_pop(&heap->span_class[class_idx].free_list);
-	return _memory_allocate_from_heap_fallback(heap, class_idx);
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap->size_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->size_class[class_idx].free_list);
+	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
 //! Allocate a large sized memory block from the given heap
 static void*
-_memory_allocate_large(heap_t* heap, size_t size) {
+_rpmalloc_allocate_large(heap_t* heap, size_t size) {
+	assert(heap);
 	//Calculate number of needed max sized spans (including header)
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
 	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
@@ -1311,154 +1934,244 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 	size_t span_count = size >> _memory_span_size_shift;
 	if (size & (_memory_span_size - 1))
 		++span_count;
-	size_t idx = span_count - 1;
 
 	//Find a span in one of the cache levels
-	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT);
+	span_t* span = _rpmalloc_heap_extract_new_span(heap, span_count, SIZE_CLASS_LARGE);
+	if (!span)
+		return span;
 
 	//Mark span as owned by this heap and set base data
 	assert(span->span_count == span_count);
-	span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx);
+	span->size_class = SIZE_CLASS_LARGE;
 	span->heap = heap;
-	atomic_thread_fence_release();
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
 
 //! Allocate a huge block by mapping memory pages directly
 static void*
-_memory_allocate_huge(size_t size) {
+_rpmalloc_allocate_huge(heap_t* heap, size_t size) {
+	assert(heap);
 	size += SPAN_HEADER_SIZE;
 	size_t num_pages = size >> _memory_page_size_shift;
 	if (size & (_memory_page_size - 1))
 		++num_pages;
 	size_t align_offset = 0;
-	span_t* span = _memory_map(num_pages * _memory_page_size, &align_offset);
+	span_t* span = (span_t*)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset);
 	if (!span)
 		return span;
+
 	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
+	span->size_class = SIZE_CLASS_HUGE;
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
-	return pointer_offset(span, SPAN_HEADER_SIZE);
-}
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
 
-//! Allocate a block larger than medium size
-static void*
-_memory_allocate_oversized(heap_t* heap, size_t size) {
-	if (size <= LARGE_SIZE_LIMIT)
-		return _memory_allocate_large(heap, size);
-	return _memory_allocate_huge(size);
+	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
 
 //! Allocate a block of the given size
 static void*
-_memory_allocate(heap_t* heap, size_t size) {
+_rpmalloc_allocate(heap_t* heap, size_t size) {
 	if (EXPECTED(size <= SMALL_SIZE_LIMIT))
-		return _memory_allocate_small(heap, size);
+		return _rpmalloc_allocate_small(heap, size);
 	else if (size <= _memory_medium_size_limit)
-		return _memory_allocate_medium(heap, size);
-	return _memory_allocate_oversized(heap, size);
+		return _rpmalloc_allocate_medium(heap, size);
+	else if (size <= LARGE_SIZE_LIMIT)
+		return _rpmalloc_allocate_large(heap, size);
+	return _rpmalloc_allocate_huge(heap, size);
 }
 
-//! Allocate a new heap
-static heap_t*
-_memory_allocate_heap(void) {
-	void* raw_heap;
-	void* next_raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* heap;
-	heap_t* next_heap;
-	//Try getting an orphaned heap
-	atomic_thread_fence_acquire();
-	do {
-		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap = (void*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF);
-		if (!heap)
-			break;
-		next_heap = heap->next_orphan;
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
+static void*
+_rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _rpmalloc_allocate(heap, size);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment) < size) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment & (alignment - 1)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+
+	if ((alignment <= SPAN_HEADER_SIZE) && (size < _memory_medium_size_limit)) {
+		// If alignment is less or equal to span header size (which is power of two),
+		// and size aligned to span header size multiples is less than size + alignment,
+		// then use natural alignment of blocks to provide alignment
+		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
+		assert(!(multiple_size % SPAN_HEADER_SIZE));
+		if (multiple_size <= (size + alignment))
+			return _rpmalloc_allocate(heap, multiple_size);
+	}
+
+	void* ptr = 0;
+	size_t align_mask = alignment - 1;
+	if (alignment <= _memory_page_size) {
+		ptr = _rpmalloc_allocate(heap, size + alignment);
+		if ((uintptr_t)ptr & align_mask) {
+			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+			//Mark as having aligned blocks
+			span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
+			span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		}
+		return ptr;
+	}
+
+	// Fallback to mapping new pages for this request. Since pointers passed
+	// to rpfree must be able to reach the start of the span by bitmasking of
+	// the address with the span size, the returned aligned pointer from this
+	// function must be with a span size of the start of the mapped area.
+	// In worst case this requires us to loop and map pages until we get a
+	// suitable memory address. It also means we can never align to span size
+	// or greater, since the span header will push alignment more than one
+	// span size away from span start (thus causing pointer mask to give us
+	// an invalid span start on free)
+	if (alignment & align_mask) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment >= _memory_span_size) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	size_t extra_pages = alignment / _memory_page_size;
+
+	// Since each span has a header, we will at least need one extra memory page
+	size_t num_pages = 1 + (size / _memory_page_size);
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+
+	if (extra_pages > num_pages)
+		num_pages = 1 + extra_pages;
+
+	size_t original_pages = num_pages;
+	size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
+	if (limit_pages < (original_pages * 2))
+		limit_pages = original_pages * 2;
+
+	size_t mapped_size, align_offset;
+	span_t* span;
+
+retry:
+	align_offset = 0;
+	mapped_size = num_pages * _memory_page_size;
+
+	span = (span_t*)_rpmalloc_mmap(mapped_size, &align_offset);
+	if (!span) {
+		errno = ENOMEM;
+		return 0;
+	}
+	ptr = pointer_offset(span, SPAN_HEADER_SIZE);
+
+	if ((uintptr_t)ptr & align_mask)
+		ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+
+	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
+	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
+	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
+		_rpmalloc_unmap(span, mapped_size, align_offset, mapped_size);
+		++num_pages;
+		if (num_pages > limit_pages) {
+			errno = EINVAL;
+			return 0;
+		}
+		goto retry;
+	}
+
+	//Store page count in span_count
+	span->size_class = SIZE_CLASS_HUGE;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
-	if (!heap) {
-		//Map in pages for a new heap
-		size_t align_offset = 0;
-		heap = _memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
-		if (!heap)
-			return heap;
-		memset(heap, 0, sizeof(heap_t));
-		heap->align_offset = align_offset;
-
-		//Get a new heap ID
-		do {
-			heap->id = atomic_incr32(&_memory_heap_id);
-			if (_memory_heap_lookup(heap->id))
-				heap->id = 0;
-		} while (!heap->id);
-
-		//Link in heap in heap ID map
-		size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-		do {
-			next_heap = atomic_load_ptr(&_memory_heaps[list_idx]);
-			heap->next_heap = next_heap;
-		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
-	}
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
 
-	return heap;
+	return ptr;
 }
 
+
+////////////
+///
+/// Deallocation entry points
+///
+//////
+
 //! Deallocate the given small/medium memory block in the current thread local heap
 static void
-_memory_deallocate_direct(span_t* span, void* block) {
-	assert(span->heap == get_thread_heap_raw());
-	uint32_t state = span->state;
+_rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) {
+	heap_t* heap = span->heap;
+	assert(heap->owner_thread == get_thread_id() || !heap->owner_thread || heap->finalize);
 	//Add block to free list
+	if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
+		span->used_count = span->block_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
+#endif
+		_rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span);
+		--heap->full_span_count;
+	}
+	--span->used_count;
 	*((void**)block) = span->free_list;
 	span->free_list = block;
-	if (UNEXPECTED(state == SPAN_STATE_ACTIVE))
-		return;
-	uint32_t used = --span->used_count;
-	uint32_t free = span->list_size;
-	if (UNEXPECTED(used == free))
-		_memory_span_release_to_cache(span->heap, span);
-	else if (UNEXPECTED(state == SPAN_STATE_FULL))
-		_memory_span_set_full_partial(span->heap, span);
+	if (UNEXPECTED(span->used_count == span->list_size)) {
+		_rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span);
+		_rpmalloc_span_release_to_cache(heap, span);
+	}
 }
 
-//! Put the block in the deferred free list of the owning span
 static void
-_memory_deallocate_defer(span_t* span, void* block) {
-	atomic_thread_fence_acquire();
-	if (span->state == SPAN_STATE_FULL) {
-		if ((span->list_size + 1) == span->block_count) {
-			//Span will be completely freed by deferred deallocations, no other thread can
-			//currently touch it. Safe to move to owner heap deferred cache
-			span_t* last_head;
-			heap_t* heap = span->heap;
-			do {
-				last_head = atomic_load_ptr(&heap->span_cache_deferred);
-				span->next = last_head;
-			} while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head));
-			return;
-		}
-	}
+_rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) {
+	//This list does not need ABA protection, no mutable side state
+	do {
+		span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred);
+	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
+}
 
+//! Put the block in the deferred free list of the owning span
+static void
+_rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) {
+	// The memory ordering here is a bit tricky, to avoid having to ABA protect
+	// the deferred free list to avoid desynchronization of list and list size
+	// we need to have acquire semantics on successful CAS of the pointer to
+	// guarantee the list_size variable validity + release semantics on pointer store
 	void* free_list;
 	do {
-		atomic_thread_fence_acquire();
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-		*((void**)block) = free_list;
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
-	++span->list_size;
-	atomic_store_ptr(&span->free_list_deferred, block);
+		free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (free_list == INVALID_POINTER);
+	*((void**)block) = free_list;
+	uint32_t free_count = ++span->list_size;
+	atomic_store_ptr_release(&span->free_list_deferred, block);
+	if (free_count == span->block_count) {
+		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
+		// no other thread can reach this state simultaneously on this span.
+		// Safe to move to owner heap deferred cache
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+	}
 }
 
 static void
-_memory_deallocate_small_or_medium(span_t* span, void* p) {
-	_memory_statistics_inc_free(span->heap, span->size_class);
+_rpmalloc_deallocate_small_or_medium(span_t* span, void* p) {
+	_rpmalloc_stat_inc_free(span->heap, span->size_class);
 	if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
 		//Realign pointer to block start
 		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
@@ -1466,111 +2179,150 @@ _memory_deallocate_small_or_medium(span_t* span, void* p) {
 		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
-	if (span->heap == get_thread_heap_raw())
-		_memory_deallocate_direct(span, p);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (!defer)
+		_rpmalloc_deallocate_direct_small_or_medium(span, p);
 	else
-		_memory_deallocate_defer(span, p);
+		_rpmalloc_deallocate_defer_small_or_medium(span, p);
 }
 
 //! Deallocate the given large memory block to the current heap
 static void
-_memory_deallocate_large(span_t* span) {
-	//Decrease counter
-	assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1));
-	assert(span->size_class >= SIZE_CLASS_COUNT);
-	assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT);
+_rpmalloc_deallocate_large(span_t* span) {
+	assert(span->size_class == SIZE_CLASS_LARGE);
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
-	//Large blocks can always be deallocated and transferred between heaps
-	//Investigate if it is better to defer large spans as well through span_cache_deferred,
-	//possibly with some heuristics to pick either scheme at runtime per deallocation
-	heap_t* heap = get_thread_heap();
+	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+	assert(span->heap->full_span_count);
+	--span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//Decrease counter
 	size_t idx = span->span_count - 1;
 	atomic_decr32(&span->heap->span_use[idx].current);
 #endif
-	if ((span->span_count > 1) && !heap->spans_reserved) {
+	heap_t* heap = get_thread_heap();
+	assert(heap);
+	span->heap = heap;
+	if ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved) {
 		heap->span_reserve = span;
 		heap->spans_reserved = span->span_count;
 		if (span->flags & SPAN_FLAG_MASTER) {
 			heap->span_reserve_master = span;
 		} else { //SPAN_FLAG_SUBSPAN
-			uint32_t distance = span->total_spans_or_distance;
-			span_t* master = pointer_offset(span, -(int32_t)(distance * _memory_span_size));
+			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)((size_t)span->offset_from_master * _memory_span_size));
 			heap->span_reserve_master = master;
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
 		}
-		_memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1);
+		_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved);
 	} else {
 		//Insert into cache list
-		_memory_heap_cache_insert(heap, span);
+		_rpmalloc_heap_cache_insert(heap, span);
 	}
 }
 
 //! Deallocate the given huge span
 static void
-_memory_deallocate_huge(span_t* span) {
+_rpmalloc_deallocate_huge(span_t* span) {
+	assert(span->heap);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+	assert(span->heap->full_span_count);
+	--span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+
 	//Oversized allocation, page count is stored in span_count
 	size_t num_pages = span->span_count;
-	_memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
-	_memory_statistics_sub(&_huge_pages_current, num_pages);
+	_rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
+	_rpmalloc_stat_sub(&_huge_pages_current, num_pages);
 }
 
 //! Deallocate the given block
 static void
-_memory_deallocate(void* p) {
+_rpmalloc_deallocate(void* p) {
 	//Grab the span (always at start of span, using span alignment)
-	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
 	if (UNEXPECTED(!span))
 		return;
 	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
-		_memory_deallocate_small_or_medium(span, p);
-	else if (span->size_class != (uint32_t)-1)
-		_memory_deallocate_large(span);
+		_rpmalloc_deallocate_small_or_medium(span, p);
+	else if (span->size_class == SIZE_CLASS_LARGE)
+		_rpmalloc_deallocate_large(span);
 	else
-		_memory_deallocate_huge(span);
+		_rpmalloc_deallocate_huge(span);
 }
 
+
+////////////
+///
+/// Reallocation entry points
+///
+//////
+
+static size_t
+_rpmalloc_usable_size(void* p);
+
 //! Reallocate the given block to the given size
 static void*
-_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
+_rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) {
 	if (p) {
 		//Grab the span using guaranteed span alignment
-		span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
-		if (span->heap) {
-			if (span->size_class < SIZE_CLASS_COUNT) {
-				//Small/medium sized block
-				assert(span->span_count == 1);
-				void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-				uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
-				uint32_t block_idx = block_offset / span->block_size;
-				void* block = pointer_offset(blocks_start, block_idx * span->block_size);
-				if (!oldsize)
-					oldsize = span->block_size - (uint32_t)pointer_diff(p, block);
-				if ((size_t)span->block_size >= size) {
-					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-						memmove(block, p, oldsize);
-					return block;
-				}
-			} else {
-				//Large block
-				size_t total_size = size + SPAN_HEADER_SIZE;
-				size_t num_spans = total_size >> _memory_span_size_shift;
-				if (total_size & (_memory_span_mask - 1))
-					++num_spans;
-				size_t current_spans = span->span_count;
-				assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1));
-				void* block = pointer_offset(span, SPAN_HEADER_SIZE);
-				if (!oldsize)
-					oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
-					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-						memmove(block, p, oldsize);
-					return block;
-				}
+		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			//Small/medium sized block
+			assert(span->span_count == 1);
+			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+			uint32_t block_idx = block_offset / span->block_size;
+			void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
+			if (!oldsize)
+				oldsize = (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block));
+			if ((size_t)span->block_size >= size) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		} else if (span->size_class == SIZE_CLASS_LARGE) {
+			//Large block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_spans = total_size >> _memory_span_size_shift;
+			if (total_size & (_memory_span_mask - 1))
+				++num_spans;
+			size_t current_spans = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
 			}
 		} else {
 			//Oversized block
@@ -1594,38 +2346,70 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 		oldsize = 0;
 	}
 
+	if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+		return 0;
+
 	//Size is greater than block size, need to allocate a new block and deallocate the old
-	heap_t* heap = get_thread_heap();
 	//Avoid hysteresis by overallocating if increase is small (below 37%)
 	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
 	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
-	void* block = _memory_allocate(heap, new_size);
+	void* block = _rpmalloc_allocate(heap, new_size);
 	if (p && block) {
 		if (!(flags & RPMALLOC_NO_PRESERVE))
 			memcpy(block, p, oldsize < new_size ? oldsize : new_size);
-		_memory_deallocate(p);
+		_rpmalloc_deallocate(p);
 	}
 
 	return block;
 }
 
+static void*
+_rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize,
+                           unsigned int flags) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
+
+	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+	size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0);
+	if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
+		if (no_alloc || (size >= (usablesize / 2)))
+			return ptr;
+	}
+	// Aligned alloc marks span as having aligned blocks
+	void* block = (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0);
+	if (EXPECTED(block != 0)) {
+		if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
+			if (!oldsize)
+				oldsize = usablesize;
+			memcpy(block, ptr, oldsize < size ? oldsize : size);
+		}
+		_rpmalloc_deallocate(ptr);
+	}
+	return block;
+}
+
+
+////////////
+///
+/// Initialization, finalization and utility
+///
+//////
+
 //! Get the usable size of the given block
 static size_t
-_memory_usable_size(void* p) {
+_rpmalloc_usable_size(void* p) {
 	//Grab the span using guaranteed span alignment
-	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
-	if (span->heap) {
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (span->size_class < SIZE_CLASS_COUNT) {
 		//Small/medium block
-		if (span->size_class < SIZE_CLASS_COUNT) {
-			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-			return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
-		}
-
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+	}
+	if (span->size_class == SIZE_CLASS_LARGE) {
 		//Large block
-		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
+		size_t current_spans = span->span_count;
 		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
 	}
-
 	//Oversized block, page count is stored in span_count
 	size_t current_pages = span->span_count;
 	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
@@ -1633,7 +2417,7 @@ _memory_usable_size(void* p) {
 
 //! Adjust and optimize the size class properties for the given class
 static void
-_memory_adjust_size_class(size_t iclass) {
+_rpmalloc_adjust_size_class(size_t iclass) {
 	size_t block_size = _memory_size_class[iclass].block_size;
 	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
 
@@ -1641,88 +2425,18 @@ _memory_adjust_size_class(size_t iclass) {
 	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
 
 	//Check if previous size classes can be merged
-	size_t prevclass = iclass;
-	while (prevclass > 0) {
-		--prevclass;
-		//A class can be merged if number of pages and number of blocks are equal
-		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
-			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
-		else
-			break;
-	}
-}
-
-static void
-_memory_heap_finalize(void* heapptr) {
-	heap_t* heap = heapptr;
-	if (!heap)
-		return;
-	//Release thread cache spans back to global cache
-#if ENABLE_THREAD_CACHE
-	_memory_heap_cache_adopt_deferred(heap);
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-#if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			span_t* next = _memory_span_list_split(span, (uint32_t)release_count);
-#if ENABLE_STATISTICS
-			heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size;
-			heap->span_use[iclass].spans_to_global += span->list_size;
-#endif
-			_memory_global_cache_insert(span);
-			span = next;
+	if (iclass >= SMALL_CLASS_COUNT) {
+		size_t prevclass = iclass;
+		while (prevclass > 0) {
+			--prevclass;
+			//A class can be merged if number of pages and number of blocks are equal
+			if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+				memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+			else
+				break;
 		}
-#else
-		if (span)
-			_memory_unmap_span_list(span);
-#endif
-		heap->span_cache[iclass] = 0;
 	}
-#endif
-
-	//Orphan the heap
-	void* raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* last_heap;
-	do {
-		last_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap->next_orphan = (void*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF);
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
-
-	set_thread_heap(0);
-
-#if ENABLE_STATISTICS
-	atomic_decr32(&_memory_active_heaps);
-	assert(atomic_load32(&_memory_active_heaps) >= 0);
-#endif
-}
-
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-#include <fibersapi.h>
-static DWORD fls_key;
-static void NTAPI
-rp_thread_destructor(void* value) {
-	if (value)
-		rpmalloc_thread_finalize();
 }
-#endif
-
-#if PLATFORM_POSIX
-#  include <sys/mman.h>
-#  include <sched.h>
-#  ifdef __FreeBSD__
-#    include <sys/sysctl.h>
-#    define MAP_HUGETLB MAP_ALIGNED_SUPER
-#  endif
-#  ifndef MAP_UNINITIALIZED
-#    define MAP_UNINITIALIZED 0
-#  endif
-#endif
-#include <errno.h>
 
 //! Initialize the allocator and setup global data
 extern inline int
@@ -1731,7 +2445,6 @@ rpmalloc_initialize(void) {
 		rpmalloc_thread_initialize();
 		return 0;
 	}
-	memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
 	return rpmalloc_initialize_config(0);
 }
 
@@ -1745,10 +2458,12 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 
 	if (config)
 		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+	else
+		memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
 
 	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
-		_memory_config.memory_map = _memory_map_os;
-		_memory_config.memory_unmap = _memory_unmap_os;
+		_memory_config.memory_map = _rpmalloc_mmap_os;
+		_memory_config.memory_unmap = _rpmalloc_unmap_os;
 	}
 
 #if RPMALLOC_CONFIGURABLE
@@ -1765,35 +2480,10 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		GetSystemInfo(&system_info);
 		_memory_page_size = system_info.dwPageSize;
 		_memory_map_granularity = system_info.dwAllocationGranularity;
-		if (config && config->enable_huge_pages) {
-			HANDLE token = 0;
-			size_t large_page_minimum = GetLargePageMinimum();
-			if (large_page_minimum)
-				OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-			if (token) {
-				LUID luid;
-				if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
-					TOKEN_PRIVILEGES token_privileges;
-					memset(&token_privileges, 0, sizeof(token_privileges));
-					token_privileges.PrivilegeCount = 1;
-					token_privileges.Privileges[0].Luid = luid;
-					token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-					if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
-						DWORD err = GetLastError();
-						if (err == ERROR_SUCCESS) {
-							_memory_huge_pages = 1;
-							_memory_page_size = large_page_minimum;
-							_memory_map_granularity = large_page_minimum;
-						}
-					}
-				}
-				CloseHandle(token);
-			}
-		}
 #else
 		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
 		_memory_map_granularity = _memory_page_size;
-		if (config && config->enable_huge_pages) {
+		if (_memory_config.enable_huge_pages) {
 #if defined(__linux__)
 			size_t huge_page_size = 0;
 			FILE* meminfo = fopen("/proc/meminfo", "r");
@@ -1828,15 +2518,51 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		}
 #endif
 	} else {
-		if (config && config->enable_huge_pages)
+		if (_memory_config.enable_huge_pages)
 			_memory_huge_pages = 1;
 	}
 
-	//The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF)
-	if (_memory_page_size < 512)
-		_memory_page_size = 512;
-	if (_memory_page_size > (64 * 1024 * 1024))
-		_memory_page_size = (64 * 1024 * 1024);
+#if PLATFORM_WINDOWS
+	if (_memory_config.enable_huge_pages) {
+		HANDLE token = 0;
+		size_t large_page_minimum = GetLargePageMinimum();
+		if (large_page_minimum)
+			OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+		if (token) {
+			LUID luid;
+			if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+				TOKEN_PRIVILEGES token_privileges;
+				memset(&token_privileges, 0, sizeof(token_privileges));
+				token_privileges.PrivilegeCount = 1;
+				token_privileges.Privileges[0].Luid = luid;
+				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+					DWORD err = GetLastError();
+					if (err == ERROR_SUCCESS) {
+						_memory_huge_pages = 1;
+						if (large_page_minimum > _memory_page_size)
+						 	_memory_page_size = large_page_minimum;
+						if (large_page_minimum > _memory_map_granularity)
+							_memory_map_granularity = large_page_minimum;
+					}
+				}
+			}
+			CloseHandle(token);
+		}
+	}
+#endif
+
+	size_t min_span_size = 256;
+	size_t max_page_size;
+#if UINTPTR_MAX > 0xFFFFFFFF
+	max_page_size = 4096ULL * 1024ULL * 1024ULL;
+#else
+	max_page_size = 4 * 1024 * 1024;
+#endif
+	if (_memory_page_size < min_span_size)
+		_memory_page_size = min_span_size;
+	if (_memory_page_size > max_page_size)
+		_memory_page_size = max_page_size;
 	_memory_page_size_shift = 0;
 	size_t page_size_bit = _memory_page_size;
 	while (page_size_bit != 1) {
@@ -1846,18 +2572,22 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
 
 #if RPMALLOC_CONFIGURABLE
-	size_t span_size = _memory_config.span_size;
-	if (!span_size)
-		span_size = (64 * 1024);
-	if (span_size > (256 * 1024))
-		span_size = (256 * 1024);
-	_memory_span_size = 4096;
-	_memory_span_size_shift = 12;
-	while (_memory_span_size < span_size) {
-		_memory_span_size <<= 1;
-		++_memory_span_size_shift;
+	if (!_memory_config.span_size) {
+		_memory_span_size = _memory_default_span_size;
+		_memory_span_size_shift = _memory_default_span_size_shift;
+		_memory_span_mask = _memory_default_span_mask;
+	} else {
+		size_t span_size = _memory_config.span_size;
+		if (span_size > (256 * 1024))
+			span_size = (256 * 1024);
+		_memory_span_size = 4096;
+		_memory_span_size_shift = 12;
+		while (_memory_span_size < span_size) {
+			_memory_span_size <<= 1;
+			++_memory_span_size_shift;
+		}
+		_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 	}
-	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 #endif
 
 	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
@@ -1875,35 +2605,21 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-	if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize))
+	if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw))
 		return -1;
 #endif
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    fls_key = FlsAlloc(&rp_thread_destructor);
-#endif
-
-	atomic_store32(&_memory_heap_id, 0);
-	atomic_store32(&_memory_orphan_counter, 0);
-#if ENABLE_STATISTICS
-	atomic_store32(&_memory_active_heaps, 0);
-	atomic_store32(&_reserved_spans, 0);
-	atomic_store32(&_mapped_pages, 0);
-	_mapped_pages_peak = 0;
-	atomic_store32(&_mapped_total, 0);
-	atomic_store32(&_unmapped_total, 0);
-	atomic_store32(&_mapped_pages_os, 0);
-	atomic_store32(&_huge_pages_current, 0);
-	_huge_pages_peak = 0;
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
 #endif
 
 	//Setup all small and medium size classes
 	size_t iclass = 0;
 	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
-	_memory_adjust_size_class(iclass);
+	_rpmalloc_adjust_size_class(iclass);
 	for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
 		size_t size = iclass * SMALL_GRANULARITY;
 		_memory_size_class[iclass].block_size = (uint32_t)size;
-		_memory_adjust_size_class(iclass);
+		_rpmalloc_adjust_size_class(iclass);
 	}
 	//At least two blocks per span, then fall back to large allocations
 	_memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
@@ -1914,11 +2630,15 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		if (size > _memory_medium_size_limit)
 			break;
 		_memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
-		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+		_rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
 	}
 
-	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx)
-		atomic_store_ptr(&_memory_heaps[list_idx], 0);
+	_memory_orphan_heaps = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_first_class_orphan_heaps = 0;
+#endif
+	memset(_memory_heaps, 0, sizeof(_memory_heaps));
+	atomic_store32_release(&_memory_global_lock, 0);
 
 	//Initialize this thread
 	rpmalloc_thread_initialize();
@@ -1928,61 +2648,24 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 //! Finalize the allocator
 void
 rpmalloc_finalize(void) {
-	atomic_thread_fence_acquire();
-
 	rpmalloc_thread_finalize();
-	//rpmalloc_dump_statistics(stderr);
+	//rpmalloc_dump_statistics(stdout);
 
-	//Free all thread caches
+	if (_memory_global_reserve) {
+		atomic_add32(&_memory_global_reserve_master->remaining_spans, -(int32_t)_memory_global_reserve_count);
+		_memory_global_reserve_master = 0;
+		_memory_global_reserve_count = 0;
+		_memory_global_reserve = 0;
+	}
+	atomic_store32_release(&_memory_global_lock, 0);	
+
+	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
-			if (heap->spans_reserved) {
-				span_t* span = _memory_map_spans(heap, heap->spans_reserved);
-				_memory_unmap_span(span);
-			}
-
-			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-				heap_class_t* heap_class = heap->span_class + iclass;
-				span_t* span = heap_class->partial_span;
-				while (span) {
-					span_t* next = span->next;
-					if (span->state == SPAN_STATE_ACTIVE) {
-						uint32_t used_blocks = span->block_count;
-						if (span->free_list_limit < span->block_count)
-							used_blocks = span->free_list_limit;
-						uint32_t free_blocks = 0;
-						void* block = heap_class->free_list;
-						while (block) {
-							++free_blocks;
-							block = *((void**)block);
-						}
-						block = span->free_list;
-						while (block) {
-							++free_blocks;
-							block = *((void**)block);
-						}
-						if (used_blocks == (free_blocks + span->list_size))
-							_memory_heap_cache_insert(heap, span);
-					} else {
-						if (span->used_count == span->list_size)
-							_memory_heap_cache_insert(heap, span);
-					}
-					span = next;
-				}
-			}
-
-#if ENABLE_THREAD_CACHE
-			//Free span caches (other thread might have deferred after the thread using this heap finalized)
-			_memory_heap_cache_adopt_deferred(heap);
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				if (heap->span_cache[iclass])
-					_memory_unmap_span_list(heap->span_cache[iclass]);
-			}
-#endif
 			heap_t* next_heap = heap->next_heap;
-			size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-			_memory_unmap(heap, heap_size, heap->align_offset, heap_size);
+			heap->finalize = 1;
+			_rpmalloc_heap_global_finalize(heap);
 			heap = next_heap;
 		}
 	}
@@ -1990,24 +2673,21 @@ rpmalloc_finalize(void) {
 #if ENABLE_GLOBAL_CACHE
 	//Free global caches
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-		_memory_cache_finalize(&_memory_span_cache[iclass]);
+		_rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]);
 #endif
 
-	atomic_store_ptr(&_memory_orphan_heaps, 0);
-	atomic_thread_fence_release();
-
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 	pthread_key_delete(_memory_thread_heap);
 #endif
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    FlsFree(fls_key);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsFree(fls_key);
+	fls_key = 0;
 #endif
-
 #if ENABLE_STATISTICS
-	//If you hit these asserts you probably have memory leaks or double frees in your code
-	assert(!atomic_load32(&_mapped_pages));
-	assert(!atomic_load32(&_reserved_spans));
-	assert(!atomic_load32(&_mapped_pages_os));
+	//If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code
+	assert(atomic_load32(&_mapped_pages) == 0);
+	assert(atomic_load32(&_reserved_spans) == 0);
+	assert(atomic_load32(&_mapped_pages_os) == 0);
 #endif
 
 	_rpmalloc_initialized = 0;
@@ -2017,124 +2697,39 @@ rpmalloc_finalize(void) {
 extern inline void
 rpmalloc_thread_initialize(void) {
 	if (!get_thread_heap_raw()) {
-		heap_t* heap = _memory_allocate_heap();
+		heap_t* heap = _rpmalloc_heap_allocate(0);
 		if (heap) {
-			atomic_thread_fence_acquire();
-#if ENABLE_STATISTICS
-			atomic_incr32(&_memory_active_heaps);
-#endif
+			_rpmalloc_stat_inc(&_memory_active_heaps);
 			set_thread_heap(heap);
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 			FlsSetValue(fls_key, heap);
 #endif
 		}
 	}
-}
-
-//! Finalize thread, orphan heap
-void
-rpmalloc_thread_finalize(void) {
-	heap_t* heap = get_thread_heap_raw();
-	if (heap)
-		_memory_heap_finalize(heap);
-}
-
-int
-rpmalloc_is_thread_initialized(void) {
-	return (get_thread_heap_raw() != 0) ? 1 : 0;
-}
-
-const rpmalloc_config_t*
-rpmalloc_config(void) {
-	return &_memory_config;
-}
-
-//! Map new pages to virtual memory
-static void*
-_memory_map_os(size_t size, size_t* offset) {
-	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
-	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
-	assert(size >= _memory_page_size);
-#if PLATFORM_WINDOWS
-	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
-	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
-	if (!ptr) {
-		assert(!"Failed to map virtual memory block");
-		return 0;
-	}
-#else
-	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
-#  if defined(__APPLE__)
-	int fd = (int)VM_MAKE_TAG(240U);
-	if (_memory_huge_pages)
-		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
-#  elif defined(MAP_HUGETLB)
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
-#  else
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
-#  endif
-	if ((ptr == MAP_FAILED) || !ptr) {
-		assert("Failed to map virtual memory block" == 0);
-		return 0;
-	}
-#endif
-#if ENABLE_STATISTICS
-	atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
-#endif
-	if (padding) {
-		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
-		assert(final_padding <= _memory_span_size);
-		assert(final_padding <= padding);
-		assert(!(final_padding % 8));
-		ptr = pointer_offset(ptr, final_padding);
-		*offset = final_padding >> 3;
-	}
-	assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask));
-	return ptr;
-}
-
-//! Unmap pages from virtual memory
-static void
-_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) {
-	assert(release || (offset == 0));
-	assert(!release || (release >= _memory_page_size));
-	assert(size >= _memory_page_size);
-	if (release && offset) {
-		offset <<= 3;
-		address = pointer_offset(address, -(int32_t)offset);
-#if PLATFORM_POSIX
-		//Padding is always one span size
-		release += _memory_span_size;
-#endif
-	}
-#if !DISABLE_UNMAP
-#if PLATFORM_WINDOWS
-	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
-		assert(!"Failed to unmap virtual memory block");
-	}
-#else
-	if (release) {
-		if (munmap(address, release)) {
-			assert("Failed to unmap virtual memory block" == 0);
-		}
-	}
-	else {
-#if defined(POSIX_MADV_FREE)
-		if (posix_madvise(address, size, POSIX_MADV_FREE))
-#endif
-		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
-			assert("Failed to madvise virtual memory block as free" == 0);
-		}
-	}
-#endif
-#endif
-#if ENABLE_STATISTICS
-	if (release)
-		atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift));
+}
+
+//! Finalize thread, orphan heap
+void
+rpmalloc_thread_finalize(void) {
+	heap_t* heap = get_thread_heap_raw();
+	if (heap)
+		_rpmalloc_heap_release_raw(heap);
+	set_thread_heap(0);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsSetValue(fls_key, 0);
 #endif
 }
 
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap_raw() != 0) ? 1 : 0;
+}
+
+const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &_memory_config;
+}
+
 // Extern interface
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2146,12 +2741,12 @@ rpmalloc(size_t size) {
 	}
 #endif
 	heap_t* heap = get_thread_heap();
-	return _memory_allocate(heap, size);
+	return _rpmalloc_allocate(heap, size);
 }
 
 extern inline void
 rpfree(void* ptr) {
-	_memory_deallocate(ptr);
+	_rpmalloc_deallocate(ptr);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2175,8 +2770,9 @@ rpcalloc(size_t num, size_t size) {
 	total = num * size;
 #endif
 	heap_t* heap = get_thread_heap();
-	void* block = _memory_allocate(heap, total);
-	memset(block, 0, total);
+	void* block = _rpmalloc_allocate(heap, total);
+	if (block)
+		memset(block, 0, total);
 	return block;
 }
 
@@ -2188,7 +2784,8 @@ rprealloc(void* ptr, size_t size) {
 		return ptr;
 	}
 #endif
-	return _memory_reallocate(ptr, size, 0, 0);
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_reallocate(heap, ptr, size, 0, 0);
 }
 
 extern RPMALLOC_ALLOCATOR void*
@@ -2200,126 +2797,40 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 		return 0;
 	}
 #endif
-	void* block;
-	if (alignment > 32) {
-		size_t usablesize = _memory_usable_size(ptr);
-		if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1)))
-			return ptr;
-
-		block = rpaligned_alloc(alignment, size);
-		if (ptr) {
-			if (!oldsize)
-				oldsize = usablesize;
-			if (!(flags & RPMALLOC_NO_PRESERVE))
-				memcpy(block, ptr, oldsize < size ? oldsize : size);
-			rpfree(ptr);
-		}
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
-	} else {
-		block = _memory_reallocate(ptr, size, oldsize, flags);
-	}
-	return block;
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags);
 }
 
 extern RPMALLOC_ALLOCATOR void*
 rpaligned_alloc(size_t alignment, size_t size) {
-	if (alignment <= 16)
-		return rpmalloc(size);
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
 
+extern inline RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+	size_t total;
 #if ENABLE_VALIDATE_ARGS
-	if ((size + alignment) < size) {
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
 		errno = EINVAL;
 		return 0;
 	}
-	if (alignment & (alignment - 1)) {
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
 		errno = EINVAL;
 		return 0;
 	}
 #endif
-
-	void* ptr = 0;
-	size_t align_mask = alignment - 1;
-	if (alignment < _memory_page_size) {
-		ptr = rpmalloc(size + alignment);
-		if ((uintptr_t)ptr & align_mask)
-			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
-		return ptr;
-	}
-
-	// Fallback to mapping new pages for this request. Since pointers passed
-	// to rpfree must be able to reach the start of the span by bitmasking of
-	// the address with the span size, the returned aligned pointer from this
-	// function must be with a span size of the start of the mapped area.
-	// In worst case this requires us to loop and map pages until we get a
-	// suitable memory address. It also means we can never align to span size
-	// or greater, since the span header will push alignment more than one
-	// span size away from span start (thus causing pointer mask to give us
-	// an invalid span start on free)
-	if (alignment & align_mask) {
-		errno = EINVAL;
-		return 0;
-	}
-	if (alignment >= _memory_span_size) {
-		errno = EINVAL;
-		return 0;
-	}
-
-	size_t extra_pages = alignment / _memory_page_size;
-
-	// Since each span has a header, we will at least need one extra memory page
-	size_t num_pages = 1 + (size / _memory_page_size);
-	if (size & (_memory_page_size - 1))
-		++num_pages;
-
-	if (extra_pages > num_pages)
-		num_pages = 1 + extra_pages;
-
-	size_t original_pages = num_pages;
-	size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
-	if (limit_pages < (original_pages * 2))
-		limit_pages = original_pages * 2;
-
-	size_t mapped_size, align_offset;
-	span_t* span;
-
-retry:
-	align_offset = 0;
-	mapped_size = num_pages * _memory_page_size;
-
-	span = _memory_map(mapped_size, &align_offset);
-	if (!span) {
-		errno = ENOMEM;
-		return 0;
-	}
-	ptr = pointer_offset(span, SPAN_HEADER_SIZE);
-
-	if ((uintptr_t)ptr & align_mask)
-		ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-
-	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
-	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
-	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
-		_memory_unmap(span, mapped_size, align_offset, mapped_size);
-		++num_pages;
-		if (num_pages > limit_pages) {
-			errno = EINVAL;
-			return 0;
-		}
-		goto retry;
-	}
-
-	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
-	span->span_count = (uint32_t)num_pages;
-	span->align_offset = (uint32_t)align_offset;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
-
-	return ptr;
+#else
+	total = num * size;
+#endif
+	void* block = rpaligned_alloc(alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2338,7 +2849,7 @@ rpposix_memalign(void **memptr, size_t alignment, size_t size) {
 
 extern inline size_t
 rpmalloc_usable_size(void* ptr) {
-	return (ptr ? _memory_usable_size(ptr) : 0);
+	return (ptr ? _rpmalloc_usable_size(ptr) : 0);
 }
 
 extern inline void
@@ -2354,13 +2865,13 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		size_class_t* size_class = _memory_size_class + iclass;
-		heap_class_t* heap_class = heap->span_class + iclass;
-		span_t* span = heap_class->partial_span;
+		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
-			atomic_thread_fence_acquire();
 			size_t free_count = span->list_size;
-			if (span->state == SPAN_STATE_PARTIAL)
-				free_count += (size_class->block_count - span->used_count);
+			size_t block_count = size_class->block_count;
+			if (span->free_list_limit < block_count)
+				block_count = span->free_list_limit;
+			free_count += (block_count - span->used_count);
 			stats->sizecache = free_count * size_class->block_size;
 			span = span->next;
 		}
@@ -2368,38 +2879,46 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass])
-			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
-		span_t* deferred_list = !iclass ? atomic_load_ptr(&heap->span_cache_deferred) : 0;
-		//TODO: Incorrect, for deferred lists the size is NOT stored in list_size
-		if (deferred_list)
-			stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size;
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size;
 	}
 #endif
+
+	span_t* deferred = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
+	while (deferred) {
+		if (deferred->size_class != SIZE_CLASS_HUGE)
+			stats->spancache = (size_t)deferred->span_count * _memory_span_size;
+		deferred = (span_t*)deferred->free_list;
+	}
+
 #if ENABLE_STATISTICS
-	stats->thread_to_global = heap->thread_to_global;
-	stats->global_to_thread = heap->global_to_thread;
+	stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global);
+	stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread);
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current);
-		stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high;
-		stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global;
-		stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global;
-		stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache;
-		stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache;
-		stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved;
-		stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved;
-		stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls;
+		stats->span_use[iclass].peak = (size_t)atomic_load32(&heap->span_use[iclass].high);
+		stats->span_use[iclass].to_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global);
+		stats->span_use[iclass].from_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global);
+		stats->span_use[iclass].to_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache);
+		stats->span_use[iclass].from_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache);
+		stats->span_use[iclass].to_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved);
+		stats->span_use[iclass].from_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved);
+		stats->span_use[iclass].map_calls = (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls);
 	}
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
 		stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak;
-		stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total;
+		stats->size_use[iclass].alloc_total = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total);
 		stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
-		stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache;
-		stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache;
-		stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved;
-		stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls;
+		stats->size_use[iclass].spans_to_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache);
+		stats->size_use[iclass].spans_from_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache);
+		stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved);
+		stats->size_use[iclass].map_calls = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls);
 	}
 #endif
 }
@@ -2416,67 +2935,89 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
 #endif
 #if ENABLE_GLOBAL_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size;
+#endif
+}
+
+#if ENABLE_STATISTICS
+
+static void
+_memory_heap_dump_statistics(heap_t* heap, void* file) {
+	fprintf(file, "Heap %d stats:\n", heap->id);
+	fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (!atomic_load32(&heap->size_class_use[iclass].alloc_total))
+			continue;
+		fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
+			atomic_load32(&heap->size_class_use[iclass].alloc_current),
+			heap->size_class_use[iclass].alloc_peak,
+			atomic_load32(&heap->size_class_use[iclass].alloc_total),
+			atomic_load32(&heap->size_class_use[iclass].free_total),
+			_memory_size_class[iclass].block_size,
+			_memory_size_class[iclass].block_count,
+			atomic_load32(&heap->size_class_use[iclass].spans_current),
+			heap->size_class_use[iclass].spans_peak,
+			((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
+	}
+	fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
-	}
+		if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
+			continue;
+		fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
+			atomic_load32(&heap->span_use[iclass].current),
+			atomic_load32(&heap->span_use[iclass].high),
+			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+#if ENABLE_THREAD_CACHE
+			(unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+#else
+			0, (size_t)0, (size_t)0,
 #endif
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->span_use[iclass].spans_map_calls));
+	}
+	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
+	fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
 }
 
+#endif
+
 void
 rpmalloc_dump_statistics(void* file) {
 #if ENABLE_STATISTICS
 	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
 	assert(atomic_load32(&_memory_active_heaps) == 0);
-
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
-			fprintf(file, "Heap %d stats:\n", heap->id);
-			fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
-			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-				if (!heap->size_class_use[iclass].alloc_total) {
+			int need_dump = 0;
+			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {
+				if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) {
 					assert(!atomic_load32(&heap->size_class_use[iclass].free_total));
-					assert(!heap->size_class_use[iclass].spans_map_calls);
+					assert(!atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
 					continue;
 				}
-				fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
-					atomic_load32(&heap->size_class_use[iclass].alloc_current),
-					heap->size_class_use[iclass].alloc_peak,
-					heap->size_class_use[iclass].alloc_total,
-					atomic_load32(&heap->size_class_use[iclass].free_total),
-					_memory_size_class[iclass].block_size,
-					_memory_size_class[iclass].block_count,
-					heap->size_class_use[iclass].spans_current,
-					heap->size_class_use[iclass].spans_peak,
-					((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024),
-					heap->size_class_use[iclass].spans_map_calls);
+				need_dump = 1;
 			}
-			fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
+			for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) {
+				if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
 					continue;
-				fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
-					atomic_load32(&heap->span_use[iclass].current),
-					heap->span_use[iclass].high,
-					((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
-					((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					heap->span_use[iclass].spans_map_calls);
+				need_dump = 1;
 			}
-			fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
-			fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024));
+			if (need_dump)
+				_memory_heap_dump_statistics(heap, file);
 			heap = heap->next_heap;
 		}
 	}
-
 	fprintf(file, "Global stats:\n");
 	size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
 	size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;
@@ -2504,6 +3045,188 @@ rpmalloc_dump_statistics(void* file) {
 #endif
 }
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+extern inline rpmalloc_heap_t*
+rpmalloc_heap_acquire(void) {
+	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
+	// could already be allocated from the heap which would (wrongly) be released when
+	// heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be
+	// pristine from the dedicated orphan list can be used.
+	heap_t* heap = _rpmalloc_heap_allocate(1);
+	heap->owner_thread = 0;
+	_rpmalloc_stat_inc(&_memory_active_heaps);
+	return heap;
+}
+
+extern inline void
+rpmalloc_heap_release(rpmalloc_heap_t* heap) {
+	if (heap)
+		_rpmalloc_heap_release(heap, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _rpmalloc_allocate(heap, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
+	return rpmalloc_heap_aligned_calloc(heap, 0, num, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	void* block = _rpmalloc_aligned_allocate(heap, alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _rpmalloc_reallocate(heap, ptr, size, 0, flags);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags);	
+}
+
+extern inline void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
+	(void)sizeof(heap);
+	_rpmalloc_deallocate(ptr);
+}
+
+extern inline void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
+	span_t* span;
+	span_t* next_span;
+
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		span = heap->size_class[iclass].partial_span;
+		while (span) {
+			next_span = span->next;
+			_rpmalloc_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+		heap->size_class[iclass].partial_span = 0;
+		span = heap->full_span[iclass];
+		while (span) {
+			next_span = span->next;
+			_rpmalloc_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+	}
+	memset(heap->size_class, 0, sizeof(heap->size_class));
+	memset(heap->full_span, 0, sizeof(heap->full_span));
+
+	span = heap->large_huge_span;
+	while (span) {
+		next_span = span->next;
+		if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
+			_rpmalloc_deallocate_huge(span);
+		else
+			_rpmalloc_heap_cache_insert(heap, span);
+		span = next_span;
+	}
+	heap->large_huge_span = 0;
+	heap->full_span_count = 0;
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
+			continue;
+#if ENABLE_GLOBAL_CACHE
+		_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+		_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+#else
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+		span_cache->count = 0;
+	}
+#endif
+
+#if ENABLE_STATISTICS
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
+		atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
+	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->span_use[iclass].current, 0);
+	}
+#endif
+}
+
+extern inline void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
+	heap_t* prev_heap = get_thread_heap_raw();
+	if (prev_heap != heap) {
+		set_thread_heap(heap);
+		if (prev_heap)
+			rpmalloc_heap_release(prev_heap);
+	}
+}
+
+#endif
+
 #if ENABLE_PRELOAD || ENABLE_OVERRIDE
 
 #include "malloc.c"
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 2f48bc97..6b85c0af 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -20,11 +20,12 @@ extern "C" {
 #if defined(__clang__) || defined(__GNUC__)
 # define RPMALLOC_EXPORT __attribute__((visibility("default")))
 # define RPMALLOC_ALLOCATOR 
-# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
-# if defined(__clang_major__) && (__clang_major__ < 4)
+# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
+# define RPMALLOC_ATTRIB_MALLOC
 # define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
 # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
 # else
+# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
 # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
 # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)  __attribute__((alloc_size(count, size)))
 # endif
@@ -45,13 +46,24 @@ extern "C" {
 # define RPMALLOC_CDECL
 #endif
 
-//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes
+//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
+//  a very small overhead due to some size calculations not being compile time constants
 #ifndef RPMALLOC_CONFIGURABLE
 #define RPMALLOC_CONFIGURABLE 0
 #endif
 
+//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions).
+//  Will introduce a very small overhead to track fully allocated spans in heaps
+#ifndef RPMALLOC_FIRST_CLASS_HEAPS
+#define RPMALLOC_FIRST_CLASS_HEAPS 0
+#endif
+
 //! Flag to rpaligned_realloc to not preserve content in reallocation
 #define RPMALLOC_NO_PRESERVE    1
+//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place,
+//  in which case the original pointer is still valid (just like a call to realloc which failes to allocate
+//  a new block).
+#define RPMALLOC_GROW_OR_FAIL   2
 
 typedef struct rpmalloc_global_statistics_t {
 	//! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
@@ -99,7 +111,7 @@ typedef struct rpmalloc_thread_statistics_t {
 		size_t from_reserved;
 		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
 		size_t map_calls;
-	} span_use[32];
+	} span_use[64];
 	//! Per size class statistics (only if ENABLE_STATISTICS=1)
 	struct {
 		//! Current number of allocations
@@ -163,6 +175,7 @@ typedef struct rpmalloc_config_t {
 	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
 	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
 	int enable_huge_pages;
+	int unused;
 } rpmalloc_config_t;
 
 //! Initialize allocator with default configuration
@@ -240,6 +253,13 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsi
 RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
 rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
 
+//! Allocate a memory block of at least the given size and alignment, and zero initialize it.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
 //! Allocate a memory block of at least the given size and alignment.
 //  Alignment must be a power of two and a multiple of sizeof(void*),
 //  and should ideally be less than memory page size. A caveat of rpmalloc
@@ -252,12 +272,80 @@ rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB
 //  and should ideally be less than memory page size. A caveat of rpmalloc
 //  internals is that this must also be strictly less than the span size (default 64KiB)
 RPMALLOC_EXPORT int
-rpposix_memalign(void **memptr, size_t alignment, size_t size);
+rpposix_memalign(void** memptr, size_t alignment, size_t size);
 
 //! Query the usable size of the given memory block (from given pointer to the end of block)
 RPMALLOC_EXPORT size_t
 rpmalloc_usable_size(void* ptr);
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+//! Heap type
+typedef struct heap_t rpmalloc_heap_t;
+
+//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
+//  if none available. Heap API is imlemented with the strict assumption that only one single
+//  thread will call heap functions for a given heap at any given time, no functions are thread safe.
+RPMALLOC_EXPORT rpmalloc_heap_t*
+rpmalloc_heap_acquire(void);
+
+//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap).
+//  Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer.
+RPMALLOC_EXPORT void
+rpmalloc_heap_release(rpmalloc_heap_t* heap);
+
+//! Allocate a memory block of at least the given size using the given heap.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size using the given heap. The returned
+//  block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned
+//  block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function. The returned block will have the requested alignment.
+//  Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be
+//  less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than
+//  the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Free the given memory block from the given heap. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr);
+
+//! Free all memory allocated by the heap
+RPMALLOC_EXPORT void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap);
+
+//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap
+//  for a single thread, a heap can never be shared between multiple threads. The previous
+//  current heap for the calling thread is released to be reused by other threads.
+RPMALLOC_EXPORT void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap);
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/rpmalloc/rpnew.h b/rpmalloc/rpnew.h
new file mode 100644
index 00000000..cdb3d24d
--- /dev/null
+++ b/rpmalloc/rpnew.h
@@ -0,0 +1,111 @@
+
+#ifdef __cplusplus
+
+#include <new>
+#include <rpmalloc.h>
+
+#ifdef _WIN32
+
+extern void __CRTDECL
+operator delete(void* p) noexcept {
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p) noexcept {
+	rpfree(p);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size) noexcept(false) {
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size) noexcept(false) {
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpmalloc(size);
+}
+
+#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+
+extern void __CRTDECL
+operator delete(void* p, std::size_t size) noexcept {
+	(void)sizeof(size);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::size_t size) noexcept {
+	(void)sizeof(size);
+	rpfree(p);
+}
+
+#endif
+
+#if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+
+extern void __CRTDECL
+operator delete(void* p, std::align_val_t align) noexcept {
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::align_val_t align) noexcept {
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete(void* p, std::size_t size, std::align_val_t align) noexcept {
+	(void)sizeof(size);
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::size_t size, std::align_val_t align) noexcept {
+	(void)sizeof(size);
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, std::align_val_t align) noexcept(false) {
+	return rpaligned_alloc(align, size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, std::align_val_t align) noexcept(false) {
+	return rpaligned_alloc(align, size);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, std::align_val_t align, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpaligned_alloc(align, size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, std::align_val_t align, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpaligned_alloc(align, size);
+}
+
+#endif
+
+#endif
+
+#endif
diff --git a/test/main-override.cc b/test/main-override.cc
index 1134d37f..ece254c5 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -4,6 +4,7 @@
 #endif
 
 #include <rpmalloc.h>
+#include <rpnew.h>
 #include <thread.h>
 #include <test.h>
 
@@ -12,6 +13,10 @@
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
+#include <inttypes.h>
+
+extern "C" void* RPMALLOC_CDECL pvalloc(size_t size);
+extern "C" void* RPMALLOC_CDECL valloc(size_t size);
 
 static size_t _hardware_threads;
 
@@ -26,6 +31,8 @@ test_fail(const char* reason) {
 
 static int
 test_alloc(void) {
+	const rpmalloc_config_t* config = rpmalloc_config();
+
 	void* p = malloc(371);
 	if (!p)
 		return test_fail("malloc failed");
@@ -47,6 +54,31 @@ test_alloc(void) {
 		return test_fail("usable size invalid (3)");
 	delete[] static_cast<int*>(p);
 
+	p = new int[32];
+	if (!p)
+		return test_fail("new[] failed");
+	if (rpmalloc_usable_size(p) != 32*sizeof(int))
+		return test_fail("usable size invalid (4)");
+	delete[] static_cast<int*>(p);
+
+	p = valloc(873);
+	if (reinterpret_cast<uintptr_t>(p) & (config->page_size - 1)) {
+		fprintf(stderr, "FAIL: pvalloc did not align address to page size (%p)\n", p);
+		return -1;
+	}
+	free(p);
+
+	p = pvalloc(275);
+	if (reinterpret_cast<uintptr_t>(p) & (config->page_size - 1)) {
+		fprintf(stderr, "FAIL: pvalloc did not align address to page size (%p)\n", p);
+		return -1;
+	}
+	if (reinterpret_cast<uintptr_t>(p) < config->page_size) {
+		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%" PRIu64 ")\n", static_cast<uint64_t>(rpmalloc_usable_size(p)));
+		return -1;
+	}
+	rpfree(p);
+
 	printf("Allocation tests passed\n");
 	return 0;
 }
@@ -56,6 +88,7 @@ test_free(void) {
 	free(rpmalloc(371));
 	free(new int);
 	free(new int[16]);
+	free(pvalloc(1275));
 	printf("Free tests passed\n");
 	return 0;	
 }
@@ -131,7 +164,10 @@ main(int argc, char** argv) {
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wnonportable-system-include-path"
+#endif
+#include <windows.h>
 
 static void
 test_initialize(void) {
diff --git a/test/main.c b/test/main.c
index 679287f5..f8db4c7e 100644
--- a/test/main.c
+++ b/test/main.c
@@ -2,6 +2,9 @@
 #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
 #  define _CRT_SECURE_NO_WARNINGS
 #endif
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wnonportable-system-include-path"
+#endif
 
 #include <rpmalloc.h>
 #include <thread.h>
@@ -18,16 +21,26 @@
 #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
 
 static size_t _hardware_threads;
+static int _test_failed;
 
 static void
 test_initialize(void);
 
 static int
-test_fail(const char* reason) {
-	fprintf(stderr, "FAIL: %s\n", reason);
+test_fail_cb(const char* reason, const char* file, int line) {
+	fprintf(stderr, "FAIL: %s @ %s:%d\n", reason, file, line);
+	fflush(stderr);
+	_test_failed = 1;
 	return -1;
 }
 
+#define test_fail(msg) test_fail_cb(msg, __FILE__, __LINE__)
+
+static void
+defer_free_thread(void *arg) {
+	rpfree(arg);
+}
+
 static int
 test_alloc(void) {
 	unsigned int iloop = 0;
@@ -37,9 +50,15 @@ test_alloc(void) {
 	void* addr[8142];
 	char data[20000];
 	unsigned int datasize[7] = { 473, 39, 195, 24, 73, 376, 245 };
+	size_t wanted_usable_size;
 
 	rpmalloc_initialize();
 
+	//Query the small granularity
+	void* zero_alloc = rpmalloc(0);
+	size_t small_granularity = rpmalloc_usable_size(zero_alloc);
+	rpfree(zero_alloc);
+
 	for (id = 0; id < 20000; ++id)
 		data[id] = (char)(id % 139 + id % 17);
 
@@ -58,16 +77,18 @@ test_alloc(void) {
 	rpfree(testptr);
 	for (iloop = 0; iloop <= 1024; ++iloop) {
 		testptr = rpmalloc(iloop);
-		size_t wanted_usable_size = 16 * ((iloop / 16) + ((!iloop || (iloop % 16)) ? 1 : 0));
-		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
+		wanted_usable_size = iloop ? small_granularity * ((iloop + (small_granularity - 1)) / small_granularity) : small_granularity;
+		if (rpmalloc_usable_size(testptr) != wanted_usable_size) {
+			printf("For %u wanted %zu got %zu\n", iloop, wanted_usable_size, rpmalloc_usable_size(testptr));
 			return test_fail("Bad base alloc usable size");
+		}
 		rpfree(testptr);
 	}
 
 	//Verify medium block sizes (until class merging kicks in)
 	for (iloop = 1025; iloop <= 6000; ++iloop) {
 		testptr = rpmalloc(iloop);
-		size_t wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
+		wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 			return test_fail("Bad medium alloc usable size");
 		rpfree(testptr);
@@ -76,9 +97,10 @@ test_alloc(void) {
 	//Large reallocation test
 	testptr = rpmalloc(253000);
 	testptr = rprealloc(testptr, 151);
-	if (rpmalloc_usable_size(testptr) != 160)
+	wanted_usable_size = (small_granularity * ((151 + (small_granularity - 1)) / small_granularity));
+	if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 		return test_fail("Bad usable size");
-	if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != 144)
+	if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != (wanted_usable_size - 16))
 		return test_fail("Bad offset usable size");
 	rpfree(testptr);
 
@@ -87,7 +109,7 @@ test_alloc(void) {
 		size_t size = 37 * iloop;
 		testptr = rpmalloc(size);
 		*((uintptr_t*)testptr) = 0x12345678;
-		size_t wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+		wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 			return test_fail("Bad usable size (alloc)");
 		testptr = rprealloc(testptr, size + 16);
@@ -99,7 +121,7 @@ test_alloc(void) {
 
 		testptr = rpaligned_alloc(128, size);
 		*((uintptr_t*)testptr) = 0x12345678;
-		wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+		wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) < wanted_usable_size)
 			return test_fail("Bad usable size (aligned alloc)");
 		if (rpmalloc_usable_size(testptr) > (wanted_usable_size + 128))
@@ -109,6 +131,8 @@ test_alloc(void) {
 			return test_fail("Bad usable size (aligned realloc)");
 		if (*((uintptr_t*)testptr) != 0x12345678)
 			return test_fail("Data not preserved on realloc");
+		if (rpaligned_realloc(testptr, 128, size * 1024 * 4, 0, RPMALLOC_GROW_OR_FAIL))
+			return test_fail("Realloc with grow-or-fail did not fail as expected");
 		void* unaligned = rprealloc(testptr, size);
 		if (unaligned != testptr) {
 			ptrdiff_t diff = pointer_diff(testptr, unaligned);
@@ -120,11 +144,23 @@ test_alloc(void) {
 		rpfree(testptr);
 	}
 
-	static size_t alignment[3] = { 0, 64, 256 };
+	static size_t alignment[5] = { 0, 32, 64, 128, 256 };
+	for (iloop = 0; iloop < 5; ++iloop) {
+		for (ipass = 0; ipass < 128 * 1024; ++ipass) {
+			size_t this_alignment = alignment[iloop];
+			char* baseptr = rpaligned_alloc(this_alignment, ipass);
+			if (this_alignment && ((uintptr_t)baseptr & (this_alignment - 1)))
+				return test_fail("Alignment failed");
+			rpfree(baseptr);
+		}
+	}
 	for (iloop = 0; iloop < 64; ++iloop) {
 		for (ipass = 0; ipass < 8142; ++ipass) {
+			size_t this_alignment = alignment[ipass % 5];
 			size_t size = iloop + ipass + datasize[(iloop + ipass) % 7];
-			char* baseptr = rpaligned_alloc(alignment[ipass % 3], size);
+			char* baseptr = rpaligned_alloc(this_alignment, size);
+			if (this_alignment && ((uintptr_t)baseptr & (this_alignment - 1)))
+				return test_fail("Alignment failed");
 			for (size_t ibyte = 0; ibyte < size; ++ibyte)
 				baseptr[ibyte] = (char)(ibyte & 0xFF);
 
@@ -137,8 +173,9 @@ test_alloc(void) {
 			}
 
 			size_t alignsize = (iloop * ipass + datasize[(iloop + ipass * 3) % 7]) & 0x2FF;
+			this_alignment = alignment[(ipass + 1) % 5];
 			capsize = (capsize > alignsize ? alignsize : capsize);
-			baseptr = rpaligned_realloc(baseptr, 128, alignsize, resize, 0);
+			baseptr = rpaligned_realloc(baseptr, this_alignment, alignsize, resize, 0);
 			for (size_t ibyte = 0; ibyte < capsize; ++ibyte) {
 				if (baseptr[ibyte] != (char)(ibyte & 0xFF))
 					return test_fail("Data not preserved on realloc");
@@ -282,6 +319,29 @@ test_alloc(void) {
 	}
 	rpmalloc_finalize();
 
+	// Test that a full span with deferred block is finalized properly
+	// Also test that a deferred huge span is finalized properly
+	rpmalloc_initialize();
+	{
+		addr[0] = rpmalloc(23457);
+	
+		thread_arg targ;
+		targ.fn = defer_free_thread;
+		targ.arg = addr[0];
+		uintptr_t thread = thread_run(&targ);
+		thread_sleep(100);
+		thread_join(thread);
+
+		addr[0] = rpmalloc(12345678);
+
+		targ.fn = defer_free_thread;
+		targ.arg = addr[0];
+		thread = thread_run(&targ);
+		thread_sleep(100);
+		thread_join(thread);
+	}
+	rpmalloc_finalize();
+
 	printf("Memory allocation tests passed\n");
 
 	return 0;
@@ -313,9 +373,13 @@ test_realloc(void) {
 
 	size_t bigsize = 1024 * 1024;
 	void* bigptr = rpmalloc(bigsize);
-	while (bigsize < 3 * 1024 * 1024) {
+	while (bigsize < 3000000) {
 		++bigsize;
 		bigptr = rprealloc(bigptr, bigsize);
+		if (rpaligned_realloc(bigptr, 0, bigsize * 32, 0, RPMALLOC_GROW_OR_FAIL))
+			return test_fail("Reallocation with grow-or-fail did not fail as expected");
+		if (rpaligned_realloc(bigptr, 128, bigsize * 32, 0, RPMALLOC_GROW_OR_FAIL))
+			return test_fail("Reallocation with aligned grow-or-fail did not fail as expected");
 	}
 	rpfree(bigptr);
 
@@ -362,6 +426,7 @@ typedef struct _allocator_thread_arg {
 	unsigned int        passes; //max 4096
 	unsigned int        datasize[32];
 	unsigned int        num_datasize; //max 32
+	int                 init_fini_each_loop;
 	void**              pointers;
 	void**              crossthread_pointers;
 } allocator_thread_arg_t;
@@ -388,7 +453,13 @@ allocator_thread(void* argp) {
 
 	thread_sleep(1);
 
+	if (arg.init_fini_each_loop)
+		rpmalloc_thread_finalize();
+
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
+		if (arg.init_fini_each_loop)
+			rpmalloc_thread_initialize();
+
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
 			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
 
@@ -431,8 +502,14 @@ allocator_thread(void* argp) {
 
 			rpfree(addr[ipass]);
 		}
+
+		if (arg.init_fini_each_loop)
+			rpmalloc_thread_finalize();
 	}
 
+	if (arg.init_fini_each_loop)
+		rpmalloc_thread_initialize();
+
 	rpfree(data);
 	rpfree(addr);
 
@@ -442,6 +519,87 @@ allocator_thread(void* argp) {
 	thread_exit((uintptr_t)ret);
 }
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+static void
+heap_allocator_thread(void* argp) {
+	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
+	unsigned int iloop = 0;
+	unsigned int ipass = 0;
+	unsigned int icheck = 0;
+	unsigned int id = 0;
+	void** addr;
+	uint32_t* data;
+	unsigned int cursize;
+	unsigned int iwait = 0;
+	int ret = 0;
+
+	rpmalloc_heap_t* outer_heap = rpmalloc_heap_acquire();
+
+	addr = rpmalloc_heap_alloc(outer_heap, sizeof(void*) * arg.passes);
+	data = rpmalloc_heap_alloc(outer_heap, 512 * 1024);
+	for (id = 0; id < 512 * 1024 / 4; ++id)
+		data[id] = id;
+
+	thread_sleep(1);
+
+	for (iloop = 0; iloop < arg.loops; ++iloop) {
+		rpmalloc_heap_t* heap = rpmalloc_heap_acquire();
+
+		for (ipass = 0; ipass < arg.passes; ++ipass) {
+			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
+
+			addr[ipass] = rpmalloc_heap_alloc(heap, 4 + cursize);
+			if (addr[ipass] == 0) {
+				ret = test_fail("Allocation failed");
+				goto end;
+			}
+
+			*(uint32_t*)addr[ipass] = (uint32_t)cursize;
+			memcpy(pointer_offset(addr[ipass], 4), data, cursize);
+
+			for (icheck = 0; icheck < ipass; ++icheck) {
+				if (addr[icheck] == addr[ipass]) {
+					ret = test_fail("Identical pointer returned from allocation");
+					goto end;
+				}
+				if (addr[icheck] < addr[ipass]) {
+					if (pointer_offset(addr[icheck], *(uint32_t*)addr[icheck] + 4) > addr[ipass]) {
+						ret = test_fail("Invalid pointer inside another block returned from allocation");
+						goto end;
+					}
+				}
+				else if (addr[icheck] > addr[ipass]) {
+					if (pointer_offset(addr[ipass], *(uint32_t*)addr[ipass] + 4) > addr[icheck]) {
+						ret = test_fail("Invalid pointer inside another block returned from allocation");
+						goto end;
+					}
+				}
+			}
+		}
+
+		for (ipass = 0; ipass < arg.passes; ++ipass) {
+			cursize = *(uint32_t*)addr[ipass];
+
+			if (memcmp(pointer_offset(addr[ipass], 4), data, cursize)) {
+				ret = test_fail("Data corrupted");
+				goto end;
+			}
+		}
+
+		rpmalloc_heap_free_all(heap);
+		rpmalloc_heap_release(heap);
+	}
+
+	rpmalloc_heap_free_all(outer_heap);
+	rpmalloc_heap_release(outer_heap);
+
+end:
+	thread_exit((uintptr_t)ret);
+}
+
+#endif
+
 static void
 crossallocator_thread(void* argp) {
 	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
@@ -463,7 +621,7 @@ crossallocator_thread(void* argp) {
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
 			size_t iarg = (iloop + ipass + iextra++) % arg.num_datasize;
-			cursize = arg.datasize[iarg] + ((iloop + ipass) % 21);
+			cursize = arg.datasize[iarg] + ((iloop + ipass) % 439);
 			void* first_addr = rpmalloc(cursize);
 			if (first_addr == 0) {
 				ret = test_fail("Allocation failed");
@@ -479,7 +637,7 @@ crossallocator_thread(void* argp) {
 			}
 
 			iarg = (iloop + ipass + iextra++) % arg.num_datasize;
-			cursize = arg.datasize[iarg] + ((iloop + ipass) % 17);
+			cursize = arg.datasize[iarg] + ((iloop + ipass) % 751);
 			void* third_addr = rpmalloc(cursize);
 			if (third_addr == 0) {
 				ret = test_fail("Allocation failed");
@@ -507,7 +665,7 @@ crossallocator_thread(void* argp) {
 
 	rpfree(extra_pointers);
 
-	while (next_crossthread < end_crossthread) {
+	while ((next_crossthread < end_crossthread) && !_test_failed) {
 		if (arg.crossthread_pointers[next_crossthread]) {
 			rpfree(arg.crossthread_pointers[next_crossthread]);
 			arg.crossthread_pointers[next_crossthread] = 0;
@@ -526,47 +684,57 @@ crossallocator_thread(void* argp) {
 static void
 initfini_thread(void* argp) {
 	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
-	unsigned int iloop = 0;
-	unsigned int ipass = 0;
-	unsigned int icheck = 0;
+	unsigned int iloop;
+	unsigned int ipass;
+	unsigned int icheck;
 	unsigned int id = 0;
-	void* addr[4096];
+	uint32_t* addr[4096];
+	uint32_t blocksize[4096];
 	char data[8192];
 	unsigned int cursize;
-	unsigned int iwait = 0;
+	unsigned int max_datasize = 0;
+	uint32_t this_size;
+	uint32_t check_size;
 	int ret = 0;
 
-	for (id = 0; id < 8192; ++id)
+	for (id = 0; id < sizeof(data); ++id)
 		data[id] = (char)id;
 
 	thread_yield();
 
+	if (arg.passes > (sizeof(addr) / sizeof(addr[0])))
+		arg.passes = sizeof(addr) / sizeof(addr[0]);
+
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		rpmalloc_thread_initialize();
 
-		unsigned int max_datasize = 0;
+		max_datasize = 0;
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
+			cursize = arg.datasize[(iloop + ipass) % arg.num_datasize] + ((iloop + ipass) % 1024);
+			if (cursize > sizeof(data))
+				cursize = sizeof(data);
 			if (cursize > max_datasize)
 				max_datasize = cursize;
 
-			addr[ipass] = rpmalloc(4 + cursize);
+			addr[ipass] = rpmalloc(sizeof(uint32_t) + cursize);
 			if (addr[ipass] == 0) {
 				ret = test_fail("Allocation failed");
 				goto end;
 			}
 
-			*(uint32_t*)addr[ipass] = (uint32_t)cursize;
-			memcpy(pointer_offset(addr[ipass], 4), data, cursize);
+			blocksize[ipass] = (uint32_t)cursize;
+			addr[ipass][0] = (uint32_t)cursize;
+			memcpy(addr[ipass] + 1, data, cursize);
 
 			for (icheck = 0; icheck < ipass; ++icheck) {
-				size_t this_size = *(uint32_t*)addr[ipass];
-				size_t check_size = *(uint32_t*)addr[icheck];
+				this_size = addr[ipass][0];
+				check_size = addr[icheck][0];
 				if (this_size != cursize) {
 					ret = test_fail("Data corrupted in this block (size)");
 					goto end;
 				}
-				if (check_size > max_datasize) {
+				if (check_size != blocksize[icheck]) {
+					printf("For %u:%u got previous block size %u (%x) wanted %u (%x)\n", iloop, ipass, check_size, check_size, blocksize[icheck], blocksize[icheck]);
 					ret = test_fail("Data corrupted in previous block (size)");
 					goto end;
 				}
@@ -575,13 +743,12 @@ initfini_thread(void* argp) {
 					goto end;
 				}
 				if (addr[icheck] < addr[ipass]) {
-					if (pointer_offset(addr[icheck], check_size + 4) > addr[ipass]) {
+					if (pointer_offset(addr[icheck], check_size + sizeof(uint32_t)) > (void*)addr[ipass]) {
 						ret = test_fail("Invalid pointer inside another block returned from allocation");
 						goto end;
 					}
-				}
-				else if (addr[icheck] > addr[ipass]) {
-					if (pointer_offset(addr[ipass], cursize + 4) > addr[icheck]) {
+				} else {
+					if (pointer_offset(addr[ipass], this_size + sizeof(uint32_t)) > (void*)addr[icheck]) {
 						ret = test_fail("Invalid pointer inside another block returned from allocation");
 						goto end;
 					}
@@ -590,13 +757,19 @@ initfini_thread(void* argp) {
 		}
 
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = *(uint32_t*)addr[ipass];
+			cursize = addr[ipass][0];
+
+			if (cursize != blocksize[ipass]) {
+				printf("For %u:%u got size %u (%x) wanted %u (%x)\n", iloop, ipass, cursize, cursize, blocksize[ipass], blocksize[ipass]);
+				ret = test_fail("Data corrupted (size)");
+				goto end;
+			}
 			if (cursize > max_datasize) {
+				printf("For %u:%u got size %u (%x) >= %u\n", iloop, ipass, cursize, cursize, max_datasize);
 				ret = test_fail("Data corrupted (size)");
 				goto end;
 			}
-
-			if (memcmp(pointer_offset(addr[ipass], 4), data, cursize)) {
+			if (memcmp(addr[ipass] + 1, data, cursize)) {
 				ret = test_fail("Data corrupted");
 				goto end;
 			}
@@ -646,8 +819,14 @@ test_threaded(void) {
 	arg.datasize[14] = 38934;
 	arg.datasize[15] = 234;
 	arg.num_datasize = 16;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
 	arg.loops = 100;
 	arg.passes = 4000;
+#else
+	arg.loops = 30;
+	arg.passes = 1000;
+#endif
+	arg.init_fini_each_loop = 0;
 
 	thread_arg targ;
 	targ.fn = allocator_thread;
@@ -674,22 +853,27 @@ test_threaded(void) {
 
 static int 
 test_crossthread(void) {
-	uintptr_t thread[8];
-	allocator_thread_arg_t arg[8];
-	thread_arg targ[8];
+	uintptr_t thread[32];
+	allocator_thread_arg_t arg[32];
+	thread_arg targ[32];
 
 	rpmalloc_initialize();
 
 	size_t num_alloc_threads = _hardware_threads;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
-	if (num_alloc_threads > 4)
-		num_alloc_threads = 4;
+	if (num_alloc_threads > 16)
+		num_alloc_threads = 16;
 
 	for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread) {
 		unsigned int iadd = (ithread * (16 + ithread) + ithread) % 128;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
 		arg[ithread].loops = 50;
 		arg[ithread].passes = 1024;
+#else
+		arg[ithread].loops = 20;
+		arg[ithread].passes = 200;
+#endif
 		arg[ithread].pointers = rpmalloc(sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		memset(arg[ithread].pointers, 0, sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		arg[ithread].datasize[0] = 19 + iadd;
@@ -701,11 +885,11 @@ test_crossthread(void) {
 		arg[ithread].datasize[6] = 3892 + iadd;
 		arg[ithread].datasize[7] = 19 + iadd;
 		arg[ithread].datasize[8] = 154 + iadd;
-		arg[ithread].datasize[9] = 39723 + iadd;
-		arg[ithread].datasize[10] = 15 + iadd;
-		arg[ithread].datasize[11] = 493 + iadd;
+		arg[ithread].datasize[9] = 9723 + iadd;
+		arg[ithread].datasize[10] = 15543 + iadd;
+		arg[ithread].datasize[11] = 32493 + iadd;
 		arg[ithread].datasize[12] = 34 + iadd;
-		arg[ithread].datasize[13] = 894 + iadd;
+		arg[ithread].datasize[13] = 1894 + iadd;
 		arg[ithread].datasize[14] = 193 + iadd;
 		arg[ithread].datasize[15] = 2893 + iadd;
 		arg[ithread].num_datasize = 16;
@@ -754,8 +938,13 @@ test_threadspam(void) {
 	num_alloc_threads = _hardware_threads;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
-	if (num_alloc_threads > 64)
-		num_alloc_threads = 64;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
+	if (num_alloc_threads > 32)
+		num_alloc_threads = 32;
+#else
+	if (num_alloc_threads > 16)
+		num_alloc_threads = 16;
+#endif
 
 	arg.loops = 500;
 	arg.passes = 10;
@@ -775,8 +964,7 @@ test_threadspam(void) {
 		thread[i] = thread_run(&targ);
 
 	for (j = 0; j < num_passes; ++j) {
-		thread_sleep(10);
-		thread_fence();
+		thread_sleep(100);
 
 		for (i = 0; i < num_alloc_threads; ++i) {
 			threadres[i] = thread_join(thread[i]);
@@ -803,6 +991,76 @@ test_threadspam(void) {
 	return 0;
 }
 
+static int
+test_first_class_heaps(void) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	uintptr_t thread[32];
+	uintptr_t threadres[32];
+	unsigned int i;
+	size_t num_alloc_threads;
+	allocator_thread_arg_t arg[32];
+
+	rpmalloc_initialize();
+
+	num_alloc_threads = _hardware_threads * 2;
+	if (num_alloc_threads < 2)
+		num_alloc_threads = 2;
+	if (num_alloc_threads > 16)
+		num_alloc_threads = 16;
+
+	for (i = 0; i < num_alloc_threads; ++i) {
+		arg[i].datasize[0] = 19;
+		arg[i].datasize[1] = 249;
+		arg[i].datasize[2] = 797;
+		arg[i].datasize[3] = 3058;
+		arg[i].datasize[4] = 47892;
+		arg[i].datasize[5] = 173902;
+		arg[i].datasize[6] = 389;
+		arg[i].datasize[7] = 19;
+		arg[i].datasize[8] = 2493;
+		arg[i].datasize[9] = 7979;
+		arg[i].datasize[10] = 3;
+		arg[i].datasize[11] = 79374;
+		arg[i].datasize[12] = 3432;
+		arg[i].datasize[13] = 548;
+		arg[i].datasize[14] = 38934;
+		arg[i].datasize[15] = 234;
+		arg[i].num_datasize = 16;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
+		arg[i].loops = 100;
+		arg[i].passes = 4000;
+#else
+		arg[i].loops = 50;
+		arg[i].passes = 1000;
+#endif
+		arg[i].init_fini_each_loop = 1;
+
+		thread_arg targ;
+		targ.fn = heap_allocator_thread;
+		if ((i % 2) != 0)
+			targ.fn = allocator_thread;
+		targ.arg = &arg[i];
+
+		thread[i] = thread_run(&targ);
+	}
+
+	thread_sleep(1000);
+
+	for (i = 0; i < num_alloc_threads; ++i)
+		threadres[i] = thread_join(thread[i]);
+
+	rpmalloc_finalize();
+
+	for (i = 0; i < num_alloc_threads; ++i) {
+		if (threadres[i])
+			return -1;
+	}
+
+	printf("First class heap tests passed\n");
+#endif
+	return 0;
+}
+
 int
 test_run(int argc, char** argv) {
 	(void)sizeof(argc);
@@ -816,9 +1074,11 @@ test_run(int argc, char** argv) {
 		return -1;
 	if (test_crossthread())
 		return -1;
+	if (test_threaded())
+		return -1;
 	if (test_threadspam())
 		return -1;
-	if (test_threaded())
+	if (test_first_class_heaps())
 		return -1;
 	printf("All tests passed\n");
 	return 0;
@@ -843,7 +1103,7 @@ main(int argc, char** argv) {
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 
 static void
 test_initialize(void) {
diff --git a/test/thread.c b/test/thread.c
index 15b8d1d2..152b2682 100644
--- a/test/thread.c
+++ b/test/thread.c
@@ -6,9 +6,12 @@
 #else
 #  define ATTRIBUTE_NORETURN __attribute__((noreturn))
 #endif
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wnonportable-system-include-path"
+#endif
 
 #ifdef _WIN32
-#  include <Windows.h>
+#  include <windows.h>
 #  include <process.h>
 
 static unsigned __stdcall
@@ -23,12 +26,6 @@ thread_entry(void* argptr) {
 #  include <pthread.h>
 #  include <sched.h>
 
-#if !defined(__x86_64__) && !defined(_AMD64_) && !defined(_M_AMD64) && !defined(__i386__)
-#  define MEMORY_BARRIER __sync_synchronize()
-#else
-#  define MEMORY_BARRIER __asm__ __volatile__("":::"memory")
-#endif
-
 static void*
 thread_entry(void* argptr) {
 	thread_arg* arg = argptr;
@@ -94,19 +91,8 @@ thread_sleep(int milliseconds) {
 void
 thread_yield(void) {
 #ifdef _WIN32
-	Sleep(0);
-	_ReadWriteBarrier();
+	SleepEx(0, 1);
 #else
 	sched_yield();
-	MEMORY_BARRIER;
-#endif
-}
-
-void
-thread_fence(void) {
-#ifdef _WIN32
-	_ReadWriteBarrier();
-#else
-	MEMORY_BARRIER;
 #endif
 }
diff --git a/test/thread.h b/test/thread.h
index 5c0d873c..062970eb 100644
--- a/test/thread.h
+++ b/test/thread.h
@@ -26,9 +26,6 @@ thread_sleep(int milliseconds);
 extern void
 thread_yield(void);
 
-extern void
-thread_fence(void);
-
 #ifdef __cplusplus
 }
 #endif