diff --git a/autogtp/Management.cpp b/autogtp/Management.cpp
index 069b0bfce..d6f55ec3b 100644
--- a/autogtp/Management.cpp
+++ b/autogtp/Management.cpp
@@ -71,8 +71,16 @@ void Management::runTuningProcess(const QString &tuneCmdLine) {
     tuneProcess.waitForStarted(-1);
     while (tuneProcess.state() == QProcess::Running) {
         tuneProcess.waitForReadyRead(1000);
+        QByteArray text = tuneProcess.readAllStandardOutput();
+        int version_start = text.indexOf("Leela Zero ") + 11;
+        if (version_start > 10) {
+            int version_end = text.indexOf(" ", version_start);
+            m_leelaversion = QString(text.mid(version_start, version_end - version_start));
+        }
+        QTextStream(stdout) << text;
         QTextStream(stdout) << tuneProcess.readAllStandardError();
     }
+    QTextStream(stdout) << "Found Leela Version : " << m_leelaversion << endl;
     tuneProcess.waitForFinished(-1);
 }
 
@@ -316,6 +324,8 @@ Order Management::getWorkInternal(bool tuning) {
         prog_cmdline.append("0");
     } else {
         prog_cmdline.append(QString::number(AUTOGTP_VERSION));
+        if (!m_leelaversion.isEmpty())
+            prog_cmdline.append("/"+m_leelaversion);
     }
     QProcess curl;
     curl.start(prog_cmdline);
diff --git a/autogtp/Management.h b/autogtp/Management.h
index cb6715c13..4cdba6a6b 100644
--- a/autogtp/Management.h
+++ b/autogtp/Management.h
@@ -82,6 +82,7 @@ public slots:
     int m_threadsLeft;
     bool m_delNetworks;
     QLockFile *m_lockFile;
+    QString m_leelaversion;
 
     Order getWorkInternal(bool tuning);
     Order getWork(bool tuning = false);
diff --git a/msvc/VS2015/leela-zero.vcxproj b/msvc/VS2015/leela-zero.vcxproj
index dc31e6f6e..885d1efab 100644
--- a/msvc/VS2015/leela-zero.vcxproj
+++ b/msvc/VS2015/leela-zero.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|x64">
@@ -93,6 +93,7 @@
     <ClCompile Include="..\..\src\Training.cpp" />
     <ClCompile Include="..\..\src\Tuner.cpp" />
     <ClCompile Include="..\..\src\UCTNode.cpp" />
+    <ClCompile Include="..\..\src\UCTNodePointer.cpp" />
     <ClCompile Include="..\..\src\UCTNodeRoot.cpp" />
     <ClCompile Include="..\..\src\UCTSearch.cpp" />
     <ClCompile Include="..\..\src\Utils.cpp" />
@@ -121,6 +122,7 @@
     <ClInclude Include="..\..\src\Training.h" />
     <ClInclude Include="..\..\src\Tuner.h" />
     <ClInclude Include="..\..\src\UCTNode.h" />
+    <ClInclude Include="..\..\src\UCTNodePointer.h" />
     <ClInclude Include="..\..\src\UCTSearch.h" />
     <ClInclude Include="..\..\src\Utils.h" />
     <ClInclude Include="..\..\src\Zobrist.h" />
diff --git a/src/GTP.cpp b/src/GTP.cpp
index efb2a402f..e2f6630b4 100644
--- a/src/GTP.cpp
+++ b/src/GTP.cpp
@@ -65,6 +65,10 @@ std::vector<int> cfg_gpus;
 bool cfg_sgemm_exhaustive;
 bool cfg_tune_only;
 #endif
+int cfg_puct_factor;
+float cfg_backup_pct;
+int cfg_backup_type;
+bool cfg_pseudo_backup;
 float cfg_puct;
 float cfg_softmax_temp;
 float cfg_fpu_reduction;
@@ -95,6 +99,10 @@ void GTP::setup_default_parameters() {
     cfg_sgemm_exhaustive = false;
     cfg_tune_only = false;
 #endif
+    cfg_puct_factor = 1;
+    cfg_backup_pct = 50.0;
+    cfg_backup_type = 3;
+    cfg_pseudo_backup = true;
     cfg_puct = 0.8f;
     cfg_softmax_temp = 1.0f;
     cfg_fpu_reduction = 0.25f;
diff --git a/src/GTP.h b/src/GTP.h
index 2e5c00d1e..7684da094 100644
--- a/src/GTP.h
+++ b/src/GTP.h
@@ -48,6 +48,8 @@ extern std::vector<int> cfg_gpus;
 extern bool cfg_sgemm_exhaustive;
 extern bool cfg_tune_only;
 #endif
+extern int cfg_puct_factor;
+extern float cfg_backup_pct;
 extern float cfg_puct;
 extern float cfg_softmax_temp;
 extern float cfg_fpu_reduction;
diff --git a/src/Leela.cpp b/src/Leela.cpp
index 9a17b160e..38d3850a6 100644
--- a/src/Leela.cpp
+++ b/src/Leela.cpp
@@ -77,6 +77,22 @@ static void parse_commandline(int argc, char *argv[]) {
         ("noponder", "Disable thinking on opponent's time.")
         ("benchmark", "Test network and exit. Default args:\n-v3200 --noponder "
                       "-m0 -t1 -s1.")
+        
+        ("puct-factor", po::value<int>()->default_value(cfg_puct_factor),
+                      "0: original (=1), 1: linear (=winrate*2, default), 2: quadratic (=winrate(1-winrate)*4).")
+        ("backup-pct", po::value<float>()->default_value(cfg_backup_pct),
+                      "Update (backup) Q-values (winrates) of white's moves that are ancestors of the new leaf node "
+                      "with a probability determined by winrate at root node and this parameter.\n"
+                      "At most 100, defaulted to 50.\n"
+                      "The lower the value, the weaker you assume white to be.")
+        ("backup-type", po::value<int>()->default_value(cfg_backup_type),
+                      "0: throw a dice to go up a generation,\n"
+                      "1: always update, 2: never update,\n"
+                      "3: throw dice once for each simulation (default),\n"
+                      "4: throw dice once for each ancestor,\n"
+                      "5: update the foremost ancestors only.")
+        ("pseudo-backup", po::value<std::string>()->default_value("on"),
+                      "[on|off] Whether to increment visit count when value is not actually updated.")
         ;
 #ifdef USE_OPENCL
     po::options_description gpu_desc("GPU options");
@@ -102,13 +118,13 @@ static void parse_commandline(int argc, char *argv[]) {
             po::value<float>()->default_value(cfg_random_temp),
             "Temperature to use for random move selection.")
         ;
-#ifdef USE_TUNER
     po::options_description tuner_desc("Tuning options");
     tuner_desc.add_options()
         ("puct", po::value<float>())
         ("softmax_temp", po::value<float>())
         ("fpu_reduction", po::value<float>())
         ;
+#ifdef USE_TUNER
 #endif
     // These won't be shown, we use them to catch incorrect usage of the
     // command line.
@@ -167,7 +183,26 @@ static void parse_commandline(int argc, char *argv[]) {
         cfg_quiet = true;  // Set this early to avoid unnecessary output.
     }
 
-#ifdef USE_TUNER
+    if (vm.count("puct-factor")) {
+	    cfg_puct_factor = vm["puct-factor"].as<int>();
+    }
+    if (vm.count("backup-pct")) {
+	    cfg_backup_pct = vm["backup-pct"].as<float>();
+	    if (cfg_backup_pct > 100.0) {
+		    cfg_backup_pct = 50.0;
+		    myprintf("Invalid backup percentage. Falling back to 50.0.\n");
+	    }
+    }
+    if (vm.count("backup-type")) {
+	    cfg_backup_type = vm["backup-type"].as<int>();
+    }
+    if (vm.count("pseudo-backup")) {
+        auto pb = vm["pseudo-backup"].as<std::string>();
+        if (pb == "off") {
+            cfg_pseudo_backup = false;
+        }
+    }
+	
     if (vm.count("puct")) {
         cfg_puct = vm["puct"].as<float>();
     }
@@ -177,6 +212,7 @@ static void parse_commandline(int argc, char *argv[]) {
     if (vm.count("fpu_reduction")) {
         cfg_fpu_reduction = vm["fpu_reduction"].as<float>();
     }
+#ifdef USE_TUNER
 #endif
 
     if (vm.count("logfile")) {
diff --git a/src/Network.cpp b/src/Network.cpp
index d629afe06..6d4ef455e 100644
--- a/src/Network.cpp
+++ b/src/Network.cpp
@@ -970,8 +970,8 @@ Network::Netresult Network::get_scored_moves_internal(
     const auto winrate_out =
         innerproduct<256, 1, false>(winrate_data, ip2_val_w, ip2_val_b);
 
-    // Sigmoid
-    const auto winrate_sig = (1.0f + std::tanh(winrate_out[0])) / 2.0f;
+    // Sigmoid: tanh normalized to take value in (0,1)
+    const auto winrate_sig = 1.0f / (1.0f + std::exp(-2.0f * winrate_out[0]));
 
     Netresult result;
 
diff --git a/src/SMP.cpp b/src/SMP.cpp
index d46756c18..7e387c0ad 100644
--- a/src/SMP.cpp
+++ b/src/SMP.cpp
@@ -32,7 +32,12 @@ SMP::Lock::Lock(Mutex & m) {
 
 void SMP::Lock::lock() {
     assert(!m_owns_lock);
-    while (m_mutex->m_lock.exchange(true, std::memory_order_acquire) == true);
+    // Test and Test-and-Set reduces memory contention
+    // However, just trying to Test-and-Set first improves performance in almost
+    // all cases
+    while (m_mutex->m_lock.exchange(true, std::memory_order_acquire)) {
+      while (m_mutex->m_lock.load(std::memory_order_relaxed));
+    }
     m_owns_lock = true;
 }
 
diff --git a/src/UCTNode.cpp b/src/UCTNode.cpp
index 6d35b95f6..80023cab5 100644
--- a/src/UCTNode.cpp
+++ b/src/UCTNode.cpp
@@ -87,6 +87,7 @@ bool UCTNode::create_children(std::atomic<int>& nodecount,
     if (state.board.white_to_move()) {
         m_net_eval = 1.0f - m_net_eval;
     }
+    update(m_net_eval);
     eval = m_net_eval;
 
     std::vector<Network::ScoreVertexPair> nodelist;
@@ -255,11 +256,13 @@ UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
     // Count parentvisits manually to avoid issues with transpositions.
     auto total_visited_policy = 0.0f;
     auto parentvisits = size_t{0};
+    auto parent_total_blackeval = get_pure_eval(FastBoard::BLACK);
     for (const auto& child : m_children) {
         if (child.valid()) {
             parentvisits += child.get_visits();
             if (child.get_visits() > 0) {
                 total_visited_policy += child.get_score();
+                parent_total_blackeval += child.get_blackevals();
             }
         }
     }
@@ -270,9 +273,18 @@ UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
     // Do not do this if we have introduced noise at this node exactly
     // to explore more.
     
-    auto pure_eval = get_pure_eval(color); 
+    parentvisits++;
+    auto pure_eval = parent_total_blackeval / float(parentvisits);
+    if (color == FastBoard::WHITE) {
+        pure_eval = 1.0 - pure_eval;
+    }
     if (!is_root || !cfg_noise) {
-        fpu_reduction = cfg_fpu_reduction * std::sqrt(total_visited_policy) * pure_eval / 0.5;
+        fpu_reduction = cfg_fpu_reduction * std::sqrt(total_visited_policy);
+        if (cfg_puct_factor == 2) {
+            fpu_reduction *= pure_eval * (1 - pure_eval) / 0.25;
+        } else if (cfg_puct_factor == 1) {
+            fpu_reduction *= pure_eval / 0.5;
+        }
     }
     // Estimated eval for unknown nodes = current parent winrate - reduction
     auto fpu_eval = pure_eval - fpu_reduction;
@@ -291,7 +303,12 @@ UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
         }
         auto psa = child.get_score();
         auto denom = 1.0 + child.get_visits();
-        auto puct = cfg_puct * psa * (numerator / denom) * pure_eval / 0.5;
+        auto puct = cfg_puct * psa * (numerator / denom);
+        if (cfg_puct_factor == 2) {
+            puct *= pure_eval * (1 - pure_eval) / 0.25;
+        } else if (cfg_puct_factor == 1) {
+            puct *= pure_eval / 0.5;
+        }
         auto value = winrate + puct;
         assert(value > std::numeric_limits<double>::lowest());
 
diff --git a/src/UCTNodePointer.cpp b/src/UCTNodePointer.cpp
index bb51f83b0..b026d95fa 100644
--- a/src/UCTNodePointer.cpp
+++ b/src/UCTNodePointer.cpp
@@ -90,6 +90,11 @@ float UCTNodePointer::get_eval(int tomove) const {
     return read_ptr()->get_eval(tomove);
 }
 
+double UCTNodePointer::get_blackevals() const {
+    assert(is_inflated());
+    return read_ptr()->get_blackevals();
+}
+
 int UCTNodePointer::get_move() const {
     if (is_inflated()) return read_ptr()->get_move();
     return read_vertex();
diff --git a/src/UCTNodePointer.h b/src/UCTNodePointer.h
index d40bbc2ff..19d7dcfca 100644
--- a/src/UCTNodePointer.h
+++ b/src/UCTNodePointer.h
@@ -110,6 +110,7 @@ class UCTNodePointer {
     int get_move() const;
     // this can only be called if it is an inflated pointer
     float get_eval(int tomove) const;
+    double get_blackevals() const;
 };
 
 #endif
diff --git a/src/UCTSearch.cpp b/src/UCTSearch.cpp
index 63dfe254b..a23e77c12 100644
--- a/src/UCTSearch.cpp
+++ b/src/UCTSearch.cpp
@@ -24,6 +24,7 @@
 #include <cstddef>
 #include <limits>
 #include <memory>
+#include <random>
 #include <type_traits>
 
 #include "FastBoard.h"
@@ -31,6 +32,7 @@
 #include "FullBoard.h"
 #include "GTP.h"
 #include "GameState.h"
+#include "Random.h"
 #include "TimeControl.h"
 #include "Timing.h"
 #include "Training.h"
@@ -157,8 +159,19 @@ float UCTSearch::get_min_psa_ratio() const {
     return 0.0f;
 }
 
+float calc_backup_pct (float blackeval) {
+	// dynamically adjust backup_pct according to root winrate
+	if (blackeval > 0.5) {
+		return (100.0 - cfg_backup_pct) * 4.0 * blackeval * (1 - blackeval) + cfg_backup_pct;
+	} else {
+		return 100.0;
+	}
+}
+
 SearchResult UCTSearch::play_simulation(GameState & currstate,
-                                        UCTNode* const node) {
+                                        UCTNode* const node,
+                                        float backup_pct,
+				        int depth) {
     const auto color = currstate.get_to_move();
     auto result = SearchResult{};
 
@@ -168,6 +181,7 @@ SearchResult UCTSearch::play_simulation(GameState & currstate,
         if (currstate.get_passes() >= 2) {
             auto score = currstate.final_score();
             result = SearchResult::from_score(score);
+            node->update(result.eval());
         } else if (m_nodes < MAX_TREE_SIZE) {
             float eval;
             const auto had_children = node->has_children();
@@ -179,6 +193,9 @@ SearchResult UCTSearch::play_simulation(GameState & currstate,
             }
         }
     }
+	if (result.valid()) {
+		result.remaining_backups = - depth * (1.0 - backup_pct / 100.0);
+	}
 
     if (node->has_children() && !result.valid()) {
         auto next = node->uct_select_child(color, node == m_root.get());
@@ -188,13 +205,48 @@ SearchResult UCTSearch::play_simulation(GameState & currstate,
         if (move != FastBoard::PASS && currstate.superko()) {
             next->invalidate();
         } else {
-            result = play_simulation(currstate, next);
+            if (backup_pct > 100.0) {
+                backup_pct = calc_backup_pct(node->get_pure_eval(FastBoard::BLACK));
+            }
+            result = play_simulation(currstate, next, backup_pct, depth + 1);
+            result.remaining_backups++;
+            if (result.valid()) {
+                if (color == FastBoard::BLACK || result.backup_type == 1 || node->get_visits() == 0) {
+                    node->update(result.eval());
+                }
+                else if (result.backup_type == 2) {
+                    if (cfg_pseudo_backup) {
+                        node->update(node->get_pure_eval(FastBoard::BLACK));
+                    }
+                }
+                else if (result.backup_type == 5) {
+                    if (std::uniform_real_distribution<double>{ 0.0, 1.0 }(Random::get_Rng()) <= result.remaining_backups) {
+                        node->update(result.eval());
+                    }
+                    else {
+                        if (cfg_pseudo_backup) {
+                            node->update(node->get_pure_eval(FastBoard::BLACK));
+                        }
+                    }
+                }
+                else if (std::uniform_real_distribution<double>{ 0.0, 100.0 }(Random::get_Rng()) <= backup_pct) {
+                    node->update(result.eval());
+                    if (result.backup_type == 3) {
+                        result.backup_type = 1;
+                    }
+                }
+                else {
+                    if (cfg_pseudo_backup) {
+                        node->update(node->get_pure_eval(FastBoard::BLACK));
+                    }
+                    if (result.backup_type == 0 || result.backup_type == 3) {
+                        result.backup_type = 2;
+                    }
+                }
+            }
         }
     }
 
-    if (result.valid()) {
-        node->update(result.eval());
-    }
     node->virtual_loss_undo();
 
     return result;
@@ -586,7 +638,7 @@ bool UCTSearch::stop_thinking(int elapsed_centis, int time_for_move) const {
 void UCTWorker::operator()() {
     do {
         auto currstate = std::make_unique<GameState>(m_rootstate);
-        auto result = m_search->play_simulation(*currstate, m_root);
+        auto result = m_search->play_simulation(*currstate, m_root, 200.0, 0);
         if (result.valid()) {
             m_search->increment_playouts();
         }
@@ -631,7 +683,7 @@ int UCTSearch::think(int color, passflag_t passflag) {
     do {
         auto currstate = std::make_unique<GameState>(m_rootstate);
 
-        auto result = play_simulation(*currstate, m_root.get());
+        auto result = play_simulation(*currstate, m_root.get(), 200.0, 0);
         if (result.valid()) {
             increment_playouts();
         }
@@ -707,7 +759,7 @@ void UCTSearch::ponder() {
     auto last_output = 0;
     do {
         auto currstate = std::make_unique<GameState>(m_rootstate);
-        auto result = play_simulation(*currstate, m_root.get());
+        auto result = play_simulation(*currstate, m_root.get(), 200.0, 0);
         if (result.valid()) {
             increment_playouts();
         }
diff --git a/src/UCTSearch.h b/src/UCTSearch.h
index f2425039b..84d0721db 100644
--- a/src/UCTSearch.h
+++ b/src/UCTSearch.h
@@ -32,9 +32,13 @@
 #include "GameState.h"
 #include "UCTNode.h"
 
+extern int cfg_backup_type;
+extern bool cfg_pseudo_backup;
 
 class SearchResult {
 public:
+    int backup_type;
+    float remaining_backups{0.0};
     SearchResult() = default;
     bool valid() const { return m_valid;  }
     float eval() const { return m_eval;  }
@@ -52,7 +56,7 @@ class SearchResult {
     }
 private:
     explicit SearchResult(float eval)
-        : m_valid(true), m_eval(eval) {}
+        : m_valid(true), m_eval(eval), backup_type(cfg_backup_type) {}
     bool m_valid{false};
     float m_eval{0.0f};
 };
@@ -98,7 +102,7 @@ class UCTSearch {
     void ponder();
     bool is_running() const;
     void increment_playouts();
-    SearchResult play_simulation(GameState& currstate, UCTNode* const node);
+    SearchResult play_simulation(GameState& currstate, UCTNode* const node, float backup_pct, int depth);
 
 private:
     float get_min_psa_ratio() const;
diff --git a/training/tf/net_to_model.py b/training/tf/net_to_model.py
index ebefd4303..80bd08a27 100755
--- a/training/tf/net_to_model.py
+++ b/training/tf/net_to_model.py
@@ -24,7 +24,7 @@
     print("Blocks", blocks)
 
 tfprocess = TFProcess()
-tfprocess.init(batch_size=1)
+tfprocess.init(batch_size=1, gpus_num=1)
 if tfprocess.RESIDUAL_BLOCKS != blocks:
     raise ValueError("Number of blocks in tensorflow model doesn't match "\
             "number of blocks in input network")
diff --git a/training/tf/tfprocess.py b/training/tf/tfprocess.py
index 0dba54c7a..7c3a1f47f 100644
--- a/training/tf/tfprocess.py
+++ b/training/tf/tfprocess.py
@@ -132,13 +132,13 @@ def __init__(self):
         self.swa_recalc_bn = True
 
         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
-        config = tf.ConfigProto(gpu_options=gpu_options)
+        config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
         self.session = tf.Session(config=config)
 
         self.training = tf.placeholder(tf.bool)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 
-    def init(self, batch_size, macrobatch=1, logbase='leelalogs'):
+    def init(self, batch_size, macrobatch=1, gpus_num=None, logbase='leelalogs'):
         self.batch_size = batch_size
         self.macrobatch = macrobatch
         self.logbase = logbase
@@ -159,13 +159,15 @@ def init(self, batch_size, macrobatch=1, logbase='leelalogs'):
         probs = tf.reshape(probs, (batch_size, 19*19 + 1))
         winner = tf.reshape(winner, (batch_size, 1))
 
-        self.init_net(planes, probs, winner)
+        if gpus_num is None:
+            gpus_num = self.gpus_num
+        self.init_net(planes, probs, winner, gpus_num)
 
-    def init_net(self, planes, probs, winner):
+    def init_net(self, planes, probs, winner, gpus_num):
         self.y_ = probs   # (tf.float32, [None, 362])
-        self.sx = tf.split(planes, self.gpus_num)
-        self.sy_ = tf.split(probs, self.gpus_num)
-        self.sz_ = tf.split(winner, self.gpus_num)
+        self.sx = tf.split(planes, gpus_num)
+        self.sy_ = tf.split(probs, gpus_num)
+        self.sz_ = tf.split(winner, gpus_num)
         self.batch_norm_count = 0
         self.reuse_var = None
 
@@ -182,7 +184,7 @@ def init_net(self, planes, probs, winner):
         tower_reg_term = []
         tower_y_conv = []
         with tf.variable_scope(tf.get_variable_scope()):
-            for i in range(self.gpus_num):
+            for i in range(gpus_num):
                 with tf.device("/gpu:%d" % i):
                     with tf.name_scope("tower_%d" % i):
                         loss, policy_loss, mse_loss, reg_term, y_conv = self.tower_loss(