microsoft · jameslamb · Oct 13, 2024 · Jul 24, 2024 · Jul 26, 2024 · Jul 26, 2024
@@ -77,7 +77,7 @@ class CUDATree : public Tree {
                             const data_size_t* used_data_indices,
                             data_size_t num_data, double* score) const override;
 
-  inline void AsConstantTree(double val) override;
+  inline void AsConstantTree(double val, int count) override;
 
   const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }
 

@@ -228,13 +228,14 @@ class Tree {
     shrinkage_ = 1.0f;
   }
 
-  virtual inline void AsConstantTree(double val) {
+  virtual inline void AsConstantTree(double val, int count = 0) {
     num_leaves_ = 1;
     shrinkage_ = 1.0f;
     leaf_value_[0] = val;
     if (is_linear_) {
       leaf_const_[0] = val;
     }
+    leaf_count_[0] = count;
   }
 
   /*! \brief Serialize this object to string*/
@@ -563,7 +564,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature,
   leaf_parent_[leaf] = new_node_idx;
   leaf_parent_[num_leaves_] = new_node_idx;
   // save current leaf value to internal node before change
-  internal_weight_[new_node_idx] = leaf_weight_[leaf];
+  internal_weight_[new_node_idx] = left_weight + right_weight;
   internal_value_[new_node_idx] = leaf_value_[leaf];
   internal_count_[new_node_idx] = left_cnt + right_cnt;
   leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;

@@ -3906,7 +3906,7 @@ def _get_split_feature(
                 return feature_name
 
             def _is_single_node_tree(tree: Dict[str, Any]) -> bool:
-                return set(tree.keys()) == {"leaf_value"}
+                return set(tree.keys()) == {"leaf_value", "leaf_count"}
 
             # Create the node record, and populate universal data members
             node: Dict[str, Union[int, str, None]] = OrderedDict()

@@ -427,7 +427,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
             score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id);
           }
         }
-        new_tree->AsConstantTree(init_scores[cur_tree_id]);
+        new_tree->AsConstantTree(init_scores[cur_tree_id], num_data_);
+      } else {
+        // extend init_scores with zeros
+        new_tree->AsConstantTree(0, num_data_);
       }
     }
     // add model

@@ -168,7 +168,7 @@ class RF : public GBDT {
               output = init_scores_[cur_tree_id];
             }
           }
-          new_tree->AsConstantTree(output);
+          new_tree->AsConstantTree(output, num_data_);
           MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
           UpdateScore(new_tree.get(), cur_tree_id);
           MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));

@@ -330,9 +330,10 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() {
   CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__);
 }
 
-void CUDATree::AsConstantTree(double val) {
-  Tree::AsConstantTree(val);
+void CUDATree::AsConstantTree(double val, int count) {
+  Tree::AsConstantTree(val, count);
   CopyFromHostToCUDADevice<double>(cuda_leaf_value_, &val, 1, __FILE__, __LINE__);
+  CopyFromHostToCUDADevice<int>(cuda_leaf_count_, &count, 1, __FILE__, __LINE__);
 }
 
 }  // namespace LightGBM

@@ -94,7 +94,7 @@ __global__ void SplitKernel(  // split information
     split_gain[new_node_index] = static_cast<float>(cuda_split_info->gain);
   } else if (thread_index == 4) {
     // save current leaf value to internal node before change
-    internal_weight[new_node_index] = leaf_weight[leaf_index];
+    internal_weight[new_node_index] = cuda_split_info->left_sum_hessians + cuda_split_info->right_sum_hessians;
     leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians;
   } else if (thread_index == 5) {
     internal_value[new_node_index] = leaf_value[leaf_index];
@@ -210,7 +210,7 @@ __global__ void SplitCategoricalKernel(  // split information
     split_gain[new_node_index] = static_cast<float>(cuda_split_info->gain);
   } else if (thread_index == 4) {
     // save current leaf value to internal node before change
-    internal_weight[new_node_index] = leaf_weight[leaf_index];
+    internal_weight[new_node_index] = cuda_split_info->left_sum_hessians + cuda_split_info->right_sum_hessians;
     leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians;
   } else if (thread_index == 5) {
     internal_value[new_node_index] = leaf_value[leaf_index];

@@ -416,12 +416,15 @@ std::string Tree::ToJSON() const {
   str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
   str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
   if (num_leaves_ == 1) {
+    str_buf << "\"tree_structure\":{";
+    str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
     if (is_linear_) {
-      str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << ", " << "\n";
-      str_buf << LinearModelToJSON(0) << "}" << "\n";
+      str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n';
+      str_buf << LinearModelToJSON(0);
     } else {
-      str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
+      str_buf << "\"leaf_count\":" << leaf_count_[0];
     }
+    str_buf << "}" << '\n';
   } else {
     str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
   }
@@ -731,6 +734,12 @@ Tree::Tree(const char* str, size_t* used_len) {
     is_linear_ = false;
   }
 
+  if (key_vals.count("leaf_count")) {
+    leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
+  } else {
+    leaf_count_.resize(num_leaves_);
+  }
+
   #ifdef USE_CUDA
   is_cuda_tree_ = false;
   #endif  // USE_CUDA
@@ -793,12 +802,6 @@ Tree::Tree(const char* str, size_t* used_len) {
     leaf_weight_.resize(num_leaves_);
   }
 
-  if (key_vals.count("leaf_count")) {
-    leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
-  } else {
-    leaf_count_.resize(num_leaves_);
-  }
-
   if (key_vals.count("decision_type")) {
     decision_type_ = CommonC::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
   } else {

@@ -38,12 +38,14 @@ void CUDALeafSplits::InitValues(
   const double lambda_l1, const double lambda_l2,
   const score_t* cuda_gradients, const score_t* cuda_hessians,
   const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf,
-  const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) {
+  const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf,
+  double* root_sum_gradients, double* root_sum_hessians) {
   cuda_gradients_ = cuda_gradients;
   cuda_hessians_ = cuda_hessians;
   cuda_sum_of_gradients_buffer_.SetValue(0);
   cuda_sum_of_hessians_buffer_.SetValue(0);
   LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf);
+  CopyFromCUDADeviceToHost<double>(root_sum_gradients, cuda_sum_of_gradients_buffer_.RawData(), 1, __FILE__, __LINE__);
   CopyFromCUDADeviceToHost<double>(root_sum_hessians, cuda_sum_of_hessians_buffer_.RawData(), 1, __FILE__, __LINE__);
   SynchronizeCUDADevice(__FILE__, __LINE__);
 }
@@ -53,11 +55,12 @@ void CUDALeafSplits::InitValues(
   const int16_t* cuda_gradients_and_hessians,
   const data_size_t* cuda_bagging_data_indices,
   const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
-  hist_t* cuda_hist_in_leaf, double* root_sum_hessians,
+  hist_t* cuda_hist_in_leaf, double* root_sum_gradients, double* root_sum_hessians,
   const score_t* grad_scale, const score_t* hess_scale) {
   cuda_gradients_ = reinterpret_cast<const score_t*>(cuda_gradients_and_hessians);
   cuda_hessians_ = nullptr;
   LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf, grad_scale, hess_scale);
+  CopyFromCUDADeviceToHost<double>(root_sum_gradients, cuda_sum_of_gradients_buffer_.RawData(), 1, __FILE__, __LINE__);
   CopyFromCUDADeviceToHost<double>(root_sum_hessians, cuda_sum_of_hessians_buffer_.RawData(), 1, __FILE__, __LINE__);
   SynchronizeCUDADevice(__FILE__, __LINE__);
 }

@@ -44,14 +44,14 @@ class CUDALeafSplits {
     const score_t* cuda_gradients, const score_t* cuda_hessians,
     const data_size_t* cuda_bagging_data_indices,
     const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
-    hist_t* cuda_hist_in_leaf, double* root_sum_hessians);
+    hist_t* cuda_hist_in_leaf, double* root_sum_gradients, double* root_sum_hessians);
 
   void InitValues(
     const double lambda_l1, const double lambda_l2,
     const int16_t* cuda_gradients_and_hessians,
     const data_size_t* cuda_bagging_data_indices,
     const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
-    hist_t* cuda_hist_in_leaf, double* root_sum_hessians,
+    hist_t* cuda_hist_in_leaf, double* root_sum_gradients, double* root_sum_hessians,
     const score_t* grad_scale, const score_t* hess_scale);
 
   void InitValues();

@@ -66,6 +66,7 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_
   leaf_best_split_default_left_.resize(config_->num_leaves, 0);
   leaf_num_data_.resize(config_->num_leaves, 0);
   leaf_data_start_.resize(config_->num_leaves, 0);
+  leaf_sum_gradients_.resize(config_->num_leaves, 0.0f);
   leaf_sum_hessians_.resize(config_->num_leaves, 0.0f);
 
   if (!boosting_on_cuda_) {
@@ -122,6 +123,7 @@ void CUDASingleGPUTreeLearner::BeforeTrain() {
       cuda_data_partition_->cuda_data_indices(),
       root_num_data,
       cuda_histogram_constructor_->cuda_hist_pointer(),
+      &leaf_sum_gradients_[0],
       &leaf_sum_hessians_[0],
       cuda_gradient_discretizer_->grad_scale_ptr(),
       cuda_gradient_discretizer_->hess_scale_ptr());
@@ -137,6 +139,7 @@ void CUDASingleGPUTreeLearner::BeforeTrain() {
       cuda_data_partition_->cuda_data_indices(),
       root_num_data,
       cuda_histogram_constructor_->cuda_hist_pointer(),
+      &leaf_sum_gradients_[0],
       &leaf_sum_hessians_[0]);
   }
   leaf_num_data_[0] = root_num_data;
@@ -162,6 +165,12 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients,
   const bool track_branch_features = !(config_->interaction_constraints_vector.empty());
   std::unique_ptr<CUDATree> tree(new CUDATree(config_->num_leaves, track_branch_features,
     config_->linear_tree, config_->gpu_device_id, has_categorical_feature_));
+  // set the root value by hand, as it is not handled by splits
+  tree->SetLeafOutput(0, CUDALeafSplits::CalculateSplittedLeafOutput<true, false>(
+    leaf_sum_gradients_[smaller_leaf_index_], leaf_sum_hessians_[smaller_leaf_index_],
+    config_->lambda_l1, config_->lambda_l2,  config_->path_smooth,
+    static_cast<data_size_t>(num_data_), 0));
+  tree->SyncLeafOutputFromHostToCUDA();
   for (int i = 0; i < config_->num_leaves - 1; ++i) {
     global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf");
     const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_];
@@ -293,8 +302,6 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients,
                                        best_split_info);
     }
 
-    double sum_left_gradients = 0.0f;
-    double sum_right_gradients = 0.0f;
     cuda_data_partition_->Split(best_split_info,
                                 best_leaf_index_,
                                 right_leaf_index,
@@ -313,10 +320,10 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients,
                                 &leaf_data_start_[right_leaf_index],
                                 &leaf_sum_hessians_[best_leaf_index_],
                                 &leaf_sum_hessians_[right_leaf_index],
-                                &sum_left_gradients,
-                                &sum_right_gradients);
+                                &leaf_sum_gradients_[best_leaf_index_],
+                                &leaf_sum_gradients_[right_leaf_index]);
     #ifdef DEBUG
-    CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients);
+    CheckSplitValid(best_leaf_index_, right_leaf_index);
     #endif  // DEBUG
     smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index);
     larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_);
@@ -374,6 +381,7 @@ void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) {
     leaf_best_split_default_left_.resize(config_->num_leaves, 0);
     leaf_num_data_.resize(config_->num_leaves, 0);
     leaf_data_start_.resize(config_->num_leaves, 0);
+    leaf_sum_gradients_.resize(config_->num_leaves, 0.0f);
     leaf_sum_hessians_.resize(config_->num_leaves, 0.0f);
   }
   cuda_histogram_constructor_->ResetConfig(config);
@@ -562,9 +570,7 @@ void CUDASingleGPUTreeLearner::SelectFeatureByNode(const Tree* tree) {
 #ifdef DEBUG
 void CUDASingleGPUTreeLearner::CheckSplitValid(
   const int left_leaf,
-  const int right_leaf,
-  const double split_sum_left_gradients,
-  const double split_sum_right_gradients) {
+  const int right_leaf) {
   std::vector<data_size_t> left_data_indices(leaf_num_data_[left_leaf]);
   std::vector<data_size_t> right_data_indices(leaf_num_data_[right_leaf]);
   CopyFromCUDADeviceToHost<data_size_t>(left_data_indices.data(),
@@ -585,9 +591,9 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(
     sum_right_gradients += host_gradients_[index];
     sum_right_hessians += host_hessians_[index];
   }
-  CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f);
+  CHECK_LE(std::fabs(sum_left_gradients - leaf_sum_gradients_[left_leaf]), 1e-6f);
   CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f);
-  CHECK_LE(std::fabs(sum_right_gradients - split_sum_right_gradients), 1e-6f);
+  CHECK_LE(std::fabs(sum_right_gradients - leaf_sum_gradients_[right_leaf]), 1e-6f);
   CHECK_LE(std::fabs(sum_right_hessians - leaf_sum_hessians_[right_leaf]), 1e-6f);
 }
 #endif  // DEBUG

@@ -71,8 +71,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
 
   #ifdef DEBUG
   void CheckSplitValid(
-    const int left_leaf, const int right_leaf,
-    const double sum_left_gradients, const double sum_right_gradients);
+    const int left_leaf, const int right_leaf);
   #endif  // DEBUG
 
   void RenewDiscretizedTreeLeaves(CUDATree* cuda_tree);
@@ -103,6 +102,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
   std::vector<uint8_t> leaf_best_split_default_left_;
   std::vector<data_size_t> leaf_num_data_;
   std::vector<data_size_t> leaf_data_start_;
+  std::vector<double> leaf_sum_gradients_;
   std::vector<double> leaf_sum_hessians_;
   int smaller_leaf_index_;
   int larger_leaf_index_;

@@ -201,6 +201,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   auto tree_ptr = tree.get();
   constraints_->ShareTreePointer(tree_ptr);
 
+  // set the root value by hand, as it is not handled by splits
+  tree->SetLeafOutput(0, FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
+    smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
+    config_->lambda_l1, config_->lambda_l2, config_->max_delta_step,
+    BasicConstraint(), config_->path_smooth, static_cast<data_size_t>(num_data_), 0));
+
   // root leaf
   int left_leaf = 0;
   int cur_depth = 1;