glotzerlab · npkamath · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml
@@ -0,0 +1,21 @@
+name: Install Non-Python Dependencies
+description: Installs system packages needed to build dupin.
+runs:
+  using: "composite"
+  steps:
+    - name: install-dependencies-linux
+      if: runner.os == 'Linux'
+      shell: bash
+      run: |
+        sudo apt-get update
+        sudo apt-get install libtbb12 libtbb-dev libeigen3-dev ninja-build
+    - name: install-dependencies-macos
+      if: runner.os == 'macOS'
+      shell: bash
+      run: |
+        brew update
+        brew install tbb eigen ninja
+    - name: install-dependencies-windows
+      if: runner.os == 'Windows'
+      shell: bash
+      run: choco install tbb eigen3 ninja
diff --git a/.github/workflows/publish-packages.yml b/.github/workflows/publish-packages.yml
@@ -34,10 +34,11 @@ jobs:
         -r requirements/requirements-test.txt \
         -r requirements/requirements-jit.txt \
         -r requirements/requirements-data.txt
-
     - name: Install pypa/build
       run:
         python -m pip install build
+    - name: Install System Packages
+      uses: ./.github/actions/install-nopython-dependencies
     - name: Build a binary wheel and a source tarball
       run:
         python -m build --outdir dist/ .

diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
@@ -45,7 +45,7 @@ jobs:
         python-version: ${{ matrix.python }}
     - name: Update pip/build packages
       run: |
-        pip install setuptools --upgrade
+        pip install pip --upgrade
     - name: Install newest dependencies
       run: |
         pip install -r requirements/requirements-test.txt
@@ -60,6 +60,8 @@ jobs:
       run: |
         pip install -r requirements/requirements-jit.txt
       if: ${{ matrix.python != '3.12' }}
+    - name: Install system packages
+      uses: ./.github/actions/install-nopython-dependencies
     - name: Install the package
       run: |
         pip install -e .

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.8)
+project(dupin VERSION 0.0.1 LANGUAGES CXX)
+
+set(DEFAULT_BUILD_TYPE "Release")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE})
+endif()
+
+find_package(Eigen3 REQUIRED)
+find_package(TBB REQUIRED)
+# Use modern method for Python binding
+set(PYBIND11_NEWPYTHON ON)
+find_package(pybind11 CONFIG REQUIRED)
+
+add_subdirectory(src)
+add_subdirectory(dupin)
diff --git a/dupin/CMakeLists.txt b/dupin/CMakeLists.txt
@@ -0,0 +1,2 @@
+ # Defaults to site-packages so only need package name
+ install(DIRECTORY ./ DESTINATION dupin FILES_MATCHING PATTERN "*.py")
diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py
@@ -0,0 +1,76 @@
+"""Implements dynamic programming class for optimal segementation algorithm."""
+import _dupin
+import numpy as np
+
+
+class DynP:
+    """Detects the change points in a time series.
+
+    Attributes
+    ----------
+    data: np.ndarray
+        Matrix storing the time series data.
+    num_bkps: int
+        Number of change points to detect.
+    jump: int
+        Interval for checking potential change points. Changing will
+        not provide optimal detection, but will reduce runtime.
+    min_size: int
+        Minimum size of a segment. Changing will not provide optimal
+        detection, but will reduce runtime.
+
+
+    Methods
+    -------
+    __init__(self, data: np.ndarray, num_bkps: int, jump: int, min_size: int)
+        Initializes the DynamicProgramming instance with the time series data
+        and parameters.
+    set_num_threads(self, num_threads: int)
+        Sets the number of threads to be used for parallel computation.
+    fit(self, num_bkps: int) -> list
+        Calculates the cost matrix and identifies the optimal breakpoints in
+        the time series data.
+
+    Example Usage
+    -------------
+    >>> import numpy as np
+    >>> from dynp import DynP
+    >>> data = np.random.rand(100, 1)  # Simulated time series data
+    >>> num_bkps = 3  # Number of breakpoints to detect
+    >>> jump = 1  # Interval for checking potential breakpoints
+    >>> min_size = 3  # Minimum size of a segment
+    >>> model = Dynp(data, num_bkps, jump, min_size)
+    >>> breakpoints = model.fit(num_bkps)
+    >>> print(breakpoints)
+    """
+
+    def __init__(
+        self, data: np.ndarray, num_bkps: int, jump: int, min_size: int
+    ):
+        """Initialize the DynamicProgramming instance with given parameters."""
+        self._dupin = _dupin.DynamicProgramming(data, num_bkps, jump, min_size)
+
+    def set_num_threads(self, num_threads: int):
+        """Set the number of threads for parallelization.
+
+        Parameters
+        ----------
+        num_threads: int
+            The number of threads to use during computation. Default
+            is determined automatically.
+        """
+        self._dupin.set_threads(num_threads)
+
+    def fit(self, num_breakpoints: int) -> list[int]:
+        """Calculate the cost matrix and return the breakpoints.
+
+        Parameters
+        ----------
+        num_bkps: int
+            number of change points to detect.
+
+        Returns
+        -------
+            list: A list of integers representing the breakpoints.
+        """
+        return self._dupin.fit(num_breakpoints)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
-build-backend = "setuptools.build_meta"
-requires = ["setuptools >= 64.0.0"]
+requires = ["scikit-build-core>=0.7.0", "pybind11"]
+build-backend = "scikit_build_core.build"
 
 [project]
 name = "dupin"

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -0,0 +1,19 @@
+list(APPEND dupin_cxx "module.cpp" "dupin.h" "dupin.cpp")
+
+pybind11_add_module(_dupin ${dupin_cxx})
+
+set_target_properties(_dupin PROPERTIES
+    CXX_STANDARD 17
+    CMAKE_CXX_STANDARD_REQUIRED True
+)
+
+target_include_directories(_dupin PRIVATE
+    ${EIGEN3_INCLUDE_DIR}
+    ${TBB_INCLUDE_DIRS}
+)
+
+target_link_libraries(_dupin PRIVATE TBB::tbb)
+target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION})
+target_compile_options(_dupin PRIVATE -O2 -march=native)
+# Installs C++ extension into the root of the Python package
+install(TARGETS _dupin LIBRARY DESTINATION dupin)
diff --git a/src/dupin.cpp b/src/dupin.cpp
@@ -0,0 +1,170 @@
+#include <iostream>
+#include <iomanip>
+#include <limits>
+#include <unordered_map>
+#include <vector>
+#include <Eigen/Dense>
+#include <tbb/blocked_range2d.h>
+#include <tbb/global_control.h>
+#include <tbb/parallel_for.h>
+#include "dupin.h"
+
+using namespace std;
+using namespace Eigen;
-using namespace std;
-using namespace Eigen;
-using namespace std;
-using namespace Eigen;
+
+DynamicProgramming::DynamicProgramming()
+    : num_features(0), num_timesteps(0), jump(1), min_size(3), cost_matrix(0) {}
+
+
+DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data,
+                                       int jump_, int min_size_)
+    : data(data), jump(jump_), min_size(min_size_), cost_matrix(data.rows()) {
+  num_timesteps = data.rows();
+  num_features = data.cols();
+}
+
+void DynamicProgramming::scale_data() {
+  Eigen::VectorXd min_val = data.colwise().minCoeff();
+  Eigen::VectorXd max_val = data.colwise().maxCoeff();
+  Eigen::VectorXd range = max_val - min_val;
+
+  for (int j = 0; j <num_features; ++j) {
+    if (range(j) == 0.0) {
+      data.col(j).setZero();
+    } else {
+      data.col(j) = (data.col(j).array() - min_val(j)) / range(j);
+    }
+  }
+}
+void DynamicProgramming::regression_setup(linear_fit_struct &lfit) {
+  lfit.x = Eigen::VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) /
+           (num_timesteps - 1);
+  lfit.y = data;
+}
+
+//work in progress, the rowwise colwise is messing up
+Eigen::MatrixXd DynamicProgramming::regression_lines(int start, int end, linear_fit_struct &lfit) {
+    int n = end - start;
+    Eigen::VectorXd x = lfit.x.segment(start, n);
+    Eigen::MatrixXd y = lfit.y.block(start, 0, n, num_features);
+
+    // Ensure x is in a two-dimensional form for broadcasting
+    Eigen::MatrixXd x_matrix = x.replicate(1, num_features);
+
+    // Calculate means
+    double x_mean = x.mean();
+    Eigen::VectorXd y_mean = y.colwise().mean();
+
+    // Center the data around 0
+    Eigen::MatrixXd x_centered = x_matrix.colwise() - Eigen::VectorXd::Constant(n, x_mean);
+    Eigen::MatrixXd y_centered = y.rowwise() - y_mean.transpose();
+
+    // Calculate slopes for each feature
+    Eigen::VectorXd slope = (x_centered.array() * y_centered.array()).colwise().sum() / x_centered.array().square().sum();
+
+    // Calculate intercepts for each feature
+    Eigen::VectorXd intercept = y_mean.array() - slope.array() * x_mean;
+
+    // everything till this line is functioning fine; I might be overcomplicating it
+    Eigen::MatrixXd regression_lines = (x_matrix.array().colwise() - x_mean).colwise() * slope.array() + intercept.transpose().array();
+
+    return regression_lines;
+}
+
+double DynamicProgramming::l2_cost(const Eigen::MatrixXd &predicted_y, int start, int end) {
+    Eigen::MatrixXd diff = predicted_y.block(start, 0, end - start, num_features) -
+                           data.block(start, 0, end - start, num_features);
+    return std::sqrt(diff.array().square().sum());
+}
+
+void DynamicProgramming::predicted(int start, int end, linear_fit_struct &lfit,
+                                    Eigen::MatrixXd &predicted_y) {
+    predicted_y.block(start, 0, end - start, num_features) = regression_lines(start, end, lfit);
+}
+
+double DynamicProgramming::cost_function(int start, int end) {
+  linear_fit_struct lfit;
+  regression_setup(lfit);
+
+  Eigen::MatrixXd predicted_y(num_timesteps, num_features);
+  predicted(start, end, lfit, predicted_y); // Fill the predicted_y matrix
+
+  return l2_cost(predicted_y, start, end);
+}
+
+void DynamicProgramming::initialize_cost_matrix() {
+  scale_data();
+  tbb::parallel_for(tbb::blocked_range<int>(0, num_timesteps),
+                    [&](const tbb::blocked_range<int> &r) {
+                      for (int i = r.begin(); i < r.end(); ++i) {
+                        for (int j = i + min_size; j < num_timesteps; ++j) {
+                          cost_matrix(i, j) = cost_function(i, j);
+                        }
+                      }
+                    });
+  cost_computed = true;
+}
+
+std::pair<double, std::vector<int>> DynamicProgramming::seg(int start, int end,
+                                                  int num_bkps) {
+  MemoKey key = {start, end, num_bkps};
-  MemoKey key = {start, end, num_bkps};
+  MemoKey key{start, end, num_bkps};
-  MemoKey key = {start, end, num_bkps};
+  MemoKey key{start, end, num_bkps};
+  auto it = memo.find(key);
+  if (it != memo.end()) {
+    return it->second;
+  }
+  if (num_bkps == 0) {
+    return {cost_matrix(start, end), {end}};
+  }
+
+  std::pair<double, std::vector<int>> best = {std::numeric_limits<double>::infinity(), {}};
+
+  for (int bkp = start + min_size; bkp < end; bkp++) {
+    if ((bkp - start) < min_size || (end - bkp) < min_size) {
+        continue;
+    }
+      auto left = seg(start, bkp, num_bkps - 1);
+      auto right = seg(bkp, end, 0);
+      double cost = left.first + right.first;
+      if (cost < best.first) {
+        best.first = cost;
+        best.second = left.second;
+        best.second.push_back(bkp);
+        best.second.insert(best.second.end(), right.second.begin(),
+                           right.second.end());
+      }
+    }
+  }
+
+  memo[key] = best;
+  return best;
+}
+
+std::vector<int> DynamicProgramming::compute_breakpoints(int num_bkps) {
+  auto result = seg(0, num_timesteps - 1, num_bkps);
+  std::vector<int> breakpoints = result.second;
+  return breakpoints;
+}
+
+std::vector<int> DynamicProgramming::fit(int num_bkps){
+  if (!cost_computed){
+  initialize_cost_matrix();
+  }
+  return compute_breakpoints(num_bkps);
+}
+
+void set_parallelization(int num_threads) {
+  static tbb::global_control gc(tbb::global_control::max_allowed_parallelism,
+                                num_threads);
+}
+
+DynamicProgramming::UpperTriangularMatrix &
+DynamicProgramming::getCostMatrix() {
+  return cost_matrix;
+}
+
+void DynamicProgramming::setCostMatrix(
+    const DynamicProgramming::UpperTriangularMatrix &value) {
+  cost_matrix = value;
+}
+
+int main() { return 0; }