From 9493a8364014d5ae7a3984a7afa2d8f70d91e15d Mon Sep 17 00:00:00 2001 From: Dennis Rohde Date: Thu, 2 Sep 2021 10:39:41 +0200 Subject: [PATCH] improve clustering framework --- CMakeLists.txt | 4 +- Fred/__init__.py | 7 +- README.md | 73 +++--- include/clustering.hpp | 438 ++--------------------------------- include/config.hpp | 21 ++ include/coreset.hpp | 3 +- include/frechet.hpp | 2 +- include/simplification.hpp | 10 +- setup.py | 2 +- src/clustering.cpp | 187 +++++++++++++++ src/config.cpp | 17 ++ src/dynamic_time_warping.cpp | 20 +- src/frechet.cpp | 86 +++---- src/fred_python_wrapper.cpp | 76 ++---- src/simplification.cpp | 16 +- test/test.py | 34 ++- 16 files changed, 423 insertions(+), 573 deletions(-) create mode 100644 include/config.hpp create mode 100644 src/clustering.cpp create mode 100644 src/config.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 40cb406..b0304d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,6 @@ link_libraries(OpenMP::OpenMP_CXX) add_subdirectory(pybind11) pybind11_add_module(backend - src/fred_python_wrapper.cpp src/curve.cpp src/point.cpp src/interval.cpp @@ -45,4 +44,7 @@ pybind11_add_module(backend src/jl_transform.cpp src/simplification.cpp src/dynamic_time_warping.cpp + src/clustering.cpp + src/config.cpp + src/fred_python_wrapper.cpp ) diff --git a/Fred/__init__.py b/Fred/__init__.py index 84240a1..517d82e 100644 --- a/Fred/__init__.py +++ b/Fred/__init__.py @@ -1,10 +1,9 @@ -from . import backend - -import numpy as np -import matplotlib.pyplot as plt +from .backend import * +config = Config() def plot_curve(*curves, savename=None, saveextension=None): + import matplotlib.pyplot as plt for curve in curves: if isinstance(curve, backend.Curve): if curve.dimensions >= 2: diff --git a/README.md b/README.md index 9ef059d..2b4c3da 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,14 @@ A fast, scalable and light-weight C++ Fréchet distance library, exposed to pyth ### NOW USING PYBIND11 INSTEAD OF BOOST! ### NOW AVAILABLE VIA PIP -## Ingredients C++ Backend -`import Fred.backend as fred` +## Ingredients +`import Fred as fred` + +- for verbosity, set `fred.config.verbose`, default is false ### Number of Threads -By default, Fred will automatically determine the number of threads to use. If you want to set an upper limit, call `fred.set_maximum_number_threads(number)`. +By default, Fred will automatically determine the number of threads to use. If you want to set an upper limit, set `fred.config.number_threads`. Set to `-1` to enable dynamic mode again. ### Curve - signature: `fred.Curve(np.ndarray)`, `fred.Curve(np.ndarray, str name)` @@ -24,9 +26,8 @@ By default, Fred will automatically determine the number of threads to use. If y - signature: `fred.continuous_frechet(curve1, curve2)` - returns: `fred.Continuous_Frechet_Result` with members `value`, `time_bounds`: running-time for upper and lower bound, `number_searches`: number of free space diagrams built, `time_searches`: running-time for free spaces -###### continuous Frechet distance config -- approximation error in percent of distance: `fred.set_continuous_frechet_error(double percent)` with parameter `percent`, which defaults to 1 -- rounding: `fred.set_continuous_frechet_rounding(round)` with parameter `round`, which defaults to true +###### continuous Fréchet distance config +- approximation error in percent of distance: `fred.config.continuous_frechet_error`, which defaults to 1 #### discrete Fréchet distance - signature: `fred.discrete_frechet(curve1, curve2)` @@ -57,18 +58,14 @@ All simplifications are vertex-restricted! ### Clustering -##### Distance_Matrix - -A `fred.Distance_Matrix()` can be used to speed up consecutive calls of `fred.discrete_klcenter` and `fred.discrete_klmedian`. As the name suggests, it stores the distances already computed. - #### discrete (k,l)-center clustering (continuous Fréchet) - from [**Approximating (k,l)-center clustering for curves**](https://dl.acm.org/doi/10.5555/3310435.3310616) - signature: `fred.discrete_klcenter(k, l, curves, distances, random_first_center, fast_simplification)` with parameters - `k`: number of centers - `l`: maximum complexity of the centers - - `distances`: `fred.Distance_Matrix`, defaults to empty `fred.Distance_Matrix` + - `consecutive_call`: reuses distances and simplifications already computed in a previous call if `true`, defaults to `false` - `random_first_center`: determines if first center is chosen uniformly at random or first curve is used as first center, optional, defaults to true - - `fast_simplification`: determines whether to use the minimum error simplification or the faster approximate minimum error simplification, defaults to false + - `fast_simplification`: determines whether to use the minimum error simplification or the faster approximate minimum error simplification, defaults to `false` - returns: `fred.Clustering_Result` with mebers - `value`: objective value - `time`: running-time @@ -79,8 +76,8 @@ A `fred.Distance_Matrix()` can be used to speed up consecutive calls of `fred.di - signature: `fred.discrete_klmedian(k, l, curves, distances, fast_simplification)` with parameters - `k`: number of centers - `l`: maximum complexity of the centers - - `distances`: `fred.Distance_Matrix`, defaults to empty `fred.Distance_Matrix` - - `fast_simplification`: determines whether to use the minimum error simplification or the faster approximate minimum error simplification, defaults to false + - `consecutive_call`: reuses distances and simplifications already computed in a previous call if `true`, defaults to `false` + - `fast_simplification`: determines whether to use the minimum error simplification or the faster approximate minimum error simplification, defaults to `false` - returns: `fred.Clustering_Result` with mebers - `value`: objective value - `time`: running-time @@ -88,12 +85,21 @@ A `fred.Distance_Matrix()` can be used to speed up consecutive calls of `fred.di #### Clustering Result - signature: `fred.Clustering_Result` -- methods: `len(fred.Clustering_Result)`: number of centers, `fred.Clustering_Result[i]`: get ith center, `fred.Clustering_Result.compute_assignment(fred.Curves)`: assigns every curve to its nearest center -- members: `value`: objective value, `time`: running-time, `assignment`: empty if compute_assignment was not called +- methods: + -`len(fred.Clustering_Result)`: number of centers + - `fred.Clustering_Result[i]`: get ith center + - `fred.Clustering_Result.compute_assignment(fred.Curves, bool consecutive_call)`: assigns every curve to its nearest center with parameter `consecutive_call`, which defaults to `false`; set to true, if you want to assign the curves used for clustering +- members: + - `value`: objective value + - `time`: running-time + - `assignment`: empty if compute_assignment was not called #### Cluster Assignment - signature: `fred.Cluster_Assignment` -- methods: `len(fred.Cluster_Assignment)`: number of centers, `fred.Cluster_Assignment.count(i)`: number of curves assigned to center i, `fred.Cluster_Assignment.get(i,j)`: get index of jth curve assigned to center i +- methods: + - `len(fred.Cluster_Assignment)`: number of centers + -`fred.Cluster_Assignment.count(i)`: number of curves assigned to center `i` + - `fred.Cluster_Assignment.get(i,j)`: get index of `j`th curve assigned to center `i` ### Dimension Reduction via Gaussian Random Projection - [Section 2 in **Random Projections and Sampling Algorithms for Clustering of High Dimensional Polygonal Curves**](https://papers.nips.cc/paper/9443-random-projections-and-sampling-algorithms-for-clustering-of-high-dimensional-polygonal-curves) @@ -121,22 +127,21 @@ Just run `python py/test.py`. ## Mini Example ```python -import Fred.backend as fred -import Fred +import Fred as fred import numpy as np import pandas as pd -curve1d = fred.Curve(np.array([1., 2.])) # Curve stores a polygonal curve with +curve1d = fred.Curve([1., 2.]) # Curve stores a polygonal curve with # at least two points of at least one # and equal number of dimensions -curve2d1 = fred.Curve(np.array([[1., 0.], [2., 1.], [3., 0.]])) # any number of dimensions and points works -curve2d2 = fred.Curve(np.array([[1., -1.], [2., -2.], [3., -1.]]), "optional name, e.g. displayed in plot") +curve2d1 = fred.Curve([[1., 0.], [2., 1.], [3., 0.]]) # any number of dimensions and points works +curve2d2 = fred.Curve([[1., -1.], [2., -2.], [3., -1.]], "optional name, e.g. displayed in plot") print(curve2d1) -Fred.plot_curve(curve2d1, curve2d2) -Fred.plot_curve(curve2d2, fred.minimum_error_simplification(curve2d2, 2)) +fred.plot_curve(curve2d1, curve2d2) +fred.plot_curve(curve2d2, fred.minimum_error_simplification(curve2d2, 2)) print("distance is {}".format(fred.continuous_frechet(curve2d1, curve2d2).value)) @@ -165,13 +170,13 @@ curves.add(ps4) curves.add(ps5) curves.add(ps6) -Fred.plot_curve(curves) +fred.plot_curve(curves) curves = fred.dimension_reduction(curves, 0.95) # fred is pretty fast but with high dimensional data # a dimension reduction massively improves running-time # even for smaller values of epsilon -Fred.plot_curve(curves) +fred.plot_curve(curves) # Oneshot clustering - if you already know the value of k @@ -183,27 +188,23 @@ print("clustering cost is {}".format(clustering.value)) for i, center in enumerate(clustering): print("center {} is {}".format(i, center)) - - -Fred.plot_curve(clustering) -# Multiple clustering calls - if you need to find a suitable value for k +fred.plot_curve(clustering) -dm = fred.Distance_Matrix() # computing the Fréchet distance is costly, - # therefore we buffer each distance already computed to - # speed up consecutive clustering calls +# Multiple clustering calls - if you need to find a suitable value for k for k in range(2, 6): - clustering = fred.discrete_klcenter(k, 10, curves, dm) + clustering = fred.discrete_klcenter(k, 10, curves, consecutive_call=True) print("clustering cost is {}".format(clustering.value)) - clustering = fred.discrete_klmedian(k, 10, curves, dm) + clustering = fred.discrete_klmedian(k, 10, curves, consecutive_call=True) print("clustering cost is {}".format(clustering.value)) - + clustering.compute_assignment(curves) for i in range(0, len(clustering)): for j in range(0, clustering.assignment.count(i)): print("{} was assigned to center {}".format(curves[clustering.assignment.get(i,j)].name, clustering[i].name)) + ``` diff --git a/include/clustering.hpp b/include/clustering.hpp index 0736ef6..04d5c44 100644 --- a/include/clustering.hpp +++ b/include/clustering.hpp @@ -13,6 +13,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include #include +#include "config.hpp" #include "types.hpp" #include "random.hpp" #include "curve.hpp" @@ -26,16 +27,12 @@ struct Distance_Matrix : public std::vector> { Distance_Matrix() {} Distance_Matrix(const curve_number_t n, const curve_number_t m) : std::vector>(n, std::vector(m, -1.0)) {} - void print() const { - for (const auto &row : *this) { - for (const auto elem : row) { - std::cout << elem << " "; - } - std::cout << std::endl; - } - } + void print() const; }; +extern Distance_Matrix distances; +extern Curves simplifications; + class Cluster_Assignment : public std::vector> { public: inline Cluster_Assignment(const curve_number_t k = 0) : std::vector>(k, std::vector()) {} @@ -89,17 +86,6 @@ inline distance_t _center_cost_sum(const Curves &in, const Curves &simplified_in return cost; } -inline Cluster_Assignment _cluster_assignment(const Curves &in, const Curves &simplified_in, const std::vector ¢ers, Distance_Matrix &distances) { - const auto k = centers.size(); - Cluster_Assignment result(centers.size()); - - if (k == 0) return result; - - for (curve_number_t i = 0; i < in.size(); ++i) result[_nearest_center(i, in, simplified_in, centers, distances)].push_back(i); - - return result; -} - struct Clustering_Result { Curves centers; distance_t value; @@ -121,412 +107,30 @@ struct Clustering_Result { return centers.cend(); } - inline void compute_assignment(const Curves &in) { - Distance_Matrix distances(in.size(), centers.size()); - std::vector center_indices = std::vector(centers.size(), 0); - for (curve_size_t i = 1; i < centers.size(); ++i) center_indices[i] = i; - assignment = _cluster_assignment(in, centers, center_indices, distances); - } -}; - -Clustering_Result kl_cluster(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const bool local_search = false, const bool random_start_center = true, const bool fast_simplification = false) { - const auto start = std::chrono::high_resolution_clock::now(); - Clustering_Result result; - - if (in.empty()) return result; - - std::vector centers; - Curves simplified_in(in.number(), ell, in.dimensions()); - - auto simplify = [&](const curve_number_t i) { - if (fast_simplification) { - auto simplified_curve = Simplification::approximate_minimum_error_simplification(const_cast(in[i]), ell); - simplified_curve.set_name("Simplification of " + in[i].get_name()); - return simplified_curve; + inline void compute_assignment(const Curves &in, const bool consecutive_call = false) { + assignment = Cluster_Assignment(centers.size()); + if (consecutive_call and in.size() == distances.size()) { + for (curve_number_t i = 0; i < in.size(); ++i) assignment[_nearest_center(i, in, simplifications, center_indices, distances)].push_back(i); } else { - Simplification::Subcurve_Shortcut_Graph graph(const_cast(in[i])); - auto simplified_curve = graph.minimum_error_simplification(ell); - simplified_curve.set_name("Simplification of " + in[i].get_name()); - return simplified_curve; - } - }; - - if (random_start_center) { - Random::Uniform_Random_Generator ugen; - const curve_number_t r = std::floor(simplified_in.size() * ugen.get()); - simplified_in[r] = simplify(r); - centers.push_back(r); - } else { - simplified_in[0] = simplify(0); - centers.push_back(0); - } - - distance_t curr_maxdist = 0; - curve_number_t curr_maxcurve = 0; - distance_t curr_curve_cost; - - if (distances.empty()) distances = Distance_Matrix(in.size(), simplified_in.size()); - - { - // remaining centers - for (curve_number_t i = 1; i < num_centers; ++i) { - - curr_maxdist = 0; - curr_maxcurve = 0; - { - - // all curves - for (curve_number_t j = 0; j < in.size(); ++j) { - - curr_curve_cost = _curve_cost(j, in, simplified_in, centers, distances); - - if (curr_curve_cost > curr_maxdist) { - curr_maxdist = curr_curve_cost; - curr_maxcurve = j; - } - - } - #if DEBUG - std::cout << "found center no. " << i+1 << std::endl; - #endif - - if (simplified_in[curr_maxcurve].empty()) { - simplified_in[curr_maxcurve] = simplify(curr_maxcurve); - } - centers.push_back(curr_maxcurve); - } - } - } - - if (local_search) { - - distance_t cost = _center_cost_sum(in, simplified_in, centers, distances); - distance_t approxcost = cost; - distance_t curr_cost = cost; - distance_t gamma = 1/(10 * num_centers); - bool found = true; - auto curr_centers = centers; - - // try to improve current solution - while (found) { - found = false; - - // go through all centers - for (curve_number_t i = 0; i < num_centers; ++i) { - curr_centers = centers; - - // check if there is a better center among all other curves - for (curve_number_t j = 0; j < simplified_in.size(); ++j) { - // continue if curve is already part of center set - if (std::find(curr_centers.begin(), curr_centers.end(), j) != curr_centers.end()) continue; - - // swap - if (simplified_in[j].empty()) { - simplified_in[j] = simplify(j); - } - curr_centers[i] = j; - // new cost - curr_cost = _center_cost_sum(in, simplified_in, curr_centers, distances); - // check if improvement is done - if (curr_cost < cost - gamma * approxcost) { - cost = curr_cost; - centers = curr_centers; - found = true; - } - } - } - } - curr_maxdist = cost; - } - - Curves simpl_centers; - for (const auto center: centers) simpl_centers.push_back(simplified_in[center]); - - auto end = std::chrono::high_resolution_clock::now(); - result.centers = simpl_centers; - result.value = curr_maxdist; - result.running_time = std::chrono::duration_cast(end - start).count(); - return result; -} - -Clustering_Result kl_center(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const bool random_start_center = true, const bool fast_simplification = false) { - return kl_cluster(num_centers, ell, in, distances, false, random_start_center, fast_simplification); -} - -Clustering_Result kl_median(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const bool fast_simplification = false) { - return kl_cluster(num_centers, ell, in, distances, true, fast_simplification); -} - -Clustering_Result one_median_sampling(const curve_size_t ell, const Curves &in, const double epsilon, const Curves ¢er_domain = Curves()) { - const auto start = std::chrono::high_resolution_clock::now(); - Clustering_Result result; - std::vector centers; - Curves &simplified_in = const_cast(center_domain); - - if (center_domain.empty()) { - Curves simplified_in_self(in.number(), ell, in.dimensions()); - - for (curve_number_t i = 0; i < in.size(); ++i) { - Simplification::Subcurve_Shortcut_Graph graph(const_cast(in[i])); - auto simplified_curve = graph.minimum_error_simplification(ell); - simplified_curve.set_name("Simplification of " + in[i].get_name()); - simplified_in_self[i] = simplified_curve; + auto ndistances = Distance_Matrix(in.size(), centers.size()); + auto ncenter_indices = std::vector(centers.size()); + std::iota(ncenter_indices.begin(), ncenter_indices.end(), 0); + for (curve_number_t i = 0; i < in.size(); ++i) assignment[_nearest_center(i, in, centers, ncenter_indices, ndistances)].push_back(i); } - simplified_in = simplified_in_self; } - const auto n = in.size(); - - const auto s = std::ceil(60); - const auto t = std::ceil(std::log(60)/(epsilon*epsilon)); - - Random::Uniform_Random_Generator ugen; - - const auto candidates = ugen.get(s); - const auto witnesses = ugen.get(t); - - Distance_Matrix distances = Distance_Matrix(in.size(), in.size()); - - curve_number_t best_candidate = 0; - distance_t best_objective_value = std::numeric_limits::infinity(); - - for (curve_number_t i = 0; i < candidates.size(); ++i) { - - const curve_number_t candidate = std::floor(candidates[i] * n); - distance_t objective = 0; - - for (curve_number_t j = 0; j < witnesses.size(); ++j) { - const curve_number_t witness = std::floor(witnesses[j] * n); - - _cheap_dist(witness, candidate, in, simplified_in, distances); - objective += distances[witness][candidate]; - } - - if (objective < best_objective_value) { - best_candidate = candidate; - best_objective_value = objective; - } + inline void set_center_indices(const std::vector &pcenter_indices) { + center_indices = pcenter_indices; } - centers.push_back(best_candidate); - - auto end = std::chrono::high_resolution_clock::now(); - result.centers.push_back(simplified_in[centers[0]]); - result.value = _center_cost_sum(in, simplified_in, centers, distances); - result.running_time = std::chrono::duration_cast(end - start).count(); - return result; -} -Clustering_Result one_median_exhaustive(const curve_size_t ell, const Curves &in, const Curves ¢er_domain = Curves()) { - const auto start = std::chrono::high_resolution_clock::now(); - Clustering_Result result; - std::vector centers; - const Curves &simplified_in = center_domain; - - if (center_domain.empty()) { - Curves simplified_in_self(in.number(), ell, in.dimensions()); - - for (curve_number_t i = 0; i < in.size(); ++i) { - Simplification::Subcurve_Shortcut_Graph graph(const_cast(in[i])); - auto simplified_curve = graph.minimum_error_simplification(ell); - simplified_curve.set_name("Simplification of " + in[i].get_name()); - simplified_in_self[i] = simplified_curve; - } - const_cast(simplified_in) = simplified_in_self; - } - - const auto n = in.size(); - - Distance_Matrix distances = Distance_Matrix(in.size(), in.size()); - - curve_number_t best_candidate = 0; - distance_t best_objective_value = std::numeric_limits::infinity(); - - for (curve_number_t j = 0; j < in.size(); ++j) { - - distance_t objective = 0; - - for (curve_number_t i = 0; i < in.size(); ++i) { - _cheap_dist(i, j, in, simplified_in, distances); - objective += distances[i][j]; - } - - if (objective < best_objective_value) { - best_candidate = j; - best_objective_value = objective; - } - } - centers.push_back(best_candidate); - - auto end = std::chrono::high_resolution_clock::now(); - result.centers.push_back(simplified_in[centers[0]]); - result.value = best_objective_value; - result.running_time = std::chrono::duration_cast(end - start).count(); - return result; -} +private: + std::vector center_indices; +}; -Clustering_Result two_two_dtw_one_two_median(const Curves &in, const bool with_assignment = false) { - const auto start = std::chrono::high_resolution_clock::now(); - Clustering_Result result; - - const auto n = in.size(); - - std::vector> markings = std::vector>(n, std::vector(in.get_m(), false)); - std::vector svert = std::vector(n, 0), evert = std::vector(n, 0); - - Points S1(in.dimensions()), S2(in.dimensions()); - Point mu1(in.dimensions()), mu2(in.dimensions()); - - for (curve_number_t i = 0; i < n; ++i) { - S1.push_back(in[i][svert[i]]); - markings[i][svert[i]] = true; - ++svert[i]; - evert[i] = in[i].size() - 1; - S2.push_back(in[i][evert[i]]); - markings[i][evert[i]] = true; - --evert[i]; - } - - mu1 = S1.centroid(); - mu2 = S2.centroid(); - - const auto infty = std::numeric_limits::infinity(); - - bool done = false; - distance_t d1 = infty, d2 = infty, dist = infty; - curve_number_t c1 = 0, c2 = 0; - - while (not done) { - d1 = infty; - d2 = infty; - - for (curve_size_t i = 0; i < in.size(); ++i) { - if (not markings[i][svert[i]]) { - dist = in[i][svert[i]].dist_sqr(mu1); - if (dist < d1) { - d1 = dist; - c1 = i; - } - } - if (not markings[i][evert[i]]) { - dist = in[i][evert[i]].dist_sqr(mu2); - if (dist < d2) { - d2 = dist; - c2 = i; - } - } - } - - if (d1 < d2) { - //std::cout << "S1 add " << c1 << "." << svert[c1] << std::endl; - S1.push_back(in[c1][svert[c1]]); - markings[c1][svert[c1]] = true; - ++svert[c1]; - mu1 = S1.centroid(); - done = false; - } - else if (d2 < infty) { - //std::cout << "S2 add " << c2 << "." << evert[c2] << std::endl; - S2.push_back(in[c2][evert[c2]]); - markings[c2][evert[c2]] = true; - --evert[c2]; - mu2 = S2.centroid(); - done = false; - } else done = true; - - } - - Curve center_curve(mu1.dimensions(), "center curve"); - center_curve.push_back(mu1); - center_curve.push_back(mu2); - - distance_t cost = 0; - - for (const auto &p : S1) cost += p.dist(mu1); - for (const auto &p : S2) cost += p.dist(mu2); - - auto end = std::chrono::high_resolution_clock::now(); - result.centers.push_back(center_curve); - result.value = cost; - result.running_time = std::chrono::duration_cast(end - start).count(); - return result; -} +Clustering_Result kl_cluster(const curve_number_t, const curve_size_t, const Curves &, const bool, const bool, const bool, const bool); -Clustering_Result two_two_dtw_one_two_median_exact(const Curves &in, const bool with_assignment = false) { - const auto start = std::chrono::high_resolution_clock::now(); - Clustering_Result result; - Curve best_center(in.dimensions()); - const auto infty = std::numeric_limits::infinity(); - - curve_size_t n = 1; - std::vector pointers = std::vector(in.size(), 0), - divisors = std::vector(in.size(), 0); - distance_t best = infty, cost = 0; - Points S1(in.dimensions()), S2(in.dimensions()); - - for (curve_number_t i = 0; i < in.size(); ++i) { - n *= in[i].complexity() - 1; - if (i == 0) divisors[i] = in[0].complexity() - 1; - else if (i == 1) divisors[i] = in[0].complexity() - 1; - else divisors[i] = divisors[i-1] * (in[i].complexity() - 1); - } - - const auto onepercent = n / 100; - - int currperc = 0; - - for (curve_size_t i = 0; i < n; ++i) { - - if (onepercent > 0) { - if (i / onepercent > currperc) { - currperc = i / onepercent; - std::cout << currperc << "% done" << std::endl; - } - } - - pointers[0] = i % divisors[0]; - for (curve_number_t j = 1; j < in.size(); ++j) { - pointers[j] = (i / divisors[j]) % (in[j].complexity() - 1); - } - - S1.clear(); - S2.clear(); - - for (curve_number_t j = 0; j < in.size(); ++j) { - for (curve_size_t k = 0; k < in[j].complexity(); ++ k) { - if (k <= pointers[j]) { - S1.push_back(in[j][k]); - if (k == in[j].complexity() - 1) std::cerr << "error!!" << std::endl; - } - else S2.push_back(in[j][k]); - } - } - - auto mu1 = S1.centroid(); - auto mu2 = S2.centroid(); - - Curve center_curve(mu1.dimensions(), "optimal center curve"); - center_curve.push_back(mu1); - center_curve.push_back(mu2); - - cost = 0; - - for (curve_number_t j = 0; j < in.size(); ++j) { - const auto dist = Dynamic_Time_Warping::Discrete::distance(center_curve, in[j]); - cost += dist.value; - } - - if (cost < best) { - best = cost; - best_center = center_curve; - } - } - - auto end = std::chrono::high_resolution_clock::now(); - result.centers.push_back(best_center); - result.value = best; - result.running_time = std::chrono::duration_cast(end - start).count(); - return result; -} +Clustering_Result kl_center(const curve_number_t, const curve_size_t, const Curves &, const bool = false, const bool = true, const bool = false); +Clustering_Result kl_median(const curve_number_t, const curve_size_t, const Curves &, const bool = false, const bool = false); } diff --git a/include/config.hpp b/include/config.hpp new file mode 100644 index 0000000..8aa3c08 --- /dev/null +++ b/include/config.hpp @@ -0,0 +1,21 @@ +/* +Copyright 2021 Dennis Rohde + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#pragma once + +namespace Config { + + class Config{}; + + extern bool verbose; + extern bool mp_dynamic; + extern int number_threads; + +} diff --git a/include/coreset.hpp b/include/coreset.hpp index 4e4715d..2fd7ec6 100644 --- a/include/coreset.hpp +++ b/include/coreset.hpp @@ -31,8 +31,7 @@ class Median_Coreset { public: inline Median_Coreset(const curve_number_t k, curve_size_t ell, const Curves &in, const parameter_t epsilon, const distance_t constant = 1) : in{in}, k{k}, ell{ell}, epsilon{epsilon}, constant{constant}, cluster_costs(k, 0), cluster_sizes(k, 0), lambda(in.size()), Lambda{2*k + 12*std::sqrt(k) + 18}, probabilities(in.size()) { - Clustering::Distance_Matrix distances; - c_approx = Clustering::kl_median(k, ell, in, distances); + c_approx = Clustering::kl_median(k, ell, in); c_approx.compute_assignment(in); for (curve_number_t i = 0; i < k; ++i) { for (curve_number_t j = 0; j < c_approx.assignment.count(i); ++j) { diff --git a/include/frechet.hpp b/include/frechet.hpp index 23f2054..348c48c 100644 --- a/include/frechet.hpp +++ b/include/frechet.hpp @@ -10,6 +10,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #pragma once +#include "config.hpp" #include "types.hpp" #include "point.hpp" #include "interval.hpp" @@ -19,7 +20,6 @@ namespace Frechet { namespace Continuous { extern distance_t error; - extern bool round; struct Distance { distance_t value; diff --git a/include/simplification.hpp b/include/simplification.hpp index 22ee9a7..22a9533 100644 --- a/include/simplification.hpp +++ b/include/simplification.hpp @@ -17,6 +17,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include #include +#include "config.hpp" #include "types.hpp" #include "curve.hpp" #include "frechet.hpp" @@ -31,6 +32,7 @@ class Subcurve_Shortcut_Graph { public: Subcurve_Shortcut_Graph(Curve &curve) : curve{curve}, edges{std::vector>(curve.complexity(), std::vector(curve.complexity(), std::numeric_limits::infinity()))} { + if (Config::verbose) std::cout << "SIMPL: computing shortcut graph" << std::endl; const curve_size_t complexity = curve.complexity(); Curve segment(2, curve.front().dimensions()); auto distance = Frechet::Continuous::Distance(); @@ -38,6 +40,8 @@ class Subcurve_Shortcut_Graph { for (curve_size_t i = 0; i < complexity - 1; ++i) { for (curve_size_t j = i + 1; j < complexity; ++j) { + if (Config::verbose) std::cout << "SIMPL: computing shortcut distance from vertex " << i << " to vertex " << j << std::endl; + curve.set_subcurve(i, j); segment[0] = curve.front(); @@ -53,6 +57,7 @@ class Subcurve_Shortcut_Graph { } Curve minimum_error_simplification(const curve_size_t ll) const { + if (Config::verbose) std::cout << "SIMPL: computing exact minimum error simplification using shortcut graph" << std::endl; if (ll >= curve.complexity()) return curve; curve_size_t l = ll - 1; @@ -70,10 +75,10 @@ class Subcurve_Shortcut_Graph { std::vector others; curve_size_t best = 0; - for (curve_size_t i = 0; i < l; ++i) { if (i == 0) { + if (Config::verbose) std::cout << "SIMPL: initializing arrays" << std::endl; #pragma omp parallel for for (curve_size_t j = 1; j < curve.complexity(); ++j) { distances[j][0] = edges[0][j]; @@ -81,6 +86,7 @@ class Subcurve_Shortcut_Graph { } } else { for (curve_size_t j = 1; j < curve.complexity(); ++j) { + if (Config::verbose) std::cout << "SIMPL: computing shortcut using " << i << " jumps" << std::endl; others.resize(j); #pragma omp parallel for for (curve_size_t k = 0; k < j; ++k) { @@ -95,6 +101,8 @@ class Subcurve_Shortcut_Graph { } } + if (Config::verbose) std::cout << "SIMPL: backwards constructing simplification" << std::endl; + curve_size_t ell = l; result.push_back(curve.back()); diff --git a/setup.py b/setup.py index 63fa3a5..b18e106 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ def build_extension(self, ext): setup( name='Fred-Frechet', - version='1.8.4', + version='1.9.3', author='Dennis Rohde', author_email='dennis.rohde@tu-dortmund.de', description='A fast, scalable and light-weight C++ Fréchet distance library, exposed to python and focused on (k,l)-clustering of polygonal curves.', diff --git a/src/clustering.cpp b/src/clustering.cpp new file mode 100644 index 0000000..d859b3f --- /dev/null +++ b/src/clustering.cpp @@ -0,0 +1,187 @@ +/* +Copyright 2021 Dennis Rohde + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include "clustering.hpp" + +namespace Clustering { + +Distance_Matrix distances; +Curves simplifications; + +void Distance_Matrix::print() const { + for (const auto &row : *this) { + for (const auto elem : row) { + std::cout << elem << " "; + } + std::cout << std::endl; + } +} + +Clustering_Result kl_cluster(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, + const bool local_search = false, const bool consecutive_call = false, const bool random_start_center = true, const bool fast_simplification = false) { + + const auto start = std::chrono::high_resolution_clock::now(); + Clustering_Result result; + + if (in.empty()) return result; + + if (not consecutive_call) { + distances = Distance_Matrix(in.size(), in.size()); + simplifications = Curves(in.number(), ell, in.dimensions()); + } else { + if (distances.empty()) { + distances = Distance_Matrix(in.size(), in.size()); + simplifications = Curves(in.number(), ell, in.dimensions()); + } + if (distances.size() != in.size()) { + std::cerr << "WARNING: you have tried to use 'consecutive_call = true' with different input; ignoring!" << std::endl; + distances = Distance_Matrix(in.size(), in.size()); + simplifications = Curves(in.number(), ell, in.dimensions()); + } + } + + std::vector centers; + + auto simplify = [&](const curve_number_t i) { + if (fast_simplification) { + if (Config::verbose) std::cout << "KL_CLUST: computing approximate vertex restricted minimum error simplification" << std::endl; + auto simplified_curve = Simplification::approximate_minimum_error_simplification(const_cast(in[i]), ell); + simplified_curve.set_name("Simplification of " + in[i].get_name()); + return simplified_curve; + } else { + if (Config::verbose) std::cout << "KL_CLUST: computing exact vertex restricted minimum error simplification" << std::endl; + Simplification::Subcurve_Shortcut_Graph graph(const_cast(in[i])); + auto simplified_curve = graph.minimum_error_simplification(ell); + simplified_curve.set_name("Simplification of " + in[i].get_name()); + return simplified_curve; + } + }; + + if (Config::verbose) std::cout << "KL_CLUST: computing first center" << std::endl; + if (random_start_center) { + Random::Uniform_Random_Generator ugen; + const curve_number_t r = std::floor(simplifications.size() * ugen.get()); + if (simplifications[r].empty()) { + if (Config::verbose) std::cout << "KL_CLUST: computing simplification of curve " << r << std::endl; + simplifications[r] = simplify(r); + } + centers.push_back(r); + } else { + if (simplifications[0].empty()) { + if (Config::verbose) std::cout << "KL_CLUST: computing simplification of curve 0" << std::endl; + simplifications[0] = simplify(0); + } + centers.push_back(0); + } + if (Config::verbose) std::cout << "KL_CLUST: first center is " << centers[0] << std::endl; + + distance_t curr_maxdist = 0; + curve_number_t curr_maxcurve = 0; + distance_t curr_curve_cost; + + if (Config::verbose) std::cout << "KL_CLUST: computing remaining centers" << std::endl; + { + // remaining centers + for (curve_number_t i = 2; i <= num_centers; ++i) { + + curr_maxdist = 0; + curr_maxcurve = 0; + { + + // all curves + for (curve_number_t j = 0; j < in.size(); ++j) { + + if (Config::verbose) std::cout << "KL_CLUST: computing cost of curve " << j << std::endl; + curr_curve_cost = _curve_cost(j, in, simplifications, centers, distances); + + if (curr_curve_cost > curr_maxdist) { + curr_maxdist = curr_curve_cost; + curr_maxcurve = j; + } + + } + if (Config::verbose) std::cout << "KL_CLUST: center " << i << " is curve " << curr_maxcurve << std::endl; + + if (simplifications[curr_maxcurve].empty()) { + if (Config::verbose) std::cout << "KL_CLUST: computing simplification of " << curr_maxcurve << std::endl; + simplifications[curr_maxcurve] = simplify(curr_maxcurve); + } + centers.push_back(curr_maxcurve); + } + } + } + + if (local_search) { + + if (Config::verbose) std::cout << "KL_CLUST: computing k-median cost" << std::endl; + distance_t cost = _center_cost_sum(in, simplifications, centers, distances); + distance_t approxcost = cost; + distance_t curr_cost = cost; + distance_t gamma = 1/(10 * num_centers); + bool found = true; + auto curr_centers = centers; + + if (Config::verbose) std::cout << "KL_CLUST: starting local search" << std::endl; + // try to improve current solution + while (found) { + found = false; + + // go through all centers + for (curve_number_t i = 0; i < num_centers; ++i) { + curr_centers = centers; + + // check if there is a better center among all other curves + for (curve_number_t j = 0; j < simplifications.size(); ++j) { + // continue if curve is already part of center set + if (std::find(curr_centers.begin(), curr_centers.end(), j) != curr_centers.end()) continue; + + if (Config::verbose) std::cout << "KL_CLUST: substituting curve " << curr_centers[i] << " for curve " << j << " as center" << std::endl; + // swap + if (simplifications[j].empty()) { + if (Config::verbose) std::cout << "KL_CLUST: computing simplification of curve " << j << std::endl; + simplifications[j] = simplify(j); + } + curr_centers[i] = j; + // new cost + if (Config::verbose) std::cout << "KL_CLUST: updating k-median cost" << std::endl; + curr_cost = _center_cost_sum(in, simplifications, curr_centers, distances); + // check if improvement is done + if (curr_cost < cost - gamma * approxcost) { + if (Config::verbose) std::cout << "KL_CLUST: cost did improve" << std::endl; + cost = curr_cost; + centers = curr_centers; + found = true; + } else if (Config::verbose) std::cout << "KL_CLUST: cost did not improve" << std::endl; + } + } + } + curr_maxdist = cost; + } + + Curves simpl_centers; + for (const auto center: centers) simpl_centers.push_back(simplifications[center]); + + auto end = std::chrono::high_resolution_clock::now(); + result.centers = simpl_centers; + result.set_center_indices(centers); + result.value = curr_maxdist; + result.running_time = std::chrono::duration_cast(end - start).count(); + return result; +} + +Clustering_Result kl_center(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, const bool consecutive_call, const bool random_start_center, const bool fast_simplification) { + return kl_cluster(num_centers, ell, in, false, consecutive_call, random_start_center, fast_simplification); +} + +Clustering_Result kl_median(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, const bool consecutive_call, const bool fast_simplification) { + return kl_cluster(num_centers, ell, in, true, consecutive_call, true, fast_simplification); +} + +} diff --git a/src/config.cpp b/src/config.cpp new file mode 100644 index 0000000..ccb2344 --- /dev/null +++ b/src/config.cpp @@ -0,0 +1,17 @@ +/* +Copyright 2021 Dennis Rohde + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +namespace Config { + + bool verbose = false; + bool mp_dynamic = true; + int number_threads = -1; + +} diff --git a/src/dynamic_time_warping.cpp b/src/dynamic_time_warping.cpp index 7a784ca..cc094f1 100644 --- a/src/dynamic_time_warping.cpp +++ b/src/dynamic_time_warping.cpp @@ -26,23 +26,31 @@ std::string Distance::repr() const { Distance distance(const Curve &curve1, const Curve &curve2) { Distance result; - auto start = std::chrono::high_resolution_clock::now(); + const auto start = std::chrono::high_resolution_clock::now(); + std::vector> a(curve1.complexity() + 1, std::vector(curve2.complexity() + 1, std::numeric_limits::infinity())); + std::vector> dists(curve1.complexity(), std::vector(curve2.complexity())); + + #pragma omp parallel for collapse(2) + for (curve_size_t i = 0; i < curve1.complexity(); ++i) { + for (curve_size_t j = 0; j < curve2.complexity(); ++j) { + dists[i][j] = curve1[i].dist(curve2[j]); + } + } + a[0][0] = 0; - #pragma omp parallel for ordered collapse(2) for (curve_size_t i = 1; i <= curve1.complexity(); ++i) { for (curve_size_t j = 1; j <= curve2.complexity(); ++j) { - a[i][j] = std::sqrt(curve1[i-1].dist_sqr(curve2[j-1])); - #pragma omp ordered + a[i][j] = dists[i-1][j-1]; a[i][j] += std::min(std::min(a[i-1][j], a[i][j-1]), a[i-1][j-1]); } } auto value = a[curve1.complexity()][curve2.complexity()]; - auto end = std::chrono::high_resolution_clock::now(); + + const auto end = std::chrono::high_resolution_clock::now(); result.time = std::chrono::duration_cast(end - start).count(); result.value = value; return result; - } } // end namespace Discrete diff --git a/src/frechet.cpp b/src/frechet.cpp index 65cd6ba..558db80 100644 --- a/src/frechet.cpp +++ b/src/frechet.cpp @@ -41,15 +41,13 @@ Distance distance(const Curve &curve1, const Curve &curve2) { return result; } - auto start = std::chrono::high_resolution_clock::now(); + const auto start = std::chrono::high_resolution_clock::now(); + if (Config::verbose) std::cout << "CFD: computing lower bound" << std::endl; const distance_t lb = _projective_lower_bound(curve1, curve2); + if (Config::verbose) std::cout << "CFD: computing upper bound" << std::endl; const distance_t ub = _greedy_upper_bound(curve1, curve2); - auto end = std::chrono::high_resolution_clock::now(); - - #if DEBUG - std::cout << "narrowed to [" << lb << ", " << ub.value << "]" << std::endl; - #endif - + const auto end = std::chrono::high_resolution_clock::now(); + auto dist = _distance(curve1, curve2, ub, lb); dist.time_bounds = std::chrono::duration_cast(end - start).count(); @@ -58,14 +56,16 @@ Distance distance(const Curve &curve1, const Curve &curve2) { Distance _distance(const Curve &curve1, const Curve &curve2, distance_t ub, distance_t lb) { Distance result; - auto start = std::chrono::high_resolution_clock::now(); + const auto start = std::chrono::high_resolution_clock::now(); distance_t split = (ub + lb)/2; - const distance_t p_error = lb > 0 ? lb * error / 100 : std::numeric_limits::epsilon(); + const distance_t p_error = lb * error / 100 > std::numeric_limits::epsilon() ? lb * error / 100 : std::numeric_limits::epsilon(); std::size_t number_searches = 0; if (ub - lb > p_error) { - auto infty = std::numeric_limits::infinity(); + if (Config::verbose) std::cout << "CFD: binary search using FSD" << std::endl; + + const auto infty = std::numeric_limits::infinity(); std::vector> reachable1(curve1.complexity() - 1, std::vector(curve2.complexity(), infty)); std::vector> reachable2(curve1.complexity(), std::vector(curve2.complexity() - 1, infty)); @@ -88,19 +88,12 @@ Distance _distance(const Curve &curve1, const Curve &curve2, distance_t ub, dist else { lb = split; } - #if DEBUG - std::cout << "narrowed to [" << lb << ", " << ub << "]" << std::endl; - #endif + if (Config::verbose) std::cout << "CFD: narrowed distance to to [" << lb << ", " << ub << "]" << std::endl; } } - distance_t value = (ub + lb)/2.; - const distance_t p_error_exp = std::log10(p_error); - const distance_t round_exp = p_error_exp >= 0 ? 0 : static_cast(-p_error_exp); - const distance_t round_fact = std::pow(10, round_exp); - if (round) value = std::round(value * round_fact) / round_fact; - auto end = std::chrono::high_resolution_clock::now(); - result.value = value; + const auto end = std::chrono::high_resolution_clock::now(); + result.value = (ub + lb)/2.; result.time_searches = std::chrono::duration_cast(end - start).count(); result.number_searches = number_searches; return result; @@ -110,11 +103,13 @@ bool _less_than_or_equal(const distance_t distance, Curve const& curve1, Curve c std::vector> &reachable1, std::vector> &reachable2, std::vector> &free_intervals1, std::vector> &free_intervals2) { + if (Config::verbose) std::cout << "CFD: constructing FSD" << std::endl; const distance_t dist_sqr = distance * distance; const auto infty = std::numeric_limits::infinity(); const curve_size_t n1 = curve1.complexity(); const curve_size_t n2 = curve2.complexity(); + if (Config::verbose) std::cout << "CFD: resetting old FSD" << std::endl; #pragma omp parallel for collapse(2) if (n1 * n2 > 1000) for (curve_size_t i = 0; i < n1; ++i) { @@ -126,6 +121,8 @@ bool _less_than_or_equal(const distance_t distance, Curve const& curve1, Curve c } } + if (Config::verbose) std::cout << "CFD: FSD borders" << std::endl; + for (curve_size_t i = 0; i < n1 - 1; ++i) { reachable1[i][0] = 0; if (curve2[0].dist_sqr(curve1[i+1]) > dist_sqr) break; @@ -136,6 +133,8 @@ bool _less_than_or_equal(const distance_t distance, Curve const& curve1, Curve c if (curve1[0].dist_sqr(curve2[j+1]) > dist_sqr) break; } + if (Config::verbose) std::cout << "CFD: computing free space" << std::endl; + #pragma omp target teams distribute parallel for collapse(2) if (n1 * n2 > 1000) for (curve_size_t i = 0; i < n1; ++i) { for (curve_size_t j = 0; j < n2; ++j) { @@ -148,6 +147,8 @@ bool _less_than_or_equal(const distance_t distance, Curve const& curve1, Curve c } } + if (Config::verbose) std::cout << "CFD: computing reachable space" << std::endl; + for (curve_size_t i = 0; i < n1; ++i) { for (curve_size_t j = 0; j < n2; ++j) { if ((i < n1 - 1) and (j > 0)) { @@ -251,32 +252,39 @@ std::string Distance::repr() const { Distance distance(const Curve &curve1, const Curve &curve2) { Distance result; - auto start = std::chrono::high_resolution_clock::now(); - std::vector> a(curve1.complexity(), std::vector(curve2.complexity(), -1)); - auto value = std::sqrt(_dp(a, curve1.complexity() - 1, curve2.complexity() - 1, curve1, curve2)); + const auto start = std::chrono::high_resolution_clock::now(); + + std::vector> a(curve1.complexity(), std::vector(curve2.complexity())); + std::vector> dists(curve1.complexity(), std::vector(curve2.complexity())); + + #pragma omp parallel for collapse(2) + for (curve_size_t i = 0; i < curve1.complexity(); ++i) { + for (curve_size_t j = 0; j < curve2.complexity(); ++j) { + dists[i][j] = curve1[i].dist_sqr(curve2[j]); + } + } + + for (curve_size_t i = 0; i < curve1.complexity(); ++i) { + for (curve_size_t j = 0; j < curve2.complexity(); ++j) { + if (i == 0 and j == 0) a[i][j] = dists[i][j]; + else if (i == 0 and j > 0) a[i][j] = std::max(a[i][j-1], dists[i][j]); + else if (i > 0 and j == 0) a[i][j] = std::max(a[i-1][j], dists[i][j]); + else { + a[i][j] = std::max(std::min(std::min(a[i-1][j], a[i-1][j-1]), a[i][j-1]), dists[i][j]); + } + } + } + + const auto value = std::sqrt(a[curve1.complexity() - 1][curve2.complexity() - 1]); + auto end = std::chrono::high_resolution_clock::now(); + result.time = std::chrono::duration_cast(end - start).count(); result.value = value; return result; } -distance_t _dp(std::vector> &a, const curve_size_t i, const curve_size_t j, const Curve &curve1, const Curve &curve2) { - if (a[i][j] > -1) return a[i][j]; - else if (i == 0 and j == 0) return curve1[i].dist_sqr(curve2[j]); - else if (i > 0 and j == 0) return std::max(_dp(a, i-1, 0, curve1, curve2), curve1[i].dist_sqr(curve2[j])); - else if (i == 0 and j > 0) return std::max(_dp(a, 0, j-1, curve1, curve2), curve1[i].dist_sqr(curve2[j])); - else { - a[i][j] = std::max( - std::min( - std::min(_dp(a, i-1, j, curve1, curve2), - _dp(a, i-1, j-1, curve1, curve2)), - _dp(a, i, j-1, curve1, curve2)), - curve1[i].dist_sqr(curve2[j])); - } - return a[i][j]; -} - } // end namespace Discrete } // end namespace Frechet diff --git a/src/fred_python_wrapper.cpp b/src/fred_python_wrapper.cpp index 6756378..fda78c4 100644 --- a/src/fred_python_wrapper.cpp +++ b/src/fred_python_wrapper.cpp @@ -11,6 +11,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include #include +#include "config.hpp" #include "curve.hpp" #include "point.hpp" #include "frechet.hpp" @@ -27,24 +28,6 @@ namespace fc = Frechet::Continuous; namespace fd = Frechet::Discrete; namespace ddtw = Dynamic_Time_Warping::Discrete; -const distance_t default_error = 1; - -void set_frechet_error(const distance_t error) { - fc::error = error; -} - -void set_frechet_rounding(const bool round) { - fc::round = round; -} - -distance_t get_frechet_error() { - return fc::error; -} - -bool get_frechet_rounding() { - return fc::round; -} - Curve minimum_error_simplification(const Curve &curve, const curve_size_t l) { Simplification::Subcurve_Shortcut_Graph graph(const_cast(curve)); auto scurve = graph.minimum_error_simplification(l); @@ -59,19 +42,32 @@ Curve approximate_minimum_link_simplification(const Curve &curve, const distance } Curve approximate_minimum_error_simplification(const Curve &curve, const curve_size_t ell) { + if (Config::verbose) std::cout << "ASIMPL: computing simplification" << std::endl; auto scurve = Simplification::approximate_minimum_error_simplification(curve, ell); scurve.set_name("Simplification of " + curve.get_name()); return scurve; } -void set_number_threads(std::uint64_t number) { - omp_set_dynamic(0); - omp_set_num_threads(number); -} - PYBIND11_MODULE(backend, m) { - m.attr("default_error_continuous_frechet") = default_error; + m.def("set_maximum_number_threads", set_number_threads); + + py::class_(m, "Config") + .def(py::init<>()) + .def_property("continuous_frechet_error", [&](Config::Config&) { return fc::error; }, [&](Config::Config&, const bool error) { fc::error = error; }) + .def_property("verbose", [&](Config::Config&) { return &Config::verbose; }, [&](Config::Config&, const bool verbose) { Config::verbose = verbose; }) + .def_property("number_threads", [&](Config::Config&){ return &Config::number_threads; }, [&](Config::Config&, const int number_threads) { + if (number_threads <= 0) { + Config::number_threads = -1; + Config::mp_dynamic = true; + } else { + Config::number_threads = number_threads; + Config::mp_dynamic = false; + } + omp_set_num_threads(Config::number_threads); + omp_set_dynamic(Config::mp_dynamic); + }) + ; py::class_(m, "Point") .def(py::init()) @@ -126,7 +122,6 @@ PYBIND11_MODULE(backend, m) { ; py::class_(m, "Continuous_Frechet_Distance") - .def(py::init<>()) .def_readwrite("time_searches", &fc::Distance::time_searches) .def_readwrite("time_bounds", &fc::Distance::time_bounds) .def_readwrite("number_searches", &fc::Distance::number_searches) @@ -135,36 +130,28 @@ PYBIND11_MODULE(backend, m) { ; py::class_(m, "Discrete_Frechet_Distance") - .def(py::init<>()) .def_readwrite("time", &fd::Distance::time) .def_readwrite("value", &fd::Distance::value) .def("__repr__", &fd::Distance::repr) ; py::class_(m, "Discrete_Dynamic_Time_Warping_Distance") - .def(py::init<>()) .def_readwrite("time", &ddtw::Distance::time) .def_readwrite("value", &ddtw::Distance::value) .def("__repr__", &ddtw::Distance::repr) ; - py::class_(m, "Distance_Matrix") - .def(py::init<>()) - ; - py::class_(m, "Clustering_Result") - .def(py::init<>()) .def_readwrite("value", &Clustering::Clustering_Result::value) .def_readwrite("time", &Clustering::Clustering_Result::running_time) .def_readwrite("assignment", &Clustering::Clustering_Result::assignment) .def("__getitem__", &Clustering::Clustering_Result::get, py::return_value_policy::reference) .def("__len__", &Clustering::Clustering_Result::size) .def("__iter__", [](Clustering::Clustering_Result &v) { return py::make_iterator(v.cbegin(), v.cend()); }, py::keep_alive<0, 1>()) - .def("compute_assignment", &Clustering::Clustering_Result::compute_assignment) + .def("compute_assignment", &Clustering::Clustering_Result::compute_assignment, py::arg("curves"), py::arg("consecutive_call") = false) ; py::class_(m, "Cluster_Assignment") - .def(py::init<>()) .def("__len__", &Clustering::Cluster_Assignment::size) .def("count", &Clustering::Cluster_Assignment::count) .def("get", &Clustering::Cluster_Assignment::get) @@ -174,11 +161,6 @@ PYBIND11_MODULE(backend, m) { .def(py::init()) ; - m.def("set_continuous_frechet_error", &set_frechet_error); - m.def("set_continuous_frechet_rounding", &set_frechet_rounding); - m.def("get_continuous_frechet_error", &get_frechet_error); - m.def("get_continuous_frechet_rounding", &get_frechet_rounding); - m.def("continuous_frechet", &fc::distance); m.def("discrete_frechet", &fd::distance); m.def("discrete_dynamic_time_warping", &ddtw::distance); @@ -187,17 +169,9 @@ PYBIND11_MODULE(backend, m) { m.def("approximate_minimum_link_simplification", &approximate_minimum_link_simplification); m.def("approximate_minimum_error_simplification", &approximate_minimum_error_simplification); - m.def("dimension_reduction", &JLTransform::transform_naive, py::arg("in") = Curves(), py::arg("epsilon") = 0.5, py::arg("empirical_constant") = true); + m.def("dimension_reduction", &JLTransform::transform_naive, py::arg("curves"), py::arg("epsilon") = 0.5, py::arg("empirical_constant") = true); + + m.def("discrete_klcenter", &Clustering::kl_center, py::arg("k") = 2, py::arg("l") = 2, py::arg("curves"), py::arg("consecutive_call") = false, py::arg("random_start_center") = true, py::arg("fast_simplification") = false); + m.def("discrete_klmedian", &Clustering::kl_median, py::arg("k") = 2, py::arg("l") = 2, py::arg("curves"), py::arg("consecutive_call") = false, py::arg("fast_simplification") = false); - m.def("discrete_klcenter", &Clustering::kl_center, py::arg("num_centers") = 1, py::arg("ell") = 2, py::arg("in") = Curves(), py::arg("distances") = Clustering::Distance_Matrix(), py::arg("random_start_center") = true, py::arg("fast_simplification") = false); - m.def("discrete_klmedian", &Clustering::kl_median, py::arg("num_centers") = 1, py::arg("ell") = 2, py::arg("in") = Curves(), py::arg("distances") = Clustering::Distance_Matrix(), py::arg("fast_simplification") = false); - - // these are experimental - //m.def("two_two_dtw_one_two_median", &Clustering::two_two_dtw_one_two_median); - //m.def("two_two_dtw_one_two_median_exact", &Clustering::two_two_dtw_one_two_median_exact); - //def("discrete_onemedian_sampling", onemedian_sampling, onemedian_sampling_overloads()); - //def("discrete_onemedian_exhaustive", onemedian_exhaustive, onemedian_exhaustive_overloads()); - //def("onemedian_coreset", onemedian_coreset, onemedian_coreset_overloads()); - - m.def("set_maximum_number_threads", set_number_threads); } diff --git a/src/simplification.cpp b/src/simplification.cpp index 356f35e..1a075c4 100644 --- a/src/simplification.cpp +++ b/src/simplification.cpp @@ -11,6 +11,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include "simplification.hpp" Curve Simplification::approximate_minimum_link_simplification(const Curve &pcurve, const distance_t epsilon) { + if (Config::verbose) std::cout << "ASIMPL: computing approximate minimum link simplification" << std::endl; Curve &curve = const_cast(pcurve); const curve_size_t complexity = curve.complexity(); @@ -25,7 +26,10 @@ Curve Simplification::approximate_minimum_link_simplification(const Curve &pcurv segment[0] = curve[i]; j = 0; - + + if (Config::verbose) std::cout << "ASIMPL: computing maximum shortcut starting at " << i << std::endl; + + if (Config::verbose) std::cout << "ASIMPL: exponential error search" << std::endl; while (distance <= epsilon) { ++j; @@ -42,7 +46,8 @@ Curve Simplification::approximate_minimum_link_simplification(const Curve &pcurv low = j == 1 ? 0 : std::pow(2, j - 1); high = std::min(static_cast(std::pow(2, j)), complexity - i - 1); - + + if (Config::verbose) std::cout << "ASIMPL: binary error search" << std::endl; while (low < high) { mid = std::ceil((low + high) / 2.); @@ -55,7 +60,7 @@ Curve Simplification::approximate_minimum_link_simplification(const Curve &pcurv if (distance <= epsilon) low = mid; else high = mid - 1; } - + if (Config::verbose) std::cout << "ASIMPL: shortcutting from " << i << " to " << i+low << std::endl; i += low; curve.reset_subcurve(); simplification.push_back(curve[i]); @@ -64,6 +69,7 @@ Curve Simplification::approximate_minimum_link_simplification(const Curve &pcurv } Curve Simplification::approximate_minimum_error_simplification(const Curve &curve, const curve_size_t ell) { + if (Config::verbose) std::cout << "ASIMPL: computing approximate minimum error simplification" << std::endl; Curve simplification(curve.dimensions()), segment(2, curve.dimensions()); segment[0] = curve.front(); @@ -75,11 +81,13 @@ Curve Simplification::approximate_minimum_error_simplification(const Curve &curv Curve new_simplification = Simplification::approximate_minimum_link_simplification(curve, max_distance); + if (Config::verbose) std::cout << "ASIMPL: computing upper bound for error by exponential search" << std::endl; while (new_simplification.complexity() > ell) { max_distance *= 2.; new_simplification = Simplification::approximate_minimum_link_simplification(curve, max_distance); } + if (Config::verbose) std::cout << "ASIMPL: binary search using upper bound" << std::endl; while (max_distance - min_distance > min_distance * Frechet::Continuous::error / 100) { mid_distance = (min_distance + max_distance) / 2.; @@ -91,7 +99,7 @@ Curve Simplification::approximate_minimum_error_simplification(const Curve &curv max_distance = mid_distance; } } - + if (Config::verbose) std::cout << "ASIMPL: backwards construction of simplification" << std::endl; curve_size_t diff = ell - simplification.complexity(); while (diff > 0) { simplification.push_back(simplification.back()); diff --git a/test/test.py b/test/test.py index 6cbb0eb..178e613 100644 --- a/test/test.py +++ b/test/test.py @@ -5,30 +5,44 @@ class TestContinuousFrechet(unittest.TestCase): def test_zigzag1d(self): - a = fred.Curve(np.array([0.0, 1.0, 0.0, 1.0])) - b = fred.Curve(np.array([0.0, 0.75, 0.25, 1.0])) - c = fred.Curve(np.array([0.0, 1.0])) + a = fred.Curve([0.0, 1.0, 0.0, 1.0]) + b = fred.Curve([0.0, 0.75, 0.25, 1.0]) + c = fred.Curve([0.0, 1.0]) self.assertEqual(fred.continuous_frechet(a, b).value, 0.25) self.assertEqual(fred.continuous_frechet(a, c).value, 0.5) def test_longsegment(self): - a = fred.Curve(np.array([0.0,500.0e3, 1.0e6])) - b = fred.Curve(np.array([0.0, 1.0e6])) + a = fred.Curve([0.0,500.0e3, 1.0e6]) + b = fred.Curve([0.0, 1.0e6]) self.assertEqual(fred.continuous_frechet(a, b).value, 0.0) class TestDiscreteFrechet(unittest.TestCase): def test_zigzag1d(self): - a = fred.Curve(np.array([0.0, 1.0, 0.0, 1.0])) - b = fred.Curve(np.array([0.0, 0.75, 0.25, 1.0])) - c = fred.Curve(np.array([0.0, 1.0])) + a = fred.Curve([0.0, 1.0, 0.0, 1.0]) + b = fred.Curve([0.0, 0.75, 0.25, 1.0]) + c = fred.Curve([0.0, 1.0]) self.assertEqual(fred.discrete_frechet(a, b).value, 0.25) self.assertEqual(fred.discrete_frechet(a, c).value, 1.0) def test_longsegment(self): - a = fred.Curve(np.array([0.0,500.0e3, 1.0e6])) - b = fred.Curve(np.array([0.0, 1.0e6])) + a = fred.Curve([0.0,500.0e3, 1.0e6]) + b = fred.Curve([0.0, 1.0e6]) self.assertEqual(fred.discrete_frechet(a, b).value, 500000.0) + +class TestDiscreteDynamicTimeWarping(unittest.TestCase): + + def test_zigzag1d(self): + a = fred.Curve([0.0, 1.0, 0.0, 1.0]) + b = fred.Curve([0.0, 0.75, 0.25, 1.0]) + c = fred.Curve([0.0, 1.0]) + self.assertEqual(fred.discrete_dynamic_time_warping(a, b).value, 0.5) + self.assertEqual(fred.discrete_dynamic_time_warping(a, c).value, 1.0) + + def test_longsegment(self): + a = fred.Curve([0.0,500.0e3, 1.0e6]) + b = fred.Curve([0.0, 1.0e6]) + self.assertEqual(fred.discrete_dynamic_time_warping(a, b).value, 500000.0) if __name__ == '__main__': unittest.main()