Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade datasketches lib from 4.1.0 to 5.0.2 #713

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions 3rd/datasketches/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ target_sources(common
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
${CMAKE_CURRENT_SOURCE_DIR}/include/optional.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
)
19 changes: 11 additions & 8 deletions 3rd/datasketches/common/include/common_defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,30 @@
#include <chrono>
#include <thread>

/// DataSketches namespace
namespace datasketches {

static const uint64_t DEFAULT_SEED = 9001;

enum resize_factor { X1 = 0, X2, X4, X8 };

template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;

// thread-safe random bit
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));
template<typename A> using string = std::basic_string<char, std::char_traits<char>, typename std::allocator_traits<A>::template rebind_alloc<char>>;

// common random declarations
namespace random_utils {
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
static thread_local std::mt19937_64 rand(rd());
static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
}

// thread-safe random bit
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));

inline void override_seed(uint64_t s) {
rand.seed(s);
}
}

// utility function to hide unused compiler warning
// usually has no additional cost
Expand Down
2 changes: 0 additions & 2 deletions 3rd/datasketches/common/include/count_zeros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@

#include <cstdint>

#include <stdio.h>

namespace datasketches {

static const uint8_t byte_leading_zeros_table[256] = {
Expand Down
15 changes: 9 additions & 6 deletions 3rd/datasketches/common/include/kolmogorov_smirnov.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@

namespace datasketches {

/**
* Kolmogorov-Smirnov test for KLL or Quantiles sketches
*/
class kolmogorov_smirnov {
public:
/**
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param sketch1 sketch 1
* @param sketch2 sketch 2
* @return the raw delta between two KLL quantile sketches
*/
template<typename Sketch>
Expand All @@ -39,8 +42,8 @@ class kolmogorov_smirnov {
* Adjusts the computed threshold by the error epsilons of the two given sketches.
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param sketch1 sketch 1
* @param sketch2 sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
* @return the adjusted threshold to be compared with the raw delta
*/
Expand All @@ -52,8 +55,8 @@ class kolmogorov_smirnov {
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
* this will return false.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param sketch1 sketch 1
* @param sketch2 sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
* reflect the same underlying distribution) using the provided p-value.
Expand Down
148 changes: 148 additions & 0 deletions 3rd/datasketches/common/include/optional.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef _OPTIONAL_HPP_
#define _OPTIONAL_HPP_

// This is a simplistic substitute for std::optional until we require C++17

#if (__cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L))
#include <optional>
using std::optional;
#else

#include <type_traits>

namespace datasketches {

template<typename T>
class optional {
public:

optional() noexcept: initialized_(false) {}

optional(const T& value) noexcept(std::is_nothrow_copy_constructible<T>::value) {
new (&value_) T(value);
initialized_ = true;
}

optional(T&& value) noexcept(std::is_nothrow_move_constructible<T>::value) {
new (&value_) T(std::move(value));
initialized_ = true;
}

// conversion from compatible types
template<typename TT>
optional(const optional<TT>& other) noexcept(std::is_nothrow_constructible<T, TT>::value): initialized_(false) {
if (other.initialized_) {
new (&value_) T(other.value_);
initialized_ = true;
}
}

optional(const optional& other) noexcept(std::is_nothrow_copy_constructible<T>::value): initialized_(false) {
if (other.initialized_) {
new (&value_) T(other.value_);
initialized_ = true;
}
}

optional(optional&& other) noexcept(std::is_nothrow_move_constructible<T>::value): initialized_(false) {
if (other.initialized_) {
new (&value_) T(std::move(other.value_));
initialized_ = true;
}
}

~optional() noexcept(std::is_nothrow_destructible<T>::value) {
if (initialized_) value_.~T();
}

explicit operator bool() const noexcept {
return initialized_;
}

optional& operator=(const optional& other)
noexcept(std::is_nothrow_copy_constructible<T>::value && std::is_nothrow_copy_assignable<T>::value) {
if (initialized_) {
if (other.initialized_) {
value_ = other.value_;
} else {
reset();
}
} else {
if (other.initialized_) {
new (&value_) T(other.value_);
initialized_ = true;
}
}
return *this;
}

optional& operator=(optional&& other)
noexcept(std::is_nothrow_move_constructible<T>::value && std::is_nothrow_move_assignable<T>::value) {
if (initialized_) {
if (other.initialized_) {
value_ = std::move(other.value_);
} else {
reset();
}
} else {
if (other.initialized_) {
new (&value_) T(std::move(other.value_));
initialized_ = true;
}
}
return *this;
}

template<typename... Args>
void emplace(Args&&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value) {
new (&value_) T(args...);
initialized_ = true;
}

T& operator*() & noexcept { return value_; }
const T& operator*() const & noexcept { return value_; }
T&& operator*() && noexcept { return std::move(value_); }
const T&& operator*() const && noexcept { return std::move(value_); }

T* operator->() noexcept { return &value_; }
const T* operator->() const noexcept { return &value_; }

void reset() noexcept(std::is_nothrow_destructible<T>::value) {
if (initialized_) value_.~T();
initialized_ = false;
}

private:
union {
T value_;
};
bool initialized_;

// for converting constructor
template<typename TT> friend class optional;
};

} // namespace

#endif // C++17

#endif // _OPTIONAL_HPP_
97 changes: 95 additions & 2 deletions 3rd/datasketches/common/include/quantiles_sorted_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,129 @@

namespace datasketches {

/**
* Sorted view for quantiles sketches (REQ, KLL and Quantiles)
*/
template<
typename T,
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
typename Allocator
>
class quantiles_sorted_view {
public:
/// Entry type
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
using Container = std::vector<Entry, AllocEntry>;

/// @private
quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);

/// @private
template<typename Iterator>
void add(Iterator begin, Iterator end, uint64_t weight);

/// @private
void convert_to_cummulative();

class const_iterator;

/**
* Iterator pointing to the first entry in the view.
* If the view is empty, the returned iterator must not be dereferenced or incremented.
* @return iterator pointing to the first entry
*/
const_iterator begin() const;

/**
* Iterator pointing to the past-the-end entry in the view.
* The past-the-end entry is the hypothetical entry that would follow the last entry.
* It does not point to any entry, and must not be dereferenced or incremented.
* @return iterator pointing to the past-the-end entry
*/
const_iterator end() const;

/// @return size of the view
size_t size() const;

/**
* Returns an approximation to the normalized rank of the given item.
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param item to be ranked
* @param inclusive if true the weight of the given item is included into the rank.
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
* according to the Comparator.
*
* @return an approximate normalized rank of the given item (0 to 1 inclusive)
*/
double get_rank(const T& item, bool inclusive = true) const;

/**
* Quantile return type.
* This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
*/
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;

/**
* Returns an item from the sketch that is the best approximation to an item
* from the original stream with the given normalized rank.
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param rank of an item in the hypothetical sorted stream.
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
*
* @return approximate quantile associated with the given normalized rank
*/
quantile_return_type get_quantile(double rank, bool inclusive = true) const;

using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;

/**
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
* cumulative analog of the PMF, of the input stream given a set of split points (items).
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
*
* @param size the number of split points in the array
*
* @param inclusive if true the rank of an item includes its own weight, and therefore
* if the sketch contains items equal to a slit point, then in CDF such items are
* included into the interval to the left of split point. Otherwise they are included into
* the interval to the right of split point.
*
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
* of the input stream given the split_points. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array. This can be viewed as array of ranks of the given split points plus one more value
* that is always 1.
*/
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;

/**
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
* given a set of split points (items).
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
*
* @param size the number of split points in the array
*
* @param inclusive if true the rank of an item includes its own weight, and therefore
* if the sketch contains items equal to a slit point, then in PMF such items are
* included into the interval to the left of split point. Otherwise they are included into the interval
* to the right of split point.
*
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;

private:
Expand Down Expand Up @@ -122,8 +214,6 @@ class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_vi
using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;

const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}

template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
const value_type operator*() const { return Base::operator*(); }

Expand All @@ -147,6 +237,9 @@ class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_vi

private:
Base begin;

friend class quantiles_sorted_view<T, C, A>;
const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
};

} /* namespace datasketches */
Expand Down
Loading
Loading