Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test correction #71

Merged
merged 6 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ WARNINGS=-Wall -Wextra -Wno-char-subscripts \
-Wpointer-arith -Wwrite-strings -Wdisabled-optimization \
-Wformat -Wcast-align -Wno-unused-function -Wno-unused-parameter \
-pedantic -Wunused-variable\
-Wno-cast-align
-Wno-cast-align -Wno-sign-compare

FLAGS=-O3 -funroll-loops -pipe $(TARGET_FLAG) -Iinclude/sketch -I. -Iinclude/blaze -Ivec -Ipybind11/include -Iinclude -fpic -Wall $(WARNINGS) \
FLAGS=-O2 -funroll-loops -pipe $(TARGET_FLAG) -Iinclude/sketch -I. -Iinclude/blaze -Ivec -Ipybind11/include -Iinclude -fpic -Wall $(WARNINGS) \
-fno-strict-aliasing

CXXFLAGS=$(FLAGS) -Wreorder \
Expand Down Expand Up @@ -49,7 +49,7 @@ STD?= -std=c++17

#CCBIN?=-ccbin=clang++

GPUFLAGS= $(CCBIN) -O3 -std=c++14 -Iinclude -I. -Xcompiler $(TARGET_FLAG) -Xcompiler -fopenmp -Iinclude/sketch \
GPUFLAGS= $(CCBIN) -O2 -std=c++14 -Iinclude -I. -Xcompiler $(TARGET_FLAG) -Xcompiler -fopenmp -Iinclude/sketch \
-lz

INCLUDES=-I`$(PYCONF) --includes` -Ipybind11/include
Expand All @@ -70,7 +70,7 @@ hpython: pybbmh.cpython.so
$(PYTHON) -c "import subprocess;import site; subprocess.check_call('cp pybbmh.py "*`$(PYCONF) --extension-suffix`" %s' % site.getsitepackages()[0], shell=True)"

%.cpython.so: %.cpp
$(CXX) $(UNDEFSTR) $(INCLUDES) -fopenmp -O3 -Wall $(CXXFLAGS) -shared $(STD) -fPIC `python3 -m pybind11 --includes` $< -o $*$(SUF) -lz && \
$(CXX) $(UNDEFSTR) $(INCLUDES) -fopenmp -Wall $(CXXFLAGS) -O2 -shared $(STD) -fPIC `python3 -m pybind11 --includes` $< -o $*$(SUF) -lz && \
ln -fs $*$(SUF) $@

%.o: %.cpp
Expand Down
7 changes: 2 additions & 5 deletions include/sketch/bbmh.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,8 @@ struct FinalDivBBitMinHash {
return std::max(0., frac / (1. - b2pow));
}
double containment_index(const FinalDivBBitMinHash &o) const {
double ji = jaccard_index(o);
double is = (est_cardinality_ + o.est_cardinality_) * ji / (1. + ji);
const double ji = jaccard_index(o);
const double is = (est_cardinality_ + o.est_cardinality_) * ji / (1. + ji);
return is / est_cardinality_;
}
double intersection_size(const FinalDivBBitMinHash &o) const {
Expand Down Expand Up @@ -1623,9 +1623,6 @@ FinalBBitMinHash BBitMinHasher<T, Hasher>::finalize(uint32_t b) const {
size_t ndef;
double cest = -1.;
if((ndef = std::count_if(core_.begin(), core_.end(), [](auto x) {return x == detail::default_val<T>();}))) {
#ifndef NDEBUG
std::fprintf(stderr, "requires densification: %zu/%zu need to be densified\n", ndef, core_.size());
#endif
tmp = core_;
cest = detail::harmonic_cardinality_estimate_impl(tmp);
std::replace(tmp.begin(), tmp.end(), std::numeric_limits<T>::max() >> p_, detail::default_val<T>());
Expand Down
4 changes: 2 additions & 2 deletions include/sketch/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@
#define sk__xstr__(x) sk__str__(x)
#define SKETCH_SHIFT 8
#define SKETCH_MAJOR 0
#define SKETCH_MINOR 18
#define SKETCH_REVISION 1
#define SKETCH_MINOR 19
#define SKETCH_REVISION 0
#define SKETCH_VERSION_INTEGER ((((SKETCH_MAJOR << SKETCH_SHIFT) | SKETCH_MINOR) << SKETCH_SHIFT) | SKETCH_REVISION)
#define SKETCH_VERSION SKETCH_MAJOR.SKETCH_MINOR##SKETCH_REVISION
#define SKETCH_VERSION_STR sk__xstr__(SKETCH_MAJOR.SKETCH_MINOR.SKETCH_REVISION)
Expand Down
7 changes: 4 additions & 3 deletions include/sketch/hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ struct InvH {
const InverseOperation iop;

InvH(uint64_t seed):
seed_(seed | std::is_same<Operation, op::multiplies<uint64_t>>::value),
seed_(seed | std::is_same<Operation, op::multiplies<uint64_t>>::value), // Ensures that the seed is odd if multiplies, as an odd number is needed for reversibility.
inverse_(multinv::Inverse64<Operation>()(seed_)), op(), iop() {}
// To ensure that it is actually reversible.
INLINE uint64_t inverse(uint64_t hv) const {
Expand Down Expand Up @@ -901,8 +901,9 @@ struct XorMultiply: public FusedReversible<InvXor, InvMul > {
struct MultiplyAdd: public FusedReversible<InvMul, InvAdd> {
MultiplyAdd(uint64_t seed1=0x9a98567ed20c127d, uint64_t seed2=0xe37e28c4271b5a1duLL): FusedReversible<InvMul, InvAdd>(seed1 | 1, seed2 | 1) {}
};
struct MultiplyAddXor: public FusedReversible3<InvMul,InvAdd,InvXor> {
MultiplyAddXor(uint64_t seed1=0x9a98567ed20c127d, uint64_t seed2=0xe37e28c4271b5a1duLL): FusedReversible3<InvMul,InvAdd,InvXor>(seed1 | 1, seed2 | 1) {}
struct MultiplyAddXor: public FusedReversible3<InvAdd,InvMul,InvXor> {
using base = FusedReversible3<InvAdd,InvMul,InvXor>;
MultiplyAddXor(uint64_t seed1=0x9a98567ed20c127d, uint64_t seed2=0xe37e28c4271b5a1duLL): base(seed1 | 1, seed2 | 1) {}
};
template<size_t shift>
struct MultiplyAddXoRot: public FusedReversible3<InvMul,InvXor,RotN<shift>> {
Expand Down
51 changes: 36 additions & 15 deletions include/sketch/hll.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,21 @@ static constexpr const char *JESTIM_STRINGS []
"ORIGINAL", "ERTL_IMPROVED", "ERTL_MLE", "ERTL_JOINT_MLE"
};
enum JointEstimationMethod: uint8_t {
//ORIGINAL = 0,
//ERTL_IMPROVED = 1, // Improved but biased method
//ERTL_MLE = 2, // element-wise max, followed by MLE
ERTL_JOINT_MLE = 3 // Ertl special version
J_ORIGINAL = ORIGINAL,
J_ERTL_IMPROVED = ERTL_IMPROVED, // Improved but biased method
J_ERTL_MLE = ERTL_MLE, // element-wise max, followed by MLE
ERTL_JOINT_MLE = 3, // Ertl special version
J_ERTL_JOINT_MLE = ERTL_JOINT_MLE,
};

static inline std::string to_string(JointEstimationMethod est) {
switch(est) {case J_ORIGINAL: return "Original"; case J_ERTL_IMPROVED: return "Improved"; case J_ERTL_MLE: return "MLE"; case J_ERTL_JOINT_MLE: return "JMLE"; default: return "UNKNOWN";};
}
static inline std::string to_string(EstimationMethod est) {
return to_string(static_cast<JointEstimationMethod>(est));
}


static const char *EST_STRS [] {
"original",
"ertl_improved",
Expand Down Expand Up @@ -200,17 +209,17 @@ static constexpr double TWO_POW_32 = 1ull << 32;

template<typename CountArrType>
static double calculate_estimate(const CountArrType &counts,
EstimationMethod estim, uint64_t m, uint32_t p, double alpha, double relerr=1e-2) noexcept {
JointEstimationMethod estim, uint64_t m, uint32_t p, double alpha, double relerr=1e-2) noexcept {
assert(estim <= 3);
#if ENABLE_COMPUTED_GOTO
static constexpr void *arr [] {&&ORREST, &&ERTL_IMPROVED_EST, &&ERTL_MLE_EST};
static constexpr void *arr [] {&&ORREST, &&ERTL_IMPROVED_EST, &&ERTL_MLE_EST, &&ERTL_JOINT_MLE_EST};
goto *arr[estim];
ORREST: {
#else
switch(estim) {
case ORIGINAL: {
case J_ORIGINAL: {
#endif
assert(estim != ERTL_MLE);
assert(estim != static_cast<JointEstimationMethod>(ERTL_MLE));
double sum = counts[0];
for(unsigned i = 1; i < 64 - p + 1; ++i) if(counts[i]) sum += std::ldexp(counts[i], -i); // 64 - p because we can't have more than that many leading 0s. This is just a speed thing.
//for(unsigned i = 1; i < 64 - p + 1; ++i) sum += std::ldexp(counts[i], -i); // 64 - p because we can't have more than that many leading 0s. This is just a speed thing.
Expand All @@ -228,23 +237,35 @@ static double calculate_estimate(const CountArrType &counts,
return value;
}
#if ENABLE_COMPUTED_GOTO
ERTL_IMPROVED_EST: {
ERTL_IMPROVED_EST:
#else
case ERTL_IMPROVED: {
case J_ERTL_IMPROVED:
#endif
{
static const double divinv = 1. / (2.L*std::log(2.L));
double z = m * detail::gen_tau(static_cast<double>((m-counts[64 - p + 1]))/static_cast<double>(m));
for(unsigned i = 64-p; i; z += counts[i--], z *= 0.5); // Reuse value variable to avoid an additional allocation.
z += m * detail::gen_sigma(static_cast<double>(counts[0])/static_cast<double>(m));
return m * divinv * m / z;
}
#if ENABLE_COMPUTED_GOTO
ERTL_MLE_EST: return ertl_ml_estimate(counts, p, 64 - p, relerr);
ERTL_MLE_EST:
ERTL_JOINT_MLE_EST:
#else
case ERTL_MLE: return ertl_ml_estimate(counts, p, 64 - p, relerr);
default: HEDLEY_UNREACHABLE();
}
case J_ERTL_MLE:
case J_ERTL_JOINT_MLE:
#endif
return ertl_ml_estimate(counts, p, 64 - p, relerr);
default:
std::fprintf(stderr, "Unknown estimation method.\n");
HEDLEY_UNREACHABLE();
}
}

template<typename CountArrType>
static double calculate_estimate(const CountArrType &counts,
EstimationMethod estim, uint64_t m, uint32_t p, double alpha, double relerr=1e-2) noexcept {
return calculate_estimate(counts, static_cast<JointEstimationMethod>(estim), m, p, alpha, relerr);
}

template<typename CoreType>
Expand Down Expand Up @@ -696,7 +717,7 @@ std::array<double, 3> ertl_joint(const HllType &h1, const HllType &h2) {
const double cAX = h1.get_is_ready() ? h1.creport() : ertl_ml_estimate(c1, h1.p(), h1.q());
const double cBX = h2.get_is_ready() ? h2.creport() : ertl_ml_estimate(c2, h2.p(), h2.q());
const double cABX = ertl_ml_estimate(cu, h1.p(), h1.q());
// std::fprintf(stderr, "Made initials: %lf, %lf, %lf\n", cAX, cBX, cABX);
std::fprintf(stderr, "Made initials: %lf, %lf, %lf\n", cAX, cBX, cABX);
std::array<uint32_t, 64> countsAXBhalf;
std::array<uint32_t, 64> countsBXAhalf;
countsAXBhalf[q] = h1.m();
Expand Down
4 changes: 4 additions & 0 deletions include/sketch/hmh.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,12 @@ struct hmh_t {
template<typename IT> IT access(size_t index) const {
return reinterpret_cast<const IT *>(data_.data())[index];
}
#ifndef _mm512_srli_epi16
#define _mm512_srli_epi16(mm, Imm) _mm512_and_si512(_mm512_set1_epi16(0xFFFFu >> Imm), _mm512_srli_epi32(mm, Imm))
#endif
#ifndef _mm512_srli_epi8
#define _mm512_srli_epi8(mm, Imm) _mm512_and_si512(_mm512_set1_epi8(0xFFu >> Imm), _mm512_srli_epi32(mm, Imm))
#endif
template<typename IT>
std::array<uint32_t, 64> sum_counts() const {
using hll::detail::SIMDHolder;
Expand Down
55 changes: 34 additions & 21 deletions include/sketch/mh.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,16 @@ struct RangeMinHash: public AbstractMinHash<T, Cmp> {
AbstractMinHash<T, Cmp>(sketch_size), hf_(std::move(hf)), cmp_(std::move(cmp))
{
}
void show() {
std::fprintf(stderr, "%zu mins\n", minimizers_.size());
for(const auto x: minimizers_) {
std::fprintf(stderr, "%zu\n", size_t(x));
}
}
RangeMinHash(std::string) {throw NotImplementedError("");}
double cardinality_estimate() const {
return double(std::numeric_limits<T>::max()) / this->max_element() * minimizers_.size();
const double result = (std::numeric_limits<T>::max()) / this->max_element() * minimizers_.size();
return result;
}
RangeMinHash(gzFile fp) {
if(!fp) throw std::runtime_error("Null file handle!");
Expand All @@ -127,8 +134,9 @@ struct RangeMinHash: public AbstractMinHash<T, Cmp> {
}
RangeMinHash &operator+=(const RangeMinHash &o) {
minimizers_.insert(o.begin(), o.end());
while(minimizers_.size() > this->ss_)
while(minimizers_.size() > this->ss_) {
minimizers_.erase(minimizers_.begin());
}
return *this;
}
RangeMinHash operator+(const RangeMinHash &o) const {
Expand Down Expand Up @@ -222,8 +230,6 @@ struct RangeMinHash: public AbstractMinHash<T, Cmp> {
void free() {clear();}
final_type cfinalize() const {
std::vector<T> reta(minimizers_.begin(), minimizers_.end());
if(reta.size() < this->ss_)
reta.insert(reta.end(), this->ss_ - reta.size(), std::numeric_limits<uint64_t>::max());
return final_type(std::move(reta));
}
final_type finalize() & {
Expand Down Expand Up @@ -320,25 +326,36 @@ struct FinalRMinHash {
tmp += o;
return tmp;
}
/*
double union_size(const FinalRMinHash &o) const {
std::vector<T> total(o.begin(), o.end());
total.insert(total.end(), begin(), end());
std::sort(total.begin(), total.end());
total.resize(std::min(o.size(), size()));
const size_t maxv = total.back();
return (double(std::numeric_limits<T>::max()) / maxv) * this->size();
PREC_REQ(this->size() == o.size(), "Non-matching parameters for FinalRMinHash comparison");
size_t n_in_sketch = 0;
auto i1 = this->rbegin(), i2 = o.rbegin();
T mv;
while(n_in_sketch < first.size() - 1) {
// Easier to branch-predict: http://www.vldb.org/pvldb/vol8/p293-inoue.pdf
if(*i1 != *i2) ++i1, ++i2;
else {
const int c = *i1 < *i2;
i2 += !c; i1 += c;
if(*i1 == *i2) {
++i1, ++i2;
} else if(*i1 < *i2) {
++i1;
} else {
++i2;
}
++n_in_sketch;
}
mv = *i1 < *i2 ? *i1: *i2;
// TODO: test after refactoring
assert(i1 < this->rend());
return double(std::numeric_limits<T>::max()) / (mv) * this->size();
const size_t mv = std::min(*i1, *i2);
const double est = double(std::numeric_limits<T>::max()) / mv * this->size();
std::fprintf(stderr, "mv: %zu. est: %g. Expected maxv %zu\n", size_t(mv), est, maxv);
return est;
}
*/
double cardinality_estimate(MHCardinalityMode mode=ARITHMETIC_MEAN) const {
// KMV (kth-minimum value) estimate
return (static_cast<double>(std::numeric_limits<T>::max()) / double(this->max_element()) * first.size());
Expand Down Expand Up @@ -399,20 +416,16 @@ struct FinalRMinHash {
void free() {
decltype(first) tmp; std::swap(tmp, first);
}
template<typename Alloc>
FinalRMinHash(const std::vector<T, Alloc> &ofirst): first(ofirst.size()) {std::copy(ofirst.begin(), ofirst.end(), first.begin()); sort();}
template<typename It>
FinalRMinHash(It start, It end): first(std::distance(start, end)) {
std::copy(start, end, first.begin());
sort();
}
template<typename Alloc, typename=std::enable_if_t<std::is_same<Alloc, allocator>::value>>
FinalRMinHash(std::vector<T, Alloc> &&ofirst): first(std::move(ofirst)) {
FinalRMinHash(It start, It end) {
std::copy(start, end, std::back_inserter(first));
sort();
}
template<typename Alloc>
FinalRMinHash(const std::vector<T, Alloc> &ofirst): FinalRMinHash(ofirst.begin(), ofirst.end()) {}
template<typename Hasher, bool is_bottom>
FinalRMinHash(const BottomKHasher<Hasher, T, is_bottom> &bk): FinalRMinHash(bk.mpq_.getq().begin(), bk.mpq_.getq().end()) {}
FinalRMinHash(FinalRMinHash &&o): first(std::move(o.first)) {sort();}
FinalRMinHash(FinalRMinHash &&o) = default;
ssize_t read(gzFile fp) {
uint64_t sz;
if(gzread(fp, &sz, sizeof(sz)) != sizeof(sz)) throw ZlibError("Failed to read");
Expand Down Expand Up @@ -453,7 +466,7 @@ struct FinalRMinHash {
FinalRMinHash() {}
FinalRMinHash &operator=(const FinalRMinHash &o) = default;
void sort() {
common::sort::default_sort(this->first.begin(), this->first.end());
std::sort(this->first.begin(), this->first.end());
}
};

Expand Down
4 changes: 1 addition & 3 deletions include/sketch/rnla.h
Original file line number Diff line number Diff line change
Expand Up @@ -372,9 +372,7 @@ struct PStableSketcher: public RNLASketcher<FloatType> {
tx(st * this->destdim() + ind, sind) = dist_(gen);
}
}
#if !NDEBUG
std::fprintf(stderr, "nonzeros: %zu. total: %zu\n", tx.nonZeros(), tx.rows() * tx.columns());
#endif
VERBOSE_ONLY(std::fprintf(stderr, "nonzeros: %zu. total: %zu\n", tx.nonZeros(), tx.rows() * tx.columns());)
}
}
PStableSketcher(const PStableSketcher &o): super(o.ntables(), o.destdim()), seed_(o.seed_), dist_(o.dist_), dense_(o.dense_) {
Expand Down
3 changes: 1 addition & 2 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def __str__(self):
"../pybind11/include"
]

__version__ = subprocess.check_output(
["git", "describe", "--abbrev=4"]).decode().strip().split('-')[0]
__version__ = "0.19.0"


def make_namepair(name):
Expand Down
14 changes: 8 additions & 6 deletions testsrc/bbmhtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,19 @@ void verify_popcount() {
b2.addh(4);
b2.addh(17);
auto f1 = b1.cfinalize(), f2 = b2.cfinalize();
VERBOSE_ONLY(
std::fprintf(stderr, "f1 popcount: %" PRIu64 "\n", f1.popcnt());
std::fprintf(stderr, "f2 popcount: %" PRIu64 "\n", f2.popcnt());
)
#if 0
b1.show();
b2.show();
#endif
auto b3 = b1 + b2;
//b3.show();
auto f3 = b3.finalize();
std::fprintf(stderr, "f3 popcount: %" PRIu64 "\n", f3.popcnt());
auto neqb12 = f1.equal_bblocks(f2);
std::fprintf(stderr, "eqb: %zu. With itself: %zu\n", size_t(neqb12), size_t(f1.equal_bblocks(f1)));
VERBOSE_ONLY(std::fprintf(stderr, "f3 popcount: %" PRIu64 "\n", f3.popcnt());)
VERBOSE_ONLY(std::fprintf(stderr, "eqb: %zu. With itself: %zu\n", size_t(f1.equal_bblocks(f2)), size_t(f1.equal_bblocks(f1)));)
}

int main(int argc, char *argv[]) {
Expand Down Expand Up @@ -154,7 +155,8 @@ int main(int argc, char *argv[]) {
b2.densify();
auto est = (b1 + b2).cardinality_estimate();
auto usest = b1.union_size(b2);
std::fprintf(stderr, "union est by union: %f. by union_size: %f. difference: %12e\n", est, usest, (est - usest));
VERBOSE_ONLY(std::fprintf(stderr, "union est by union: %f. by union_size: %f. difference: %12e\n", est, usest, (est - usest));)
assert(std::abs(est - usest) < 1e-6);
assert(est == usest);
auto f1 = b1.finalize(), f2 = b2.finalize(), f3 = b3.finalize();
assert(i <= 9 || std::abs(est - niter) < niter * .1 || !std::fprintf(stderr, "est: %lf. niter: %zu\n", est, size_t(niter)));
Expand All @@ -165,10 +167,10 @@ int main(int argc, char *argv[]) {
auto smh1 = smhp2.finalize(16), smh2 = smhp21.finalize(16);
auto smhd1 = smhdp.finalize(16), smhd2 = smhdp1.finalize(16);
auto smh1ji = smh1.jaccard_index(smh1);
std::fprintf(stderr, "smh1ji: %g\n", smh1ji);
VERBOSE_ONLY(std::fprintf(stderr, "smh1ji: %g\n", smh1ji);)
assert(smh1ji == 1.);
auto pji = smh1.jaccard_index(smh2);
std::fprintf(stderr, "estimate: %f. nmin: %u. b: %u\n", pji, 1u << i, b);
VERBOSE_ONLY(std::fprintf(stderr, "estimate: %f. nmin: %u. b: %u\n", pji, 1u << i, b);)
if(std::abs(pji - .5) > 0.05) {
std::fprintf(stderr, "original (no b-bit): %f\n", b1.jaccard_index(b2));
std::fprintf(stderr, ">.05 error: estimate: %f. nmin: %u. b: %u. %f%% error\n", pji, 1u << i, b, std::abs(pji - .5) / .5 * 100);
Expand Down
2 changes: 1 addition & 1 deletion testsrc/cmtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,5 @@ int main() {
for(auto &c: cmf.second)
c = 2;
assert(c.count == cmf.intersection_size(cmf2)); // Make sure intersection size is still the same
std::fprintf(stderr, "%llu, %llu\n", c.count, cm.union_size(cm2));
std::fprintf(stderr, "%llu, %llu\n", (unsigned long long)c.count, (unsigned long long)cm.union_size(cm2));
}
Loading