Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NOT FOR MERGING -- Testing CQF versus bloom filter and count-min sketch #1836

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions testsCQF/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
TARGETS= mergeTest_SameSize mergeTest_DifferentSize mergeTest_Resize mergeTest_Qbits cqfConstruction


DEBUG=-g
OPT=
ARCH=


PROFILE=

CXX = g++ -std=c++11
CC = g++ -std=c++11
LD= g++ -std=c++11

CXXFLAGS = -Wall $(DEBUG) $(PROFILE) $(OPT) $(ARCH) -m64 -I../third-party/cqf/ -Wno-unused-result -Wno-strict-aliasing -Wno-unused-function

LDFLAGS = $(DEBUG) $(PROFILE) $(OPT) -lpthread -lssl -lcrypto -lm

#
# declaration of dependencies
#

all: $(TARGETS)

# dependencies between programs and .o files

mergeTest_SameSize: mergeTest_SameSize.o
mergeTest_DifferentSize: mergeTest_DifferentSize.o
mergeTest_Resize: mergeTest_Resize.o
mergeTest_Qbits: mergeTest_Qbits.o
cqfConstruction: cqfConstruction.o
# dependencies between .o files and .h files


# dependencies between .o files and .cc (or .c) files



#
# generic build rules
#

$(TARGETS):
$(LD) $^ ../third-party/cqf/gqf.o $(LDFLAGS) -o $@

%.o: %.cc
$(CXX) $(CXXFLAGS) $(INCLUDE) $< -c -o $@

%.o: %.c
$(CC) $(CXXFLAGS) $(INCLUDE) $< -c -o $@

clean:
rm -f *.o $(TARGETS)
3 changes: 3 additions & 0 deletions testsCQF/Requirements
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
To run the scripts you need:
1) Parallel gnu tool sudo apt-get install parallel
2) numpy,matplotlib pip install numpy matplotlib
52 changes: 52 additions & 0 deletions testsCQF/cqfConstruction.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* =====================================================================================
*
* Filename: main_release.c
*
* Description:
*
* Version: 1.0
* Created: 2017-02-04 03:40:58 PM
* Revision: none
* Compiler: gcc
*
* Author: Prashant Pandey ([email protected])
* Rob Johnson ([email protected])
* Organization: Stony Brook University
*
* =====================================================================================
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include <openssl/rand.h>

#include "gqf.h"

int main(int argc, char **argv)
{
QF cf;
QFi cfi;
uint64_t qbits = 15;
uint64_t nhashbits = qbits + 8;
uint64_t nslots = (1ULL << qbits);

/* Initialise the CQF q=15 r= 8 */
printf("Constructing CQF q=15 and r=8\n");
qf_init(&cf, nslots, nhashbits, 0);
printf("Constructing CQF with q=15 and r=8 was done successfully\n");

nhashbits=qbits+3;
/* Initialise the CQF q=15 r= 3 */
printf("Constructing CQF q=15 and r=3\n");
qf_init(&cf, nslots, nhashbits, 0);
printf("Constructing CQF with q=15 and r=3 was done successfully\n");

}
66 changes: 66 additions & 0 deletions testsCQF/generateSeq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sys
from itertools import product
import random
import numpy as np


def help():
print("python3 generateSeq.py <NumberOfUniqeKmers> <K> <NumberOfUnseenKmers> <outputPrefix> ")
print("Script generate 3 files:\n"
"1) Gold: Kmers with the true count\n"
"2) Dataset: Kmers to be counted\n"
"3) Unseen: kmers that are not in the dataset"
)


# N number of uniq kmers
# k is kmer length
# M number of kmers in the nonexisted dataset
# s is the zipifan dataset coeffieint
def GenerateKmers(N,k,outPrefix,M,s=1.5):
Gold=open(outPrefix+".gold",'w')
dataset=open(outPrefix+".dat",'w')
NonExisted=open(outPrefix+".none.dat",'w')
skipMax=random.randint(1,10000)
skip=0
rank=1
i=0
j=0
s= np.random.zipf(s,N)
for candidate in product('ACGT',repeat=k):
skip+=1
if skip <skipMax:
continue
skipMax=random.randint(1,1000)
skip=0
candidate="".join(candidate)
if i<N:
numberOfOccurences=min([s[i],1000])
Gold.write("%s\t%d\n"%(candidate,numberOfOccurences))
for j in range(0,int(numberOfOccurences)):
dataset.write("%s\n"%candidate)
i+=1
elif j<M:
NonExisted.write("%s\n"%candidate)
j+=1
else:
break
Gold.close()
dataset.close()
NonExisted.close()






if __name__ =='__main__':
if sys.argv[1] in ['-h','--h', '--help' ]:
help()
exit(0)
NoUniqueKmers=int(sys.argv[1])
K=int(sys.argv[2])
NumberOfUnseenKmers=int(sys.argv[3])
outPrefix=sys.argv[4]
GenerateKmers(NoUniqueKmers,K,outPrefix,NumberOfUnseenKmers)

103 changes: 103 additions & 0 deletions testsCQF/mergeTest.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* =====================================================================================
*
* Filename: main_release.c
*
* Description:
*
* Version: 1.0
* Created: 2017-02-04 03:40:58 PM
* Revision: none
* Compiler: gcc
*
* Author: Prashant Pandey ([email protected])
* Rob Johnson ([email protected])
* Organization: Stony Brook University
*
* =====================================================================================
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include <openssl/rand.h>

#include "gqf.h"

int main(int argc, char **argv)
{
QF cf,cf1,cf2;
QFi cfi;
uint64_t qbits = 18;
uint64_t small_qbits=qbits;
qbits+=2;
uint64_t nhashbits = qbits + 8;
uint64_t small_nhashbits=small_qbits+8;
uint64_t nslots = (1ULL << qbits);
uint64_t small_nslots=(1ULL << small_qbits);
uint64_t nvals = 250*nslots/1000;
uint64_t *vals;

/* Initialise the CQF */
printf("Initialize first cqf size =%d, hashbits=%d\n", nslots, nhashbits);
qf_init(&cf, nslots, nhashbits, 0);
printf("Initialize Second cqf size =%d, hashbits=%d\n",small_nslots,small_nhashbits);
qf_init(&cf1, small_nslots,small_nhashbits, 0);
printf("Initialize Third cqf size =%d, hashbits=%d\n",small_nslots,small_nhashbits);
qf_init(&cf2,small_nslots,small_nhashbits, 0);
/* Generate random values */
vals = (uint64_t*)malloc(nvals*sizeof(vals[0]));
RAND_pseudo_bytes((unsigned char *)vals, sizeof(*vals) * nvals);
for (uint64_t i = 0; i < nvals; i++) {
vals[i] = (1 * vals[i]);
}
printf("Inserting\n");
/* Insert vals in the CQF */
for (uint64_t i = 0; i < (nvals*2)/3; i++) {
vals[i]=vals[i]%cf1.range;
if(i%2==1){
qf_insert(&cf2, vals[i], 0, 50);
}
else{
qf_insert(&cf1, vals[i], 0, 50);
}

}
printf("Merging\n");

qf_merge(&cf1,&cf2,&cf);
printf("Inserting again\n");
for (uint64_t i = (nvals*2)/3; i <nvals; i++) {
vals[i]=vals[i]%cf.range;
qf_insert(&cf, vals[i], 0, 50);
}

for (uint64_t i = 0; i < nvals; i++) {
uint64_t count = qf_count_key_value(&cf, vals[i], 0);
if (count < 50) {
fprintf(stderr, "failed lookup after insertion for %lx %ld.\n", vals[i],
count);
abort();
}
}

/* Initialize an iterator */
qf_iterator(&cf, &cfi, 0);
do {
uint64_t key, value, count;
qfi_get(&cfi, &key, &value, &count);
if (qf_count_key_value(&cf, key, 0) < 50) {
fprintf(stderr, "Failed lookup from A for: %ld. Returned count: %ld\n",
key, qf_count_key_value(&cf, key, 0));
abort();
}
} while(!qfi_next(&cfi));

fprintf(stdout, "Validated the CQF.\n");
}
109 changes: 109 additions & 0 deletions testsCQF/mergeTest_DifferentSize.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* =====================================================================================
*
* Filename: main_release.c
*
* Description:
*
* Version: 1.0
* Created: 2017-02-04 03:40:58 PM
* Revision: none
* Compiler: gcc
*
* Author: Prashant Pandey ([email protected])
* Rob Johnson ([email protected])
* Organization: Stony Brook University
*
* =====================================================================================
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include <openssl/rand.h>

#include "gqf.h"

int main(int argc, char **argv)
{
QF cf,cf1,cf2;
QFi cfi;
uint64_t qbits = 17;
uint64_t qbits1=qbits;
uint64_t qbits2=qbits+1;
uint64_t large_qbits=qbits+2;
uint64_t nhashbits = large_qbits + 8;
uint64_t nhashbits1 = qbits1 + 8;
uint64_t nhashbits2 = qbits2 + 8;

uint64_t nslots = (1ULL << large_qbits);
uint64_t nslots1=(1ULL << qbits1);
uint64_t nslots2=(1ULL << qbits2);
uint64_t nvals = 250*nslots1/1000;
uint64_t *vals;

/* Initialise the CQF */
printf("Initialize first cqf size =%d, hashbits=%d\n", nslots, nhashbits);
qf_init(&cf, nslots, nhashbits, 0);
printf("Initialize Second cqf size =%d, hashbits=%d\n",nslots1,nhashbits1);
qf_init(&cf1, nslots1,nhashbits1, 0);
printf("Initialize Third cqf size =%d, hashbits=%d\n",nslots2,nhashbits2);
qf_init(&cf2,nslots2,nhashbits2, 0);
/* Generate random values */
vals = (uint64_t*)malloc(nvals*sizeof(vals[0]));
RAND_pseudo_bytes((unsigned char *)vals, sizeof(*vals) * nvals);
for (uint64_t i = 0; i < nvals; i++) {
vals[i] = (1 * vals[i]);
}
vals[0]=131074;
printf("Inserting\n");
/* Insert vals in the CQF */
for (uint64_t i = 0; i < (nvals*2)/3; i++) {

if(i%2==1){
//printf("%d\n",vals[i] );
qf_insert(&cf2, vals[i]%cf2.range, 0, 50);
}
else{
qf_insert(&cf1, vals[i]%cf1.range, 0, 50);
}

}

printf("Merging\n");
qf_merge(&cf1,&cf2,&cf);

printf("Inserting again into Big one\n");
for (uint64_t i = (nvals*2)/3; i <nvals; i++) {
qf_insert(&cf, vals[i]%cf.range, 0, 50);
}

for (uint64_t i = 0; i < nvals; i++) {
uint64_t count = qf_count_key_value(&cf, vals[i]%cf.range, 0);
if (count < 50) {
fprintf(stderr, "failed lookup after insertion for %lx %ld.\n", vals[i],
count);
abort();
}
}

/* Initialize an iterator */
qf_iterator(&cf, &cfi, 0);
do {
uint64_t key, value, count;
qfi_get(&cfi, &key, &value, &count);
if (qf_count_key_value(&cf, key, 0) < 50) {
fprintf(stderr, "Failed lookup from A for: %ld. Returned count: %ld\n",
key, qf_count_key_value(&cf, key, 0));
abort();
}
} while(!qfi_next(&cfi));

fprintf(stdout, "Validated the CQF.\n");
}
Loading