diff --git a/AdaptiveSampler.h b/AdaptiveSampler.h index e6efbc5..e2f08a6 100644 --- a/AdaptiveSampler.h +++ b/AdaptiveSampler.h @@ -73,8 +73,8 @@ template class AdaptiveSampler { } void print_header() { - printf("#%-6s %6s %8s %8s %8s %8s %8s %8s\n", "type", "size", - "min", "max", "avg", "90th", "95th", "99th"); + printf("#%-6s %6s %8s %8s %8s %8s %8s %8s %8s %8s\n", "type", "size", + "min", "max", "avg", "50th", "90th", "95th", "99th", "99.9th"); } void print_stats(const char *type, const char *size) { @@ -82,17 +82,18 @@ template class AdaptiveSampler { size_t l = samples_copy.size(); if (l == 0) { - printf("%-7s %6s %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size, + printf("%-7s %6s %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); return; } sort(samples_copy.begin(), samples_copy.end()); - printf("%-7s %6s %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size, + printf("%-7s %6s %8.1f %8.1f% 8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size, samples_copy[0], samples_copy[l-1], average(), + samples_copy[(l*50)/100], samples_copy[(l*90)/100], samples_copy[(l*95)/100], - samples_copy[(l*99)/100]); + samples_copy[(l*99)/100], samples_copy[(l*99.9)/100]); } }; diff --git a/AgentStats.h b/AgentStats.h index 50e016b..e73bb8c 100644 --- a/AgentStats.h +++ b/AgentStats.h @@ -5,7 +5,15 @@ class AgentStats { public: uint64_t rx_bytes, tx_bytes; - uint64_t gets, sets, get_misses; + uint64_t gets, sets, accesses, get_misses; + uint64_t gets_l1, gets_l2, sets_l1, sets_l2; + uint64_t get_misses_l1, get_misses_l2; + uint64_t set_misses_l1, set_misses_l2; + uint64_t excl_wbs, incl_wbs; + uint64_t copies_to_l1; + uint64_t delete_misses_l2; + uint64_t delete_hits_l2; + uint64_t set_incl_hits_l1, set_excl_hits_l1; uint64_t skips; double start, stop; diff --git a/Connection.cc b/Connection.cc index ea02899..9232d91 100644 --- a/Connection.cc +++ b/Connection.cc @@ -1,4 +1,9 @@ #include +#include +#include +#include + +#include #include #include @@ -15,18 +20,135 @@ #include "mutilate.h" #include "binary_protocol.h" #include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" + +//#define DEBUGC + +using namespace moodycamel; +std::hash hashstr; + +extern ifstream kvfile; +extern pthread_mutex_t flock; +extern pthread_mutex_t *item_locks; +extern int item_lock_hashpower; + + +pthread_mutex_t cid_lock = PTHREAD_MUTEX_INITIALIZER; +uint32_t connids = 0; + +//pthread_mutex_t opaque_lock = PTHREAD_MUTEX_INITIALIZER; +//uint32_t g_opaque = 0; + +void item_lock(size_t hv, uint32_t cid) { + //char out[128]; + //sprintf(out,"conn: %u, locking %lu\n",cid,hv); + //write(2,out,strlen(out)); + pthread_mutex_lock(&item_locks[hv & hashmask(item_lock_hashpower)]); +} + +void item_unlock(size_t hv, uint32_t cid) { + //char out[128]; + //sprintf(out,"conn: %u, unlocking %lu\n",cid,hv); + //write(2,out,strlen(out)); + pthread_mutex_unlock(&item_locks[hv & hashmask(item_lock_hashpower)]); +} + +void *item_trylock(uint32_t hv, uint32_t cid) { + pthread_mutex_t *lock = &item_locks[hv & hashmask(item_lock_hashpower)]; + if (pthread_mutex_trylock(lock) == 0) { + //char out[128]; + //sprintf(out,"conn: %u, locking %u\n",cid,hv); + //write(2,out,strlen(out)); + return lock; + } + return NULL; +} + +void item_trylock_unlock(void *lock, uint32_t cid) { + //char out[128]; + //sprintf(out,"conn: %u, unlocking\n",cid); + //write(2,out,strlen(out)); + pthread_mutex_unlock((pthread_mutex_t *) lock); +} + +void Connection::output_op(Operation *op, int type, bool found) { + char output[1024]; + char a[256]; + char s[256]; + memset(a,0,256); + memset(s,0,256); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,op->key,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,op->key,op->opaque,op->type); + } + write(2,output,strlen(output)); +} /** * Create a new connection to a server endpoint. */ Connection::Connection(struct event_base* _base, struct evdns_base* _evdns, string _hostname, string _port, options_t _options, - bool sampling) : + //ConcurrentQueue* a_trace_queue, + bool sampling ) : start_time(0), stats(sampling), options(_options), hostname(_hostname), port(_port), base(_base), evdns(_evdns) { valuesize = createGenerator(options.valuesize); keysize = createGenerator(options.keysize); + + //trace_queue = a_trace_queue; + opaque = 0; + total = 0; + op_queue_size = 0; + issue_buf_n = 0; + //; + //op_queue = (Operation**)malloc(sizeof(Operation*)*OPAQUE_MAX); + eof = 0; + keygen = new KeyGenerator(keysize, options.records); if (options.lambda <= 0) { @@ -39,32 +161,103 @@ Connection::Connection(struct event_base* _base, struct evdns_base* _evdns, read_state = INIT_READ; write_state = INIT_WRITE; - + last_quiet = false; + //op_queue.reserve(OPAQUE_MAX); //new std::vector(OPAQUE_MAX); + last_tx = last_rx = 0.0; - bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE); - bufferevent_setcb(bev, bev_read_cb, bev_write_cb, bev_event_cb, this); - bufferevent_enable(bev, EV_READ | EV_WRITE); + pthread_mutex_lock(&cid_lock); + cid = connids++; + pthread_mutex_unlock(&cid_lock); + + issue_buf_size = 0; + issue_buf = (unsigned char*)malloc(sizeof(unsigned char)*MAX_BUFFER_SIZE); + memset(issue_buf,0,MAX_BUFFER_SIZE); + issue_buf_pos = issue_buf; + timer = evtimer_new(base, timer_cb, this); - if (options.binary) { - prot = new ProtocolBinary(options, this, bev); - } else { - prot = new ProtocolAscii(options, this, bev); - } +} + +//void Connection::set_queue(ConcurrentQueue* a_trace_queue) { +// trace_queue = a_trace_queue; +//} - if (bufferevent_socket_connect_hostname(bev, evdns, AF_UNSPEC, +void Connection::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + //while (trace_queue->size() < 1); + //usleep(1000); +} + +void Connection::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +uint32_t Connection::get_cid() { + return cid; +} + +int Connection::do_connect() { + + int connected = 0; + if (options.unix_socket) { + + bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev, bev_read_cb, bev_write_cb, bev_event_cb, this); + bufferevent_enable(bev, EV_READ | EV_WRITE); + + struct sockaddr_un sin; + memset(&sin, 0, sizeof(sin)); + sin.sun_family = AF_LOCAL; + strcpy(sin.sun_path, hostname.c_str()); + + int addrlen; + addrlen = sizeof(sin); + int err = bufferevent_socket_connect(bev, (struct sockaddr*)&sin, addrlen); + if (err == 0) { + connected = 1; + if (options.binary) { + prot = new ProtocolBinary(options, this, bev); + } else if (options.redis) { + prot = new ProtocolRESP(options, this, bev); + } else { + prot = new ProtocolAscii(options, this, bev); + } + } else { + connected = 0; + err = errno; + fprintf(stderr,"error %s\n",strerror(err)); + bufferevent_free(bev); + //event_base_free(_evbase_ptr); + } + } else { + bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev, bev_read_cb, bev_write_cb, bev_event_cb, this); + bufferevent_enable(bev, EV_READ | EV_WRITE); + + if (options.binary) { + prot = new ProtocolBinary(options, this, bev); + } else if (options.redis) { + prot = new ProtocolRESP(options, this, bev); + } else { + prot = new ProtocolAscii(options, this, bev); + } + if (bufferevent_socket_connect_hostname(bev, evdns, AF_UNSPEC, hostname.c_str(), - atoi(port.c_str()))) { - DIE("bufferevent_socket_connect_hostname()"); + atoi(port.c_str())) == 0) { + connected = 1; + } else { + bufferevent_free(bev); + connected = 0; + } } - - timer = evtimer_new(base, timer_cb, this); + return connected; } /** * Destroy a connection, performing cleanup. */ Connection::~Connection() { + event_free(timer); timer = NULL; // FIXME: W("Drain op_q?"); @@ -81,8 +274,8 @@ Connection::~Connection() { */ void Connection::reset() { // FIXME: Actually check the connection, drain all bufferevents, drain op_q. - assert(op_queue.size() == 0); - evtimer_del(timer); + //assert(op_queue.size() == 0); + //evtimer_del(timer); read_state = IDLE; write_state = INIT_WRITE; stats = ConnectionStats(stats.sampling); @@ -120,9 +313,10 @@ void Connection::start_loading() { */ void Connection::issue_something(double now) { char key[256]; + memset(key,0,256); // FIXME: generate key distribution here! string keystr = keygen->generate(lrand48() % options.records); - strcpy(key, keystr.c_str()); + strncpy(key, keystr.c_str(),255); if (drand48() < options.update) { int index = lrand48() % (1024 * 1024); @@ -132,12 +326,405 @@ void Connection::issue_something(double now) { } } + +/** + * Get/Set Style + * Issue a get first, if not found then set + */ +void Connection::issue_getset(double now) { + + if (!options.read_file && !kvfile.is_open()) + { + string keystr; + char key[256]; + memset(key,0,256); + keystr = keygen->generate(lrand48() % options.records); + strncpy(key, keystr.c_str(),255); + + char log[1024]; + int length = valuesize->generate(); + sprintf(log,"%s,%d\n",key,length); + write(2,log,strlen(log)); + + issue_get_with_len(key, length, now); + } + else + { + string line; + string rT; + string rApp; + string rReq; + string rKey; + string rvaluelen; + + pthread_mutex_lock(&flock); + getline(kvfile,line); + pthread_mutex_unlock(&flock); + stringstream ss(line); + getline( ss, rT, ','); + getline( ss, rApp, ','); + getline( ss, rReq, ','); + getline( ss, rKey, ',' ); + getline( ss, rvaluelen, ',' ); + + int vl = atoi(rvaluelen.c_str()); + + char key[256]; + memset(key,0,256); + strncpy(key, rKey.c_str(),255); + issue_get_with_len(key, vl, now); + } + +} + +int Connection::issue_something_trace(double now) { + int ret = 0; + + string line; + string rT; + string rApp; + string rOp; + string rKey; + string rKeySize; + string rvaluelen; + + pthread_mutex_lock(&flock); + if (kvfile.good()) { + getline(kvfile,line); + pthread_mutex_unlock(&flock); + } + else { + pthread_mutex_unlock(&flock); + return 1; + } + stringstream ss(line); + int Op = 0; + int vl = 0; + + if (options.twitter_trace == 1) { + getline( ss, rT, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rKeySize, ',' ); + getline( ss, rvaluelen, ',' ); + getline( ss, rApp, ',' ); + getline( ss, rOp, ',' ); + vl = atoi(rvaluelen.c_str()); + if (vl < 1) vl = 1; + if (rOp.compare("get") == 0) { + Op = 1; + } else if (rOp.compare("set") == 0) { + Op = 2; + } else { + Op = 0; + } + + while (Op == 0) { + string line1; + pthread_mutex_lock(&flock); + if (kvfile.good()) { + getline(kvfile,line1); + pthread_mutex_unlock(&flock); + } + stringstream ss1(line1); + getline( ss1, rT, ',' ); + getline( ss1, rKey, ',' ); + getline( ss1, rKeySize, ',' ); + getline( ss1, rvaluelen, ',' ); + getline( ss1, rApp, ',' ); + getline( ss1, rOp, ',' ); + vl = atoi(rvaluelen.c_str()); + if (vl < 1) vl = 1; + + if (rOp.compare("get") == 0) { + Op = 1; + } else if (rOp.compare("set") == 0) { + Op = 2; + } else { + Op = 0; + } + } + + } else { + getline( ss, rT, ',' ); + getline( ss, rApp, ',' ); + getline( ss, rOp, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rvaluelen, ',' ); + if (rOp.compare("read") == 0) + Op = 1; + if (rOp.compare("write") == 0) + Op = 2; + vl = atoi(rvaluelen.c_str()); + } + + + if (vl > 524000) vl = 524000; + //if (strcmp(key,"100004781") == 0) { + // fprintf(stderr,"ready!\n"); + //} + switch(Op) + { + case 1: + issue_get_with_len(rKey.c_str(), vl, now); + break; + case 2: + int index = lrand48() % (1024 * 1024); + issue_set(rKey.c_str(), &random_char[index], vl, now,true); + break; + } + return ret; +} + + +/** + * Get/Set or Set Style + * If a GET command: Issue a get first, if not found then set + * If trace file (or prob. write) says to set, then set it + */ +int Connection::issue_getsetorset(double now) { + + int ret = 0; + + if (!options.read_file) { + string keystr; + char key[256]; + memset(key,0,256); + keystr = keygen->generate(lrand48() % options.records); + strncpy(key, keystr.c_str(),255); + + char log[1024]; + int length = valuesize->generate(); + sprintf(log,"%s,%d\n",key,length); + write(2,log,strlen(log)); + + issue_get_with_len(key, length, now); + + } else { + + string line; + string rT; + string rApp; + string rOp; + string rKey; + string rKeySize; + string rvaluelen; + + + int nissued = 0; + //fprintf(stderr,"starting to issue, current %d\n",issue_buf_n); + while (nissued < options.depth) { + //bool res = trace_queue->try_dequeue(line); + + if (trace_queue->size() > 0) { + pthread_mutex_lock(lock); + line = trace_queue->front(); + trace_queue->pop(); + pthread_mutex_unlock(lock); + if (line.compare("EOF") == 0) { + eof = 1; + return 1; + } + + stringstream ss(line); + int Op = 0; + int vl = 0; + + if (options.twitter_trace == 1) { + getline( ss, rT, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rKeySize, ',' ); + getline( ss, rvaluelen, ',' ); + getline( ss, rApp, ',' ); + getline( ss, rOp, ',' ); + //vl = atoi(rvaluelen.c_str()); + vl = stoi(rvaluelen); + //vl = 100000; + if (vl < 1) continue; + if (vl > 524000) vl = 524000; + if (rOp.compare("get") == 0) { + Op = 1; + } else if (rOp.compare("set") == 0) { + Op = 2; + } else { + Op = 0; + } + + + } else if (options.twitter_trace == 2) { + getline( ss, rT, ',' ); + getline( ss, rApp, ',' ); + getline( ss, rOp, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rvaluelen, ',' ); + Op = stoi(rOp); + vl = stoi(rvaluelen); + } else { + getline( ss, rT, ',' ); + getline( ss, rApp, ',' ); + getline( ss, rOp, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rvaluelen, ',' ); + vl = stoi(rvaluelen); + if (rOp.compare("read") == 0) + Op = 1; + if (rOp.compare("write") == 0) + Op = 2; + } + + + char key[256]; + memset(key,0,256); + strncpy(key, rKey.c_str(),255); + int issued = 0; + switch(Op) + { + case 0: + //fprintf(stderr,"invalid line: %s, vl: %d @T: %d\n", + // key,vl,stoi(rT)); + break; + case 1: + if (nissued < options.depth-1) { + issued = issue_get_with_len(key, vl, now, true); + last_quiet = true; + } else { + issued = issue_get_with_len(key, vl, now, false); + last_quiet = false; + } + break; + case 2: + if (last_quiet) { + issue_noop(now); + } + int index = lrand48() % (1024 * 1024); + //issued = issue_get_with_len(key, vl, now, false); + issued = issue_set(key, &random_char[index], vl, now, true); + last_quiet = false; + break; + + } + if (issued) { + nissued++; + total++; + } else { + if (Op != 0) { + fprintf(stderr,"failed to issue line: %s, vl: %d @T: %d\n", + key,vl,stoi(rT)); + } + break; + } + } else { +//#ifdef DEBUGC + return 0; + //fprintf(stderr,"trace_queue size: %d\n",trace_queue->size()); + //if (stats.accesses > 10) { + // eof = 1; + // return 1; + //} + } + } + //fprintf(stderr,"done issue, current %d\n",issue_buf_n); + if (last_quiet) { + issue_noop(); + last_quiet = false; + } +#ifdef DEBUGC + fprintf(stderr,"getsetorset issuing %d reqs last quiet %d\n",issue_buf_n,last_quiet); + char *output = (char*)malloc(sizeof(char)*(issue_buf_size+512)); + fprintf(stderr,"-------------------------------------\n"); + memcpy(output,issue_buf,issue_buf_size); + write(2,output,issue_buf_size); + fprintf(stderr,"\n-------------------------------------\n"); + free(output); +#endif + //buffer is ready to go! + bufferevent_write(bev, issue_buf, issue_buf_size); + + memset(issue_buf,0,issue_buf_size); + issue_buf_pos = issue_buf; + issue_buf_size = 0; + issue_buf_n = 0; + } + + return ret; + +} + +/** + * Issue a get request to the server. + */ +int Connection::issue_get_with_len(const char* key, int valuelen, double now, bool quiet) { + //Operation *op = new Operation; + Operation op; // = new Operation; + +#if HAVE_CLOCK_GETTIME + op.start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + op.start_time = tv_to_double(&now_tv); +#else + op.start_time = get_time(); +#endif + } else { + op.start_time = now; + } +#endif + + //record before rx + //r_vsize = stats.rx_bytes % 100000; + //pthread_mutex_lock(&opaque_lock); + op.opaque = opaque++; + if (opaque > OPAQUE_MAX) { + opaque = 0; + } + //pthread_mutex_unlock(&opaque_lock); + + strncpy(op.key,key,255); + op.valuelen = valuelen; + op.type = Operation::GET; + //op.hv = hashstr(op.key); + //item_lock(op.hv,cid); + //pthread_mutex_t *lock = (pthread_mutex_t*)item_trylock(op.hv,cid); + //if (lock != NULL) { + op_queue[op.opaque] = op; + op_queue_size++; + //output_op(&op,0,0); + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + + if (quiet) { + //h.opcode = CMD_GETQ; + h.opcode = CMD_GET; + } + h.opaque = htonl(op.opaque); + + memcpy(issue_buf_pos,&h,24); + issue_buf_pos += 24; + issue_buf_size += 24; + memcpy(issue_buf_pos,key,keylen); + issue_buf_pos += keylen; + issue_buf_size += keylen; + issue_buf_n++; + + if (read_state != LOADING) stats.tx_bytes += 24 + keylen; + + stats.log_access(op); + return 1; +} + /** * Issue a get request to the server. */ void Connection::issue_get(const char* key, double now) { Operation op; - int l; #if HAVE_CLOCK_GETTIME op.start_time = get_time_accurate(); @@ -155,23 +742,78 @@ void Connection::issue_get(const char* key, double now) { } #endif - op.key = string(key); + //record before rx + //r_vsize = stats.rx_bytes % 100000; + + op.opaque = opaque++; + if (opaque > OPAQUE_MAX) { + opaque = 0; + } + + strncpy(op.key,key,255); op.type = Operation::GET; - op_queue.push(op); + //op.hv = hashstr(op.key); + //item_lock(op.hv,cid); + op_queue[op.opaque] = op; + op_queue_size++; if (read_state == IDLE) read_state = WAITING_FOR_GET; - l = prot->get_request(key); + int l = prot->get_request(key,op.opaque); if (read_state != LOADING) stats.tx_bytes += l; + + stats.log_access(op); } /** - * Issue a set request to the server. + * Issue a delete90 request to the server. */ -void Connection::issue_set(const char* key, const char* value, int length, - double now) { +void Connection::issue_delete90(double now) { Operation op; int l; +#if HAVE_CLOCK_GETTIME + op.start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + op.start_time = tv_to_double(&now_tv); +#else + op.start_time = get_time(); +#endif + } else { + op.start_time = now; + } +#endif + + op.type = Operation::DELETE; + op.opaque = 0; + op_queue[op.opaque] = op; + op_queue_size++; + + if (read_state == IDLE) read_state = WAITING_FOR_DELETE; + l = prot->delete90_request(); + if (read_state != LOADING) stats.tx_bytes += l; +} + +/** + * Issue a set request as a result of a miss to the server. + * The difference here is that we will yield to any outstanding SETs to this + * key, i.e. while waiting for GET response a SET to the key was issued. + * + * + * or v2? + * - works with the lock held, since we want to beat any incoming writes + * - maintains program order, total set ordering + * - currenlty using this design + */ +void Connection::issue_set_miss(const char* key, const char* value, int length) { + //Operation *op = new Operation; + Operation op; // = new Operation; + int l; + double now = 0; + #if HAVE_CLOCK_GETTIME op.start_time = get_time_accurate(); #else @@ -179,41 +821,192 @@ void Connection::issue_set(const char* key, const char* value, int length, else op.start_time = now; #endif + //record value size + //r_vsize = length; + //r_appid = key[0] - '0'; + //const char* kptr = key; + //kptr += 2; + //r_key = atoi(kptr); + //r_ksize = strlen(kptr); + op.opaque = opaque++; + if (opaque > OPAQUE_MAX) { + opaque = 0; + } + + strncpy(op.key,key,255); + op.valuelen = length; op.type = Operation::SET; - op_queue.push(op); + //op.hv = hashstr(op.key); + op_queue[op.opaque] = op; + op_queue_size++; - if (read_state == IDLE) read_state = WAITING_FOR_SET; - l = prot->set_request(key, value, length); + //output_op(&op,1,0); + + //if (read_state == IDLE) read_state = WAITING_FOR_SET; + l = prot->set_request(key, value, length, op.opaque); if (read_state != LOADING) stats.tx_bytes += l; + + //if (is_access) + stats.log_access(op); +} + + +void Connection::issue_noop(double now) { + Operation op; + + if (now == 0.0) op.start_time = get_time(); + else op.start_time = now; + + //op.opaque = opaque++; + //if (opaque > OPAQUE_MAX) { + // opaque = 0; + //} + + //op.valuelen = 0; + //op.type = Operation::NOOP; + //op.hv = hashstr(op.key); + //pthread_mutex_t *lock = (pthread_mutex_t*)item_trylock(op.hv,cid); + //if (lock != NULL) { + //item_lock(op.hv,cid); + //op_queue[op.opaque] = op; + //op_queue_size++; + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + //h.opaque = htonl(op.opaque); + + memcpy(issue_buf_pos,&h,24); + issue_buf_pos += 24; + issue_buf_size += 24; + issue_buf_n++; } /** - * Return the oldest live operation in progress. + * Issue a set request to the server. */ -void Connection::pop_op() { - assert(op_queue.size() > 0); +int Connection::issue_set(const char* key, const char* value, int length, + double now, bool is_access) { + //Operation *op = new Operation; + Operation op; // = new Operation; - op_queue.pop(); +#if HAVE_CLOCK_GETTIME + op.start_time = get_time_accurate(); +#else + if (now == 0.0) op.start_time = get_time(); + else op.start_time = now; +#endif + + //record value size + //r_vsize = length; + //r_appid = key[0] - '0'; + //const char* kptr = key; + //kptr += 2; + //r_key = atoi(kptr); + //r_ksize = strlen(kptr); + op.opaque = opaque++; + if (opaque > OPAQUE_MAX) { + opaque = 0; + } + + op.valuelen = length; + op.type = Operation::SET; + strncpy(op.key,key,255); + //op.hv = hashstr(op.key); + //pthread_mutex_t *lock = (pthread_mutex_t*)item_trylock(op.hv,cid); + //if (lock != NULL) { + //item_lock(op.hv,cid); + op_queue[op.opaque] = op; + op_queue_size++; + + //output_op(&op,1,0); + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(op.opaque); + + memcpy(issue_buf_pos,&h,24); + issue_buf_pos += 24; + issue_buf_size += 24; + if (options.miss_through && is_access) { + uint32_t flags = htonl(16384); + memcpy(issue_buf_pos,&flags,4); + issue_buf_pos += 4; + issue_buf_size += 4; + uint32_t exp = 0; + memcpy(issue_buf_pos,&exp,4); + issue_buf_pos += 4; + issue_buf_size += 4; + + } else { + uint32_t flags = 0; + memcpy(issue_buf_pos,&flags,4); + issue_buf_pos += 4; + issue_buf_size += 4; + uint32_t exp = 0; + memcpy(issue_buf_pos,&exp,4); + issue_buf_pos += 4; + issue_buf_size += 4; + } + memcpy(issue_buf_pos,key,keylen); + issue_buf_pos += keylen; + issue_buf_size += keylen; + memcpy(issue_buf_pos,value,length); + issue_buf_pos += length; + issue_buf_size += length; + issue_buf_n++; + + + //if (read_state == IDLE) read_state = WAITING_FOR_SET; + //l = prot->set_request(key, value, length, op->opaque); + + //if (is_access) { + if (read_state != LOADING) stats.tx_bytes += length + 32 + keylen; + stats.log_access(op); + //} + return 1; + //} else { + // return 0; + //} +} + +/** + * Return the oldest live operation in progress. + */ +void Connection::pop_op(Operation *op) { + + //assert(op_queue.size() > 0); + uint32_t opopq = op->opaque; + //pthread_mutex_t *l = op->lock; + //delete op_queue[opopq]; + op_queue.erase(opopq); + op_queue_size--; + + //item_trylock_unlock(l,cid); + //item_unlock(hv,cid); if (read_state == LOADING) return; read_state = IDLE; // Advance the read state machine. - if (op_queue.size() > 0) { - Operation& op = op_queue.front(); - switch (op.type) { - case Operation::GET: read_state = WAITING_FOR_GET; break; - case Operation::SET: read_state = WAITING_FOR_SET; break; - default: DIE("Not implemented."); - } - } + //if (op_queue.size() > 0) { + // Operation& op = op_queue.front(); + // switch (op.type) { + // case Operation::GET: read_state = WAITING_FOR_GET; break; + // case Operation::SET: read_state = WAITING_FOR_SET; break; + // case Operation::DELETE: read_state = WAITING_FOR_DELETE; break; + // default: DIE("Not implemented."); + // } + //} } /** * Finish up (record stats) an operation that just returned from the * server. */ -void Connection::finish_op(Operation *op) { +void Connection::finish_op(Operation *op, int was_hit) { double now; #if USE_CACHED_TIME struct timeval now_tv; @@ -228,25 +1021,84 @@ void Connection::finish_op(Operation *op) { op->end_time = now; #endif - switch (op->type) { - case Operation::GET: stats.log_get(*op); break; - case Operation::SET: stats.log_set(*op); break; - default: DIE("Not implemented."); + if (options.successful_queries && was_hit) { + switch (op->type) { + case Operation::GET: stats.log_get(*op); break; + case Operation::SET: stats.log_set(*op); break; + case Operation::DELETE: break; + default: DIE("Not implemented."); + } + } else { + switch (op->type) { + case Operation::GET: stats.log_get(*op); break; + case Operation::SET: stats.log_set(*op); break; + case Operation::DELETE: break; + default: DIE("Not implemented."); + } } last_rx = now; - pop_op(); - drive_write_machine(); + uint32_t opopq = op->opaque; + op_queue.erase(opopq); + //op_queue.erase(op_queue.begin()+opopq); + //delete op_queue[opopq]; + op_queue_size--; + read_state = IDLE; + + //lets check if we should output stats for the window + //Do the binning for percentile outputs + //crude at start + if ((options.misswindow != 0) && ( ((stats.window_accesses) % options.misswindow) == 0)) + { + if (stats.window_gets != 0) + { + //printf("%lu,%.4f\n",(stats.accesses), + // ((double)stats.window_get_misses/(double)stats.window_accesses)); + stats.window_gets = 0; + stats.window_get_misses = 0; + stats.window_sets = 0; + stats.window_accesses = 0; + } + } + } + + /** * Check if our testing is done and we should exit. */ bool Connection::check_exit_condition(double now) { if (read_state == INIT_READ) return false; if (now == 0.0) now = get_time(); - if (now > start_time + options.time) return true; - if (options.loadonly && read_state == IDLE) return true; + + if (options.read_file) { + if (eof) { + return true; + } + else if ((options.queries == 1) && + (now > start_time + options.time)) + { + return true; + } + else { + return false; + } + + } else { + if (options.queries != 0 && + (((long unsigned)options.queries) == (stats.accesses))) + { + return true; + } + if ((options.queries == 0) && + (now > start_time + options.time)) + { + return true; + } + if (options.loadonly && read_state == IDLE) return true; + } + return false; } @@ -259,7 +1111,7 @@ void Connection::event_callback(short events) { int fd = bufferevent_getfd(bev); if (fd < 0) DIE("bufferevent_getfd"); - if (!options.no_nodelay) { + if (!options.no_nodelay && !options.unix_socket) { int one = 1; if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (void *) &one, sizeof(one)) < 0) @@ -270,14 +1122,19 @@ void Connection::event_callback(short events) { if (prot->setup_connection_w()) { read_state = IDLE; } + drive_write_machine(); } else if (events & BEV_EVENT_ERROR) { int err = bufferevent_socket_get_dns_error(bev); - if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"Got an error: %s\n", + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); DIE("BEV_EVENT_ERROR: %s", strerror(errno)); } else if (events & BEV_EVENT_EOF) { - DIE("Unexpected EOF from server."); + //DIE("Unexpected EOF from server."); + fprintf(stderr,"Unexpected EOF from server."); + return; } } @@ -293,7 +1150,9 @@ void Connection::drive_write_machine(double now) { double delay; struct timeval tv; - if (check_exit_condition(now)) return; + if (check_exit_condition(now)) { + return; + } while (1) { switch (write_state) { @@ -303,9 +1162,14 @@ void Connection::drive_write_machine(double now) { double_to_tv(delay, &tv); evtimer_add(timer, &tv); write_state = WAITING_FOR_TIME; + write_state = ISSUING; break; case ISSUING: + if (op_queue_size >= (size_t) options.depth) { + write_state = WAITING_FOR_OPQ; + return; + } if (op_queue.size() >= (size_t) options.depth) { write_state = WAITING_FOR_OPQ; return; @@ -323,9 +1187,15 @@ void Connection::drive_write_machine(double now) { return; } - issue_something(now); + if (options.getsetorset) { + int ret = issue_getsetorset(now); + if (ret) return; //if at EOF + } else { + issue_something(now); + } + last_tx = now; - stats.log_op(op_queue.size()); + stats.log_op(op_queue_size); next_time += iagen->generate(); if (options.skip && options.lambda > 0.0 && @@ -352,7 +1222,7 @@ void Connection::drive_write_machine(double now) { break; case WAITING_FOR_OPQ: - if (op_queue.size() >= (size_t) options.depth) return; + if (op_queue_size >= (size_t) options.depth) return; write_state = ISSUING; break; @@ -368,82 +1238,179 @@ void Connection::read_callback() { struct evbuffer *input = bufferevent_get_input(bev); Operation *op = NULL; - bool done, full_read; - - if (op_queue.size() == 0) V("Spurious read callback."); - - while (1) { - if (op_queue.size() > 0) op = &op_queue.front(); - - switch (read_state) { - case INIT_READ: DIE("event from uninitialized connection"); - case IDLE: return; // We munched all the data we expected? - - case WAITING_FOR_GET: - assert(op_queue.size() > 0); - full_read = prot->handle_response(input, done); - if (!full_read) { - return; - } else if (done) { - finish_op(op); // sets read_state = IDLE - } - break; - - case WAITING_FOR_SET: - assert(op_queue.size() > 0); - if (!prot->handle_response(input, done)) return; - finish_op(op); - break; - - case LOADING: - assert(op_queue.size() > 0); - if (!prot->handle_response(input, done)) return; - loader_completed++; - pop_op(); - - if (loader_completed == options.records) { - D("Finished loading."); - read_state = IDLE; - } else { - while (loader_issued < loader_completed + LOADER_CHUNK) { - if (loader_issued >= options.records) break; - - char key[256]; - string keystr = keygen->generate(loader_issued); - strcpy(key, keystr.c_str()); - int index = lrand48() % (1024 * 1024); - issue_set(key, &random_char[index], valuesize->generate()); - - loader_issued++; - } - } - - break; - - case CONN_SETUP: + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + // + found = true; + //bool full_read = true; + //fprintf(stderr,"read_cb start with current queue of ops: %lu and issue_buf_n: %d\n",op_queue.size(),issue_buf_n); + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + if (read_state == CONN_SETUP) { assert(options.binary); if (!prot->setup_connection_r(input)) return; read_state = IDLE; break; + } + + int opcode; + uint32_t opaque; + full_read = prot->handle_response(input, done, found, opcode, opaque); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGC + char out[128]; + sprintf(out,"conn: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + continue; + } + op = &op_queue[opaque]; +#ifdef DEBUGC + char out[128]; + sprintf(out,"conn: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { + //char out2[128]; + //sprintf(out2,"conn: %u, bad op: %s\n",cid,op->key.c_str()); + //write(2,out2,strlen(out2)); + continue; + } + } else { + break; + } + + + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) {// && + //(options.twitter_trace != 1)) { + char key[256]; + string keystr = op->key; + strcpy(key, keystr.c_str()); + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + finish_op(op,0); // sets read_state = IDLE + if (last_quiet) { + issue_noop(); + } + //issue_set_miss(key, &random_char[index], valuelen); + issue_set(key, &random_char[index], valuelen, false); + last_quiet = false; + + } else { + if (found) { + finish_op(op,1); + } else { + finish_op(op,0); + } + } + } else { + char out[128]; + sprintf(out,"conn: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } - default: DIE("not implemented"); + } + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } +#ifdef DEBUGC + fprintf(stderr,"read_cb done with current queue of ops: %d and issue_buf_n: %d\n",op_queue_size,issue_buf_n); + for (auto x : op_queue) { + cerr << x.first << ": " << x.second.key << endl; + } +#endif + //buffer is ready to go! + //if (issue_buf_n >= options.depth) { + if (issue_buf_n > 0) { + if (last_quiet) { + issue_noop(); + last_quiet = false; } +#ifdef DEBUGC + fprintf(stderr,"read_cb writing %d reqs, last quiet %d\n",issue_buf_n,last_quiet); + char *output = (char*)malloc(sizeof(char)*(issue_buf_size+512)); + fprintf(stderr,"-------------------------------------\n"); + memcpy(output,issue_buf,issue_buf_size); + write(2,output,issue_buf_size); + fprintf(stderr,"\n-------------------------------------\n"); + free(output); +#endif + + bufferevent_write(bev, issue_buf, issue_buf_size); + memset(issue_buf,0,issue_buf_size); + issue_buf_pos = issue_buf; + issue_buf_size = 0; + issue_buf_n = 0; } + + //if (op_queue_size > (uint32_t) options.depth) { + // fprintf(stderr,"read_cb opqueue too big %d\n",op_queue_size); + // return; + //} else { + // fprintf(stderr,"read_cb issing %d\n",op_queue_size); + // issue_getsetorset(now); + //} + last_tx = now; + stats.log_op(op_queue_size); + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} } /** * Callback called when write requests finish. */ -void Connection::write_callback() {} +void Connection::write_callback() { + + //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size()); +} /** * Callback for timer timeouts. */ -void Connection::timer_callback() { drive_write_machine(); } +void Connection::timer_callback() { + drive_write_machine(); +} +// //fprintf(stderr,"timer callback issuing requests!\n"); +// if (op_queue_size >= (size_t) options.depth) { +// return; +// } else { +// double now = get_time(); +// issue_getsetorset(now); +// } +//} /* The follow are C trampolines for libevent callbacks. */ void bev_event_cb(struct bufferevent *bev, short events, void *ptr) { + Connection* conn = (Connection*) ptr; conn->event_callback(events); } diff --git a/Connection.h b/Connection.h index fea451e..b617b9e 100644 --- a/Connection.h +++ b/Connection.h @@ -4,12 +4,16 @@ #include #include +#include +#include +#include #include #include #include #include +#include "bipbuffer.h" #include "AdaptiveSampler.h" #include "cmdline.h" #include "ConnectionOptions.h" @@ -17,15 +21,61 @@ #include "Generator.h" #include "Operation.h" #include "util.h" - +#include "blockingconcurrentqueue.h" #include "Protocol.h" +#define OPAQUE_MAX 64000 +#define hashsize(n) ((unsigned long int)1<<(n)) +#define hashmask(n) (hashsize(n)-1) + +#define MAX_BUFFER_SIZE 10*1024*1024 +#define MAX_LEVELS 2+1 + using namespace std; +using namespace moodycamel; + + +typedef struct _evicted_type { + bool evicted; + uint32_t evictedFlags; + uint32_t serverFlags; + uint32_t clsid; + uint32_t evictedKeyLen; + uint32_t evictedLen; + char *evictedKey; + char *evictedData; +} evicted_t; + +typedef struct resp { + uint32_t opaque; + int opcode; + bool found; + evicted_t* evict; +} resp_t; + void bev_event_cb(struct bufferevent *bev, short events, void *ptr); void bev_read_cb(struct bufferevent *bev, void *ptr); +void bev_event_cb1(struct bufferevent *bev, short events, void *ptr); +void bev_event_cb1_approx(struct bufferevent *bev, short events, void *ptr); +void bev_event_cb1_approx_batch(struct bufferevent *bev, short events, void *ptr); +void bev_read_cb1(struct bufferevent *bev, void *ptr); +void bev_read_cb1_approx(struct bufferevent *bev, void *ptr); +void bev_read_cb1_approx_batch(struct bufferevent *bev, void *ptr); +void bev_event_cb2(struct bufferevent *bev, short events, void *ptr); +void bev_event_cb2_approx(struct bufferevent *bev, short events, void *ptr); +void bev_event_cb2_approx_batch(struct bufferevent *bev, short events, void *ptr); +void bev_read_cb2(struct bufferevent *bev, void *ptr); +void bev_read_cb2_approx(struct bufferevent *bev, void *ptr); +void bev_read_cb2_approx_batch(struct bufferevent *bev, void *ptr); void bev_write_cb(struct bufferevent *bev, void *ptr); +void bev_write_cb_m(struct bufferevent *bev, void *ptr); +void bev_write_cb_m_approx(struct bufferevent *bev, void *ptr); +void bev_write_cb_m_approx_batch(struct bufferevent *bev, void *ptr); void timer_cb(evutil_socket_t fd, short what, void *ptr); +void timer_cb_m(evutil_socket_t fd, short what, void *ptr); +void timer_cb_m_approx(evutil_socket_t fd, short what, void *ptr); +void timer_cb_m_approx_batch(evutil_socket_t fd, short what, void *ptr); class Protocol; @@ -33,9 +83,13 @@ class Connection { public: Connection(struct event_base* _base, struct evdns_base* _evdns, string _hostname, string _port, options_t options, + //ConcurrentQueue *a_trace_queue, bool sampling = true); + ~Connection(); + int do_connect(); + double start_time; // Time when this connection began operations. ConnectionStats stats; options_t options; @@ -54,6 +108,11 @@ class Connection { void read_callback(); void write_callback(); void timer_callback(); + + uint32_t get_cid(); + //void set_queue(ConcurrentQueue *a_trace_queue); + void set_queue(queue *a_trace_queue); + void set_lock(pthread_mutex_t* a_lock); private: string hostname; @@ -75,6 +134,7 @@ class Connection { IDLE, WAITING_FOR_GET, WAITING_FOR_SET, + WAITING_FOR_DELETE, MAX_READ_STATE, }; @@ -92,33 +152,870 @@ class Connection { // Parameters to track progress of the data loader. int loader_issued, loader_completed; + uint32_t opaque; + int issue_buf_size; + int issue_buf_n; + unsigned char *issue_buf_pos; + unsigned char *issue_buf; + bool last_quiet; + uint32_t total; + uint32_t cid; + int eof; + + Protocol *prot; Generator *valuesize; Generator *keysize; KeyGenerator *keygen; Generator *iagen; - std::queue op_queue; + //std::vector> op_queue; + std::unordered_map op_queue; + + uint32_t op_queue_size; + pthread_mutex_t* lock; + //ConcurrentQueue *trace_queue; + queue *trace_queue; // state machine functions / event processing - void pop_op(); - void finish_op(Operation *op); + void pop_op(Operation *op); + void output_op(Operation *op, int type, bool was_found); + //void finish_op(Operation *op); + void finish_op(Operation *op,int was_hit); void issue_something(double now = 0.0); + int issue_something_trace(double now = 0.0); + void issue_getset(double now = 0.0); + int issue_getsetorset(double now = 0.0); void drive_write_machine(double now = 0.0); // request functions void issue_sasl(); + void issue_noop(double now = 0.0); void issue_get(const char* key, double now = 0.0); - void issue_set(const char* key, const char* value, int length, - double now = 0.0); + int issue_get_with_len(const char* key, int valuelen, double now = 0.0, bool quiet = false); + int issue_set(const char* key, const char* value, int length, + double now = 0.0, bool is_access = false); + void issue_set_miss(const char* key, const char* value, int length); + void issue_delete90(double now = 0.0); + + // protocol fucntions + int set_request_ascii(const char* key, const char* value, int length); + int set_request_binary(const char* key, const char* value, int length); + int set_request_resp(const char* key, const char* value, int length); + + int get_request_ascii(const char* key); + int get_request_binary(const char* key); + int get_request_resp(const char* key); + + bool consume_binary_response(evbuffer *input); + bool consume_ascii_line(evbuffer *input, bool &done); + bool consume_resp_line(evbuffer *input, bool &done); +}; + +class ConnectionMulti { +public: + ConnectionMulti(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t options, + bool sampling = true, int fd1 = -1, int fd2 = -1); + + ~ConnectionMulti(); + + int do_connect(); + + double start_time; // Time when this connection began operations. + ConnectionStats stats; + options_t options; + + bool is_ready() { return read_state == IDLE; } + void set_priority(int pri); + + // state commands + void start() { + //fprintf(stderr,"connid: %d starting...\n",cid); + drive_write_machine(); + } + void start_loading(); + void reset(); + bool check_exit_condition(double now = 0.0); + + void event_callback1(short events); + void event_callback2(short events); + void read_callback1(); + void read_callback2(); + // event callbacks + void write_callback(); + void timer_callback(); + + int eof; + uint32_t get_cid(); + //void set_queue(ConcurrentQueue *a_trace_queue); + int add_to_wb_keys(string wb_key); + int add_to_copy_keys(string key); + void del_wb_keys(string wb_key); + void del_copy_keys(string key); + void set_g_wbkeys(unordered_map> *a_wb_keys); + void set_queue(queue *a_trace_queue); + void set_lock(pthread_mutex_t* a_lock); + +private: + string hostname1; + string hostname2; + string port; + + double o_percent; + int trace_queue_n; + struct event_base *base; + struct evdns_base *evdns; + struct bufferevent *bev1; + struct bufferevent *bev2; + + struct event *timer; // Used to control inter-transmission time. + double next_time; // Inter-transmission time parameters. + double last_rx; // Used to moderate transmission rate. + double last_tx; + + vector wb_keys; + enum read_state_enum { + INIT_READ, + CONN_SETUP, + LOADING, + IDLE, + WAITING_FOR_GET, + WAITING_FOR_SET, + WAITING_FOR_DELETE, + MAX_READ_STATE, + }; + + enum write_state_enum { + INIT_WRITE, + ISSUING, + WAITING_FOR_TIME, + WAITING_FOR_OPQ, + MAX_WRITE_STATE, + }; + + read_state_enum read_state; + write_state_enum write_state; + + // Parameters to track progress of the data loader. + int loader_issued, loader_completed; + + uint32_t *opaque; + int *issue_buf_size; + int *issue_buf_n; + unsigned char **issue_buf_pos; + unsigned char **issue_buf; + bool last_quiet1; + bool last_quiet2; + uint32_t total; + uint32_t cid; + + //std::vector> op_queue; + Operation ***op_queue; + uint32_t *op_queue_size; + + map key_hist; + + Generator *valuesize; + Generator *keysize; + KeyGenerator *keygen; + Generator *iagen; + pthread_mutex_t* lock; + unordered_map> *g_wb_keys; + queue *trace_queue; + + // state machine functions / event processing + void pop_op(Operation *op); + void output_op(Operation *op, int type, bool was_found); + //void finish_op(Operation *op); + void finish_op(Operation *op,int was_hit); + int issue_getsetorset(double now = 0.0); + void drive_write_machine(double now = 0.0); + + // request functions + void issue_sasl(); + void issue_noop(double now = 0.0, int level = 1); + int issue_touch(const char* key, int valuelen, double now, int level); + int issue_delete(const char* key, double now, uint32_t flags); + int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_set(const char* key, const char* value, int length, double now, uint32_t flags); + + // protocol fucntions + int set_request_ascii(const char* key, const char* value, int length); + int set_request_binary(const char* key, const char* value, int length); + int set_request_resp(const char* key, const char* value, int length); + + int get_request_ascii(const char* key); + int get_request_binary(const char* key); + int get_request_resp(const char* key); + + bool consume_binary_response(evbuffer *input); + bool consume_ascii_line(evbuffer *input, bool &done); + bool consume_resp_line(evbuffer *input, bool &done); +}; + +class ConnectionMultiApprox { +public: + ConnectionMultiApprox(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t options, + bool sampling = true, int fd1 = -1, int fd2 = -1); + + ~ConnectionMultiApprox(); + + int do_connect(); + + double start_time; // Time when this connection began operations. + ConnectionStats stats; + options_t options; + + bool is_ready() { return read_state == IDLE; } + void set_priority(int pri); + + // state commands + void start() { + //fprintf(stderr,"connid: %d starting...\n",cid); + drive_write_machine(); + } + void start_loading(); + void reset(); + bool check_exit_condition(double now = 0.0); + + void event_callback1(short events); + void event_callback2(short events); + void read_callback1(); + void read_callback2(); + // event callbacks + void write_callback(); + void timer_callback(); + + int eof; + uint32_t get_cid(); + //void set_queue(ConcurrentQueue *a_trace_queue); + int add_to_wb_keys(string wb_key); + int add_to_copy_keys(string key); + int add_to_touch_keys(string key); + void del_wb_keys(string wb_key); + void del_copy_keys(string key); + void del_touch_keys(string key); + void set_g_wbkeys(unordered_map> *a_wb_keys); + void set_queue(queue *a_trace_queue); + void set_lock(pthread_mutex_t* a_lock); + +private: + string hostname1; + string hostname2; + string port; + + double o_percent; + int trace_queue_n; + struct event_base *base; + struct evdns_base *evdns; + struct bufferevent *bev1; + struct bufferevent *bev2; + + struct event *timer; // Used to control inter-transmission time. + double next_time; // Inter-transmission time parameters. + double last_rx; // Used to moderate transmission rate. + double last_tx; + + enum read_state_enum { + INIT_READ, + CONN_SETUP, + LOADING, + IDLE, + WAITING_FOR_GET, + WAITING_FOR_SET, + WAITING_FOR_DELETE, + MAX_READ_STATE, + }; + + enum write_state_enum { + INIT_WRITE, + ISSUING, + WAITING_FOR_TIME, + WAITING_FOR_OPQ, + MAX_WRITE_STATE, + }; + + read_state_enum read_state; + write_state_enum write_state; + + // Parameters to track progress of the data loader. + int loader_issued, loader_completed; + + uint32_t *opaque; + int *issue_buf_size; + int *issue_buf_n; + unsigned char **issue_buf_pos; + unsigned char **issue_buf; + bool last_quiet1; + bool last_quiet2; + uint32_t total; + uint32_t cid; + uint32_t gets; + uint32_t gloc; + uint32_t ghits; + uint32_t sloc; + uint32_t esets; + uint32_t isets; + uint32_t iloc; + + //std::vector> op_queue; + Operation ***op_queue; + uint32_t *op_queue_size; + + + Generator *valuesize; + Generator *keysize; + KeyGenerator *keygen; + Generator *iagen; + pthread_mutex_t* lock; + unordered_map> *g_wb_keys; + queue *trace_queue; + + // state machine functions / event processing + void pop_op(Operation *op); + void output_op(Operation *op, int type, bool was_found); + //void finish_op(Operation *op); + void finish_op(Operation *op,int was_hit); + int issue_getsetorset(double now = 0.0); + void drive_write_machine(double now = 0.0); + + // request functions + void issue_sasl(); + int issue_op(Operation* op); + void issue_noop(double now = 0.0, int level = 1); + int issue_touch(const char* key, int valuelen, double now, int level); + int issue_delete(const char* key, double now, uint32_t flags); + int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_set(const char* key, const char* value, int length, double now, uint32_t flags); + int issue_set(Operation *pop, const char* value, double now, uint32_t flags); + + // protocol fucntions + int set_request_ascii(const char* key, const char* value, int length); + int set_request_binary(const char* key, const char* value, int length); + int set_request_resp(const char* key, const char* value, int length); + + int get_request_ascii(const char* key); + int get_request_binary(const char* key); + int get_request_resp(const char* key); + + bool consume_binary_response(evbuffer *input); + bool consume_ascii_line(evbuffer *input, bool &done); + bool consume_resp_line(evbuffer *input, bool &done); +}; + +class ConnectionMultiApproxBatch { +public: + ConnectionMultiApproxBatch(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t options, + bool sampling = true, int fd1 = -1, int fd2 = -1); + + ~ConnectionMultiApproxBatch(); + + int do_connect(); + + double start_time; // Time when this connection began operations. + ConnectionStats stats; + options_t options; + + bool is_ready() { return read_state == IDLE; } + void set_priority(int pri); + + // state commands + void start() { + //fprintf(stderr,"connid: %d starting...\n",cid); + drive_write_machine(); + } + void start_loading(); + void reset(); + bool check_exit_condition(double now = 0.0); + + void event_callback1(short events); + void event_callback2(short events); + void read_callback1(); + void read_callback2(); + void read_callback1_v1(); + void read_callback2_v1(); + // event callbacks + void write_callback(); + void timer_callback(); + + int eof; + uint32_t get_cid(); + //void set_queue(ConcurrentQueue *a_trace_queue); + int add_to_wb_keys(string wb_key); + int add_to_copy_keys(string key); + int add_to_touch_keys(string key); + void del_wb_keys(string wb_key); + void del_copy_keys(string key); + void del_touch_keys(string key); + void set_g_wbkeys(unordered_map> *a_wb_keys); + void set_queue(queue *a_trace_queue); + void set_lock(pthread_mutex_t* a_lock); + int send_write_buffer(int level); + int add_get_op_to_queue(Operation *pop, int level); + int add_set_to_queue(Operation *pop, int level, const char *value); + size_t handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, + size_t read_bytes, size_t consumed_bytes, + int level, int extra); + +private: + string hostname1; + string hostname2; + string port; + + double o_percent; + int trace_queue_n; + struct event_base *base; + struct evdns_base *evdns; + struct bufferevent *bev1; + struct bufferevent *bev2; + + struct event *timer; // Used to control inter-transmission time. + double next_time; // Inter-transmission time parameters. + double last_rx; // Used to moderate transmission rate. + double last_tx; + + enum read_state_enum { + INIT_READ, + CONN_SETUP, + LOADING, + IDLE, + WAITING_FOR_GET, + WAITING_FOR_SET, + WAITING_FOR_DELETE, + MAX_READ_STATE, + }; + + enum write_state_enum { + INIT_WRITE, + ISSUING, + WAITING_FOR_TIME, + WAITING_FOR_OPQ, + MAX_WRITE_STATE, + }; + + read_state_enum read_state; + write_state_enum write_state; + + // Parameters to track progress of the data loader. + int loader_issued, loader_completed; + + uint32_t *opaque; + int *issue_buf_size; + int *issue_buf_n; + unsigned char **issue_buf_pos; + unsigned char **issue_buf; + bool last_quiet1; + bool last_quiet2; + uint32_t total; + uint32_t cid; + uint32_t gets; + uint32_t gloc; + uint32_t ghits; + uint32_t sloc; + uint32_t esets; + uint32_t isets; + uint32_t iloc; + + uint32_t clsid_; + uint32_t incl_; + uint32_t buffer_size_; + unsigned char* buffer_write[MAX_LEVELS]; + unsigned char* buffer_read[MAX_LEVELS]; + unsigned char* buffer_write_pos[MAX_LEVELS]; + unsigned char* buffer_read_pos[MAX_LEVELS]; + unsigned char* buffer_lasthdr[MAX_LEVELS]; + unsigned char* buffer_leftover[MAX_LEVELS]; + uint32_t buffer_read_n[MAX_LEVELS]; + uint32_t buffer_write_n[MAX_LEVELS]; + uint32_t buffer_read_nbytes[MAX_LEVELS]; + uint32_t buffer_write_nbytes[MAX_LEVELS]; + + + //std::vector> op_queue; + Operation ***op_queue; + uint32_t *op_queue_size; + + + Generator *valuesize; + Generator *keysize; + KeyGenerator *keygen; + Generator *iagen; + pthread_mutex_t* lock; + unordered_map> *g_wb_keys; + queue *trace_queue; + + // state machine functions / event processing + void pop_op(Operation *op); + void output_op(Operation *op, int type, bool was_found); + //void finish_op(Operation *op); + void finish_op(Operation *op,int was_hit); + int issue_getsetorset(double now = 0.0); + void drive_write_machine(double now = 0.0); + + // request functions + void issue_sasl(); + int issue_op(Operation* op); + int issue_noop(int level = 1); + size_t fill_read_buffer(int level, int *extra); + int issue_touch(const char* key, int valuelen, double now, int level); + int issue_delete(const char* key, double now, uint32_t flags); + int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_set(const char* key, const char* value, int length, double now, uint32_t flags); + int issue_set(Operation *pop, const char* value, double now, uint32_t flags); + + // protocol fucntions + int set_request_ascii(const char* key, const char* value, int length); + int set_request_binary(const char* key, const char* value, int length); + int set_request_resp(const char* key, const char* value, int length); + + int get_request_ascii(const char* key); + int get_request_binary(const char* key); + int get_request_resp(const char* key); + + bool consume_binary_response(evbuffer *input); + bool consume_ascii_line(evbuffer *input, bool &done); + bool consume_resp_line(evbuffer *input, bool &done); +}; + +class ConnectionMultiApproxShm { +public: + ConnectionMultiApproxShm(options_t options, bool sampling = true); + + ~ConnectionMultiApproxShm(); + + int do_connect(); + + double start_time; // Time when this connection began operations. + ConnectionStats stats; + options_t options; + + bool is_ready() { return read_state == IDLE; } + void set_priority(int pri); + + void start_loading(); + void reset(); + bool check_exit_condition(double now = 0.0); + + void event_callback1(short events); + void event_callback2(short events); + void read_callback1(); + void read_callback2(); + void read_callback1_v1(); + void read_callback2_v1(); + // event callbacks + void write_callback(); + void timer_callback(); + + int eof; + uint32_t get_cid(); + //void set_queue(ConcurrentQueue *a_trace_queue); + int add_to_wb_keys(string wb_key); + int add_to_copy_keys(string key); + int add_to_touch_keys(string key); + void del_wb_keys(string wb_key); + void del_copy_keys(string key); + void del_touch_keys(string key); + void set_g_wbkeys(unordered_map> *a_wb_keys); + void set_queue(queue *a_trace_queue); + void set_lock(pthread_mutex_t* a_lock); + int send_write_buffer(int level); + int add_get_op_to_queue(Operation *pop, int level); + int add_set_to_queue(Operation *pop, int level, const char *value); + size_t handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, + size_t read_bytes, size_t consumed_bytes, + int level, int extra); + void drive_write_machine_shm(double now = 0.0); + bipbuf_t* bipbuf_in[3]; + bipbuf_t* bipbuf_out[3]; + pthread_mutex_t* lock_in[3]; + pthread_mutex_t* lock_out[3]; + pthread_cond_t* cond_in_not_empty[3]; + pthread_cond_t* cond_in_not_full[3]; + pthread_cond_t* cond_out_not_empty[3]; + pthread_cond_t* cond_out_not_full[3]; + +private: + string hostname1; + string hostname2; + string port; + + double o_percent; + int trace_queue_n; + + struct event *timer; // Used to control inter-transmission time. + double next_time; // Inter-transmission time parameters. + double last_rx; // Used to moderate transmission rate. + double last_tx; + + enum read_state_enum { + INIT_READ, + CONN_SETUP, + LOADING, + IDLE, + WAITING_FOR_GET, + WAITING_FOR_SET, + WAITING_FOR_DELETE, + MAX_READ_STATE, + }; + + enum write_state_enum { + INIT_WRITE, + ISSUING, + WAITING_FOR_TIME, + WAITING_FOR_OPQ, + MAX_WRITE_STATE, + }; + + read_state_enum read_state; + write_state_enum write_state; + + // Parameters to track progress of the data loader. + int loader_issued, loader_completed; + + uint32_t *opaque; + int *issue_buf_size; + int *issue_buf_n; + unsigned char **issue_buf_pos; + unsigned char **issue_buf; + bool last_quiet1; + bool last_quiet2; + uint32_t total; + uint32_t cid; + uint32_t gets; + uint32_t gloc; + uint32_t ghits; + uint32_t sloc; + uint32_t esets; + uint32_t isets; + uint32_t iloc; + + + //std::vector> op_queue; + Operation ***op_queue; + uint32_t *op_queue_size; + + + Generator *valuesize; + Generator *keysize; + KeyGenerator *keygen; + Generator *iagen; + pthread_mutex_t* lock; + unordered_map> *g_wb_keys; + queue *trace_queue; + queue extra_queue; + + // state machine functions / event processing + void pop_op(Operation *op); + void output_op(Operation *op, int type, bool was_found); + //void finish_op(Operation *op); + void finish_op(Operation *op,int was_hit); + int issue_getsetorset(double now = 0.0); + + // request functions + void issue_sasl(); + int issue_op(Operation* op); + void issue_noop(int level = 1); + size_t fill_read_buffer(int level, int *extra); + int issue_touch(const char* key, int valuelen, double now, int level); + int issue_delete(const char* key, double now, uint32_t flags); + int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_set(const char* key, const char* value, int length, double now, uint32_t flags); + int issue_set(Operation *pop, const char* value, double now, uint32_t flags); + int offer_set(Operation *pop, int extra = 0); + int offer_get(Operation *pop, int extra = 0); + + int read_response_l1(); + void read_response_l2(); + // protocol fucntions + int set_request_ascii(const char* key, const char* value, int length); + int set_request_binary(const char* key, const char* value, int length); + int set_request_resp(const char* key, const char* value, int length); + + int get_request_ascii(const char* key); + int get_request_binary(const char* key); + int get_request_resp(const char* key); + + bool consume_binary_response(evbuffer *input); + bool consume_ascii_line(evbuffer *input, bool &done); + bool consume_resp_line(evbuffer *input, bool &done); +}; + +class ConnectionMultiApproxBatchShm { +public: + ConnectionMultiApproxBatchShm(options_t options, bool sampling = true); + + ~ConnectionMultiApproxBatchShm(); + + int do_connect(); + + double start_time; // Time when this connection began operations. + ConnectionStats stats; + options_t options; + + bool is_ready() { return read_state == IDLE; } + void set_priority(int pri); + + void start_loading(); + void reset(); + bool check_exit_condition(double now = 0.0); + + void read_callback1(); + void read_callback2(); + + int eof; + uint32_t get_cid(); + //void set_queue(ConcurrentQueue *a_trace_queue); + int add_to_wb_keys(string wb_key); + int add_to_copy_keys(string key); + int add_to_touch_keys(string key); + void del_wb_keys(string wb_key); + void del_copy_keys(string key); + void del_touch_keys(string key); + void set_g_wbkeys(unordered_map> *a_wb_keys); + void set_queue(queue *a_trace_queue); + void set_lock(pthread_mutex_t* a_lock); + size_t handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, + size_t read_bytes, size_t consumed_bytes, + int level, int extra); + void drive_write_machine_shm(double now = 0.0); + bipbuf_t* bipbuf_in[3]; + bipbuf_t* bipbuf_out[3]; + pthread_mutex_t* lock_in[3]; + pthread_mutex_t* lock_out[3]; + + int *bipbuf_out_bytes[3]; + int *bipbuf_in_bytes[3]; + pthread_cond_t* cond_in_not_empty[3]; + pthread_cond_t* cond_in_not_full[3]; + pthread_cond_t* cond_out_not_empty[3]; + pthread_cond_t* cond_out_not_full[3]; + +private: + string hostname1; + string hostname2; + string port; + + double o_percent; + int trace_queue_n; + + struct event *timer; // Used to control inter-transmission time. + double next_time; // Inter-transmission time parameters. + double last_rx; // Used to moderate transmission rate. + double last_tx; + + enum read_state_enum { + INIT_READ, + CONN_SETUP, + LOADING, + IDLE, + WAITING_FOR_GET, + WAITING_FOR_SET, + WAITING_FOR_DELETE, + MAX_READ_STATE, + }; + + enum write_state_enum { + INIT_WRITE, + ISSUING, + WAITING_FOR_TIME, + WAITING_FOR_OPQ, + MAX_WRITE_STATE, + }; + + read_state_enum read_state; + write_state_enum write_state; + + // Parameters to track progress of the data loader. + int loader_issued, loader_completed; + + uint32_t *opaque; + int *issue_buf_size; + int *issue_buf_n; + unsigned char **issue_buf_pos; + unsigned char **issue_buf; + bool last_quiet1; + bool last_quiet2; + uint32_t total; + uint32_t cid; + uint32_t gets; + uint32_t gloc; + uint32_t ghits; + uint32_t sloc; + uint32_t esets; + uint32_t isets; + uint32_t iloc; + + uint32_t buffer_size_; + unsigned char* buffer_write[MAX_LEVELS]; + unsigned char* buffer_read[MAX_LEVELS]; + unsigned char* buffer_write_pos[MAX_LEVELS]; + unsigned char* buffer_read_pos[MAX_LEVELS]; + unsigned char* buffer_lasthdr[MAX_LEVELS]; + unsigned char* buffer_leftover[MAX_LEVELS]; + uint32_t buffer_read_n[MAX_LEVELS]; + uint32_t buffer_write_n[MAX_LEVELS]; + uint32_t buffer_read_nbytes[MAX_LEVELS]; + uint32_t buffer_write_nbytes[MAX_LEVELS]; + + + //std::vector> op_queue; + Operation ***op_queue; + uint32_t *op_queue_size; + uint32_t *issued_queue; + + + Generator *valuesize; + Generator *keysize; + KeyGenerator *keygen; + Generator *iagen; + pthread_mutex_t* lock; + unordered_map> *g_wb_keys; + queue *trace_queue; + queue extra_queue; + + // state machine functions / event processing + void pop_op(Operation *op); + void output_op(Operation *op, int type, bool was_found); + //void finish_op(Operation *op); + void finish_op(Operation *op,int was_hit); + int issue_getsetorset(double now = 0.0); + + // request functions + void issue_sasl(); + int issue_op(Operation* op); + int issue_noop(int level = 1); + int issue_touch(const char* key, int valuelen, double now, int level); + int issue_delete(const char* key, double now, uint32_t flags); + int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL); + int issue_set(const char* key, const char* value, int length, double now, uint32_t flags); + int issue_set(Operation *pop, const char* value, double now, uint32_t flags); + int offer_set(Operation *pop, int extra = 0); + int offer_get(Operation *pop, int extra = 0); + int send_write_buffer(int level); + size_t fill_read_buffer(int level, int *extra); + int add_get_op_to_queue(Operation *pop, int level, int cb = 0); + int add_set_to_queue(Operation *pop, int level, const char *value, int cb = 0); + int read_response_l1(); + void read_response_l2(); // protocol fucntions int set_request_ascii(const char* key, const char* value, int length); int set_request_binary(const char* key, const char* value, int length); + int set_request_resp(const char* key, const char* value, int length); + int get_request_ascii(const char* key); int get_request_binary(const char* key); + int get_request_resp(const char* key); bool consume_binary_response(evbuffer *input); bool consume_ascii_line(evbuffer *input, bool &done); + bool consume_resp_line(evbuffer *input, bool &done); }; #endif diff --git a/ConnectionMulti.backup b/ConnectionMulti.backup new file mode 100644 index 0000000..688ad3c --- /dev/null +++ b/ConnectionMulti.backup @@ -0,0 +1,1723 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#include "Connection.h" +#include "distributions.h" +#include "Generator.h" +#include "mutilate.h" +#include "binary_protocol.h" +#include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" + +#define ITEM_L1 1 +#define ITEM_L2 2 +#define LOG_OP 4 +#define SRC_L1_M 8 +#define SRC_L1_H 16 +#define SRC_L2_M 32 +#define SRC_L2_H 64 +#define SRC_DIRECT_SET 128 +#define SRC_L1_COPY 256 +#define SRC_WB 512 + +#define ITEM_INCL 4096 +#define ITEM_EXCL 8192 +#define ITEM_DIRTY 16384 +#define ITEM_SIZE_CHANGE 131072 +#define ITEM_WAS_HIT 262144 + +#define LEVELS 2 +#define SET_INCL(incl,flags) \ + switch (incl) { \ + case 1: \ + flags |= ITEM_INCL; \ + break; \ + case 2: \ + flags |= ITEM_EXCL; \ + break; \ + \ + } \ + +#define GET_INCL(incl,flags) \ + if (flags & ITEM_INCL) incl = 1; \ + else if (flags & ITEM_EXCL) incl = 2; \ + +//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 ) +#define OP_level(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define FLAGS_level(flags) ( flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) ) + +#define OP_log(op) ((op)->flags & LOG_OP) +#define OP_incl(op) ((op)->flags & ITEM_INCL) +#define OP_excl(op) ((op)->flags & ITEM_EXCL) +#define OP_set_flag(op,flag) ((op))->flags |= flag; + +//#define DEBUGMC +//#define DEBUGS + +using namespace moodycamel; + +pthread_mutex_t cid_lock_m = PTHREAD_MUTEX_INITIALIZER; +static uint32_t connids_m = 1; + +#define NCLASSES 40 +#define CHUNK_ALIGN_BYTES 8 +static int classes = 0; +static int sizes[NCLASSES+1]; +static int inclusives[NCLASSES+1]; + +typedef struct _evicted_type { + bool evicted; + uint32_t evictedFlags; + uint32_t serverFlags; + uint32_t clsid; + uint32_t evictedKeyLen; + uint32_t evictedLen; + char *evictedKey; + char *evictedData; +} evicted_t; + +static vector cid_rate; + +extern int max_n[3]; + +static void init_inclusives(char *inclusive_str) { + int j = 1; + for (int i = 0; i < (int)strlen(inclusive_str); i++) { + if (inclusive_str[i] == '-') { + continue; + } else { + inclusives[j] = inclusive_str[i] - '0'; + j++; + } + } +} + +static void init_classes() { + + double factor = 1.25; + unsigned int chunk_size = 48; + unsigned int item_size = 24; + unsigned int size = 96; //warning if you change this you die + unsigned int i = 0; + unsigned int chunk_size_max = 1048576/2; + while (++i < NCLASSES-1) { + if (size >= chunk_size_max / factor) { + break; + } + if (size % CHUNK_ALIGN_BYTES) + size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); + sizes[i] = size; + size *= factor; + } + sizes[i] = chunk_size_max; + classes = i; + +} + +static int get_class(int vl, uint32_t kl) { + //warning if you change this you die + int vsize = vl+kl+48+1+2; + int res = 1; + while (vsize > sizes[res]) + if (res++ == classes) { + //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]); + return -1; + } + return res; +} + +static int get_incl(int vl, int kl) { + int clsid = get_class(vl,kl); + if (clsid) { + return inclusives[clsid]; + } else { + return -1; + } +} + +void ConnectionMulti::output_op(Operation *op, int type, bool found) { + char output[1024]; + char k[256]; + char a[256]; + char s[256]; + memset(k,0,256); + memset(a,0,256); + memset(s,0,256); + strcpy(k,op->key.c_str()); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type); + } + write(2,output,strlen(output)); +} + +/** + * Create a new connection to a server endpoint. + */ +ConnectionMulti::ConnectionMulti(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t _options, + bool sampling, int fd1, int fd2 ) : + start_time(0), stats(sampling), options(_options), + hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns) +{ + pthread_mutex_lock(&cid_lock_m); + cid = connids_m++; + if (cid == 1) { + cid_rate.push_back(100); + cid_rate.push_back(0); + init_classes(); + init_inclusives(options.inclusives); + } else { + cid_rate.push_back(0); + } + + pthread_mutex_unlock(&cid_lock_m); + + valuesize = createGenerator(options.valuesize); + keysize = createGenerator(options.keysize); + srand(time(NULL)); + keygen = new KeyGenerator(keysize, options.records); + + total = 0; + eof = 0; + o_percent = 0; + + if (options.lambda <= 0) { + iagen = createGenerator("0"); + } else { + D("iagen = createGenerator(%s)", options.ia); + iagen = createGenerator(options.ia); + iagen->set_lambda(options.lambda); + } + + read_state = IDLE; + write_state = INIT_WRITE; + last_quiet1 = false; + last_quiet2 = false; + + last_tx = last_rx = 0.0; + + + op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1)); + + for (int i = 0; i <= LEVELS; i++) { + op_queue_size[i] = 0; + opaque[i] = 1; + //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX); + op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX*2)); + + } + + bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev1, bev_read_cb1, bev_write_cb_m, bev_event_cb1, this); + bufferevent_enable(bev1, EV_READ | EV_WRITE); + + bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev2, bev_read_cb2, bev_write_cb_m, bev_event_cb2, this); + bufferevent_enable(bev2, EV_READ | EV_WRITE); + + timer = evtimer_new(base, timer_cb_m, this); + + read_state = IDLE; +} + + +void ConnectionMulti::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + trace_queue_n = a_trace_queue->size(); +} + +void ConnectionMulti::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +void ConnectionMulti::set_g_wbkeys(unordered_map *a_wb_keys) { + g_wb_keys = a_wb_keys; +} + +uint32_t ConnectionMulti::get_cid() { + return cid; +} + +int ConnectionMulti::add_to_wb_keys(string key) { + int ret = -1; + pthread_mutex_lock(lock); + auto pos = g_wb_keys->find(key); + if (pos == g_wb_keys->end()) { + g_wb_keys->insert( {key,cid }); + ret = 1; + //fprintf(stderr,"----set: %s----\n",Op.key.c_str()); + //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){ + // fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second); + //} + //fprintf(stderr,"----%d----\n",cid); + } else { + ret = 2; + } + + pthread_mutex_unlock(lock); + return ret; +} + +void ConnectionMulti::del_wb_keys(string key) { + + pthread_mutex_lock(lock); + auto position = g_wb_keys->find(key); + if (position != g_wb_keys->end()) { + g_wb_keys->erase(position); + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } + pthread_mutex_unlock(lock); +} + + +int ConnectionMulti::do_connect() { + + int connected = 0; + if (options.unix_socket) { + + + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + int addrlen; + addrlen = sizeof(sin1); + + int err = bufferevent_socket_connect(bev1, (struct sockaddr*)&sin1, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l1 error %s\n",strerror(err)); + } + + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + + addrlen = sizeof(sin2); + err = bufferevent_socket_connect(bev2, (struct sockaddr*)&sin2, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l2 error %s\n",strerror(err)); + } + } + read_state = IDLE; + return connected; +} + +/** + * Destroy a connection, performing cleanup. + */ +ConnectionMulti::~ConnectionMulti() { + + + for (int i = 0; i <= LEVELS; i++) { + free(op_queue[i]); + + } + + free(op_queue_size); + free(opaque); + free(op_queue); + //event_free(timer); + //timer = NULL; + // FIXME: W("Drain op_q?"); + //bufferevent_free(bev1); + //bufferevent_free(bev2); + + delete iagen; + delete keygen; + delete keysize; + delete valuesize; +} + +/** + * Reset the connection back to an initial, fresh state. + */ +void ConnectionMulti::reset() { + // FIXME: Actually check the connection, drain all bufferevents, drain op_q. + //assert(op_queue.size() == 0); + //evtimer_del(timer); + read_state = IDLE; + write_state = INIT_WRITE; + stats = ConnectionStats(stats.sampling); +} + +/** + * Set our event processing priority. + */ +void ConnectionMulti::set_priority(int pri) { + if (bufferevent_priority_set(bev1, pri)) { + DIE("bufferevent_set_priority(bev, %d) failed", pri); + } +} + + + +/** + * Get/Set or Set Style + * If a GET command: Issue a get first, if not found then set + * If trace file (or prob. write) says to set, then set it + */ +int ConnectionMulti::issue_getsetorset(double now) { + + + + int ret = 0; + int nissued = 0; + //while (nissued < options.depth) { + + //pthread_mutex_lock(lock); + if (!trace_queue->empty()) { + Operation Op = trace_queue->front(); + if (Op.type == Operation::SASL) { + eof = 1; + cid_rate[cid] = 100; + fprintf(stderr,"cid %d done\n",cid); + string op_queue1; + string op_queue2; + for (int j = 0; j < 2; j++) { + for (int i = 0; i < OPAQUE_MAX; i++) { + if (op_queue[j+1][i] != NULL) { + if (j == 0) { + op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + } else { + op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + } + } + } + } + fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return 1; + } + + + /* check if in global wb queue */ + pthread_mutex_lock(lock); + double percent = (double)total/((double)trace_queue_n) * 100; + if (percent > o_percent+1) { + //update the percentage table and see if we should execute + std::vector::iterator mp = std::min_element(cid_rate.begin(), cid_rate.end()); + double min_percent = *mp; + + if (percent > min_percent+2) { + pthread_mutex_unlock(lock); + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + int good = 0; + if (!event_pending(timer, EV_TIMEOUT, NULL)) { + good = evtimer_add(timer, &tv); + } + if (good != 0) { + fprintf(stderr,"eventimer is messed up!\n"); + return 2; + } + return 1; + } + cid_rate[cid] = percent; + fprintf(stderr,"%f,%d,%.4f\n",now,cid,percent); + o_percent = percent; + } + auto check = g_wb_keys->find(Op.key); + if (check != g_wb_keys->end()) { + pthread_mutex_unlock(lock); + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + int good = 0; + if (!event_pending(timer, EV_TIMEOUT, NULL)) { + good = evtimer_add(timer, &tv); + } + if (good != 0) { + fprintf(stderr,"eventimer is messed up in checking for key: %s\n",Op.key.c_str()); + return 2; + } + return 1; + } else { + g_wb_keys->insert( {Op.key, cid} ); + //g_wb_keys->insert( {Op.key+"l2", cid} ); + } + pthread_mutex_unlock(lock); + + + + char key[256]; + memset(key,0,256); + strncpy(key, Op.key.c_str(),255); + int vl = Op.valuelen; + + trace_queue->pop(); + + int issued = 0; + int incl = get_incl(vl,strlen(key)); + int cid = get_class(vl,strlen(key)); + int flags = 0; + int touch = (rand() % 100); + int index = lrand48() % (1024 * 1024); + //int touch = 1; + SET_INCL(incl,flags); + + switch(Op.type) + { + case Operation::GET: + //if (nissued < options.depth-1) { + // issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1); + // last_quiet1 = false; + //} else { + //} + if (options.threshold > 0) { + if (Op.future) { + key_hist[key] = 1; + } + } + issued = issue_get_with_len(key, vl, now, false, flags | LOG_OP | ITEM_L1); + if (touch == 1 && incl == 1) { + issue_touch(key,vl,now, ITEM_L2 | SRC_L1_H); + } + last_quiet1 = false; + this->stats.gets++; + this->stats.gets_cid[cid]++; + + break; + case Operation::SET: + if (last_quiet1) { + issue_noop(now,1); + } + if (incl == 1) { + issue_touch(key,vl,now, ITEM_L2 | SRC_DIRECT_SET); + } else if (incl == 2) { + issue_delete(key,now, ITEM_L2 | SRC_DIRECT_SET ); + } + issued = issue_set(key, &random_char[index], vl, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + last_quiet1 = false; + this->stats.sets++; + this->stats.sets_cid[cid]++; + break; + case Operation::DELETE: + case Operation::TOUCH: + case Operation::NOOP: + case Operation::SASL: + fprintf(stderr,"invalid line: %s, vl: %d\n",key,vl); + break; + + } + if (issued) { + nissued++; + total++; + } else { + fprintf(stderr,"failed to issue line: %s, vl: %d @T: XX \n",key,vl); + } + } else { + return 1; + } + //} + if (last_quiet1) { + issue_noop(now,1); + last_quiet1 = false; + } + + return ret; + +} + +/** + * Issue a get request to the server. + */ +int ConnectionMulti::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) { + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->key = string(key); + pop->valuelen = valuelen; + pop->type = Operation::GET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(valuelen,strlen(key)); + if (l1 != NULL) { + pop->l1 = l1; + } + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + if (quiet) { + h.opcode = CMD_GETQ; + } + h.opaque = htonl(pop->opaque); + + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + + stats.tx_bytes += 24 + keylen; + return 1; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMulti::issue_touch(const char* key, int valuelen, double now, int flags) { + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->key = string(key); + pop->valuelen = valuelen; + pop->type = Operation::TOUCH; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + +#ifdef DEBUGS + fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque); +#endif + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen), + 0x04, 0x00, htons(0), + htonl(keylen + 4) }; + h.opaque = htonl(pop->opaque); + + uint32_t exp = 0; + if (flags & ITEM_DIRTY) { + exp = htonl(flags); + } + evbuffer_add(output, &h, 24); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, key, keylen); + + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +/** + * Issue a delete request to the server. + */ +int ConnectionMulti::issue_delete(const char* key, double now, uint32_t flags) { + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->key = string(key); + pop->type = Operation::DELETE; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque); +#endif + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_DELETE, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + h.opaque = htonl(pop->opaque); + + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +void ConnectionMulti::issue_noop(double now, int level) { + struct evbuffer *output = NULL; + switch (level) { + case 1: + output = bufferevent_get_output(bev1); + break; + case 2: + output = bufferevent_get_output(bev2); + break; + } + Operation op; + + if (now == 0.0) op.start_time = get_time(); + else op.start_time = now; + + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + + evbuffer_add(output, &h, 24); + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMulti::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) { + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + + pop->key = string(key); + pop->valuelen = length; + pop->type = Operation::SET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(length,strlen(key)); + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(flags); + uint32_t exp = 0; + + evbuffer_add(output, &h, 24); + evbuffer_add(output, &f, 4); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, key, keylen); + evbuffer_add(output, value, length); + + stats.tx_bytes += length + 32 + keylen; + return 1; +} + +/** + * Return the oldest live operation in progress. + */ +void ConnectionMulti::pop_op(Operation *op) { + + uint8_t level = OP_level(op); + //op_queue[level].erase(op); + op_queue_size[level]--; + + + if (read_state == LOADING) return; + read_state = IDLE; + + // Advance the read state machine. + //if (op_queue.size() > 0) { + // Operation& op = op_queue.front(); + // switch (op.type) { + // case Operation::GET: read_state = WAITING_FOR_GET; break; + // case Operation::SET: read_state = WAITING_FOR_SET; break; + // case Operation::DELETE: read_state = WAITING_FOR_DELETE; break; + // default: DIE("Not implemented."); + // } + //} +} + +/** + * Finish up (record stats) an operation that just returned from the + * server. + */ +void ConnectionMulti::finish_op(Operation *op, int was_hit) { + double now; +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); +#else + now = get_time(); +#endif +#if HAVE_CLOCK_GETTIME + op->end_time = get_time_accurate(); +#else + op->end_time = now; +#endif + + if (options.successful_queries && was_hit) { + switch (op->type) { + case Operation::GET: + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + break; + } + break; + case Operation::SET: + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } else { + switch (op->type) { + case Operation::GET: + if (OP_log(op)) { + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + if (op->l1 != NULL) { + op->l1->end_time = now; + stats.log_get(*(op->l1)); + } + break; + } + } + break; + case Operation::SET: + if (OP_log(op)) { + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } + + last_rx = now; + uint8_t level = OP_level(op); + if (op->l1 != NULL) { + delete op_queue[1][op->l1->opaque]; + op_queue[1][op->l1->opaque] = 0; + op_queue_size[1]--; + } + //op_queue[level].erase(op_queue[level].begin()+opopq); + if (op == op_queue[level][op->opaque] && + op->opaque == op_queue[level][op->opaque]->opaque) { + delete op_queue[level][op->opaque]; + op_queue[level][op->opaque] = 0; + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque); + } + op_queue_size[level]--; + read_state = IDLE; + + +} + + + +/** + * Check if our testing is done and we should exit. + */ +bool ConnectionMulti::check_exit_condition(double now) { + if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) { + return true; + } + if (read_state == INIT_READ) return false; + + return false; +} + +/** + * Handle new connection and error events. + */ +void ConnectionMulti::event_callback1(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname1.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev1); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev1); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Handle new connection and error events. + */ +void ConnectionMulti::event_callback2(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname2.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev2); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev2); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Request generation loop. Determines whether or not to issue a new command, + * based on timer events. + * + * Note that this function loops. Be wary of break vs. return. + */ +void ConnectionMulti::drive_write_machine(double now) { + if (now == 0.0) now = get_time(); + + double delay; + struct timeval tv; + + if (check_exit_condition(now)) { + return; + } + + while (1) { + switch (write_state) { + case INIT_WRITE: + delay = iagen->generate(); + next_time = now + delay; + double_to_tv(delay, &tv); + evtimer_add(timer, &tv); + write_state = ISSUING; + break; + + case ISSUING: + if ( (op_queue_size[1] >= (size_t) options.depth) || + (op_queue_size[2] >= (size_t) options.depth) ) { + write_state = WAITING_FOR_OPQ; + break; + } + + if (options.getsetorset) { + int ret = issue_getsetorset(now); + if (ret == 1) return; //if at EOF + } + + last_tx = now; + for (int i = 1; i <= 2; i++) { + stats.log_op(op_queue_size[i]); + } + break; + + case WAITING_FOR_TIME: + write_state = ISSUING; + break; + + case WAITING_FOR_OPQ: + if ( (op_queue_size[1] >= (size_t) options.depth) || + (op_queue_size[2] >= (size_t) options.depth) ) { + //double delay = 0.01; + //struct timeval tv; + //double_to_tv(delay, &tv); + //evtimer_add(timer, &tv); + return; + } else { + write_state = ISSUING; + break; + } + + default: DIE("Not implemented"); + } + } +} + + + +/** + * Tries to consume a binary response (in its entirety) from an evbuffer. + * + * @param input evBuffer to read response from + * @return true if consumed, false if not enough data in buffer. + */ +static bool handle_response(ConnectionMulti *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) { + // Read the first 24 bytes as a header + int length = evbuffer_get_length(input); + if (length < 24) return false; + binary_header_t* h = + reinterpret_cast(evbuffer_pullup(input, 24)); + //assert(h); + + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + // Not whole response + int targetLen = 24 + bl; + if (length < targetLen) { + return false; + } + + opcode = h->opcode; + opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + + + // If something other than success, count it as a miss + if (opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + conn->stats.get_misses_l1++; + break; + case 2: + conn->stats.get_misses_l2++; + conn->stats.get_misses++; + conn->stats.window_get_misses++; + break; + + } + found = false; + evbuffer_drain(input, targetLen); + + } else if (opcode == CMD_SET && kl > 0) { + //first data is extras: clsid, flags, eflags + if (evict) { + evbuffer_drain(input,24); + unsigned char *buf = evbuffer_pullup(input,bl); + + + evict->clsid = *((uint32_t*)buf); + evict->clsid = ntohl(evict->clsid); + buf += 4; + + evict->serverFlags = *((uint32_t*)buf); + evict->serverFlags = ntohl(evict->serverFlags); + buf += 4; + + evict->evictedFlags = *((uint32_t*)buf); + evict->evictedFlags = ntohl(evict->evictedFlags); + buf += 4; + + + evict->evictedKeyLen = kl; + evict->evictedKey = (char*)malloc(kl+1); + memset(evict->evictedKey,0,kl+1); + memcpy(evict->evictedKey,buf,kl); + buf += kl; + + + evict->evictedLen = bl - kl - el; + evict->evictedData = (char*)malloc(evict->evictedLen); + memcpy(evict->evictedData,buf,evict->evictedLen); + evict->evicted = true; + //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags); + evbuffer_drain(input,bl); + } else { + evbuffer_drain(input, targetLen); + } + } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else { + evbuffer_drain(input, targetLen); + } + + conn->stats.rx_bytes += targetLen; + done = true; + return true; +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMulti::read_callback1() { + struct evbuffer *input = bufferevent_get_input(bev1); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + + full_read = handle_response(this,input, done, found, opcode, opaque, evict,1); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + op = op_queue[1][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (op->key.length() < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key.c_str()); + write(2,out2,strlen(out2)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + } else { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + break; + } + + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + char key[256]; + memset(key,0,256); + strncpy(key, op->key.c_str(),255); + int vl = op->valuelen; + int flags = OP_clu(op); + issue_get_with_len(key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + //finish_op(op,0); + + } else { + del_wb_keys(op->key); + finish_op(op,found); + } + } else { + char out[128]; + sprintf(out,"conn l1: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_DIRECT_SET || + // OP_src(op) == SRC_L2_M ) { + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //wb_keys.push_back(wb_key); + int ret = add_to_wb_keys(wb_key); + if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB); + } + //fprintf(stderr,"incl writeback %s\n",evict->evictedKey); + this->stats.incl_wbs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (key_hist[wb_key] == 1)) || + (options.wb_all) ) { + int ret = add_to_wb_keys(wb_key); + if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + } + this->stats.excl_wbs++; + } + } + /* + if (evict->serverFlags & ITEM_SIZE_CHANGE && OP_src(op) == SRC_DIRECT_SET) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key.c_str(),255); + if (evict->serverFlags & ITEM_INCL) { + int index = lrand48() % (1024 * 1024); + int valuelen = op->valuelen; + //the item's size was changed, issue a SET to L2 as a new command + issue_set(key, &random_char[index], valuelen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_L2_M); + } + } + */ + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + del_wb_keys(op->key); + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key.c_str(),op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + + } + + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } + + last_tx = now; + stats.log_op(op_queue_size[1]); + stats.log_op(op_queue_size[2]); + //for (int i = 1; i <= 2; i++) { + // fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]); + //} + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMulti::read_callback2() { + struct evbuffer *input = bufferevent_get_input(bev2); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + continue; + } + op = op_queue[2][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (op->key.length() < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key.c_str()); + write(2,out2,strlen(out2)); +#endif + continue; + } + } else { + break; + } + + + double now = get_time(); + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) {// && + //(options.twitter_trace != 1)) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key.c_str(),255); + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L1); + //wb_keys.push_back(op->key); + last_quiet1 = false; + if (OP_incl(op)) { + //wb_keys.push_back(op->key); + issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L2); + last_quiet2 = false; + } + //pthread_mutex_lock(lock); + //fprintf(stderr,"----miss: %s----\n",key); + //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){ + // fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second); + //} + //fprintf(stderr,"----%d----\n",cid); + //pthread_mutex_unlock(lock); + finish_op(op,0); // sets read_state = IDLE + + } else { + if (found) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key.c_str(),255); + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + //found in l2, set in l1 + //wb_keys.push_back(op->key); + issue_set(key, &random_char[index],valuelen, now, flags); + this->stats.copies_to_l1++; + //if (OP_excl(op)) { + // issue_delete(key,now, ITEM_L2 | SRC_L1_COPY ); + //} + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + } else { + char out[128]; + sprintf(out,"conn l2: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + if (OP_src(op) == SRC_WB) { + del_wb_keys(op->key); + } + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key.c_str(),255); + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + //int ret = add_to_wb_keys(op->key+"l2"); + //if (ret == 1) { + issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + //} + this->stats.set_misses_l2++; + } else { + issue_touch(key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //if (!found) { + // //int incl = op->incl; + // //int flags = 0; + // //SET_INCL(incl,flags); + // //// not found in l2, set in l2 + // char key[256]; + // memset(key,0,256); + // strncpy(key, op->key.c_str(),255); + // int valuelen = op->valuelen; + // int index = lrand48() % (1024 * 1024); + // if (OP_src(op) == SRC_DIRECT_SET) { + // issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP); + // this->stats.set_misses_l2++; + // } + // //if (OP_src(op) == SRC_L1_H) { + // // fprintf(stderr,"expected op in l2: %s\n",key); + // //} + // finish_op(op,0); + //} else { + // finish_op(op,1); + //} + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key.c_str(),op->opaque); + DIE("not implemented"); + } + + } + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } + + last_tx = now; + stats.log_op(op_queue_size[2]); + stats.log_op(op_queue_size[1]); + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Callback called when write requests finish. + */ +void ConnectionMulti::write_callback() { + + //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size()); +} + +/** + * Callback for timer timeouts. + */ +void ConnectionMulti::timer_callback() { + //fprintf(stderr,"timer up: %d\n",cid); + drive_write_machine(); +} + + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb1(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->event_callback1(events); +} + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb2(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->event_callback2(events); +} + +void bev_read_cb1(struct bufferevent *bev, void *ptr) { + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->read_callback1(); +} + + +void bev_read_cb2(struct bufferevent *bev, void *ptr) { + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->read_callback2(); +} + +void bev_write_cb_m(struct bufferevent *bev, void *ptr) { +} + +void timer_cb_m(evutil_socket_t fd, short what, void *ptr) { + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->timer_callback(); +} + diff --git a/ConnectionMulti.cc b/ConnectionMulti.cc new file mode 100644 index 0000000..81a6cda --- /dev/null +++ b/ConnectionMulti.cc @@ -0,0 +1,1713 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#include "Connection.h" +#include "distributions.h" +#include "Generator.h" +#include "mutilate.h" +#include "binary_protocol.h" +#include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" + +//#include + +#define ITEM_L1 1 +#define ITEM_L2 2 +#define LOG_OP 4 +#define SRC_L1_M 8 +#define SRC_L1_H 16 +#define SRC_L2_M 32 +#define SRC_L2_H 64 +#define SRC_DIRECT_SET 128 +#define SRC_L1_COPY 256 +#define SRC_WB 512 + +#define ITEM_INCL 4096 +#define ITEM_EXCL 8192 +#define ITEM_DIRTY 16384 +#define ITEM_SIZE_CHANGE 131072 +#define ITEM_WAS_HIT 262144 + +#define LEVELS 2 +#define SET_INCL(incl,flags) \ + switch (incl) { \ + case 1: \ + flags |= ITEM_INCL; \ + break; \ + case 2: \ + flags |= ITEM_EXCL; \ + break; \ + \ + } \ + +#define GET_INCL(incl,flags) \ + if (flags & ITEM_INCL) incl = 1; \ + else if (flags & ITEM_EXCL) incl = 2; \ + +//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 ) +#define OP_level(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define FLAGS_level(flags) ( flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) ) + +#define OP_log(op) ((op)->flags & LOG_OP) +#define OP_incl(op) ((op)->flags & ITEM_INCL) +#define OP_excl(op) ((op)->flags & ITEM_EXCL) +#define OP_set_flag(op,flag) ((op))->flags |= flag; + +//#define DEBUGMC +//#define DEBUGS +//using namespace folly; +using namespace moodycamel; + +pthread_mutex_t cid_lock_m = PTHREAD_MUTEX_INITIALIZER; +static uint32_t connids_m = 1; + +#define NCLASSES 40 +#define CHUNK_ALIGN_BYTES 8 +static int classes = 0; +static int sizes[NCLASSES+1]; +static int inclusives[NCLASSES+1]; + + +static vector cid_rate; + +extern int max_n[3]; + +static void init_inclusives(char *inclusive_str) { + int j = 1; + for (int i = 0; i < (int)strlen(inclusive_str); i++) { + if (inclusive_str[i] == '-') { + continue; + } else { + inclusives[j] = inclusive_str[i] - '0'; + j++; + } + } +} + +static void init_classes() { + + double factor = 1.25; + unsigned int size = 96; //warning if you change this you die + unsigned int i = 0; + unsigned int chunk_size_max = 1048576/2; + while (++i < NCLASSES-1) { + if (size >= chunk_size_max / factor) { + break; + } + if (size % CHUNK_ALIGN_BYTES) + size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); + sizes[i] = size; + size *= factor; + } + sizes[i] = chunk_size_max; + classes = i; + +} + +static int get_class(int vl, uint32_t kl) { + //warning if you change this you die + int vsize = vl+kl+48+1+2; + int res = 1; + while (vsize > sizes[res]) + if (res++ == classes) { + //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]); + return -1; + } + return res; +} + +static int get_incl(int vl, int kl) { + int clsid = get_class(vl,kl); + if (clsid) { + return inclusives[clsid]; + } else { + return -1; + } +} + +void ConnectionMulti::output_op(Operation *op, int type, bool found) { + char output[1024]; + char k[256]; + char a[256]; + char s[256]; + memset(k,0,256); + memset(a,0,256); + memset(s,0,256); + strncpy(k,op->key,255); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type); + } + write(2,output,strlen(output)); +} + +/** + * Create a new connection to a server endpoint. + */ +ConnectionMulti::ConnectionMulti(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t _options, + bool sampling, int fd1, int fd2 ) : + start_time(0), stats(sampling), options(_options), + hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns) +{ + pthread_mutex_lock(&cid_lock_m); + cid = connids_m++; + if (cid == 1) { + cid_rate.push_back(100); + cid_rate.push_back(0); + init_classes(); + init_inclusives(options.inclusives); + } else { + cid_rate.push_back(0); + } + + pthread_mutex_unlock(&cid_lock_m); + + valuesize = createGenerator(options.valuesize); + keysize = createGenerator(options.keysize); + srand(time(NULL)); + keygen = new KeyGenerator(keysize, options.records); + + total = 0; + eof = 0; + o_percent = 0; + + if (options.lambda <= 0) { + iagen = createGenerator("0"); + } else { + D("iagen = createGenerator(%s)", options.ia); + iagen = createGenerator(options.ia); + iagen->set_lambda(options.lambda); + } + + read_state = IDLE; + write_state = INIT_WRITE; + last_quiet1 = false; + last_quiet2 = false; + + last_tx = last_rx = 0.0; + + + op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1)); + + for (int i = 0; i <= LEVELS; i++) { + op_queue_size[i] = 0; + opaque[i] = 1; + //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX); + op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX*2)); + + } + + bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev1, bev_read_cb1, bev_write_cb_m, bev_event_cb1, this); + bufferevent_enable(bev1, EV_READ | EV_WRITE); + + bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev2, bev_read_cb2, bev_write_cb_m, bev_event_cb2, this); + bufferevent_enable(bev2, EV_READ | EV_WRITE); + + timer = evtimer_new(base, timer_cb_m, this); + + read_state = IDLE; +} + + +void ConnectionMulti::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + trace_queue_n = a_trace_queue->size(); +} + +void ConnectionMulti::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +void ConnectionMulti::set_g_wbkeys(unordered_map> *a_wb_keys) { + g_wb_keys = a_wb_keys; +} + +uint32_t ConnectionMulti::get_cid() { + return cid; +} + +int ConnectionMulti::add_to_wb_keys(string key) { + int ret = -1; + pthread_mutex_lock(lock); + auto pos = g_wb_keys->find(key); + if (pos == g_wb_keys->end()) { + g_wb_keys->insert( {key, vector() }); + ret = 1; + //fprintf(stderr,"----set: %s----\n",Op.key.c_str()); + //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){ + // fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second); + //} + //fprintf(stderr,"----%d----\n",cid); + } else { + ret = 2; + } + + pthread_mutex_unlock(lock); + return ret; +} + +void ConnectionMulti::del_wb_keys(string key) { + + pthread_mutex_lock(lock); + auto position = g_wb_keys->find(key); + if (position != g_wb_keys->end()) { + g_wb_keys->erase(position); + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } + pthread_mutex_unlock(lock); +} + + +int ConnectionMulti::do_connect() { + + int connected = 0; + if (options.unix_socket) { + + + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + int addrlen; + addrlen = sizeof(sin1); + + int err = bufferevent_socket_connect(bev1, (struct sockaddr*)&sin1, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l1 error %s\n",strerror(err)); + } + + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + + addrlen = sizeof(sin2); + err = bufferevent_socket_connect(bev2, (struct sockaddr*)&sin2, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l2 error %s\n",strerror(err)); + } + } + read_state = IDLE; + return connected; +} + +/** + * Destroy a connection, performing cleanup. + */ +ConnectionMulti::~ConnectionMulti() { + + + for (int i = 0; i <= LEVELS; i++) { + free(op_queue[i]); + + } + + free(op_queue_size); + free(opaque); + free(op_queue); + //event_free(timer); + //timer = NULL; + // FIXME: W("Drain op_q?"); + //bufferevent_free(bev1); + //bufferevent_free(bev2); + + delete iagen; + delete keygen; + delete keysize; + delete valuesize; +} + +/** + * Reset the connection back to an initial, fresh state. + */ +void ConnectionMulti::reset() { + // FIXME: Actually check the connection, drain all bufferevents, drain op_q. + //assert(op_queue.size() == 0); + //evtimer_del(timer); + read_state = IDLE; + write_state = INIT_WRITE; + stats = ConnectionStats(stats.sampling); +} + +/** + * Set our event processing priority. + */ +void ConnectionMulti::set_priority(int pri) { + if (bufferevent_priority_set(bev1, pri)) { + DIE("bufferevent_set_priority(bev, %d) failed", pri); + } +} + + + +/** + * Get/Set or Set Style + * If a GET command: Issue a get first, if not found then set + * If trace file (or prob. write) says to set, then set it + */ +int ConnectionMulti::issue_getsetorset(double now) { + + + + int ret = 0; + int nissued = 0; + //while (nissued < options.depth) { + + //pthread_mutex_lock(lock); + if (!trace_queue->empty()) { + Operation Op = *(trace_queue->front()); + if (Op.type == Operation::SASL) { + eof = 1; + cid_rate[cid] = 100; + fprintf(stderr,"cid %d done\n",cid); + string op_queue1; + string op_queue2; + for (int j = 0; j < 2; j++) { + for (int i = 0; i < OPAQUE_MAX; i++) { + if (op_queue[j+1][i] != NULL) { + if (j == 0) { + op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + } else { + op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + } + } + } + } + fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return 1; + } + + + /* check if in global wb queue */ + pthread_mutex_lock(lock); + double percent = (double)total/((double)trace_queue_n) * 100; + if (percent > o_percent+1) { + //update the percentage table and see if we should execute + std::vector::iterator mp = std::min_element(cid_rate.begin(), cid_rate.end()); + double min_percent = *mp; + + if (percent > min_percent+2) { + pthread_mutex_unlock(lock); + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + int good = 0; + if (!event_pending(timer, EV_TIMEOUT, NULL)) { + good = evtimer_add(timer, &tv); + } + if (good != 0) { + fprintf(stderr,"eventimer is messed up!\n"); + return 2; + } + return 1; + } + cid_rate[cid] = percent; + fprintf(stderr,"%f,%d,%.4f\n",now,cid,percent); + o_percent = percent; + } + auto check = g_wb_keys->find(Op.key); + if (check != g_wb_keys->end()) { + pthread_mutex_unlock(lock); + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + int good = 0; + if (!event_pending(timer, EV_TIMEOUT, NULL)) { + good = evtimer_add(timer, &tv); + } + if (good != 0) { + fprintf(stderr,"eventimer is messed up in checking for key: %s\n",Op.key); + return 2; + } + return 1; + } else { + //g_wb_keys->insert( {Op.key, cid} ); + //g_wb_keys->insert( {Op.key+"l2", cid} ); + } + pthread_mutex_unlock(lock); + + + + char key[256]; + memset(key,0,256); + strncpy(key, Op.key,255); + int vl = Op.valuelen; + + trace_queue->pop(); + + int issued = 0; + int incl = get_incl(vl,strlen(key)); + int cid = get_class(vl,strlen(key)); + int flags = 0; + int touch = (rand() % 100); + int index = lrand48() % (1024 * 1024); + //int touch = 1; + SET_INCL(incl,flags); + + switch(Op.type) + { + case Operation::GET: + //if (nissued < options.depth-1) { + // issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1); + // last_quiet1 = false; + //} else { + //} + if (options.threshold > 0) { + if (Op.future) { + key_hist[key] = 1; + } + } + issued = issue_get_with_len(key, vl, now, false, flags | LOG_OP | ITEM_L1); + if (touch == 1 && incl == 1) { + issue_touch(key,vl,now, ITEM_L2 | SRC_L1_H); + } + last_quiet1 = false; + this->stats.gets++; + this->stats.gets_cid[cid]++; + + break; + case Operation::SET: + if (last_quiet1) { + issue_noop(now,1); + } + if (incl == 1) { + issue_touch(key,vl,now, ITEM_L2 | SRC_DIRECT_SET); + } else if (incl == 2) { + issue_delete(key,now, ITEM_L2 | SRC_DIRECT_SET ); + } + issued = issue_set(key, &random_char[index], vl, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + last_quiet1 = false; + this->stats.sets++; + this->stats.sets_cid[cid]++; + break; + case Operation::DELETE: + case Operation::TOUCH: + case Operation::NOOP: + case Operation::SASL: + fprintf(stderr,"invalid line: %s, vl: %d\n",key,vl); + break; + + } + if (issued) { + nissued++; + total++; + } else { + fprintf(stderr,"failed to issue line: %s, vl: %d @T: XX \n",key,vl); + } + } else { + return 1; + } + //} + if (last_quiet1) { + issue_noop(now,1); + last_quiet1 = false; + } + + return ret; + +} + +/** + * Issue a get request to the server. + */ +int ConnectionMulti::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) { + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::GET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(valuelen,strlen(key)); + if (l1 != NULL) { + pop->l1 = l1; + } + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + if (quiet) { + h.opcode = CMD_GETQ; + } + h.opaque = htonl(pop->opaque); + + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + + stats.tx_bytes += 24 + keylen; + return 1; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMulti::issue_touch(const char* key, int valuelen, double now, int flags) { + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::TOUCH; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + +#ifdef DEBUGS + fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque); +#endif + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen), + 0x04, 0x00, htons(0), + htonl(keylen + 4) }; + h.opaque = htonl(pop->opaque); + + uint32_t exp = 0; + if (flags & ITEM_DIRTY) { + exp = htonl(flags); + } + evbuffer_add(output, &h, 24); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, key, keylen); + + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +/** + * Issue a delete request to the server. + */ +int ConnectionMulti::issue_delete(const char* key, double now, uint32_t flags) { + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->type = Operation::DELETE; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque); +#endif + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_DELETE, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + h.opaque = htonl(pop->opaque); + + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +void ConnectionMulti::issue_noop(double now, int level) { + struct evbuffer *output = NULL; + switch (level) { + case 1: + output = bufferevent_get_output(bev1); + break; + case 2: + output = bufferevent_get_output(bev2); + break; + } + Operation op; + + if (now == 0.0) op.start_time = get_time(); + else op.start_time = now; + + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + + evbuffer_add(output, &h, 24); + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMulti::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) { + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + + strncpy(pop->key,key,255); + pop->valuelen = length; + pop->type = Operation::SET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(length,strlen(key)); + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(flags); + uint32_t exp = 0; + + evbuffer_add(output, &h, 24); + evbuffer_add(output, &f, 4); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, key, keylen); + evbuffer_add(output, value, length); + + stats.tx_bytes += length + 32 + keylen; + return 1; +} + +/** + * Return the oldest live operation in progress. + */ +void ConnectionMulti::pop_op(Operation *op) { + + uint8_t level = OP_level(op); + //op_queue[level].erase(op); + op_queue_size[level]--; + + + if (read_state == LOADING) return; + read_state = IDLE; + + // Advance the read state machine. + //if (op_queue.size() > 0) { + // Operation& op = op_queue.front(); + // switch (op.type) { + // case Operation::GET: read_state = WAITING_FOR_GET; break; + // case Operation::SET: read_state = WAITING_FOR_SET; break; + // case Operation::DELETE: read_state = WAITING_FOR_DELETE; break; + // default: DIE("Not implemented."); + // } + //} +} + +/** + * Finish up (record stats) an operation that just returned from the + * server. + */ +void ConnectionMulti::finish_op(Operation *op, int was_hit) { + double now; +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); +#else + now = get_time(); +#endif +#if HAVE_CLOCK_GETTIME + op->end_time = get_time_accurate(); +#else + op->end_time = now; +#endif + + if (options.successful_queries && was_hit) { + switch (op->type) { + case Operation::GET: + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + break; + } + break; + case Operation::SET: + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } else { + switch (op->type) { + case Operation::GET: + if (OP_log(op)) { + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + if (op->l1 != NULL) { + op->l1->end_time = now; + stats.log_get(*(op->l1)); + } + break; + } + } + break; + case Operation::SET: + if (OP_log(op)) { + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } + + last_rx = now; + uint8_t level = OP_level(op); + if (op->l1 != NULL) { + delete op_queue[1][op->l1->opaque]; + op_queue[1][op->l1->opaque] = 0; + op_queue_size[1]--; + } + //op_queue[level].erase(op_queue[level].begin()+opopq); + if (op == op_queue[level][op->opaque] && + op->opaque == op_queue[level][op->opaque]->opaque) { + delete op_queue[level][op->opaque]; + op_queue[level][op->opaque] = 0; + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque); + } + op_queue_size[level]--; + read_state = IDLE; + + +} + + + +/** + * Check if our testing is done and we should exit. + */ +bool ConnectionMulti::check_exit_condition(double now) { + if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) { + return true; + } + if (read_state == INIT_READ) return false; + + return false; +} + +/** + * Handle new connection and error events. + */ +void ConnectionMulti::event_callback1(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname1.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev1); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev1); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Handle new connection and error events. + */ +void ConnectionMulti::event_callback2(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname2.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev2); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev2); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Request generation loop. Determines whether or not to issue a new command, + * based on timer events. + * + * Note that this function loops. Be wary of break vs. return. + */ +void ConnectionMulti::drive_write_machine(double now) { + if (now == 0.0) now = get_time(); + + double delay; + struct timeval tv; + + if (check_exit_condition(now)) { + return; + } + + while (1) { + switch (write_state) { + case INIT_WRITE: + delay = iagen->generate(); + next_time = now + delay; + double_to_tv(delay, &tv); + evtimer_add(timer, &tv); + write_state = ISSUING; + break; + + case ISSUING: + if ( (op_queue_size[1] >= (size_t) options.depth) || + (op_queue_size[2] >= (size_t) options.depth) ) { + write_state = WAITING_FOR_OPQ; + break; + } + + if (options.getsetorset) { + int ret = issue_getsetorset(now); + if (ret == 1) return; //if at EOF + } + + last_tx = now; + for (int i = 1; i <= 2; i++) { + stats.log_op(op_queue_size[i]); + } + break; + + case WAITING_FOR_TIME: + write_state = ISSUING; + break; + + case WAITING_FOR_OPQ: + if ( (op_queue_size[1] >= (size_t) options.depth) || + (op_queue_size[2] >= (size_t) options.depth) ) { + //double delay = 0.01; + //struct timeval tv; + //double_to_tv(delay, &tv); + //evtimer_add(timer, &tv); + return; + } else { + write_state = ISSUING; + break; + } + + default: DIE("Not implemented"); + } + } +} + + + +/** + * Tries to consume a binary response (in its entirety) from an evbuffer. + * + * @param input evBuffer to read response from + * @return true if consumed, false if not enough data in buffer. + */ +static bool handle_response(ConnectionMulti *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) { + // Read the first 24 bytes as a header + int length = evbuffer_get_length(input); + if (length < 24) return false; + binary_header_t* h = + reinterpret_cast(evbuffer_pullup(input, 24)); + //assert(h); + + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + // Not whole response + int targetLen = 24 + bl; + if (length < targetLen) { + return false; + } + + opcode = h->opcode; + opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + + + // If something other than success, count it as a miss + if (opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + conn->stats.get_misses_l1++; + break; + case 2: + conn->stats.get_misses_l2++; + conn->stats.get_misses++; + conn->stats.window_get_misses++; + break; + + } + found = false; + evbuffer_drain(input, targetLen); + + } else if (opcode == CMD_SET && kl > 0) { + //first data is extras: clsid, flags, eflags + if (evict) { + evbuffer_drain(input,24); + unsigned char *buf = evbuffer_pullup(input,bl); + + + evict->clsid = *((uint32_t*)buf); + evict->clsid = ntohl(evict->clsid); + buf += 4; + + evict->serverFlags = *((uint32_t*)buf); + evict->serverFlags = ntohl(evict->serverFlags); + buf += 4; + + evict->evictedFlags = *((uint32_t*)buf); + evict->evictedFlags = ntohl(evict->evictedFlags); + buf += 4; + + + evict->evictedKeyLen = kl; + evict->evictedKey = (char*)malloc(kl+1); + memset(evict->evictedKey,0,kl+1); + memcpy(evict->evictedKey,buf,kl); + buf += kl; + + + evict->evictedLen = bl - kl - el; + evict->evictedData = (char*)malloc(evict->evictedLen); + memcpy(evict->evictedData,buf,evict->evictedLen); + evict->evicted = true; + //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags); + evbuffer_drain(input,bl); + } else { + evbuffer_drain(input, targetLen); + } + } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else { + evbuffer_drain(input, targetLen); + } + + conn->stats.rx_bytes += targetLen; + done = true; + return true; +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMulti::read_callback1() { + struct evbuffer *input = bufferevent_get_input(bev1); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + + full_read = handle_response(this,input, done, found, opcode, opaque, evict,1); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + op = op_queue[1][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + } else { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + break; + } + + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + char key[256]; + memset(key,0,256); + strncpy(key, op->key,255); + int vl = op->valuelen; + int flags = OP_clu(op); + issue_get_with_len(key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + //finish_op(op,0); + + } else { + del_wb_keys(op->key); + finish_op(op,found); + } + } else { + char out[128]; + sprintf(out,"conn l1: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_DIRECT_SET || + // OP_src(op) == SRC_L2_M ) { + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //wb_keys.push_back(wb_key); + int ret = add_to_wb_keys(wb_key); + if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB); + } + //fprintf(stderr,"incl writeback %s\n",evict->evictedKey); + this->stats.incl_wbs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (key_hist[wb_key] == 1)) || + (options.wb_all) ) { + int ret = add_to_wb_keys(wb_key); + if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + } + this->stats.excl_wbs++; + } + } + /* + if (evict->serverFlags & ITEM_SIZE_CHANGE && OP_src(op) == SRC_DIRECT_SET) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key.c_str(),255); + if (evict->serverFlags & ITEM_INCL) { + int index = lrand48() % (1024 * 1024); + int valuelen = op->valuelen; + //the item's size was changed, issue a SET to L2 as a new command + issue_set(key, &random_char[index], valuelen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_L2_M); + } + } + */ + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + del_wb_keys(op->key); + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + + } + + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } + + last_tx = now; + stats.log_op(op_queue_size[1]); + stats.log_op(op_queue_size[2]); + //for (int i = 1; i <= 2; i++) { + // fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]); + //} + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMulti::read_callback2() { + struct evbuffer *input = bufferevent_get_input(bev2); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + continue; + } + op = op_queue[2][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + continue; + } + } else { + break; + } + + + double now = get_time(); + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) {// && + //(options.twitter_trace != 1)) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key,255); + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L1); + //wb_keys.push_back(op->key); + last_quiet1 = false; + if (OP_incl(op)) { + //wb_keys.push_back(op->key); + issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L2); + last_quiet2 = false; + } + //pthread_mutex_lock(lock); + //fprintf(stderr,"----miss: %s----\n",key); + //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){ + // fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second); + //} + //fprintf(stderr,"----%d----\n",cid); + //pthread_mutex_unlock(lock); + finish_op(op,0); // sets read_state = IDLE + + } else { + if (found) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key,255); + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + //found in l2, set in l1 + //wb_keys.push_back(op->key); + issue_set(key, &random_char[index],valuelen, now, flags); + this->stats.copies_to_l1++; + //if (OP_excl(op)) { + // issue_delete(key,now, ITEM_L2 | SRC_L1_COPY ); + //} + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + } else { + char out[128]; + sprintf(out,"conn l2: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + if (OP_src(op) == SRC_WB) { + del_wb_keys(op->key); + } + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET) { + char key[256]; + memset(key,0,256); + strncpy(key, op->key,255); + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + //int ret = add_to_wb_keys(op->key+"l2"); + //if (ret == 1) { + issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + //} + this->stats.set_misses_l2++; + } else { + issue_touch(key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //if (!found) { + // //int incl = op->incl; + // //int flags = 0; + // //SET_INCL(incl,flags); + // //// not found in l2, set in l2 + // char key[256]; + // memset(key,0,256); + // strncpy(key, op->key.c_str(),255); + // int valuelen = op->valuelen; + // int index = lrand48() % (1024 * 1024); + // if (OP_src(op) == SRC_DIRECT_SET) { + // issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP); + // this->stats.set_misses_l2++; + // } + // //if (OP_src(op) == SRC_L1_H) { + // // fprintf(stderr,"expected op in l2: %s\n",key); + // //} + // finish_op(op,0); + //} else { + // finish_op(op,1); + //} + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + } + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } + + last_tx = now; + stats.log_op(op_queue_size[2]); + stats.log_op(op_queue_size[1]); + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Callback called when write requests finish. + */ +void ConnectionMulti::write_callback() { + + //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size()); +} + +/** + * Callback for timer timeouts. + */ +void ConnectionMulti::timer_callback() { + //fprintf(stderr,"timer up: %d\n",cid); + drive_write_machine(); +} + + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb1(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->event_callback1(events); +} + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb2(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->event_callback2(events); +} + +void bev_read_cb1(struct bufferevent *bev, void *ptr) { + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->read_callback1(); +} + + +void bev_read_cb2(struct bufferevent *bev, void *ptr) { + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->read_callback2(); +} + +void bev_write_cb_m(struct bufferevent *bev, void *ptr) { +} + +void timer_cb_m(evutil_socket_t fd, short what, void *ptr) { + ConnectionMulti* conn = (ConnectionMulti*) ptr; + conn->timer_callback(); +} + diff --git a/ConnectionMultiApprox.cc b/ConnectionMultiApprox.cc new file mode 100644 index 0000000..7ee052a --- /dev/null +++ b/ConnectionMultiApprox.cc @@ -0,0 +1,1943 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +#include "config.h" + +#include "Connection.h" +#include "distributions.h" +#include "Generator.h" +#include "mutilate.h" +#include "binary_protocol.h" +#include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" + +//#include +//#include + +#define ITEM_L1 1 +#define ITEM_L2 2 +#define LOG_OP 4 +#define SRC_L1_M 8 +#define SRC_L1_H 16 +#define SRC_L2_M 32 +#define SRC_L2_H 64 +#define SRC_DIRECT_SET 128 +#define SRC_L1_COPY 256 +#define SRC_WB 512 + +#define ITEM_INCL 4096 +#define ITEM_EXCL 8192 +#define ITEM_DIRTY 16384 +#define ITEM_SIZE_CHANGE 131072 +#define ITEM_WAS_HIT 262144 + +#define LEVELS 2 +#define SET_INCL(incl,flags) \ + switch (incl) { \ + case 1: \ + flags |= ITEM_INCL; \ + break; \ + case 2: \ + flags |= ITEM_EXCL; \ + break; \ + \ + } \ + +#define GET_INCL(incl,flags) \ + if (flags & ITEM_INCL) incl = 1; \ + else if (flags & ITEM_EXCL) incl = 2; \ + +//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 ) +#define OP_level(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define FLAGS_level(flags) ( flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) ) + +#define OP_log(op) ((op)->flags & LOG_OP) +#define OP_incl(op) ((op)->flags & ITEM_INCL) +#define OP_excl(op) ((op)->flags & ITEM_EXCL) +#define OP_set_flag(op,flag) ((op))->flags |= flag; + +//#define DEBUGMC +//#define DEBUGS +//using namespace folly; +using namespace moodycamel; +//using namespace fmt; + +//struct node { +// long long addr,label; +// node *nxt; +// node(long long _addr = 0, long long _label = 0, node *_nxt = NULL) +// : addr(_addr),label(_label),nxt(_nxt) {} +//}; +// +//struct tnode { +// long long tm,offset; int size; +//};//trace file data structure +// +//long long find(long long addr) { +// int t = addr%MAXH; +// node *tmp = hash[t],*pre = NULL; +// while (tmp) { +// if (tmp->addr == addr) { +// long long tlabel = tmp->label; +// if (pre == NULL) hash[t] = tmp->nxt; +// else pre->nxt = tmp->nxt; +// delete tmp; +// return tlabel; +// } +// pre = tmp; +// tmp = tmp->nxt; +// } +// return 0; +//} +// +//void insert(long long addr ) { +// int t = addr%MAXH; +// node *tmp = new node(addr,n,hash[t]); +// hash[t] = tmp; +//} + + + +pthread_mutex_t cid_lock_m_approx = PTHREAD_MUTEX_INITIALIZER; +static uint32_t connids_m = 1; + +#define NCLASSES 40 +#define CHUNK_ALIGN_BYTES 8 +static int classes = 0; +static int sizes[NCLASSES+1]; +static int inclusives[NCLASSES+1]; + + + +static void init_inclusives(char *inclusive_str) { + int j = 1; + for (int i = 0; i < (int)strlen(inclusive_str); i++) { + if (inclusive_str[i] == '-') { + continue; + } else { + inclusives[j] = inclusive_str[i] - '0'; + j++; + } + } +} + +static void init_classes() { + + double factor = 1.25; + //unsigned int chunk_size = 48; + //unsigned int item_size = 24; + unsigned int size = 96; //warning if you change this you die + unsigned int i = 0; + unsigned int chunk_size_max = 1048576/2; + while (++i < NCLASSES-1) { + if (size >= chunk_size_max / factor) { + break; + } + if (size % CHUNK_ALIGN_BYTES) + size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); + sizes[i] = size; + size *= factor; + } + sizes[i] = chunk_size_max; + classes = i; + +} + +static int get_class(int vl, uint32_t kl) { + //warning if you change this you die + int vsize = vl+kl+48+1+2; + int res = 1; + while (vsize > sizes[res]) + if (res++ == classes) { + //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]); + return -1; + } + return res; +} + +static int get_incl(int vl, int kl) { + int clsid = get_class(vl,kl); + if (clsid) { + return inclusives[clsid]; + } else { + return -1; + } +} + +void ConnectionMultiApprox::output_op(Operation *op, int type, bool found) { + char output[1024]; + char k[256]; + char a[256]; + char s[256]; + memset(k,0,256); + memset(a,0,256); + memset(s,0,256); + strncpy(k,op->key,255); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type); + } + write(2,output,strlen(output)); +} + +//extern USPMCQueue g_trace_queue; +//static vector cid_rate; +//extern ConcurrentHashMap cid_rate; +extern unordered_map cid_rate; +//extern ConcurrentHashMap> copy_keys; +extern unordered_map> copy_keys; +extern unordered_map touch_keys; +extern unordered_map> wb_keys; +//extern ConcurrentHashMap> wb_keys; + +extern map g_key_hist; +extern int max_n[3]; + +/** + * Create a new connection to a server endpoint. + */ +ConnectionMultiApprox::ConnectionMultiApprox(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t _options, + bool sampling, int fd1, int fd2 ) : + start_time(0), stats(sampling), options(_options), + hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns) +{ + pthread_mutex_lock(&cid_lock_m_approx); + cid = connids_m++; + if (cid == 1) { + init_classes(); + init_inclusives(options.inclusives); + } + cid_rate.insert( { cid, 0 } ); + + pthread_mutex_unlock(&cid_lock_m_approx); + + valuesize = createGenerator(options.valuesize); + keysize = createGenerator(options.keysize); + srand(time(NULL)); + keygen = new KeyGenerator(keysize, options.records); + + total = 0; + eof = 0; + o_percent = 0; + + if (options.lambda <= 0) { + iagen = createGenerator("0"); + } else { + D("iagen = createGenerator(%s)", options.ia); + iagen = createGenerator(options.ia); + iagen->set_lambda(options.lambda); + } + + read_state = IDLE; + write_state = INIT_WRITE; + last_quiet1 = false; + last_quiet2 = false; + + last_tx = last_rx = 0.0; + gets = 0; + ghits = 0; + esets = 0; + isets = 0; + gloc = rand() % (10*2-1)+1; + sloc = rand() % (10*2-1)+1; + iloc = rand() % (10*2-1)+1; + + op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1)); + + for (int i = 0; i <= LEVELS; i++) { + op_queue_size[i] = 0; + opaque[i] = 1; + //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX); + op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1)); + for (int j = 0; j <= OPAQUE_MAX; j++) { + op_queue[i][j] = NULL; + } + + } + + + bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev1, bev_read_cb1_approx, bev_write_cb_m_approx, bev_event_cb1_approx, this); + bufferevent_enable(bev1, EV_READ | EV_WRITE); + + bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev2, bev_read_cb2_approx, bev_write_cb_m_approx, bev_event_cb2_approx, this); + bufferevent_enable(bev2, EV_READ | EV_WRITE); + + timer = evtimer_new(base, timer_cb_m_approx, this); + + read_state = IDLE; +} + + +void ConnectionMultiApprox::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + trace_queue_n = a_trace_queue->size(); +} + +void ConnectionMultiApprox::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +void ConnectionMultiApprox::set_g_wbkeys(unordered_map> *a_wb_keys) { + g_wb_keys = a_wb_keys; +} + +uint32_t ConnectionMultiApprox::get_cid() { + return cid; +} + +int ConnectionMultiApprox::add_to_wb_keys(string key) { + auto pos = wb_keys.find(key); + if (pos == wb_keys.end()) { + wb_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + +int ConnectionMultiApprox::add_to_copy_keys(string key) { + auto pos = copy_keys.find(key); + if (pos == copy_keys.end()) { + copy_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + + +void ConnectionMultiApprox::del_copy_keys(string key) { + + auto position = copy_keys.find(key); + if (position != copy_keys.end()) { + vector op_list = vector(position->second); + copy_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApprox::add_to_touch_keys(string key) { + //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2; + auto pos = touch_keys.find(key); + if (pos == touch_keys.end()) { + touch_keys.insert( {key, cid }); + return 1; + } + return 2; +} + + +void ConnectionMultiApprox::del_touch_keys(string key) { + //touch_keys.erase(key); + auto position = touch_keys.find(key); + if (position != touch_keys.end()) { + touch_keys.erase(position); + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApprox::issue_op(Operation *Op) { + double now = get_time(); + int issued = 0; + int incl = get_incl(Op->valuelen,strlen(Op->key)); + int cid = get_class(Op->valuelen,strlen(Op->key)); + Op->clsid = cid; + int flags = 0; + int index = lrand48() % (1024 * 1024); + //int touch = 1; + SET_INCL(incl,flags); + + switch(Op->type) + { + case Operation::GET: + //if (nissued < options.depth-1) { + // issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1); + // last_quiet1 = false; + //} else { + //} + issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1); + last_quiet1 = false; + this->stats.gets++; + gets++; + this->stats.gets_cid[cid]++; + + break; + case Operation::SET: + if (last_quiet1) { + issue_noop(now,1); + } + if (incl == 1) { + if (isets >= iloc) { + //if (1) { + const char *data = &random_char[index]; + issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + //int ret = add_to_touch_keys(string(Op->key)); + //if (ret == 1) { + issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET); + //} + iloc += rand()%(10*2-1)+1; + } else { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY); + } + isets++; + } else if (incl == 2) { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + if (esets >= sloc) { + issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET); + sloc += rand()%(10*2-1)+1; + } + esets++; + } + last_quiet1 = false; + this->stats.sets++; + this->stats.sets_cid[cid]++; + break; + case Operation::DELETE: + case Operation::TOUCH: + case Operation::NOOP: + case Operation::SASL: + fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen); + break; + + } + return issued; +} + +void ConnectionMultiApprox::del_wb_keys(string key) { + + auto position = wb_keys.find(key); + if (position != wb_keys.end()) { + vector op_list = vector(position->second); + wb_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } +} + + +int ConnectionMultiApprox::do_connect() { + + int connected = 0; + if (options.unix_socket) { + + + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + int addrlen; + addrlen = sizeof(sin1); + + int err = bufferevent_socket_connect(bev1, (struct sockaddr*)&sin1, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l1 error %s\n",strerror(err)); + } + + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + + addrlen = sizeof(sin2); + err = bufferevent_socket_connect(bev2, (struct sockaddr*)&sin2, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l2 error %s\n",strerror(err)); + } + } + read_state = IDLE; + return connected; +} + +/** + * Destroy a connection, performing cleanup. + */ +ConnectionMultiApprox::~ConnectionMultiApprox() { + + + for (int i = 0; i <= LEVELS; i++) { + free(op_queue[i]); + + } + + free(op_queue_size); + free(opaque); + free(op_queue); + //event_free(timer); + //timer = NULL; + // FIXME: W("Drain op_q?"); + //bufferevent_free(bev1); + //bufferevent_free(bev2); + + delete iagen; + delete keygen; + delete keysize; + delete valuesize; +} + +/** + * Reset the connection back to an initial, fresh state. + */ +void ConnectionMultiApprox::reset() { + // FIXME: Actually check the connection, drain all bufferevents, drain op_q. + //assert(op_queue.size() == 0); + //evtimer_del(timer); + read_state = IDLE; + write_state = INIT_WRITE; + stats = ConnectionStats(stats.sampling); +} + +/** + * Set our event processing priority. + */ +void ConnectionMultiApprox::set_priority(int pri) { + if (bufferevent_priority_set(bev1, pri)) { + DIE("bufferevent_set_priority(bev, %d) failed", pri); + } +} + + + +/** + * Get/Set or Set Style + * If a GET command: Issue a get first, if not found then set + * If trace file (or prob. write) says to set, then set it + */ +int ConnectionMultiApprox::issue_getsetorset(double now) { + + + + int ret = 0; + int nissued = 0; + + //while (nissued < 1) { + + //pthread_mutex_lock(lock); + //if (!trace_queue->empty()) { + + /* check if in global wb queue */ + //double percent = (double)total/((double)trace_queue_n) * 100; + //if (percent > o_percent+2) { + // //update the percentage table and see if we should execute + // if (options.ratelimit) { + // double min_percent = 1000; + // auto it = cid_rate.begin(); + // while (it != cid_rate.end()) { + // if (it->second < min_percent) { + // min_percent = it->second; + // } + // ++it; + // } + + // if (percent > min_percent+2) { + // struct timeval tv; + // tv.tv_sec = 0; + // tv.tv_usec = 100; + // int good = 0; + // if (!event_pending(timer, EV_TIMEOUT, NULL)) { + // good = evtimer_add(timer, &tv); + // } + // if (good != 0) { + // fprintf(stderr,"eventimer is messed up!\n"); + // return 2; + // } + // return 1; + // } + // } + // cid_rate.insert( {cid, percent}); + // fprintf(stderr,"%f,%d,%.4f\n",now,cid,percent); + // o_percent = percent; + //} + // + + Operation *Op = trace_queue->front(); + //Operation *Op = g_trace_queue.dequeue(); + + if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) { + eof = 1; + cid_rate.insert( {cid, 100 } ); + fprintf(stderr,"cid %d done\n",cid); + string op_queue1; + string op_queue2; + for (int j = 0; j < 2; j++) { + for (int i = 0; i < OPAQUE_MAX; i++) { + if (op_queue[j+1][i] != NULL) { + if (j == 0) { + op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + } else { + op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + } + } + } + } + fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return 1; + } + + trace_queue->pop(); + + + //trace_queue->pop(); + + //pthread_mutex_lock(lock); + //auto check = wb_keys.find(string(Op->key)); + //if (check != wb_keys.end()) { + // check->second.push_back(Op); + // return 0; + //} + //pthread_mutex_unlock(lock); + //pthread_mutex_unlock(lock); + //struct timeval tv; + //double delay; + //delay = last_rx + 0.00025 - now; + //double_to_tv(delay,&tv); + //int good = 0; + ////if (!event_pending(timer, EV_TIMEOUT, NULL)) { + //good = evtimer_add(timer, &tv); + ////} + //if (good != 0) { + // fprintf(stderr,"eventimer is messed up in checking for key: %s\n",Op->key); + // return 2; + //} + //return 1; + //} else { + //pthread_mutex_unlock(lock); + int issued = issue_op(Op); + if (issued) { + nissued++; + total++; + } else { + fprintf(stderr,"failed to issue line: %s, vl: %d\n",Op->key,Op->valuelen); + } + //} + + //} else { + // return 1; + //} + //} + //if (last_quiet1) { + // issue_noop(now,1); + // last_quiet1 = false; + //} + + return ret; + +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApprox::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) { + + //check if op is in copy_keys (currently going to L1) + //auto check = copy_keys.find(string(pop->key)); + //if (check != copy_keys.end()) { + // check->second.push_back(pop); + // return 1; + //} + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + if (l1 != NULL) { + pop->l1 = l1; + } + + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(pop->key); + + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + if (quiet) { + h.opcode = CMD_GETQ; + } + h.opaque = htonl(pop->opaque); + + + evbuffer_add(output, &h, 24); + evbuffer_add(output, pop->key, keylen); + + stats.tx_bytes += 24 + keylen; + return 1; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApprox::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) { + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::GET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(valuelen,strlen(key)); + if (l1 != NULL) { + pop->l1 = l1; + } + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + if (quiet) { + h.opcode = CMD_GETQ; + } + h.opaque = htonl(pop->opaque); + + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + + stats.tx_bytes += 24 + keylen; + return 1; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApprox::issue_touch(const char* key, int valuelen, double now, int flags) { + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::TOUCH; + pop->opaque = opaque[level]++; + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; + + pop->flags = flags; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + +#ifdef DEBUGS + fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque); +#endif + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen), + 0x04, 0x00, htons(0), + htonl(keylen + 4) }; + h.opaque = htonl(pop->opaque); + + uint32_t exp = 0; + if (flags & ITEM_DIRTY) { + exp = htonl(flags); + } + evbuffer_add(output, &h, 24); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, key, keylen); + + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +/** + * Issue a delete request to the server. + */ +int ConnectionMultiApprox::issue_delete(const char* key, double now, uint32_t flags) { + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->type = Operation::DELETE; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque); +#endif + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_DELETE, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + h.opaque = htonl(pop->opaque); + + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +void ConnectionMultiApprox::issue_noop(double now, int level) { + struct evbuffer *output = NULL; + switch (level) { + case 1: + output = bufferevent_get_output(bev1); + break; + case 2: + output = bufferevent_get_output(bev2); + break; + } + Operation op; + + if (now == 0.0) op.start_time = get_time(); + else op.start_time = now; + + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + + evbuffer_add(output, &h, 24); + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApprox::issue_set(Operation *pop, const char* value, double now, uint32_t flags) { + + //check if op is in copy_keys (currently going to L1) + //auto check = copy_keys.find(string(pop->key)); + //if (check != copy_keys.end()) { + // check->second.push_back(pop); + // return 1; + //} + + struct evbuffer *output = NULL; + int level = 0; + int length = pop->valuelen; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,length,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(pop->key); + + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(flags); + uint32_t exp = 0; + + evbuffer_add(output, &h, 24); + evbuffer_add(output, &f, 4); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, pop->key, keylen); + evbuffer_add(output, value, length); + + stats.tx_bytes += length + 32 + keylen; + return 1; +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApprox::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) { + + struct evbuffer *output = NULL; + int level = 0; + switch (FLAGS_level(flags)) { + case 1: + level = 1; + output = bufferevent_get_output(bev1); + break; + case 2: + level = 2; + output = bufferevent_get_output(bev2); + break; + } + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + strncpy(pop->key,key,255); + pop->valuelen = length; + pop->type = Operation::SET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(length,strlen(key)); + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(flags); + uint32_t exp = 0; + + evbuffer_add(output, &h, 24); + evbuffer_add(output, &f, 4); + evbuffer_add(output, &exp, 4); + evbuffer_add(output, key, keylen); + evbuffer_add(output, value, length); + + stats.tx_bytes += length + 32 + keylen; + return 1; +} + +/** + * Return the oldest live operation in progress. + */ +void ConnectionMultiApprox::pop_op(Operation *op) { + + uint8_t level = OP_level(op); + //op_queue[level].erase(op); + op_queue_size[level]--; + + + if (read_state == LOADING) return; + read_state = IDLE; + + // Advance the read state machine. + //if (op_queue.size() > 0) { + // Operation& op = op_queue.front(); + // switch (op.type) { + // case Operation::GET: read_state = WAITING_FOR_GET; break; + // case Operation::SET: read_state = WAITING_FOR_SET; break; + // case Operation::DELETE: read_state = WAITING_FOR_DELETE; break; + // default: DIE("Not implemented."); + // } + //} +} + +/** + * Finish up (record stats) an operation that just returned from the + * server. + */ +void ConnectionMultiApprox::finish_op(Operation *op, int was_hit) { + double now; +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); +#else + now = get_time(); +#endif +#if HAVE_CLOCK_GETTIME + op->end_time = get_time_accurate(); +#else + op->end_time = now; +#endif + + if (was_hit) { + switch (op->type) { + case Operation::GET: + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + if (op->l1 != NULL) { + op->l1->end_time = now; + stats.log_get(*(op->l1)); + } + break; + } + break; + case Operation::SET: + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } + //} else { + // switch (op->type) { + // case Operation::GET: + // if (OP_log(op)) { + // switch (OP_level(op)) { + // case 1: + // stats.log_get_l1(*op); + // break; + // case 2: + // stats.log_get_l2(*op); + // if (op->l1 != NULL) { + // op->l1->end_time = now; + // stats.log_get(*(op->l1)); + // } + // break; + // } + // } + // break; + // case Operation::SET: + // if (OP_log(op)) { + // switch (OP_level(op)) { + // case 1: + // stats.log_set_l1(*op); + // break; + // case 2: + // stats.log_set_l2(*op); + // break; + // } + // } + // break; + // case Operation::DELETE: break; + // case Operation::TOUCH: break; + // default: DIE("Not implemented."); + // } + //} + + last_rx = now; + uint8_t level = OP_level(op); + if (op->l1 != NULL) { + //delete op_queue[1][op->l1->opaque]; + op_queue[1][op->l1->opaque] = 0; + op_queue_size[1]--; + delete op->l1; + } + //op_queue[level].erase(op_queue[level].begin()+opopq); + if (op == op_queue[level][op->opaque] && + op->opaque == op_queue[level][op->opaque]->opaque) { + //delete op_queue[level][op->opaque]; + op_queue[level][op->opaque] = 0; + delete op; + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque); + } + op_queue_size[level]--; + read_state = IDLE; + + +} + + + +/** + * Check if our testing is done and we should exit. + */ +bool ConnectionMultiApprox::check_exit_condition(double now) { + if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) { + return true; + } + if (read_state == INIT_READ) return false; + + return false; +} + +/** + * Handle new connection and error events. + */ +void ConnectionMultiApprox::event_callback1(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname1.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev1); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev1); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Handle new connection and error events. + */ +void ConnectionMultiApprox::event_callback2(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname2.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev2); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev2); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Request generation loop. Determines whether or not to issue a new command, + * based on timer events. + * + * Note that this function loops. Be wary of break vs. return. + */ +void ConnectionMultiApprox::drive_write_machine(double now) { + + if (now == 0.0) now = get_time(); + double delay; + struct timeval tv; + + if (check_exit_condition(now)) { + return; + } + + while (1) { + switch (write_state) { + case INIT_WRITE: + delay = iagen->generate(); + next_time = now + delay; + double_to_tv(delay, &tv); + evtimer_add(timer, &tv); + write_state = ISSUING; + break; + + case ISSUING: + if ( (op_queue_size[1] >= (size_t) options.depth) || + (op_queue_size[2] >= (size_t) options.depth) ) { + write_state = WAITING_FOR_OPQ; + break; + } + + if (options.getsetorset) { + int ret = issue_getsetorset(now); + if (ret == 1) return; //if at EOF + } + + last_tx = now; + for (int i = 1; i <= 2; i++) { + stats.log_op(op_queue_size[i]); + } + break; + + case WAITING_FOR_TIME: + write_state = ISSUING; + break; + + case WAITING_FOR_OPQ: + if ( (op_queue_size[1] >= (size_t) options.depth) || + (op_queue_size[2] >= (size_t) options.depth) ) { + //double delay = 0.01; + //struct timeval tv; + //double_to_tv(delay, &tv); + //evtimer_add(timer, &tv); + return; + } else { + write_state = ISSUING; + break; + } + + default: DIE("Not implemented"); + } + } +} + + + +/** + * Tries to consume a binary response (in its entirety) from an evbuffer. + * + * @param input evBuffer to read response from + * @return true if consumed, false if not enough data in buffer. + */ +static bool handle_response(ConnectionMultiApprox *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) { + // Read the first 24 bytes as a header + int length = evbuffer_get_length(input); + if (length < 24) return false; + binary_header_t* h = + reinterpret_cast(evbuffer_pullup(input, 24)); + //assert(h); + + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + // Not whole response + int targetLen = 24 + bl; + if (length < targetLen) { + return false; + } + + opcode = h->opcode; + opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + + + // If something other than success, count it as a miss + if (opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + conn->stats.get_misses_l1++; + break; + case 2: + conn->stats.get_misses_l2++; + conn->stats.get_misses++; + conn->stats.window_get_misses++; + break; + + } + found = false; + evbuffer_drain(input, targetLen); + + } else if (opcode == CMD_SET && kl > 0) { + //first data is extras: clsid, flags, eflags + if (evict) { + evbuffer_drain(input,24); + unsigned char *buf = evbuffer_pullup(input,bl); + + + evict->clsid = *((uint32_t*)buf); + evict->clsid = ntohl(evict->clsid); + buf += 4; + + evict->serverFlags = *((uint32_t*)buf); + evict->serverFlags = ntohl(evict->serverFlags); + buf += 4; + + evict->evictedFlags = *((uint32_t*)buf); + evict->evictedFlags = ntohl(evict->evictedFlags); + buf += 4; + + + evict->evictedKeyLen = kl; + evict->evictedKey = (char*)malloc(kl+1); + memset(evict->evictedKey,0,kl+1); + memcpy(evict->evictedKey,buf,kl); + buf += kl; + + + evict->evictedLen = bl - kl - el; + evict->evictedData = (char*)malloc(evict->evictedLen); + memcpy(evict->evictedData,buf,evict->evictedLen); + evict->evicted = true; + //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags); + evbuffer_drain(input,bl); + } else { + evbuffer_drain(input, targetLen); + } + } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else { + evbuffer_drain(input, targetLen); + } + + conn->stats.rx_bytes += targetLen; + done = true; + return true; +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApprox::read_callback1() { + struct evbuffer *input = bufferevent_get_input(bev1); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + + full_read = handle_response(this,input, done, found, opcode, opaque, evict,1); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + op = op_queue[1][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + } else { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + break; + } + + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + switch (op->type) { + case Operation::GET: + if (done) { + + int vl = op->valuelen; + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + int flags = OP_clu(op); + issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + //finish_op(op,0); + + } else { + if (OP_incl(op) && ghits >= gloc) { + //int ret = add_to_touch_keys(string(op->key)); + //if (ret == 1) { + issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H); + //} + gloc += rand()%(10*2-1)+1; + } + ghits++; + finish_op(op,1); + } + } else { + char out[128]; + sprintf(out,"conn l1: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_L2_M) { + // del_copy_keys(string(op->key)); + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY); + //} + this->stats.incl_wbs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (g_key_hist[wb_key] == 1)) || + (options.wb_all) ) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + //} + this->stats.excl_wbs++; + } + } + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + + } + + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } + + last_tx = now; + stats.log_op(op_queue_size[1]); + stats.log_op(op_queue_size[2]); + //for (int i = 1; i <= 2; i++) { + // fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]); + //} + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApprox::read_callback2() { + struct evbuffer *input = bufferevent_get_input(bev2); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + continue; + } + op = op_queue[2][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + continue; + } + } else { + break; + } + + + double now = get_time(); + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) {// && + //(options.twitter_trace != 1)) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1); + if (OP_incl(op)) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2); + last_quiet2 = false; + } + //} + last_quiet1 = false; + finish_op(op,0); // sets read_state = IDLE + + } else { + if (found) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + string key = string(op->key); + const char *data = &random_char[index]; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key,data,valuelen, now, flags); + //} + this->stats.copies_to_l1++; + //djb: this is automatically done in the L2 server + //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death? + // issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY ); + //} + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + } else { + char out[128]; + sprintf(out,"conn l2: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_WB) { + // del_wb_keys(string(op->key)); + //} + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) { + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + this->stats.set_misses_l2++; + } else { + if (OP_src(op) == SRC_DIRECT_SET) { + issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //del_touch_keys(string(op->key)); + } + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + } + + double now = get_time(); + if (check_exit_condition(now)) { + return; + } + + last_tx = now; + stats.log_op(op_queue_size[2]); + stats.log_op(op_queue_size[1]); + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Callback called when write requests finish. + */ +void ConnectionMultiApprox::write_callback() { + + //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size()); +} + +/** + * Callback for timer timeouts. + */ +void ConnectionMultiApprox::timer_callback() { + //fprintf(stderr,"timer up: %d\n",cid); + drive_write_machine(); +} + + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb1_approx(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr; + conn->event_callback1(events); +} + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb2_approx(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr; + conn->event_callback2(events); +} + +void bev_read_cb1_approx(struct bufferevent *bev, void *ptr) { + ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr; + conn->read_callback1(); +} + + +void bev_read_cb2_approx(struct bufferevent *bev, void *ptr) { + ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr; + conn->read_callback2(); +} + +void bev_write_cb_m_approx(struct bufferevent *bev, void *ptr) { +} + +void timer_cb_m_approx(evutil_socket_t fd, short what, void *ptr) { + ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr; + conn->timer_callback(); +} + diff --git a/ConnectionMultiApproxBatch.cc b/ConnectionMultiApproxBatch.cc new file mode 100644 index 0000000..16de236 --- /dev/null +++ b/ConnectionMultiApproxBatch.cc @@ -0,0 +1,2187 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +#include "config.h" + +#include "Connection.h" +#include "distributions.h" +#include "Generator.h" +#include "mutilate.h" +#include "binary_protocol.h" +#include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" + +//#include +//#include + +#define ITEM_L1 1 +#define ITEM_L2 2 +#define LOG_OP 4 +#define SRC_L1_M 8 +#define SRC_L1_H 16 +#define SRC_L2_M 32 +#define SRC_L2_H 64 +#define SRC_DIRECT_SET 128 +#define SRC_L1_COPY 256 +#define SRC_WB 512 + +#define ITEM_INCL 4096 +#define ITEM_EXCL 8192 +#define ITEM_DIRTY 16384 +#define ITEM_SIZE_CHANGE 131072 +#define ITEM_WAS_HIT 262144 + +#define LEVELS 2 +#define SET_INCL(incl,flags) \ + switch (incl) { \ + case 1: \ + flags |= ITEM_INCL; \ + break; \ + case 2: \ + flags |= ITEM_EXCL; \ + break; \ + \ + } \ + +#define GET_INCL(incl,flags) \ + if (flags & ITEM_INCL) incl = 1; \ + else if (flags & ITEM_EXCL) incl = 2; \ + +//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 ) +#define OP_level(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define FLAGS_level(flags) ( flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) ) + +#define OP_log(op) ((op)->flags & LOG_OP) +#define OP_incl(op) ((op)->flags & ITEM_INCL) +#define OP_excl(op) ((op)->flags & ITEM_EXCL) +#define OP_set_flag(op,flag) ((op))->flags |= flag; + +//#define DEBUGMC +//#define DEBUGS + + + +pthread_mutex_t cid_lock_m_approx_batch = PTHREAD_MUTEX_INITIALIZER; +static uint32_t connids_m = 1; + +#define NCLASSES 40 +#define CHUNK_ALIGN_BYTES 8 +static int classes = 0; +static int sizes[NCLASSES+1]; +static int inclusives[NCLASSES+1]; + + + +static void init_inclusives(char *inclusive_str) { + int j = 1; + for (int i = 0; i < (int)strlen(inclusive_str); i++) { + if (inclusive_str[i] == '-') { + continue; + } else { + inclusives[j] = inclusive_str[i] - '0'; + j++; + } + } +} + +static void init_classes() { + + double factor = 1.25; + //unsigned int chunk_size = 48; + //unsigned int item_size = 24; + unsigned int size = 96; //warning if you change this you die + unsigned int i = 0; + unsigned int chunk_size_max = 1048576/2; + while (++i < NCLASSES-1) { + if (size >= chunk_size_max / factor) { + break; + } + if (size % CHUNK_ALIGN_BYTES) + size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); + sizes[i] = size; + size *= factor; + } + sizes[i] = chunk_size_max; + classes = i; + +} + +static int get_class(int vl, uint32_t kl) { + //warning if you change this you die + int vsize = vl+kl+48+1+2; + int res = 1; + while (vsize > sizes[res]) + if (res++ == classes) { + //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]); + return -1; + } + return res; +} + +static int get_incl(int vl, int kl) { + int clsid = get_class(vl,kl); + if (clsid) { + return inclusives[clsid]; + } else { + return -1; + } +} + + +void ConnectionMultiApproxBatch::output_op(Operation *op, int type, bool found) { + char output[1024]; + char k[256]; + char a[256]; + char s[256]; + memset(k,0,256); + memset(a,0,256); + memset(s,0,256); + strncpy(k,op->key,255); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type); + } + size_t res = write(2,output,strlen(output)); + if (res != strlen(output)) { + fprintf(stderr,"error outputingiii\n"); + } +} + +extern unordered_map cid_rate; +extern unordered_map> copy_keys; +extern unordered_map touch_keys; +extern unordered_map> wb_keys; + +extern map g_key_hist; +extern int max_n[3]; + +/** + * Create a new connection to a server endpoint. + */ +ConnectionMultiApproxBatch::ConnectionMultiApproxBatch(struct event_base* _base, struct evdns_base* _evdns, + string _hostname1, string _hostname2, string _port, options_t _options, + bool sampling, int fd1, int fd2 ) : + start_time(0), stats(sampling), options(_options), + hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns) +{ + pthread_mutex_lock(&cid_lock_m_approx_batch); + cid = connids_m++; + if (cid == 1) { + init_classes(); + init_inclusives(options.inclusives); + } + //cid_rate.insert( { cid, 0 } ); + + pthread_mutex_unlock(&cid_lock_m_approx_batch); + + valuesize = createGenerator(options.valuesize); + keysize = createGenerator(options.keysize); + srand(time(NULL)); + keygen = new KeyGenerator(keysize, options.records); + + total = 0; + eof = 0; + o_percent = 0; + + if (options.lambda <= 0) { + iagen = createGenerator("0"); + } else { + D("iagen = createGenerator(%s)", options.ia); + iagen = createGenerator(options.ia); + iagen->set_lambda(options.lambda); + } + + read_state = IDLE; + write_state = INIT_WRITE; + last_quiet1 = false; + last_quiet2 = false; + + last_tx = last_rx = 0.0; + gets = 0; + ghits = 0; + esets = 0; + isets = 0; + gloc = rand() % (10*2-1)+1; + sloc = rand() % (10*2-1)+1; + iloc = rand() % (10*2-1)+1; + + op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1)); + + for (int i = 0; i <= LEVELS; i++) { + op_queue_size[i] = 0; + opaque[i] = 1; + //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX); + op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1)); + for (int j = 0; j <= OPAQUE_MAX; j++) { + op_queue[i][j] = NULL; + } + + } + + + bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev1, bev_read_cb1_approx_batch, bev_write_cb_m_approx_batch, bev_event_cb1_approx_batch, this); + bufferevent_enable(bev1, EV_READ | EV_WRITE); + //bufferevent_setwatermark(bev1, EV_READ, 512*1024, 0); + + bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE); + bufferevent_setcb(bev2, bev_read_cb2_approx_batch, bev_write_cb_m_approx_batch, bev_event_cb2_approx_batch, this); + bufferevent_enable(bev2, EV_READ | EV_WRITE); + //bufferevent_setwatermark(bev2, EV_READ, 512*1024, 0); + + timer = evtimer_new(base, timer_cb_m_approx_batch, this); + + read_state = IDLE; +} + + +void ConnectionMultiApproxBatch::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + trace_queue_n = a_trace_queue->size(); + Operation *Op = trace_queue->front(); + incl_ = get_incl(Op->valuelen,strlen(Op->key)); + clsid_ = get_class(Op->valuelen,strlen(Op->key)); + + buffer_size_ = 1024*1024*10; + //setup the buffers + //max is (valuelen + 256 + 24 + 4 + 4 ) * depth + for (int i = 1; i <= LEVELS; i++) { + buffer_write[i] = (unsigned char*)malloc(options.depth*512*1024); + buffer_read[i] = (unsigned char*)malloc(buffer_size_); + buffer_leftover[i] = (unsigned char*)malloc(buffer_size_); + memset(buffer_read[i],0,buffer_size_); + memset(buffer_leftover[i],0,buffer_size_); + buffer_write_n[i] = 0; + buffer_read_n[i] = 0; + buffer_write_nbytes[i] = 0; + buffer_read_nbytes[i] = 0; + buffer_write_pos[i] = buffer_write[i]; + buffer_read_pos[i] = buffer_read[i]; + buffer_lasthdr[i] = 0; // buffer_read[i]; + } + +} + +void ConnectionMultiApproxBatch::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +void ConnectionMultiApproxBatch::set_g_wbkeys(unordered_map> *a_wb_keys) { + g_wb_keys = a_wb_keys; +} + +uint32_t ConnectionMultiApproxBatch::get_cid() { + return cid; +} + +int ConnectionMultiApproxBatch::add_to_wb_keys(string key) { + auto pos = wb_keys.find(key); + if (pos == wb_keys.end()) { + wb_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + +void ConnectionMultiApproxBatch::del_wb_keys(string key) { + + auto position = wb_keys.find(key); + if (position != wb_keys.end()) { + vector op_list = vector(position->second); + wb_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected wb %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxBatch::add_to_copy_keys(string key) { + auto pos = copy_keys.find(key); + if (pos == copy_keys.end()) { + copy_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + + +void ConnectionMultiApproxBatch::del_copy_keys(string key) { + + auto position = copy_keys.find(key); + if (position != copy_keys.end()) { + vector op_list = vector(position->second); + copy_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected copy %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxBatch::add_to_touch_keys(string key) { + //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2; + auto pos = touch_keys.find(key); + if (pos == touch_keys.end()) { + touch_keys.insert( {key, cid }); + return 1; + } + return 2; +} + + +void ConnectionMultiApproxBatch::del_touch_keys(string key) { + //touch_keys.erase(key); + auto position = touch_keys.find(key); + if (position != touch_keys.end()) { + touch_keys.erase(position); + } else { + fprintf(stderr,"expected touch %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxBatch::issue_op(Operation *Op) { + double now = get_time(); + int issued = 0; + Op->clsid = get_class(Op->valuelen,strlen(Op->key)); + int flags = 0; + int index = lrand48() % (1024 * 1024); + int incl = inclusives[Op->clsid]; + SET_INCL(incl,flags); + + switch(Op->type) { + + case Operation::GET: + issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1); + this->stats.gets++; + gets++; + //this->stats.gets_cid[cid]++; + break; + case Operation::SET: + if (incl == 1) { + if (isets >= iloc) { + const char *data = &random_char[index]; + issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + issued = issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET); + iloc += rand()%(10*2-1)+1; + } else { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY); + } + isets++; + } else if (incl == 2) { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + if (esets >= sloc) { + issued = issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET); + sloc += rand()%(10*2-1)+1; + } + esets++; + } + this->stats.sets++; + //this->stats.sets_cid[cid]++; + break; + case Operation::DELETE: + case Operation::TOUCH: + case Operation::NOOP: + case Operation::SASL: + fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen); + break; + + } + return issued; +} + + +int ConnectionMultiApproxBatch::do_connect() { + + int connected = 0; + if (options.unix_socket) { + + + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + int addrlen; + addrlen = sizeof(sin1); + + int err = bufferevent_socket_connect(bev1, (struct sockaddr*)&sin1, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l1 error %s\n",strerror(err)); + } + + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + + addrlen = sizeof(sin2); + err = bufferevent_socket_connect(bev2, (struct sockaddr*)&sin2, addrlen); + if (err == 0) { + connected = 1; + } else { + connected = 0; + err = errno; + fprintf(stderr,"l2 error %s\n",strerror(err)); + } + } + read_state = IDLE; + return connected; +} + +/** + * Destroy a connection, performing cleanup. + */ +ConnectionMultiApproxBatch::~ConnectionMultiApproxBatch() { + + + for (int i = 0; i <= LEVELS; i++) { + free(op_queue[i]); + if (i > 0) { + free(buffer_write[i]); + free(buffer_read[i]); + } + + } + + free(op_queue_size); + free(opaque); + free(op_queue); + event_free(timer); + //timer = NULL; + // FIXME: W("Drain op_q?"); + + bufferevent_free(bev1); + bufferevent_free(bev2); + + delete iagen; + delete keygen; + delete keysize; + delete valuesize; +} + +/** + * Reset the connection back to an initial, fresh state. + */ +void ConnectionMultiApproxBatch::reset() { + // FIXME: Actually check the connection, drain all bufferevents, drain op_q. + //assert(op_queue.size() == 0); + //evtimer_del(timer); + read_state = IDLE; + write_state = INIT_WRITE; + stats = ConnectionStats(stats.sampling); +} + +/** + * Set our event processing priority. + */ +void ConnectionMultiApproxBatch::set_priority(int pri) { + if (bufferevent_priority_set(bev1, pri)) { + DIE("bufferevent_set_priority(bev, %d) failed", pri); + } +} + + + +/** + * Get/Set or Set Style + * If a GET command: Issue a get first, if not found then set + * If trace file (or prob. write) says to set, then set it + */ +int ConnectionMultiApproxBatch::issue_getsetorset(double now) { + + Operation *Op = trace_queue->front(); + if (Op->type == Operation::SASL) { + //cid_rate.insert( {cid, 100 } ); + //fprintf(stderr,"cid %d done before loop\n",cid); + //string op_queue1; + //string op_queue2; + //for (int j = 0; j < 2; j++) { + // for (int i = 0; i < OPAQUE_MAX; i++) { + // if (op_queue[j+1][i] != NULL) { + // if (j == 0) { + // op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + // } else { + // op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + // } + // } + // } + //} + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] > 0) { + send_write_buffer(i); + } + } + eof = 1; + //fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return 1; + } + + int issued = issue_op(Op); + trace_queue->pop(); + while (issued != 2) { + Op = trace_queue->front(); + + if (Op->type == Operation::SASL) { + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] > 0) { + send_write_buffer(i); + } + } + //string op_queue1; + //string op_queue2; + //for (int j = 0; j < 2; j++) { + // for (int i = 0; i < OPAQUE_MAX; i++) { + // if (op_queue[j+1][i] != NULL) { + // if (j == 0) { + // op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + // } else { + // op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + // } + // } + // } + //} + //fprintf(stderr,"done in loop cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + eof = 1; + return 1; + } + issued = issue_op(Op); + trace_queue->pop(); + } + + return 0; +} + +int ConnectionMultiApproxBatch::send_write_buffer(int level) { + struct bufferevent *bev = NULL; + switch (level) { + case 1: + bev = bev1; + break; + case 2: + bev = bev2; + break; + default: + bev = bev1; + break; + } + int ret = bufferevent_write(bev,buffer_write[level],buffer_write_nbytes[level]); + if (ret != 0) { + fprintf(stderr,"error writing buffer! level %d, size %d\n",level,buffer_write_nbytes[level]); + } + //fprintf(stderr,"l%d write: %u\n",level,buffer_write_nbytes[level]); + buffer_write_n[level] = 0; + buffer_write_pos[level] = buffer_write[level]; + memset(buffer_write_pos[level],0,buffer_write_nbytes[level]); + stats.tx_bytes += buffer_write_nbytes[level]; + buffer_write_nbytes[level] = 0; + return 2; +} + +int ConnectionMultiApproxBatch::add_get_op_to_queue(Operation *pop, int level) { + + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,pop->flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(pop->key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + //if (quiet) { + // h.opcode = CMD_GETQ; + //} + h.opaque = htonl(pop->opaque); + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + buffer_write_n[level]++; + buffer_write_nbytes[level] += 24 + keylen; + + int res = 1; + if (buffer_write_n[level] == (uint32_t)options.depth) { + res = send_write_buffer(level); + } + return res; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxBatch::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) { + + int level = FLAGS_level(flags); + + //initialize op for sending +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + if (l1 != NULL) { + pop->l1 = l1; + } + + //put op into queue + return add_get_op_to_queue(pop,level); +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxBatch::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::GET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(valuelen,strlen(key)); + + if (l1 != NULL) { + pop->l1 = l1; + } + + return add_get_op_to_queue(pop,level); + +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxBatch::issue_touch(const char* key, int valuelen, double now, int flags) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::TOUCH; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + +#ifdef DEBUGS + fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque); +#endif + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen), + 0x04, 0x00, htons(0), + htonl(keylen + 4) }; + h.opaque = htonl(pop->opaque); + + uint32_t exp = 0; + if (flags & ITEM_DIRTY) { + exp = htonl(flags); + } + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], &exp, 4); + buffer_write_pos[level] += 4; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + buffer_write_nbytes[level] += 24 + keylen + 4; + buffer_write_n[level]++; + + int ret = 1; + if (buffer_write_n[level] == (uint32_t)options.depth) { + ret = send_write_buffer(level); + } + + return ret; +} + +/** + * Issue a delete request to the server. + */ +int ConnectionMultiApproxBatch::issue_delete(const char* key, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->type = Operation::DELETE; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque); +#endif + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_DELETE, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + h.opaque = htonl(pop->opaque); + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + buffer_write_n[level]++; + buffer_write_nbytes[level] += 24 + keylen; + + int ret = 1; + if (buffer_write_n[level] == (uint32_t)options.depth) { + ret = send_write_buffer(level); + } + + return ret; +} + +int ConnectionMultiApproxBatch::issue_noop(int level) { + + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + + buffer_write_n[level]++; + buffer_write_nbytes[level] += 24; + + int ret = 1; + if (buffer_write_n[level] == (uint32_t)options.depth) { + ret = send_write_buffer(level); + } + + return ret; +} + +int ConnectionMultiApproxBatch::add_set_to_queue(Operation *pop, int level, const char* value) { + int length = pop->valuelen; + + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,length,level,pop->flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(pop->key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(pop->flags); + uint32_t exp = 0; + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], &f, 4); + buffer_write_pos[level] += 4; + memcpy(buffer_write_pos[level], &exp, 4); + buffer_write_pos[level] += 4; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + memcpy(buffer_write_pos[level], value, length); + buffer_write_pos[level] += length; + buffer_write_n[level]++; + buffer_write_nbytes[level] += length + 32 + keylen; + + int ret = 1; + if (buffer_write_n[level] == (uint32_t)options.depth) { + ret = send_write_buffer(level); + } + return ret; + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxBatch::issue_set(Operation *pop, const char* value, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + return add_set_to_queue(pop,level,value); + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxBatch::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + strncpy(pop->key,key,255); + pop->valuelen = length; + pop->type = Operation::SET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(length,strlen(key)); + + return add_set_to_queue(pop,level,value); + +} + + +/** + * Finish up (record stats) an operation that just returned from the + * server. + */ +void ConnectionMultiApproxBatch::finish_op(Operation *op, int was_hit) { + double now; +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); +#else + now = get_time(); +#endif +#if HAVE_CLOCK_GETTIME + op->end_time = get_time_accurate(); +#else + op->end_time = now; +#endif + + if (was_hit) { + switch (op->type) { + case Operation::GET: + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + if (op->l1 != NULL) { + op->l1->end_time = now; + stats.log_get(*(op->l1)); + } + break; + } + break; + case Operation::SET: + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } + + last_rx = now; + uint8_t level = OP_level(op); + if (op->l1 != NULL) { + //delete op_queue[1][op->l1->opaque]; + if (op->l1 == op_queue[1][op->l1->opaque]) { + op_queue[1][op->l1->opaque] = 0; + if (op_queue_size[1] > 0) { + op_queue_size[1]--; + } else { + fprintf(stderr,"chained op_Queue_size[%d] out of sync!!\n",1); + } + delete op->l1; + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[1][op->opaque],op->opaque,op_queue[1][op->opaque]->opaque); + } + } + //op_queue[level].erase(op_queue[level].begin()+opopq); + if (op == op_queue[level][op->opaque] && + op->opaque == op_queue[level][op->opaque]->opaque) { + //delete op_queue[level][op->opaque]; + op_queue[level][op->opaque] = 0; + delete op; + if (op_queue_size[level] > 0) { + op_queue_size[level]--; + } else { + fprintf(stderr,"op_Queue_size[%d] out of sync!!\n",level); + } + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque); + } + read_state = IDLE; + +} + + + +/** + * Check if our testing is done and we should exit. + */ +bool ConnectionMultiApproxBatch::check_exit_condition(double now) { + if (eof == 1) { + int done = 1; + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] != 0) { + //fprintf(stderr,"%d sending %d\n",i,buffer_write_n[i]); + send_write_buffer(i); + done = 0; + } + } + if (done) { + //fprintf(stderr,"%d done - check exit\n",cid); + return true; + } + } + return false; +} + +/** + * Handle new connection and error events. + */ +void ConnectionMultiApproxBatch::event_callback1(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname1.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev1); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev1); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Handle new connection and error events. + */ +void ConnectionMultiApproxBatch::event_callback2(short events) { + if (events & BEV_EVENT_CONNECTED) { + D("Connected to %s:%s.", hostname2.c_str(), port.c_str()); + int fd = bufferevent_getfd(bev2); + if (fd < 0) DIE("bufferevent_getfd"); + + if (!options.no_nodelay && !options.unix_socket) { + int one = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (void *) &one, sizeof(one)) < 0) + DIE("setsockopt()"); + } +#ifdef DEBUGMC + fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2)); +#endif + + + } else if (events & BEV_EVENT_ERROR) { + int err = bufferevent_socket_get_dns_error(bev2); + //if (err) DIE("DNS error: %s", evutil_gai_strerror(err)); + if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err)); + fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid, + evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); + + //DIE("BEV_EVENT_ERROR: %s", strerror(errno)); + + + } else if (events & BEV_EVENT_EOF) { + fprintf(stderr,"Unexpected EOF from server."); + return; + } +} + +/** + * Request generation loop. Determines whether or not to issue a new command, + * based on timer events. + * + * Note that this function loops. Be wary of break vs. return. + */ +void ConnectionMultiApproxBatch::drive_write_machine(double now) { + + if (now == 0.0) now = get_time(); + double delay; + struct timeval tv; + + int max_depth = (int)options.depth*2; + + while (1) { + switch (write_state) { + case INIT_WRITE: + delay = iagen->generate(); + next_time = now + delay; + double_to_tv(delay, &tv); + evtimer_add(timer, &tv); + write_state = ISSUING; + break; + + case ISSUING: + if ( (op_queue_size[1] >= (size_t) max_depth) || + (op_queue_size[2] >= (size_t) max_depth) ) { + write_state = WAITING_FOR_OPQ; + break; + } + + if (options.getsetorset) { + int ret = issue_getsetorset(now); + //if (ret) return; //if at EOF + return; + } + + last_tx = now; + for (int i = 1; i <= 2; i++) { + stats.log_op(op_queue_size[i]); + } + break; + + case WAITING_FOR_TIME: + write_state = ISSUING; + break; + + case WAITING_FOR_OPQ: + if ( (op_queue_size[1] >= (size_t) max_depth) || + (op_queue_size[2] >= (size_t) max_depth) ) { + for (int i = 1; i <= LEVELS; i++) { + if (max_depth > 16) { + if (buffer_write_n[i] > max_depth*0.8) { + send_write_buffer(i); + } + } + } + next_time = now + 0.01; + double_to_tv(delay, &tv); + evtimer_add(timer, &tv); + + return; + } else { + write_state = ISSUING; + break; + } + + default: DIE("Not implemented"); + } + } +} + +size_t ConnectionMultiApproxBatch::handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, + size_t read_bytes, size_t consumed_bytes, + int level, int extra) { + if (rbuf_pos[0] != 129) { + //fprintf(stderr,"cid %d we don't have a valid header %u\n",cid,rbuf_pos[0]); + //buffer_read_pos[level] = rbuf_pos; + //buffer_read_n[level] = 1; + return 0; + } + if ((read_bytes+extra - consumed_bytes) < 24) { + size_t have = (read_bytes+extra) - (consumed_bytes); + size_t need = 24 - (have); + buffer_read_n[level] = need; + buffer_read_nbytes[level] = have; + memcpy(buffer_leftover[level],rbuf_pos,have); + //buffer_lasthdr[level] = rbuf_pos; + //buffer_read_n[level] = need; + //buffer_read_nbytes[level] = have; + //fprintf(stderr,"cid %d - we don't have enough header data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,24,read_bytes,extra,level); + return 0; + + } + + binary_header_t* h = reinterpret_cast(rbuf_pos); + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + int targetLen = 24 + bl; + if (consumed_bytes + targetLen > (read_bytes+extra)) { + size_t have = (read_bytes+extra) - (consumed_bytes); + size_t need = targetLen - (have); + buffer_read_n[level] = need; + buffer_read_nbytes[level] = have; + memcpy(buffer_leftover[level],rbuf_pos,have); + //fprintf(stderr,"cid %d - we don't have enough data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,targetLen,read_bytes,extra,level); + return 0; + } + + resp->opcode = h->opcode; + resp->opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",cid,level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + // If something other than success, count it as a miss + if (resp->opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + stats.get_misses_l1++; + break; + case 2: + stats.get_misses_l2++; + stats.get_misses++; + stats.window_get_misses++; + break; + } + resp->found = false; + } else if (resp->opcode == CMD_SET && kl > 0) { + //first data is extras: clsid, flags, eflags + if (resp->evict) { + unsigned char *buf = rbuf_pos + 24; + resp->evict->clsid = *((uint32_t*)buf); + resp->evict->clsid = ntohl(resp->evict->clsid); + buf += 4; + + resp->evict->serverFlags = *((uint32_t*)buf); + resp->evict->serverFlags = ntohl(resp->evict->serverFlags); + buf += 4; + + resp->evict->evictedFlags = *((uint32_t*)buf); + resp->evict->evictedFlags = ntohl(resp->evict->evictedFlags); + buf += 4; + + resp->evict->evictedKeyLen = kl; + resp->evict->evictedKey = (char*)malloc(kl+1); + memset(resp->evict->evictedKey,0,kl+1); + memcpy(resp->evict->evictedKey,buf,kl); + buf += kl; + + resp->evict->evictedLen = bl - kl - el; + resp->evict->evictedData = (char*)malloc(resp->evict->evictedLen); + memcpy(resp->evict->evictedData,buf,resp->evict->evictedLen); + resp->evict->evicted = true; + } + } else if ( (resp->opcode == CMD_DELETE || resp->opcode == CMD_TOUCH) && + status == RESP_NOT_FOUND) { + resp->found = false; + } + this->stats.rx_bytes += targetLen; + return targetLen; +} + + +size_t ConnectionMultiApproxBatch::fill_read_buffer(int level, int *extra) { + + size_t read_bytes = 0; + struct bufferevent *bev = NULL; + switch (level) { + case 1: + bev = bev1; + break; + case 2: + bev = bev2; + break; + default: + bev = bev1; + break; + } + if (buffer_read_n[level] != 0) { + uint32_t have = buffer_read_nbytes[level]; + struct evbuffer *input = bufferevent_get_input(bev); + size_t len = evbuffer_get_length(input); + if (len < buffer_read_n[level]) { + return 0; + } + memset(buffer_read[level],0,512*1024); + memcpy(buffer_read[level],buffer_leftover[level],have); + buffer_read_pos[level] = buffer_read[level]; + read_bytes = bufferevent_read(bev,buffer_read_pos[level]+have,len); + if (read_bytes != len) { + fprintf(stderr,"cid %d expected %lu got %lu\n",cid,len,read_bytes); + } + *extra = have; + buffer_read_n[level] = 0; + buffer_read_nbytes[level] = 0; + + } else { + memset(buffer_read[level],0,512*1024); + buffer_read_pos[level] = buffer_read[level]; + read_bytes = bufferevent_read(bev, buffer_read_pos[level], buffer_size_ / 4); + *extra = 0; + } + if (read_bytes == 0) { + fprintf(stderr,"cid %d read 0 bytes\n",cid); + } + return read_bytes; +} +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxBatch::read_callback1() { + + int level = 1; + int extra = 0; + size_t read_bytes = 0; + + read_bytes = fill_read_buffer(level,&extra); + if (read_bytes == 0) { + return; + } + + //fprintf(stderr,"cid %d l1 read: %lu\n",cid,read_bytes); + size_t consumed_bytes = 0; + size_t batch = options.depth; + //we have at least some data to read + size_t nread_ops = 0; + while (1) { + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + resp_t mc_resp; + mc_resp.found = true; + mc_resp.evict = evict; + size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra); + if (cbytes == 0) { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + break; + } + buffer_read_pos[level] = buffer_read_pos[level] + cbytes; + consumed_bytes += cbytes; + uint32_t opaque = mc_resp.opaque; + bool found = mc_resp.found; + + Operation *op = op_queue[level][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"l1 cid %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + int vl = op->valuelen; + int flags = OP_clu(op); + switch (op->type) { + case Operation::GET: + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + + } else { + if (OP_incl(op) && ghits >= gloc) { + issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H); + gloc += rand()%(10*2-1)+1; + } + ghits++; + finish_op(op,1); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_L2_M) { + // del_copy_keys(string(op->key)); + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY); + //} + this->stats.incl_wbs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (g_key_hist[wb_key] == 1)) || + (options.wb_all) ) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + //} + this->stats.excl_wbs++; + } + } + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + nread_ops++; + if (buffer_read_pos[level][0] == 0) { + break; + } + if (buffer_read_pos[level][0] != 129) { + fprintf(stderr,"cid %d we don't have a valid header post %u\n",cid,buffer_read_pos[level][0]); + break; + } + } + //if (buffer_read_n[level] == 0) { + // memset(buffer_read[level],0,read_bytes); + //} + //if (nread_ops == 0) { + // fprintf(stderr,"ugh only got: %lu ops expected %lu, read %lu, cid %u\n",nread_ops,batch,read_bytes,cid); + // int *a = 0; + // *a = 0; + //} + + + double now = get_time(); + last_tx = now; + stats.log_op(op_queue_size[1]); + stats.log_op(op_queue_size[2]); + + drive_write_machine(); + + +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxBatch::read_callback2() { + + int level = 2; + int extra = 0; + + size_t read_bytes = 0; + + read_bytes = fill_read_buffer(level,&extra); + if (read_bytes == 0) { + return; + } + + //fprintf(stderr,"l2 read: %lu\n",read_bytes); + size_t consumed_bytes = 0; + size_t batch = options.depth; + size_t nread_ops = 0; + while (1) { + evicted_t *evict = NULL; + resp_t mc_resp; + mc_resp.found = true; + mc_resp.evict = evict; + size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra); + if (cbytes == 0) { + break; + } + buffer_read_pos[level] = buffer_read_pos[level] + cbytes; + consumed_bytes += cbytes; + uint32_t opaque = mc_resp.opaque; + bool found = mc_resp.found; + + Operation *op = op_queue[level][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"l2 cid %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + double now = get_time(); + switch (op->type) { + case Operation::GET: + if ( !found && (options.getset || options.getsetorset) ) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1); + if (OP_incl(op)) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2); + } + //} + finish_op(op,0); // sets read_state = IDLE + } else { + if (found) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + string key = string(op->key); + const char *data = &random_char[index]; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key,data,valuelen, now, flags); + //} + this->stats.copies_to_l1++; + //djb: this is automatically done in the L2 server + //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death? + // issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY ); + //} + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_WB) { + // del_wb_keys(string(op->key)); + //} + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) { + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + this->stats.set_misses_l2++; + } else { + if (OP_src(op) == SRC_DIRECT_SET) { + issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //del_touch_keys(string(op->key)); + } + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + nread_ops++; + if (buffer_read_pos[level][0] == 0) { + break; + } + if (buffer_read_pos[level][0] != 129) { + fprintf(stderr,"l2 cid %d we don't have a valid header post %u\n",cid,buffer_read_pos[level][0]); + break; + } + } + //if (buffer_read_n[level] == 0) { + // memset(buffer_read[level],0,read_bytes); + //} + //if (nread_ops == 0) { + // fprintf(stderr,"ugh l2 only got: %lu ops expected %lu\n",nread_ops,batch); + //} + + + double now = get_time(); + last_tx = now; + stats.log_op(op_queue_size[2]); + stats.log_op(op_queue_size[1]); + + drive_write_machine(); +} + +/** + * Callback called when write requests finish. + */ +void ConnectionMultiApproxBatch::write_callback() { + + //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size()); +} + +/** + * Callback for timer timeouts. + */ +void ConnectionMultiApproxBatch::timer_callback() { + //fprintf(stderr,"timer up: %d\n",cid); + drive_write_machine(); +} + + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb1_approx_batch(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr; + conn->event_callback1(events); +} + +/* The follow are C trampolines for libevent callbacks. */ +void bev_event_cb2_approx_batch(struct bufferevent *bev, short events, void *ptr) { + + ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr; + conn->event_callback2(events); +} + +void bev_read_cb1_approx_batch(struct bufferevent *bev, void *ptr) { + ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr; + if (conn->options.v1callback) { + conn->read_callback1_v1(); + } else { + conn->read_callback1(); + } +} + + +void bev_read_cb2_approx_batch(struct bufferevent *bev, void *ptr) { + ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr; + if (conn->options.v1callback) { + conn->read_callback2_v1(); + } else { + conn->read_callback2(); + } +} + +void bev_write_cb_m_approx_batch(struct bufferevent *bev, void *ptr) { +} + +void timer_cb_m_approx_batch(evutil_socket_t fd, short what, void *ptr) { + ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr; + conn->timer_callback(); +} +//previous implmentation of read +// + +/** + * Tries to consume a binary response (in its entirety) from an evbuffer. + * + * @param input evBuffer to read response from + * @return true if consumed, false if not enough data in buffer. + */ +static bool handle_response(ConnectionMultiApproxBatch *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) { + // Read the first 24 bytes as a header + int length = evbuffer_get_length(input); + if (length < 24) return false; + binary_header_t* h = + reinterpret_cast(evbuffer_pullup(input, 24)); + //assert(h); + + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + // Not whole response + int targetLen = 24 + bl; + if (length < targetLen) { + return false; + } + + opcode = h->opcode; + opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + + + // If something other than success, count it as a miss + if (opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + conn->stats.get_misses_l1++; + break; + case 2: + conn->stats.get_misses_l2++; + conn->stats.get_misses++; + conn->stats.window_get_misses++; + break; + + } + found = false; + evbuffer_drain(input, targetLen); + + } else if (opcode == CMD_SET && kl > 0) { + //first data is extras: clsid, flags, eflags + if (evict) { + evbuffer_drain(input,24); + unsigned char *buf = evbuffer_pullup(input,bl); + + + evict->clsid = *((uint32_t*)buf); + evict->clsid = ntohl(evict->clsid); + buf += 4; + + evict->serverFlags = *((uint32_t*)buf); + evict->serverFlags = ntohl(evict->serverFlags); + buf += 4; + + evict->evictedFlags = *((uint32_t*)buf); + evict->evictedFlags = ntohl(evict->evictedFlags); + buf += 4; + + + evict->evictedKeyLen = kl; + evict->evictedKey = (char*)malloc(kl+1); + memset(evict->evictedKey,0,kl+1); + memcpy(evict->evictedKey,buf,kl); + buf += kl; + + + evict->evictedLen = bl - kl - el; + evict->evictedData = (char*)malloc(evict->evictedLen); + memcpy(evict->evictedData,buf,evict->evictedLen); + evict->evicted = true; + //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags); + evbuffer_drain(input,bl); + } else { + evbuffer_drain(input, targetLen); + } + } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) { + found = false; + evbuffer_drain(input, targetLen); + } else { + evbuffer_drain(input, targetLen); + } + + conn->stats.rx_bytes += targetLen; + done = true; + return true; +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxBatch::read_callback1_v1() { + struct evbuffer *input = bufferevent_get_input(bev1); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + + full_read = handle_response(this,input, done, found, opcode, opaque, evict,1); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + op = op_queue[1][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + } else { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + break; + } + + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + switch (op->type) { + case Operation::GET: + if (done) { + + int vl = op->valuelen; + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + int flags = OP_clu(op); + issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + //finish_op(op,0); + + } else { + if (OP_incl(op) && ghits >= gloc) { + //int ret = add_to_touch_keys(string(op->key)); + //if (ret == 1) { + issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H); + //} + gloc += rand()%(10*2-1)+1; + } + ghits++; + finish_op(op,1); + } + } else { + char out[128]; + sprintf(out,"conn l1: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_L2_M) { + // del_copy_keys(string(op->key)); + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY); + //} + this->stats.incl_wbs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (g_key_hist[wb_key] == 1)) || + (options.wb_all) ) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + //} + this->stats.excl_wbs++; + } + } + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + + } + + + double now = get_time(); + + last_tx = now; + stats.log_op(op_queue_size[1]); + stats.log_op(op_queue_size[2]); + //for (int i = 1; i <= 2; i++) { + // fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]); + //} + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxBatch::read_callback2_v1() { + struct evbuffer *input = bufferevent_get_input(bev2); + + Operation *op = NULL; + bool done, found; + + //initially assume found (for sets that may come through here) + //is this correct? do we want to assume true in case that + //GET was found, but wrong value size (i.e. update value) + found = true; + + + //if (op_queue.size() == 0) V("Spurious read callback."); + bool full_read = true; + while (full_read) { + + + int opcode; + uint32_t opaque; + full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2); + if (full_read) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + continue; + } + op = op_queue[2][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + continue; + } + } else { + break; + } + + + double now = get_time(); + switch (op->type) { + case Operation::GET: + if (done) { + if ( !found && (options.getset || options.getsetorset) ) {// && + //(options.twitter_trace != 1)) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1); + if (OP_incl(op)) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2); + last_quiet2 = false; + } + //} + last_quiet1 = false; + finish_op(op,0); // sets read_state = IDLE + + } else { + if (found) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + string key = string(op->key); + const char *data = &random_char[index]; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key,data,valuelen, now, flags); + //} + this->stats.copies_to_l1++; + //djb: this is automatically done in the L2 server + //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death? + // issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY ); + //} + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + } else { + char out[128]; + sprintf(out,"conn l2: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_WB) { + // del_wb_keys(string(op->key)); + //} + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) { + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + this->stats.set_misses_l2++; + } else { + if (OP_src(op) == SRC_DIRECT_SET) { + issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //del_touch_keys(string(op->key)); + } + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + } + + double now = get_time(); + + last_tx = now; + stats.log_op(op_queue_size[2]); + stats.log_op(op_queue_size[1]); + drive_write_machine(); + + // update events + //if (bev != NULL) { + // // no pending response (nothing to read) and output buffer empty (nothing to write) + // if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) { + // bufferevent_disable(bev, EV_WRITE|EV_READ); + // } + //} +} diff --git a/ConnectionMultiApproxBatchShm.cc b/ConnectionMultiApproxBatchShm.cc new file mode 100644 index 0000000..21e7593 --- /dev/null +++ b/ConnectionMultiApproxBatchShm.cc @@ -0,0 +1,1645 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "config.h" + +#include "Connection.h" +#include "distributions.h" +#include "Generator.h" +#include "mutilate.h" +#include "binary_protocol.h" +#include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" + +//#include +//#include + +#define ITEM_L1 1 +#define ITEM_L2 2 +#define LOG_OP 4 +#define SRC_L1_M 8 +#define SRC_L1_H 16 +#define SRC_L2_M 32 +#define SRC_L2_H 64 +#define SRC_DIRECT_SET 128 +#define SRC_L1_COPY 256 +#define SRC_WB 512 + +#define ITEM_INCL 4096 +#define ITEM_EXCL 8192 +#define ITEM_DIRTY 16384 +#define ITEM_SIZE_CHANGE 131072 +#define ITEM_WAS_HIT 262144 + +#define LEVELS 2 +#define SET_INCL(incl,flags) \ + switch (incl) { \ + case 1: \ + flags |= ITEM_INCL; \ + break; \ + case 2: \ + flags |= ITEM_EXCL; \ + break; \ + \ + } \ + +#define GET_INCL(incl,flags) \ + if (flags & ITEM_INCL) incl = 1; \ + else if (flags & ITEM_EXCL) incl = 2; \ + +//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 ) +#define OP_level(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define FLAGS_level(flags) ( flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) ) + +#define OP_log(op) ((op)->flags & LOG_OP) +#define OP_incl(op) ((op)->flags & ITEM_INCL) +#define OP_excl(op) ((op)->flags & ITEM_EXCL) +#define OP_set_flag(op,flag) ((op))->flags |= flag; + +//#define DEBUGMC +//#define DEBUGS + + + +pthread_mutex_t cid_lock_m_approx_batch_shm = PTHREAD_MUTEX_INITIALIZER; +static uint32_t connids_m = 1; + +#define NCLASSES 40 +#define CHUNK_ALIGN_BYTES 8 +static int classes = 0; +static int sizes[NCLASSES+1]; +static int inclusives[NCLASSES+1]; + + + +static void init_inclusives(char *inclusive_str) { + int j = 1; + for (int i = 0; i < (int)strlen(inclusive_str); i++) { + if (inclusive_str[i] == '-') { + continue; + } else { + inclusives[j] = inclusive_str[i] - '0'; + j++; + } + } +} + +static void init_classes() { + + double factor = 1.25; + //unsigned int chunk_size = 48; + //unsigned int item_size = 24; + unsigned int size = 96; //warning if you change this you die + unsigned int i = 0; + unsigned int chunk_size_max = 1048576/2; + while (++i < NCLASSES-1) { + if (size >= chunk_size_max / factor) { + break; + } + if (size % CHUNK_ALIGN_BYTES) + size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); + sizes[i] = size; + size *= factor; + } + sizes[i] = chunk_size_max; + classes = i; + +} + +static int get_class(int vl, uint32_t kl) { + //warning if you change this you die + int vsize = vl+kl+48+1+2; + int res = 1; + while (vsize > sizes[res]) + if (res++ == classes) { + //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]); + return -1; + } + return res; +} + +//static int get_incl(int vl, int kl) { +// int clsid = get_class(vl,kl); +// if (clsid) { +// return inclusives[clsid]; +// } else { +// return -1; +// } +//} + + +void ConnectionMultiApproxBatchShm::output_op(Operation *op, int type, bool found) { + char output[1024]; + char k[256]; + char a[256]; + char s[256]; + memset(k,0,256); + memset(a,0,256); + memset(s,0,256); + strncpy(k,op->key,255); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type); + } + size_t res = write(2,output,strlen(output)); + if (res != strlen(output)) { + fprintf(stderr,"error outputingiii\n"); + } +} + +extern unordered_map cid_rate; +extern unordered_map> copy_keys; +extern unordered_map touch_keys; +extern unordered_map> wb_keys; + +extern map g_key_hist; +extern int max_n[3]; + +/** + * Create a new connection to a server endpoint. + */ +ConnectionMultiApproxBatchShm::ConnectionMultiApproxBatchShm(options_t _options, bool sampling) : + start_time(0), stats(sampling), options(_options) +{ + pthread_mutex_lock(&cid_lock_m_approx_batch_shm); + cid = connids_m++; + if (cid == 1) { + init_classes(); + init_inclusives(options.inclusives); + } + //cid_rate.insert( { cid, 0 } ); + + pthread_mutex_unlock(&cid_lock_m_approx_batch_shm); + + valuesize = createGenerator(options.valuesize); + keysize = createGenerator(options.keysize); + srand(time(NULL)); + keygen = new KeyGenerator(keysize, options.records); + + total = 0; + eof = 0; + o_percent = 0; + + if (options.lambda <= 0) { + iagen = createGenerator("0"); + } else { + D("iagen = createGenerator(%s)", options.ia); + iagen = createGenerator(options.ia); + iagen->set_lambda(options.lambda); + } + + read_state = IDLE; + write_state = INIT_WRITE; + last_quiet1 = false; + last_quiet2 = false; + + last_tx = last_rx = 0.0; + gets = 0; + ghits = 0; + esets = 0; + isets = 0; + gloc = rand() % (10*2-1)+1; + sloc = rand() % (10*2-1)+1; + iloc = rand() % (10*2-1)+1; + + op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + issued_queue = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1)); + + for (int i = 0; i <= LEVELS; i++) { + op_queue_size[i] = 0; + issued_queue[i] = 0; + opaque[i] = 1; + //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX); + op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1)); + for (int j = 0; j <= OPAQUE_MAX; j++) { + op_queue[i][j] = NULL; + } + + } + + + read_state = IDLE; +} + + +void ConnectionMultiApproxBatchShm::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + trace_queue_n = a_trace_queue->size(); + //Operation *Op = trace_queue->front(); + //incl_ = get_incl(Op->valuelen,strlen(Op->key)); + //clsid_ = get_class(Op->valuelen,strlen(Op->key)); + + buffer_size_ = 1024*1024*4; + //setup the buffers + //max is (valuelen + 256 + 24 + 4 + 4 ) * depth + for (int i = 1; i <= LEVELS; i++) { + buffer_write[i] = (unsigned char*)malloc(buffer_size_); + buffer_read[i] = (unsigned char*)malloc(buffer_size_); + buffer_leftover[i] = (unsigned char*)malloc(buffer_size_); + memset(buffer_read[i],0,buffer_size_); + memset(buffer_leftover[i],0,buffer_size_); + buffer_write_n[i] = 0; + buffer_read_n[i] = 0; + buffer_write_nbytes[i] = 0; + buffer_read_nbytes[i] = 0; + buffer_write_pos[i] = buffer_write[i]; + buffer_read_pos[i] = buffer_read[i]; + buffer_lasthdr[i] = 0; // buffer_read[i]; + } + +} + +void ConnectionMultiApproxBatchShm::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +void ConnectionMultiApproxBatchShm::set_g_wbkeys(unordered_map> *a_wb_keys) { + g_wb_keys = a_wb_keys; +} + +uint32_t ConnectionMultiApproxBatchShm::get_cid() { + return cid; +} + +int ConnectionMultiApproxBatchShm::add_to_wb_keys(string key) { + auto pos = wb_keys.find(key); + if (pos == wb_keys.end()) { + wb_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + +void ConnectionMultiApproxBatchShm::del_wb_keys(string key) { + + auto position = wb_keys.find(key); + if (position != wb_keys.end()) { + vector op_list = vector(position->second); + wb_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected wb %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxBatchShm::add_to_copy_keys(string key) { + auto pos = copy_keys.find(key); + if (pos == copy_keys.end()) { + copy_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + + +void ConnectionMultiApproxBatchShm::del_copy_keys(string key) { + + auto position = copy_keys.find(key); + if (position != copy_keys.end()) { + vector op_list = vector(position->second); + copy_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected copy %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxBatchShm::add_to_touch_keys(string key) { + //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2; + auto pos = touch_keys.find(key); + if (pos == touch_keys.end()) { + touch_keys.insert( {key, cid }); + return 1; + } + return 2; +} + + +void ConnectionMultiApproxBatchShm::del_touch_keys(string key) { + //touch_keys.erase(key); + auto position = touch_keys.find(key); + if (position != touch_keys.end()) { + touch_keys.erase(position); + } else { + fprintf(stderr,"expected touch %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxBatchShm::issue_op(Operation *Op) { + double now = get_time(); + int issued = 0; + Op->clsid = get_class(Op->valuelen,strlen(Op->key)); + int flags = 0; + int index = lrand48() % (1024 * 1024); + int incl = inclusives[Op->clsid]; + SET_INCL(incl,flags); + + switch(Op->type) { + + case Operation::GET: + issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1); + this->stats.gets++; + gets++; + //this->stats.gets_cid[cid]++; + break; + case Operation::SET: + if (incl == 1) { + if (isets >= iloc) { + const char *data = &random_char[index]; + issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + issued = issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET); + iloc += rand()%(10*2-1)+1; + } else { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY); + } + isets++; + } else if (incl == 2) { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + if (esets >= sloc) { + issued = issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET); + sloc += rand()%(10*2-1)+1; + } + esets++; + } + this->stats.sets++; + //this->stats.sets_cid[cid]++; + break; + case Operation::DELETE: + case Operation::TOUCH: + case Operation::NOOP: + case Operation::SASL: + fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen); + break; + + } + return issued; +} + + +int ConnectionMultiApproxBatchShm::do_connect() { + + int connected = 0; + //the client should see for this cid, where the shared memory is + typedef struct shared_ { + bipbuf_t bipbuf_in; + bipbuf_t bipbuf_out; + pthread_mutex_t lock_in; + pthread_mutex_t lock_out; + pthread_cond_t cond_in_not_empty; + pthread_cond_t cond_in_not_full; + pthread_cond_t cond_out_not_empty; + pthread_cond_t cond_out_not_full; + int bipbuf_in_bytes; + int bipbuf_out_bytes; + int shared_id; + } shared_t; + + //this cid gets shared memory + // ftok to generate unique key + //char shmkey[64]; + //sprintf(shmkey,"shmfilel1%d",cid); + int id = cid+100; + //key_t key = ftok(shmkey,id); + + // shmget returns an identifier in shmid + int shmid = shmget(id,sizeof(shared_t),0666); + + // shmat to attach to shared memory + shared_t* share_l1 = (shared_t*) shmat(shmid,(void*)0,0); + + fprintf(stderr,"cid %d gets shared memory buf l1 %d\n",cid,share_l1->shared_id); + + // ftok to generate unique key + //char shmkey2[64]; + //sprintf(shmkey2,"shmfilel2%d",cid); + int id2 = cid+200; + //key_t key2 = ftok(shmkey2,id2); + + // shmget returns an identifier in shmid + int shmid2 = shmget(id2,sizeof(shared_t),0666); + + // shmat to attach to shared memory + shared_t* share_l2 = (shared_t*) shmat(shmid2,(void*)0,0); + + fprintf(stderr,"cid %d gets shared memory buf l2 %d\n",cid,share_l2->shared_id); + connected = 1; + + //the leads are reveresed (from perspective of server) + bipbuf_in[1] = &share_l1->bipbuf_out; + bipbuf_in[2] = &share_l2->bipbuf_out; + bipbuf_out[1] = &share_l1->bipbuf_in; + bipbuf_out[2] = &share_l2->bipbuf_in; + + bipbuf_in_bytes[1] = &share_l1->bipbuf_out_bytes; + bipbuf_in_bytes[2] = &share_l2->bipbuf_out_bytes; + bipbuf_out_bytes[1] = &share_l1->bipbuf_in_bytes; + bipbuf_out_bytes[2] = &share_l2->bipbuf_in_bytes; + + lock_in[1] = &share_l1->lock_out; + lock_in[2] = &share_l2->lock_out; + lock_out[1] = &share_l1->lock_in; + lock_out[2] = &share_l2->lock_in; + + cond_in_not_empty[1] = &share_l1->cond_out_not_empty; + cond_in_not_empty[2] = &share_l2->cond_out_not_empty; + cond_in_not_full[1] = &share_l1->cond_out_not_full; + cond_in_not_full[2] = &share_l2->cond_out_not_full; + cond_out_not_empty[1] = &share_l1->cond_in_not_empty; + cond_out_not_empty[2] = &share_l2->cond_in_not_empty; + cond_out_not_full[1] = &share_l1->cond_in_not_full; + cond_out_not_full[2] = &share_l2->cond_in_not_full; + read_state = IDLE; + return connected; +} + +/** + * Destroy a connection, performing cleanup. + */ +ConnectionMultiApproxBatchShm::~ConnectionMultiApproxBatchShm() { + + + for (int i = 0; i <= LEVELS; i++) { + free(op_queue[i]); + if (i > 0) { + free(buffer_write[i]); + free(buffer_read[i]); + } + + } + + free(op_queue_size); + free(opaque); + free(op_queue); + + delete iagen; + delete keygen; + delete keysize; + delete valuesize; +} + +/** + * Reset the connection back to an initial, fresh state. + */ +void ConnectionMultiApproxBatchShm::reset() { + // FIXME: Actually check the connection, drain all bufferevents, drain op_q. + //assert(op_queue.size() == 0); + //evtimer_del(timer); + read_state = IDLE; + write_state = INIT_WRITE; + stats = ConnectionStats(stats.sampling); +} + + + + +/** + * Get/Set or Set Style + * If a GET command: Issue a get first, if not found then set + * If trace file (or prob. write) says to set, then set it + */ +int ConnectionMultiApproxBatchShm::issue_getsetorset(double now) { + + Operation *Op = trace_queue->front(); + if (Op->type == Operation::SASL) { + //cid_rate.insert( {cid, 100 } ); + //fprintf(stderr,"cid %d done before loop\n",cid); + //string op_queue1; + //string op_queue2; + //for (int j = 0; j < 2; j++) { + // for (int i = 0; i < OPAQUE_MAX; i++) { + // if (op_queue[j+1][i] != NULL) { + // if (j == 0) { + // op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + // } else { + // op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + // } + // } + // } + //} + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] > 0) { + send_write_buffer(i); + } + } + eof = 1; + //fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return 1; + } + + int issued = issue_op(Op); + trace_queue->pop(); + while (issued != 2) { + Op = trace_queue->front(); + + if (Op->type == Operation::SASL) { + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] > 0) { + send_write_buffer(i); + } + } + //string op_queue1; + //string op_queue2; + //for (int j = 0; j < 2; j++) { + // for (int i = 0; i < OPAQUE_MAX; i++) { + // if (op_queue[j+1][i] != NULL) { + // if (j == 0) { + // op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + // } else { + // op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + // } + // } + // } + //} + //fprintf(stderr,"done in loop cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + eof = 1; + return 1; + } + issued = issue_op(Op); + trace_queue->pop(); + } + + return 0; +} + +int ConnectionMultiApproxBatchShm::send_write_buffer(int level) { + int rc = 1; + pthread_mutex_lock(lock_out[level]); + int to_write = buffer_write_nbytes[level]; + int gtg = bipbuf_unused(bipbuf_out[level]) >= to_write ? 1 : 0; + while (gtg == 0) { + pthread_cond_wait(cond_out_not_full[level],lock_out[level]); + gtg = bipbuf_unused(bipbuf_out[level]) >= to_write ? 1 : 0; + } + int ret = bipbuf_offer(bipbuf_out[level],buffer_write[level],to_write); + if (ret != to_write) { + fprintf(stderr,"error writing buffer! level %d, size %d\n",level,to_write); + } + *bipbuf_out_bytes[level] += to_write; + //fprintf(stderr,"writing %d to %d, total %d\n",to_write,level,*bipbuf_out_bytes[level]); + issued_queue[level] = buffer_write_n[level]; + buffer_write_n[level] = 0; + buffer_write_pos[level] = buffer_write[level]; + memset(buffer_write_pos[level],0,buffer_write_nbytes[level]); + stats.tx_bytes += buffer_write_nbytes[level]; + buffer_write_nbytes[level] = 0; + rc = 2; + pthread_cond_signal(cond_out_not_empty[level]); + pthread_mutex_unlock(lock_out[level]); + return rc; +} + +int ConnectionMultiApproxBatchShm::add_get_op_to_queue(Operation *pop, int level, int cb) { + + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,pop->flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(pop->key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + //if (quiet) { + // h.opcode = CMD_GETQ; + //} + h.opaque = htonl(pop->opaque); + + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + buffer_write_n[level]++; + buffer_write_nbytes[level] += 24 + keylen; + + int res = 1; + if (buffer_write_n[level] >= (uint32_t)options.depth) { // && cb == 0) { + res = send_write_buffer(level); + } + return res; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxBatchShm::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) { + + int level = FLAGS_level(flags); + + //initialize op for sending +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + if (l1 != NULL) { + pop->l1 = l1; + } + + //put op into queue + return add_get_op_to_queue(pop,level,0); +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxBatchShm::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::GET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(valuelen,strlen(key)); + + if (l1 != NULL) { + pop->l1 = l1; + } + + return add_get_op_to_queue(pop,level,1); + +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxBatchShm::issue_touch(const char* key, int valuelen, double now, int flags) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::TOUCH; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + +#ifdef DEBUGS + fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque); +#endif + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen), + 0x04, 0x00, htons(0), + htonl(keylen + 4) }; + h.opaque = htonl(pop->opaque); + + uint32_t exp = 0; + if (flags & ITEM_DIRTY) { + exp = htonl(flags); + } + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], &exp, 4); + buffer_write_pos[level] += 4; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + buffer_write_nbytes[level] += 24 + keylen + 4; + buffer_write_n[level]++; + + int ret = 1; + //if (buffer_write_n[level] == (uint32_t)options.depth) { + // ret = send_write_buffer(level); + //} + + return ret; +} + +/** + * Issue a delete request to the server. + */ +int ConnectionMultiApproxBatchShm::issue_delete(const char* key, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->type = Operation::DELETE; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque); +#endif + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_DELETE, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + h.opaque = htonl(pop->opaque); + + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + buffer_write_n[level]++; + buffer_write_nbytes[level] += 24 + keylen; + + int ret = 1; + //if (buffer_write_n[level] >= (uint32_t)options.depth) { + // ret = send_write_buffer(level); + //} + + return ret; +} + +int ConnectionMultiApproxBatchShm::issue_noop(int level) { + + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + + buffer_write_n[level]++; + buffer_write_nbytes[level] += 24; + + int ret = 1; + //if (buffer_write_n[level] >= (uint32_t)options.depth) { + // ret = send_write_buffer(level); + //} + + return ret; +} + +int ConnectionMultiApproxBatchShm::add_set_to_queue(Operation *pop, int level, const char* value, int cb) { + int length = pop->valuelen; + + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,length,level,pop->flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + uint16_t keylen = strlen(pop->key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(pop->flags); + uint32_t exp = 0; + //int to_write = buffer_write_nbytes[level] + 32 + keylen + length; + //int gtg = bipbuf_unused(bipbuf_out[level]) >= to_write ? 1 : 0; + //if (gtg == 0) { + // switch (level) { + // case 1: + // read_callback1(); + // break; + // case 2: + // read_callback2(); + // break; + // } + //} + //fprintf(stderr,"write_n[%d] %d bytes: %d\n",level,buffer_write_n[level],buffer_write_nbytes[level]); + memcpy(buffer_write_pos[level], &h, 24); + buffer_write_pos[level] += 24; + memcpy(buffer_write_pos[level], &f, 4); + buffer_write_pos[level] += 4; + memcpy(buffer_write_pos[level], &exp, 4); + buffer_write_pos[level] += 4; + memcpy(buffer_write_pos[level], pop->key, keylen); + buffer_write_pos[level] += keylen; + memcpy(buffer_write_pos[level], value, length); + buffer_write_pos[level] += length; + buffer_write_n[level]++; + buffer_write_nbytes[level] += length + 32 + keylen; + + int ret = 1; + if (buffer_write_n[level] >= (uint32_t)options.depth) { // && cb == 0) { + ret = send_write_buffer(level); + } + return ret; + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxBatchShm::issue_set(Operation *pop, const char* value, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + return add_set_to_queue(pop,level,value,0); + +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxBatchShm::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + strncpy(pop->key,key,255); + pop->valuelen = length; + pop->type = Operation::SET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(length,strlen(key)); + + return add_set_to_queue(pop,level,value,1); + +} + + +/** + * Finish up (record stats) an operation that just returned from the + * server. + */ +void ConnectionMultiApproxBatchShm::finish_op(Operation *op, int was_hit) { + double now; +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); +#else + now = get_time(); +#endif +#if HAVE_CLOCK_GETTIME + op->end_time = get_time_accurate(); +#else + op->end_time = now; +#endif + + if (was_hit) { + switch (op->type) { + case Operation::GET: + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + if (op->l1 != NULL) { + op->l1->end_time = now; + stats.log_get(*(op->l1)); + } + break; + } + break; + case Operation::SET: + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } + + last_rx = now; + uint8_t level = OP_level(op); + if (op->l1 != NULL) { + //delete op_queue[1][op->l1->opaque]; + if (op->l1 == op_queue[1][op->l1->opaque]) { + op_queue[1][op->l1->opaque] = 0; + if (op_queue_size[1] > 0) { + op_queue_size[1]--; + } else { + fprintf(stderr,"chained op_Queue_size[%d] out of sync!!\n",1); + } + delete op->l1; + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[1][op->opaque],op->opaque,op_queue[1][op->opaque]->opaque); + } + } + //op_queue[level].erase(op_queue[level].begin()+opopq); + if (op == op_queue[level][op->opaque] && + op->opaque == op_queue[level][op->opaque]->opaque) { + //delete op_queue[level][op->opaque]; + op_queue[level][op->opaque] = 0; + delete op; + if (op_queue_size[level] > 0) { + op_queue_size[level]--; + } else { + fprintf(stderr,"op_Queue_size[%d] out of sync!!\n",level); + } + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque); + } + read_state = IDLE; + +} + + + +/** + * Request generation loop. Determines whether or not to issue a new command, + * based on timer events. + * + * Note that this function loops. Be wary of break vs. return. + */ +void ConnectionMultiApproxBatchShm::drive_write_machine_shm(double now) { + + while (trace_queue->size() > 0) { + Operation *Op = trace_queue->front(); + if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) { + eof = 1; + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] > 0) { + send_write_buffer(i); + } + } + + cid_rate.insert( {cid, 100 } ); + fprintf(stderr,"cid %d done\n",cid); + string op_queue1; + string op_queue2; + for (int j = 0; j < 2; j++) { + for (int i = 0; i < OPAQUE_MAX; i++) { + if (op_queue[j+1][i] != NULL) { + if (j == 0) { + op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + } else { + op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + } + } + } + } + fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return; + } + int issued = 0; + while (issued != 2) { + Op = trace_queue->front(); + + if (Op->type == Operation::SASL) { + for (int i = 1; i <= LEVELS; i++) { + if (buffer_write_n[i] > 0) { + send_write_buffer(i); + } + } + //string op_queue1; + //string op_queue2; + //for (int j = 0; j < 2; j++) { + // for (int i = 0; i < OPAQUE_MAX; i++) { + // if (op_queue[j+1][i] != NULL) { + // if (j == 0) { + // op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + // } else { + // op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + // } + // } + // } + //} + //fprintf(stderr,"done in loop cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + eof = 1; + return; + } + issued = issue_op(Op); //this will return 2 if the write buffer was sent (i.e. buffer has depth commands) + trace_queue->pop(); + } + if ( (int)(issued_queue[1]) > 0) { + read_callback1(); + issued_queue[1] = 0; + } + if ( (int)(issued_queue[2]) > 0) { + read_callback2(); + issued_queue[2] = 0; + } + } +} + +size_t ConnectionMultiApproxBatchShm::handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, + size_t read_bytes, size_t consumed_bytes, + int level, int extra) { + if (rbuf_pos[0] != 129) { + fprintf(stderr,"cid %d we don't have a valid header %u\n",cid,rbuf_pos[0]); + //buffer_read_pos[level] = rbuf_pos; + //buffer_read_n[level] = 1; + return 0; + } + if ((read_bytes+extra - consumed_bytes) < 24) { + size_t have = (read_bytes+extra) - (consumed_bytes); + size_t need = 24 - (have); + buffer_read_n[level] = need; + buffer_read_nbytes[level] = have; + memcpy(buffer_leftover[level],rbuf_pos,have); + //buffer_lasthdr[level] = rbuf_pos; + //buffer_read_n[level] = need; + //buffer_read_nbytes[level] = have; + fprintf(stderr,"cid %d - we don't have enough header data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,24,read_bytes,extra,level); + return 0; + + } + + binary_header_t* h = reinterpret_cast(rbuf_pos); + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + int targetLen = 24 + bl; + if (consumed_bytes + targetLen > (read_bytes+extra)) { + size_t have = (read_bytes+extra) - (consumed_bytes); + size_t need = targetLen - (have); + buffer_read_n[level] = need; + buffer_read_nbytes[level] = have; + memcpy(buffer_leftover[level],rbuf_pos,have); + fprintf(stderr,"cid %d - we don't have enough data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,targetLen,read_bytes,extra,level); + return 0; + } + + resp->opcode = h->opcode; + resp->opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",cid,level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + // If something other than success, count it as a miss + if (resp->opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + stats.get_misses_l1++; + break; + case 2: + stats.get_misses_l2++; + stats.get_misses++; + stats.window_get_misses++; + break; + } + resp->found = false; + } else if (resp->opcode == CMD_SET && kl > 0) { + //first data is extras: clsid, flags, eflags + if (resp->evict) { + unsigned char *buf = rbuf_pos + 24; + resp->evict->clsid = *((uint32_t*)buf); + resp->evict->clsid = ntohl(resp->evict->clsid); + buf += 4; + + resp->evict->serverFlags = *((uint32_t*)buf); + resp->evict->serverFlags = ntohl(resp->evict->serverFlags); + buf += 4; + + resp->evict->evictedFlags = *((uint32_t*)buf); + resp->evict->evictedFlags = ntohl(resp->evict->evictedFlags); + buf += 4; + + resp->evict->evictedKeyLen = kl; + resp->evict->evictedKey = (char*)malloc(kl+1); + memset(resp->evict->evictedKey,0,kl+1); + memcpy(resp->evict->evictedKey,buf,kl); + buf += kl; + + resp->evict->evictedLen = bl - kl - el; + resp->evict->evictedData = (char*)malloc(resp->evict->evictedLen); + memcpy(resp->evict->evictedData,buf,resp->evict->evictedLen); + resp->evict->evicted = true; + } + } else if ( (resp->opcode == CMD_DELETE || resp->opcode == CMD_TOUCH) && + status == RESP_NOT_FOUND) { + resp->found = false; + } + this->stats.rx_bytes += targetLen; + return targetLen; +} + + +size_t ConnectionMultiApproxBatchShm::fill_read_buffer(int level, int *extra) { + + size_t read_bytes = 0; + + pthread_mutex_lock(lock_in[level]); + //int len = *bipbuf_in_bytes[level]; + int len = bipbuf_used(bipbuf_in[level]); + while (len == 0) { + pthread_cond_wait(cond_in_not_empty[level],lock_in[level]); + //len = *bipbuf_in_bytes[level]; + len = bipbuf_used(bipbuf_in[level]); + } + unsigned int all = 0; + + + if (buffer_read_n[level] != 0) { + uint32_t have = buffer_read_nbytes[level]; + fprintf(stderr,"already have %u\n",have); + //if ((size_t)len < buffer_read_n[level]) { + // pthread_mutex_unlock(lock_in[level]); + // return 0; + //} + unsigned char* input = bipbuf_peek_all(bipbuf_in[level],&all); + if (!input || all == 0) { + if (!input && all > 0) + fprintf(stderr,"cid %d expected %d on level %d (already have %u)\n",cid,all,level,have); + pthread_mutex_unlock(lock_in[level]); + return 0; + } + memcpy(buffer_read[level],buffer_leftover[level],have); + buffer_read_pos[level] = buffer_read[level]; + memcpy(buffer_read_pos[level]+have,input,all); + read_bytes = all; + *extra = have; + buffer_read_n[level] = 0; + buffer_read_nbytes[level] = 0; + + } else { + unsigned char *input = bipbuf_peek_all(bipbuf_in[level],&all); + if (!input || all == 0) { + if (!input && all > 0) + fprintf(stderr,"cid %d expected %d on level %d\n",cid,all,level); + pthread_mutex_unlock(lock_in[level]); + return 0; + } + read_bytes = all; + buffer_read_pos[level] = input; +#ifdef DEBUGMC + fprintf(stderr,"read %d of %d (avail: %d) on l%d\n",all,*bipbuf_in_bytes[level],len,level); +#endif + //memcpy(buffer_read_pos[level],input,len); + + *extra = 0; + } + if (read_bytes == 0) { + fprintf(stderr,"cid %d read 0 bytes\n",cid); + } + pthread_mutex_unlock(lock_in[level]); + return read_bytes; +} +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxBatchShm::read_callback1() { + + int level = 1; + int extra = 0; + size_t read_bytes = 0; + + read_bytes = fill_read_buffer(level,&extra); + if (read_bytes == 0) { + pthread_mutex_lock(lock_in[level]); + pthread_cond_signal(cond_in_not_full[level]); + pthread_mutex_unlock(lock_in[level]); + return; + } + + //fprintf(stderr,"cid %d l1 read: %lu\n",cid,read_bytes); + size_t consumed_bytes = 0; + size_t batch = options.depth; + //we have at least some data to read + size_t nread_ops = 0; + while (1) { + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + resp_t mc_resp; + mc_resp.found = true; + mc_resp.evict = evict; + size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra); + if (cbytes == 0) { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + break; + } + buffer_read_pos[level] = buffer_read_pos[level] + cbytes; + consumed_bytes += cbytes; + uint32_t opaque = mc_resp.opaque; + bool found = mc_resp.found; + + Operation *op = op_queue[level][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"l1 cid %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + int vl = op->valuelen; + int flags = OP_clu(op); + switch (op->type) { + case Operation::GET: + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + + } else { + if (OP_incl(op) && ghits >= gloc) { + issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H); + gloc += rand()%(10*2-1)+1; + } + ghits++; + finish_op(op,1); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_L2_M) { + // del_copy_keys(string(op->key)); + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY); + //} + this->stats.incl_wbs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (g_key_hist[wb_key] == 1)) || + (options.wb_all) ) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + //} + this->stats.excl_wbs++; + } + } + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + nread_ops++; + if (buffer_read_pos[level][0] != 129 || (read_bytes - consumed_bytes == 0)) { + break; + } + //if (buffer_read_pos[level][0] != 129) { + // //fprintf(stderr,"cid %d we don't have a valid header post %u\n",cid,buffer_read_pos[level][0]); + // break; + //} + } + pthread_mutex_lock(lock_in[level]); + bipbuf_poll(bipbuf_in[level],read_bytes); + *bipbuf_in_bytes[level] = *bipbuf_in_bytes[level] - read_bytes; + pthread_cond_signal(cond_in_not_full[level]); + pthread_mutex_unlock(lock_in[level]); + double now = get_time(); + last_tx = now; + stats.log_op(op_queue_size[1]); + stats.log_op(op_queue_size[2]); + + +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxBatchShm::read_callback2() { + + int level = 2; + int extra = 0; + + size_t read_bytes = 0; + + read_bytes = fill_read_buffer(level,&extra); + if (read_bytes == 0) { + pthread_mutex_lock(lock_in[level]); + pthread_cond_signal(cond_in_not_full[level]); + pthread_mutex_unlock(lock_in[level]); + return; + } + + //fprintf(stderr,"l2 read: %lu\n",read_bytes); + size_t consumed_bytes = 0; + size_t batch = options.depth; + size_t nread_ops = 0; + while (1) { + evicted_t *evict = NULL; + resp_t mc_resp; + mc_resp.found = true; + mc_resp.evict = evict; + size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra); + if (cbytes == 0) { + break; + } + buffer_read_pos[level] = buffer_read_pos[level] + cbytes; + consumed_bytes += cbytes; + uint32_t opaque = mc_resp.opaque; + bool found = mc_resp.found; + + Operation *op = op_queue[level][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"l2 cid %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + double now = get_time(); + switch (op->type) { + case Operation::GET: + if ( !found && (options.getset || options.getsetorset) ) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1); + if (OP_incl(op)) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2); + } + //} + finish_op(op,0); // sets read_state = IDLE + } else { + if (found) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + string key = string(op->key); + const char *data = &random_char[index]; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key,data,valuelen, now, flags); + //} + this->stats.copies_to_l1++; + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_WB) { + // del_wb_keys(string(op->key)); + //} + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) { + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + this->stats.set_misses_l2++; + } else { + if (OP_src(op) == SRC_DIRECT_SET) { + issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //del_touch_keys(string(op->key)); + } + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + nread_ops++; + if (buffer_read_pos[level][0] != 129 || (read_bytes - consumed_bytes == 0)) { + break; + } + } + //if (buffer_read_n[level] == 0) { + // memset(buffer_read[level],0,read_bytes); + //} + //if (nread_ops == 0) { + // fprintf(stderr,"ugh l2 only got: %lu ops expected %lu\n",nread_ops,batch); + //} + + pthread_mutex_lock(lock_in[level]); + bipbuf_poll(bipbuf_in[level],read_bytes); + *bipbuf_in_bytes[level] = *bipbuf_in_bytes[level] - read_bytes; + pthread_cond_signal(cond_in_not_full[level]); + pthread_mutex_unlock(lock_in[level]); + + + double now = get_time(); + last_tx = now; + stats.log_op(op_queue_size[2]); + stats.log_op(op_queue_size[1]); + +} + diff --git a/ConnectionMultiApproxShm.cc b/ConnectionMultiApproxShm.cc new file mode 100644 index 0000000..e3c006d --- /dev/null +++ b/ConnectionMultiApproxShm.cc @@ -0,0 +1,1772 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#include "Connection.h" +#include "distributions.h" +#include "Generator.h" +#include "mutilate.h" +#include "binary_protocol.h" +#include "util.h" +#include +#include +#include +#include +#include +#include "blockingconcurrentqueue.h" +#include "bipbuffer.h" +//#include +//#include + + + +#define ITEM_L1 1 +#define ITEM_L2 2 +#define LOG_OP 4 +#define SRC_L1_M 8 +#define SRC_L1_H 16 +#define SRC_L2_M 32 +#define SRC_L2_H 64 +#define SRC_DIRECT_SET 128 +#define SRC_L1_COPY 256 +#define SRC_WB 512 + +#define ITEM_INCL 4096 +#define ITEM_EXCL 8192 +#define ITEM_DIRTY 16384 +#define ITEM_SIZE_CHANGE 131072 +#define ITEM_WAS_HIT 262144 + +#define LEVELS 2 +#define SET_INCL(incl,flags) \ + switch (incl) { \ + case 1: \ + flags |= ITEM_INCL; \ + break; \ + case 2: \ + flags |= ITEM_EXCL; \ + break; \ + \ + } \ + +#define GET_INCL(incl,flags) \ + if (flags & ITEM_INCL) incl = 1; \ + else if (flags & ITEM_EXCL) incl = 2; \ + +//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 ) +#define OP_level(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define FLAGS_level(flags) ( flags & ~(LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \ + ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \ + SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \ + SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) + +#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \ + ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) ) + +#define OP_log(op) ((op)->flags & LOG_OP) +#define OP_incl(op) ((op)->flags & ITEM_INCL) +#define OP_excl(op) ((op)->flags & ITEM_EXCL) +#define OP_set_flag(op,flag) ((op))->flags |= flag; + +//#define DEBUGMC +//#define DEBUGS +//using namespace folly; +using namespace moodycamel; +//using namespace fmt; + +//struct node { +// long long addr,label; +// node *nxt; +// node(long long _addr = 0, long long _label = 0, node *_nxt = NULL) +// : addr(_addr),label(_label),nxt(_nxt) {} +//}; +// +//struct tnode { +// long long tm,offset; int size; +//};//trace file data structure +// +//long long find(long long addr) { +// int t = addr%MAXH; +// node *tmp = hash[t],*pre = NULL; +// while (tmp) { +// if (tmp->addr == addr) { +// long long tlabel = tmp->label; +// if (pre == NULL) hash[t] = tmp->nxt; +// else pre->nxt = tmp->nxt; +// delete tmp; +// return tlabel; +// } +// pre = tmp; +// tmp = tmp->nxt; +// } +// return 0; +//} +// +//void insert(long long addr ) { +// int t = addr%MAXH; +// node *tmp = new node(addr,n,hash[t]); +// hash[t] = tmp; +//} + + + +pthread_mutex_t cid_lock_m_approx_shm = PTHREAD_MUTEX_INITIALIZER; +static uint32_t connids_m = 1; + +#define NCLASSES 40 +#define CHUNK_ALIGN_BYTES 8 +static int classes = 0; +static int sizes[NCLASSES+1]; +static int inclusives[NCLASSES+1]; + + + +static void init_inclusives(char *inclusive_str) { + int j = 1; + for (int i = 0; i < (int)strlen(inclusive_str); i++) { + if (inclusive_str[i] == '-') { + continue; + } else { + inclusives[j] = inclusive_str[i] - '0'; + j++; + } + } +} + +static void init_classes() { + + double factor = 1.25; + //unsigned int chunk_size = 48; + //unsigned int item_size = 24; + unsigned int size = 96; //warning if you change this you die + unsigned int i = 0; + unsigned int chunk_size_max = 1048576/2; + while (++i < NCLASSES-1) { + if (size >= chunk_size_max / factor) { + break; + } + if (size % CHUNK_ALIGN_BYTES) + size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); + sizes[i] = size; + size *= factor; + } + sizes[i] = chunk_size_max; + classes = i; + +} + +static int get_class(int vl, uint32_t kl) { + //warning if you change this you die + int vsize = vl+kl+48+1+2; + int res = 1; + while (vsize > sizes[res]) + if (res++ == classes) { + //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]); + return -1; + } + return res; +} + +static int get_incl(int vl, int kl) { + int clsid = get_class(vl,kl); + if (clsid) { + return inclusives[clsid]; + } else { + return -1; + } +} + +void ConnectionMultiApproxShm::output_op(Operation *op, int type, bool found) { + char output[1024]; + char k[256]; + char a[256]; + char s[256]; + memset(k,0,256); + memset(a,0,256); + memset(s,0,256); + strncpy(k,op->key,255); + switch (type) { + case 0: //get + sprintf(a,"issue_get"); + break; + case 1: //set + sprintf(a,"issue_set"); + break; + case 2: //resp + sprintf(a,"resp"); + break; + } + switch(read_state) { + case INIT_READ: + sprintf(s,"init"); + break; + case CONN_SETUP: + sprintf(s,"setup"); + break; + case LOADING: + sprintf(s,"load"); + break; + case IDLE: + sprintf(s,"idle"); + break; + case WAITING_FOR_GET: + sprintf(s,"waiting for get"); + break; + case WAITING_FOR_SET: + sprintf(s,"waiting for set"); + break; + case WAITING_FOR_DELETE: + sprintf(s,"waiting for del"); + break; + case MAX_READ_STATE: + sprintf(s,"max"); + break; + } + if (type == 2) { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type); + } else { + sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type); + } + write(2,output,strlen(output)); +} + +//extern USPMCQueue g_trace_queue; +//static vector cid_rate; +//extern ConcurrentHashMap cid_rate; +extern unordered_map cid_rate; +//extern ConcurrentHashMap> copy_keys; +extern unordered_map> copy_keys; +extern unordered_map touch_keys; +extern unordered_map> wb_keys; +//extern ConcurrentHashMap> wb_keys; + +extern map g_key_hist; +extern int max_n[3]; + +/** + * Create a new connection to a server endpoint. + */ +ConnectionMultiApproxShm::ConnectionMultiApproxShm(options_t _options, + bool sampling) : + start_time(0), stats(sampling), options(_options) +{ + pthread_mutex_lock(&cid_lock_m_approx_shm); + cid = connids_m++; + if (cid == 1) { + init_classes(); + init_inclusives(options.inclusives); + } + cid_rate.insert( { cid, 0 } ); + + pthread_mutex_unlock(&cid_lock_m_approx_shm); + + valuesize = createGenerator(options.valuesize); + keysize = createGenerator(options.keysize); + srand(time(NULL)); + keygen = new KeyGenerator(keysize, options.records); + + total = 0; + eof = 0; + o_percent = 0; + + if (options.lambda <= 0) { + iagen = createGenerator("0"); + } else { + D("iagen = createGenerator(%s)", options.ia); + iagen = createGenerator(options.ia); + iagen->set_lambda(options.lambda); + } + + read_state = IDLE; + write_state = INIT_WRITE; + last_quiet1 = false; + last_quiet2 = false; + + last_tx = last_rx = 0.0; + gets = 0; + ghits = 0; + esets = 0; + isets = 0; + gloc = rand() % (10*2-1)+1; + sloc = rand() % (10*2-1)+1; + iloc = rand() % (10*2-1)+1; + + op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1)); + op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1)); + + for (int i = 0; i <= LEVELS; i++) { + op_queue_size[i] = 0; + opaque[i] = 1; + //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX); + op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1)); + for (int j = 0; j <= OPAQUE_MAX; j++) { + op_queue[i][j] = NULL; + } + + } + + read_state = IDLE; +} + + +void ConnectionMultiApproxShm::set_queue(queue* a_trace_queue) { + trace_queue = a_trace_queue; + trace_queue_n = a_trace_queue->size(); +} + +void ConnectionMultiApproxShm::set_lock(pthread_mutex_t* a_lock) { + lock = a_lock; +} + +void ConnectionMultiApproxShm::set_g_wbkeys(unordered_map> *a_wb_keys) { + g_wb_keys = a_wb_keys; +} + +uint32_t ConnectionMultiApproxShm::get_cid() { + return cid; +} + +int ConnectionMultiApproxShm::add_to_wb_keys(string key) { + auto pos = wb_keys.find(key); + if (pos == wb_keys.end()) { + wb_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + +int ConnectionMultiApproxShm::add_to_copy_keys(string key) { + auto pos = copy_keys.find(key); + if (pos == copy_keys.end()) { + copy_keys.insert( {key, vector() }); + return 1; + } + return 2; +} + + +void ConnectionMultiApproxShm::del_copy_keys(string key) { + + auto position = copy_keys.find(key); + if (position != copy_keys.end()) { + vector op_list = vector(position->second); + copy_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxShm::add_to_touch_keys(string key) { + //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2; + auto pos = touch_keys.find(key); + if (pos == touch_keys.end()) { + touch_keys.insert( {key, cid }); + return 1; + } + return 2; +} + + +void ConnectionMultiApproxShm::del_touch_keys(string key) { + //touch_keys.erase(key); + auto position = touch_keys.find(key); + if (position != touch_keys.end()) { + touch_keys.erase(position); + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } +} + +int ConnectionMultiApproxShm::issue_op(Operation *Op) { + double now = get_time(); + int issued = 0; + int incl = get_incl(Op->valuelen,strlen(Op->key)); + int cid = get_class(Op->valuelen,strlen(Op->key)); + Op->clsid = cid; + int flags = 0; + int index = lrand48() % (1024 * 1024); + //int touch = 1; + SET_INCL(incl,flags); + + switch(Op->type) + { + case Operation::GET: + //if (nissued < options.depth-1) { + // issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1); + // last_quiet1 = false; + //} else { + //} + issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1); + last_quiet1 = false; + this->stats.gets++; + gets++; + this->stats.gets_cid[cid]++; + + break; + case Operation::SET: + if (last_quiet1) { + //issue_noop(1); + } + if (incl == 1) { + if (isets >= iloc) { + //if (1) { + const char *data = &random_char[index]; + issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + //int ret = add_to_touch_keys(string(Op->key)); + //if (ret == 1) { + issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET); + //} + iloc += rand()%(10*2-1)+1; + } else { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY); + } + isets++; + } else if (incl == 2) { + issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET); + if (esets >= sloc) { + issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET); + sloc += rand()%(10*2-1)+1; + } + esets++; + } + last_quiet1 = false; + this->stats.sets++; + this->stats.sets_cid[cid]++; + break; + case Operation::DELETE: + case Operation::TOUCH: + case Operation::NOOP: + case Operation::SASL: + fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen); + break; + + } + return issued; +} + +void ConnectionMultiApproxShm::del_wb_keys(string key) { + + auto position = wb_keys.find(key); + if (position != wb_keys.end()) { + vector op_list = vector(position->second); + wb_keys.erase(position); + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + issue_op(*it); + } + } else { + fprintf(stderr,"expected %s, got nuthin\n",key.c_str()); + } +} + + +int ConnectionMultiApproxShm::do_connect() { + + + //the client should see for this cid, where the shared memory is + typedef struct shared_ { + bipbuf_t bipbuf_in; + bipbuf_t bipbuf_out; + pthread_mutex_t lock_in; + pthread_mutex_t lock_out; + int shared_id; + } shared_t; + + //this cid gets shared memory + // ftok to generate unique key + //char shmkey[64]; + //sprintf(shmkey,"shmfilel1%d",cid); + int id = cid+100; + //key_t key = ftok(shmkey,id); + + // shmget returns an identifier in shmid + int shmid = shmget(id,sizeof(shared_t),0666); + + // shmat to attach to shared memory + shared_t* share_l1 = (shared_t*) shmat(shmid,(void*)0,0); + + fprintf(stderr,"cid %d gets shared memory buf l1 %d\n",cid,share_l1->shared_id); + + // ftok to generate unique key + //char shmkey2[64]; + //sprintf(shmkey2,"shmfilel2%d",cid); + int id2 = cid+200; + //key_t key2 = ftok(shmkey2,id2); + + // shmget returns an identifier in shmid + int shmid2 = shmget(id2,sizeof(shared_t),0666); + + // shmat to attach to shared memory + shared_t* share_l2 = (shared_t*) shmat(shmid2,(void*)0,0); + + fprintf(stderr,"cid %d gets shared memory buf l2 %d\n",cid,share_l2->shared_id); + + //the leads are reveresed (from perspective of server) + bipbuf_in[1] = &share_l1->bipbuf_out; + bipbuf_in[2] = &share_l2->bipbuf_out; + bipbuf_out[1] = &share_l1->bipbuf_in; + bipbuf_out[2] = &share_l2->bipbuf_in; + + lock_in[1] = &share_l1->lock_out; + lock_in[2] = &share_l2->lock_out; + lock_out[1] = &share_l1->lock_in; + lock_out[2] = &share_l2->lock_in; + read_state = IDLE; + return 1; +} + +/** + * Destroy a connection, performing cleanup. + */ +ConnectionMultiApproxShm::~ConnectionMultiApproxShm() { + + + for (int i = 0; i <= LEVELS; i++) { + free(op_queue[i]); + + } + + free(op_queue_size); + free(opaque); + free(op_queue); + //event_free(timer); + //timer = NULL; + // FIXME: W("Drain op_q?"); + //bufferevent_free(bev1); + //bufferevent_free(bev2); + + delete iagen; + delete keygen; + delete keysize; + delete valuesize; +} + +/** + * Reset the connection back to an initial, fresh state. + */ +void ConnectionMultiApproxShm::reset() { + // FIXME: Actually check the connection, drain all bufferevents, drain op_q. + //assert(op_queue.size() == 0); + //evtimer_del(timer); + read_state = IDLE; + write_state = INIT_WRITE; + stats = ConnectionStats(stats.sampling); +} + + + + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxShm::offer_get(Operation *pop, int extra) { + + uint16_t keylen = strlen(pop->key); + int level = FLAGS_level(pop->flags); + + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_GET, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + //if (quiet) { + // h.opcode = CMD_GETQ; + //} + h.opaque = htonl(pop->opaque); + + int res = 0; + pthread_mutex_lock(lock_out[level]); + int gtg = bipbuf_unused(bipbuf_out[level]) > (int)(24+keylen) ? 1 : 0; + if (gtg) { + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24); + if (res != 24) { + fprintf(stderr,"failed offer 24 get level %d\n",level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)pop->key,keylen); + if (res != keylen) { + fprintf(stderr,"failed offer %d get level %d\n",keylen,level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + if (extra == 1) { + extra_queue.pop(); + } + } else { + if (extra == 0) { + extra_queue.push(pop); + } + } + pthread_mutex_unlock(lock_out[level]); + return 1; + +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxShm::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) { + + int level = FLAGS_level(flags); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + if (l1 != NULL) { + pop->l1 = l1; + } + + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + + offer_get(pop,0); + stats.tx_bytes += 24 + strlen(pop->key); + return 1; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxShm::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) { + + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::GET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(valuelen,strlen(key)); + if (l1 != NULL) { + pop->l1 = l1; + } + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; + +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + + offer_get(pop,0); + stats.tx_bytes += 24 + strlen(pop->key);; + return 1; +} + +/** + * Issue a get request to the server. + */ +int ConnectionMultiApproxShm::issue_touch(const char* key, int valuelen, double now, int flags) { + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->valuelen = valuelen; + pop->type = Operation::TOUCH; + pop->opaque = opaque[level]++; + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; + + pop->flags = flags; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + +#ifdef DEBUGS + fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque); +#endif + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen), + 0x04, 0x00, htons(0), + htonl(keylen + 4) }; + h.opaque = htonl(pop->opaque); + + uint32_t exp = 0; + if (flags & ITEM_DIRTY) { + exp = htonl(flags); + } + + int res = 0; + pthread_mutex_lock(lock_out[level]); + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24); + if (res != 24) { + fprintf(stderr,"failed offer 24 touch level %d\n",level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + if (res != keylen) { + bipbuf_offer(bipbuf_out[level],(const unsigned char*)&exp,4); + fprintf(stderr,"failed offer 4 touch level %d\n",level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)pop->key,keylen); + if (res != keylen) { + fprintf(stderr,"failed offer %d touch level %d\n",keylen,level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + pthread_mutex_unlock(lock_out[level]); + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +/** + * Issue a delete request to the server. + */ +int ConnectionMultiApproxShm::issue_delete(const char* key, double now, uint32_t flags) { + int level = FLAGS_level(flags); + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) { +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + pop->start_time = tv_to_double(&now_tv); +#else + pop->start_time = get_time(); +#endif + } else { + pop->start_time = now; + } +#endif + + strncpy(pop->key,key,255); + pop->type = Operation::DELETE; + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque); +#endif + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + uint16_t keylen = strlen(key); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_DELETE, htons(keylen), + 0x00, 0x00, htons(0), + htonl(keylen) }; + h.opaque = htonl(pop->opaque); + + pthread_mutex_lock(lock_out[level]); + bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24); + bipbuf_offer(bipbuf_out[level],(const unsigned char*)key,keylen); + pthread_mutex_unlock(lock_out[level]); + + + stats.tx_bytes += 24 + keylen; + + //stats.log_access(op); + return 1; +} + +void ConnectionMultiApproxShm::issue_noop(int level) { + Operation op; + + + binary_header_t h = { 0x80, CMD_NOOP, 0x0000, + 0x00, 0x00, htons(0), + 0x00 }; + + + //bipbuf_offer(bipbuf[level],&h,24); +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxShm::issue_set(Operation *pop, const char* value, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + pop->opaque = opaque[level]++; + pop->flags = flags; + op_queue[level][pop->opaque] = pop; + //op_queue[level].push(op); + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + offer_set(pop); + + + stats.tx_bytes += pop->valuelen + 32 + strlen(pop->key); + return 1; +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxShm::offer_set(Operation *pop, int extra) { + + uint16_t keylen = strlen(pop->key); + uint32_t length = pop->valuelen; + int level = FLAGS_level(pop->flags); + + // each line is 4-bytes + binary_header_t h = { 0x80, CMD_SET, htons(keylen), + 0x08, 0x00, htons(0), + htonl(keylen + 8 + length) }; + h.opaque = htonl(pop->opaque); + + uint32_t f = htonl(pop->flags); + uint32_t exp = 0; + int ret = 0; + int res = 0; + pthread_mutex_lock(lock_out[level]); + int gtg = bipbuf_unused(bipbuf_out[level]) > (int)(32+pop->valuelen) ? 1 : 0; + if (gtg) { + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24); + if (res != 24) { + fprintf(stderr,"failed offer 24 set level %d\n",level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&f,4); + if (res != 4) { + fprintf(stderr,"failed offer 4 set level %d\n",level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&exp,4); + if (res != 4) { + fprintf(stderr,"failed offer 4 set level %d\n",level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)pop->key,keylen); + if (res != keylen) { + fprintf(stderr,"failed offer %d set level %d\n",keylen,level); + pthread_mutex_unlock(lock_out[level]); + return 0; + } + int i = 0; + int index = lrand48() % (1024 * 1024); + const char *value = &random_char[index]; + while ((res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)value,length)) != (int)length) { + pthread_mutex_unlock(lock_out[level]); + i++; + if (i > 1000) { + fprintf(stderr,"failed offer %d set level %d\n",length,level); + break; + } + pthread_mutex_lock(lock_out[level]); + } + if (extra == 1) { + extra_queue.pop(); + } + ret = 1; + } else { + if (extra == 0) { + extra_queue.push(pop); + } + ret = 0; + } + pthread_mutex_unlock(lock_out[level]); + return ret; +} + +/** + * Issue a set request to the server. + */ +int ConnectionMultiApproxShm::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) { + + int level = FLAGS_level(flags); + //Operation op; + Operation *pop = new Operation(); + +#if HAVE_CLOCK_GETTIME + pop->start_time = get_time_accurate(); +#else + if (now == 0.0) pop->start_time = get_time(); + else pop->start_time = now; +#endif + + strncpy(pop->key,key,255); + pop->valuelen = length; + pop->type = Operation::SET; + pop->opaque = opaque[level]++; + pop->flags = flags; + pop->clsid = get_class(length,strlen(key)); + op_queue[level][pop->opaque] = pop; + op_queue_size[level]++; +#ifdef DEBUGS + fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque); +#endif + + if (opaque[level] > OPAQUE_MAX) { + opaque[level] = 1; + } + + offer_set(pop); + stats.tx_bytes += length + 32 + strlen(key); + return 1; +} + +/** + * Return the oldest live operation in progress. + */ +void ConnectionMultiApproxShm::pop_op(Operation *op) { + + uint8_t level = OP_level(op); + //op_queue[level].erase(op); + op_queue_size[level]--; + + + if (read_state == LOADING) return; + read_state = IDLE; + + // Advance the read state machine. + //if (op_queue.size() > 0) { + // Operation& op = op_queue.front(); + // switch (op.type) { + // case Operation::GET: read_state = WAITING_FOR_GET; break; + // case Operation::SET: read_state = WAITING_FOR_SET; break; + // case Operation::DELETE: read_state = WAITING_FOR_DELETE; break; + // default: DIE("Not implemented."); + // } + //} +} + +/** + * Finish up (record stats) an operation that just returned from the + * server. + */ +void ConnectionMultiApproxShm::finish_op(Operation *op, int was_hit) { + double now; +#if USE_CACHED_TIME + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); +#else + now = get_time(); +#endif +#if HAVE_CLOCK_GETTIME + op->end_time = get_time_accurate(); +#else + op->end_time = now; +#endif + + if (was_hit) { + switch (op->type) { + case Operation::GET: + switch (OP_level(op)) { + case 1: + stats.log_get_l1(*op); + break; + case 2: + stats.log_get_l2(*op); + if (op->l1 != NULL) { + op->l1->end_time = now; + stats.log_get(*(op->l1)); + } + break; + } + break; + case Operation::SET: + switch (OP_level(op)) { + case 1: + stats.log_set_l1(*op); + break; + case 2: + stats.log_set_l2(*op); + break; + } + break; + case Operation::DELETE: break; + case Operation::TOUCH: break; + default: DIE("Not implemented."); + } + } + //} else { + // switch (op->type) { + // case Operation::GET: + // if (OP_log(op)) { + // switch (OP_level(op)) { + // case 1: + // stats.log_get_l1(*op); + // break; + // case 2: + // stats.log_get_l2(*op); + // if (op->l1 != NULL) { + // op->l1->end_time = now; + // stats.log_get(*(op->l1)); + // } + // break; + // } + // } + // break; + // case Operation::SET: + // if (OP_log(op)) { + // switch (OP_level(op)) { + // case 1: + // stats.log_set_l1(*op); + // break; + // case 2: + // stats.log_set_l2(*op); + // break; + // } + // } + // break; + // case Operation::DELETE: break; + // case Operation::TOUCH: break; + // default: DIE("Not implemented."); + // } + //} + + last_rx = now; + uint8_t level = OP_level(op); + if (op->l1 != NULL) { + //delete op_queue[1][op->l1->opaque]; + op_queue[1][op->l1->opaque] = 0; + op_queue_size[1]--; + delete op->l1; + } + //op_queue[level].erase(op_queue[level].begin()+opopq); + if (op == op_queue[level][op->opaque] && + op->opaque == op_queue[level][op->opaque]->opaque) { + //delete op_queue[level][op->opaque]; + op_queue[level][op->opaque] = 0; + delete op; + } else { + fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n", + op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque); + } + op_queue_size[level]--; + read_state = IDLE; + + +} + + + +/** + * Check if our testing is done and we should exit. + */ +bool ConnectionMultiApproxShm::check_exit_condition(double now) { + if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) { + return true; + } + if (read_state == INIT_READ) return false; + + return false; +} + + + +/** + * Request generation loop + */ +void ConnectionMultiApproxShm::drive_write_machine_shm(double now) { + + while (trace_queue->size() > 0) { + int extra_tries = extra_queue.size(); + for (int i = 0; i < extra_tries; i++) { + Operation *Op = extra_queue.front(); + switch(Op->type) + { + case Operation::GET: + offer_get(Op,1); + break; + case Operation::SET: + offer_set(Op,1); + break; + } + } + + int nissued = 0; + int nissuedl2 = 0; + while (nissued < options.depth && extra_queue.size() == 0) { + Operation *Op = trace_queue->front(); + + if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) { + eof = 1; + cid_rate.insert( {cid, 100 } ); + fprintf(stderr,"cid %d done\n",cid); + string op_queue1; + string op_queue2; + for (int j = 0; j < 2; j++) { + for (int i = 0; i < OPAQUE_MAX; i++) { + if (op_queue[j+1][i] != NULL) { + if (j == 0) { + op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; + } else { + op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; + } + } + } + } + fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); + return; + } + int gtg = 0; + pthread_mutex_lock(lock_out[1]); + switch(Op->type) + { + case Operation::GET: + gtg = bipbuf_unused(bipbuf_out[1]) > (int)(24+strlen(Op->key)) ? 1 : 0; + break; + case Operation::SET: + gtg = bipbuf_unused(bipbuf_out[1]) > (int)(32+Op->valuelen) ? 1 : 0; + break; + } + pthread_mutex_unlock(lock_out[1]); + + + if (gtg) { + trace_queue->pop(); + int l2issued = issue_op(Op); + nissuedl2 += l2issued; + nissued++; + } else { + break; + } + } + + //wait for response (at least nissued) + int l2issued = read_response_l1(); + nissuedl2 += l2issued; + if (nissuedl2 > 0) { + read_response_l2(); + } + + } + +} + +/** + * Request generation loop + */ +//void ConnectionMultiApproxShm::drive_write_machine_shm_2(double now) { +// +// while (trace_queue->size() > 0) { +// int extra_tries = extra_queue.size(); +// for (int i = 0; i < extra_tries; i++) { +// Operation *Op = extra_queue.front(); +// switch(Op->type) +// { +// case Operation::GET: +// offer_get(Op,1); +// break; +// case Operation::SET: +// offer_set(Op,1); +// break; +// } +// } +// +// int nissued = 0; +// int nissuedl2 = 0; +// while (nissued < options.depth && extra_queue.size() == 0) { +// Operation *Op = trace_queue->front(); +// +// if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) { +// eof = 1; +// cid_rate.insert( {cid, 100 } ); +// fprintf(stderr,"cid %d done\n",cid); +// string op_queue1; +// string op_queue2; +// for (int j = 0; j < 2; j++) { +// for (int i = 0; i < OPAQUE_MAX; i++) { +// if (op_queue[j+1][i] != NULL) { +// if (j == 0) { +// op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key; +// } else { +// op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key; +// } +// } +// } +// } +// fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]); +// return; +// } +// int gtg = 0; +// pthread_mutex_lock(lock_out[1]); +// switch(Op->type) +// { +// case Operation::GET: +// gtg = bipbuf_unused(bipbuf_out[1]) > (int)(24+strlen(Op->key)) ? 1 : 0; +// break; +// case Operation::SET: +// gtg = bipbuf_unused(bipbuf_out[1]) > (int)(32+Op->valuelen) ? 1 : 0; +// break; +// } +// pthread_mutex_unlock(lock_out[1]); +// +// +// if (gtg) { +// trace_queue->pop(); +// int l2issued = issue_op(Op); +// nissuedl2 += l2issued; +// nissued++; +// } else { +// break; +// } +// } +// +// //wait for response (at least nissued) +// int l2issued = read_response_l1(); +// nissuedl2 += l2issued; +// if (nissuedl2 > 0) { +// read_response_l2(); +// } +// +// } +// +//} + +/** + * Tries to consume a binary response (in its entirety) from shared memory. + * + * @param input evBuffer to read response from + * @return true if consumed, false if not enough data in buffer. + */ +static int handle_response(ConnectionMultiApproxShm *conn, unsigned char *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) { + // Read the first 24 bytes as a header + //int length = evbuffer_get_length(input); + //if (length < 24) return false; + //binary_header_t* h = + // reinterpret_cast(evbuffer_pullup(input, 24)); + //assert(h); + binary_header_t* h = + reinterpret_cast(input); + + uint32_t bl = ntohl(h->body_len); + uint16_t kl = ntohs(h->key_len); + uint8_t el = h->extra_len; + // Not whole response + int targetLen = 24 + bl; + + opcode = h->opcode; + opaque = ntohl(h->opaque); + uint16_t status = ntohs(h->status); +#ifdef DEBUGMC + fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level, + h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len, + ntohl(h->body_len),ntohs(h->status)); +#endif + + pthread_mutex_lock(conn->lock_in[level]); + unsigned char *abuf; + int tries = 0; + while ((abuf = bipbuf_poll(conn->bipbuf_in[level],targetLen)) == NULL) { + pthread_mutex_unlock(conn->lock_in[level]); + tries++; + if (tries > 10) { + //fprintf(stderr,"more than 10000 tries for cid: %d for length %d\n",conn->get_cid(),targetLen); + return 0; + + } + pthread_mutex_lock(conn->lock_in[level]); + } + unsigned char bbuf[1024*1024]; + unsigned char *buf = (unsigned char*) &bbuf; + if (abuf != NULL) { + memcpy(bbuf,abuf,targetLen); + } + buf += 24; + pthread_mutex_unlock(conn->lock_in[level]); + + + // If something other than success, count it as a miss + if (opcode == CMD_GET && status == RESP_NOT_FOUND) { + switch(level) { + case 1: + conn->stats.get_misses_l1++; + break; + case 2: + conn->stats.get_misses_l2++; + conn->stats.get_misses++; + conn->stats.window_get_misses++; + break; + + } + found = false; + //evbuffer_drain(input, targetLen); + + } else if (opcode == CMD_SET && kl > 0 && evict != NULL) { + //evbuffer_drain(input,24); + //unsigned char *buf = evbuffer_pullup(input,bl); + + + evict->clsid = *((uint32_t*)buf); + evict->clsid = ntohl(evict->clsid); + buf += 4; + + evict->serverFlags = *((uint32_t*)buf); + evict->serverFlags = ntohl(evict->serverFlags); + buf += 4; + + evict->evictedFlags = *((uint32_t*)buf); + evict->evictedFlags = ntohl(evict->evictedFlags); + buf += 4; + + + evict->evictedKeyLen = kl; + evict->evictedKey = (char*)malloc(kl+1); + memset(evict->evictedKey,0,kl+1); + memcpy(evict->evictedKey,buf,kl); + buf += kl; + + + evict->evictedLen = bl - kl - el; + evict->evictedData = (char*)malloc(evict->evictedLen); + memcpy(evict->evictedData,buf,evict->evictedLen); + evict->evicted = true; + //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags); + } else if ( (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) || + (opcode == CMD_DELETE && status == RESP_NOT_FOUND) ) { + found = false; + } + + conn->stats.rx_bytes += targetLen; + done = true; + return targetLen; +} + +int ConnectionMultiApproxShm::read_response_l1() { + + //maybe need mutex etc. + unsigned char input[64]; + pthread_mutex_lock(lock_in[1]); + unsigned char *in = bipbuf_peek(bipbuf_in[1],24); + if (in) { + memcpy(input,in,24); + } + pthread_mutex_unlock(lock_in[1]); + if (in == NULL) { + return 0; + } + + uint32_t responses_expected = op_queue_size[1]; + Operation *op = NULL; + bool done, found; + found = true; + int bytes_read = 1; + int l2reqs = 0; + uint32_t responses = 0; + while (bytes_read > 0 && responses < responses_expected && input) { + + + int opcode; + uint32_t opaque; + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + bytes_read = handle_response(this,input, done, found, opcode, opaque, evict,1); + + if (bytes_read > 0) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + op = op_queue[1][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + continue; + } + responses++; + } else { + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + return 0; + } + + + double now = get_time(); + int wb = 0; + if (options.rand_admit) { + wb = (rand() % options.rand_admit); + } + switch (op->type) { + case Operation::GET: + if (done) { + + int vl = op->valuelen; + if ( !found && (options.getset || options.getsetorset) ) { + /* issue a get a l2 */ + int flags = OP_clu(op); + issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op); + op->end_time = now; + this->stats.log_get_l1(*op); + //finish_op(op,0); + + } else { + if (OP_incl(op) && ghits >= gloc) { + //int ret = add_to_touch_keys(string(op->key)); + //if (ret == 1) { + issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H); + //} + gloc += rand()%(10*2-1)+1; + } + ghits++; + finish_op(op,1); + } + l2reqs++; + } else { + char out[128]; + sprintf(out,"conn l1: %u, not done reading, should do something",cid); + write(2,out,strlen(out)); + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_L1_COPY || + // OP_src(op) == SRC_L2_M) { + // del_copy_keys(string(op->key)); + //} + if (evict->evicted) { + string wb_key(evict->evictedKey); + if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY); + //} + this->stats.incl_wbs++; + l2reqs++; + } else if (evict->evictedFlags & ITEM_EXCL) { + //fprintf(stderr,"excl writeback %s\n",evict->evictedKey); + //strncpy(wb_key,evict->evictedKey,255); + if ( (options.rand_admit && wb == 0) || + (options.threshold && (g_key_hist[wb_key] == 1)) || + (options.wb_all) ) { + //int ret = add_to_wb_keys(wb_key); + //if (ret == 1) { + issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB); + //} + this->stats.excl_wbs++; + l2reqs++; + } + } + if (OP_src(op) == SRC_DIRECT_SET) { + if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) { + this->stats.set_misses_l1++; + } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_excl_hits_l1++; + } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) { + this->stats.set_incl_hits_l1++; + } + } + } + finish_op(op,1); + break; + case Operation::TOUCH: + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + if (evict) { + if (evict->evictedKey) free(evict->evictedKey); + if (evict->evictedData) free(evict->evictedData); + free(evict); + } + pthread_mutex_lock(lock_in[1]); + unsigned char *in = bipbuf_peek(bipbuf_in[1],24); + if (in) { + memcpy(input,in,24); + pthread_mutex_unlock(lock_in[1]); + } else { + pthread_mutex_unlock(lock_in[1]); + break; + } + + } + return l2reqs; +} + +/** + * Handle incoming data (responses). + */ +void ConnectionMultiApproxShm::read_response_l2() { + + //maybe need mutex etc. + unsigned char input[64]; + pthread_mutex_lock(lock_in[2]); + unsigned char *in = bipbuf_peek(bipbuf_in[2],24); + if (in) { + memcpy(input,in,24); + } + pthread_mutex_unlock(lock_in[2]); + if (in == NULL) { + return; + } + + uint32_t responses_expected = op_queue_size[2]; + Operation *op = NULL; + bool done, found; + found = true; + int bytes_read = 1; + int l2reqs = 0; + uint32_t responses = 0; + + while (bytes_read > 0 && responses < responses_expected && input) { + + int opcode; + uint32_t opaque; + evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t)); + memset(evict,0,sizeof(evicted_t)); + bytes_read = handle_response(this,input, done, found, opcode, opaque, evict,2); + + if (bytes_read > 0) { + if (opcode == CMD_NOOP) { +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading noop\n",cid); + write(2,out,strlen(out)); +#endif + continue; + } + op = op_queue[2][opaque]; +#ifdef DEBUGMC + char out[128]; + sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque); + write(2,out,strlen(out)); + output_op(op,2,found); +#endif + if (strlen(op->key) < 1) { +#ifdef DEBUGMC + char out2[128]; + sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key); + write(2,out2,strlen(out2)); +#endif + continue; + } + responses++; + } else { + return; + } + + + double now = get_time(); + switch (op->type) { + case Operation::GET: + if ( !found && (options.getset || options.getsetorset) ) { // && + //(options.twitter_trace != 1)) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | SRC_L2_M | LOG_OP; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1); + if (OP_incl(op)) { + issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2); + last_quiet2 = false; + } + //} + last_quiet1 = false; + finish_op(op,0); // sets read_state = IDLE + + } else { + if (found) { + int valuelen = op->valuelen; + int index = lrand48() % (1024 * 1024); + int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY; + string key = string(op->key); + const char *data = &random_char[index]; + //int ret = add_to_copy_keys(string(op->key)); + //if (ret == 1) { + issue_set(op->key,data,valuelen, now, flags); + //} + this->stats.copies_to_l1++; + //djb: this is automatically done in the L2 server + //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death? + // issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY ); + //} + finish_op(op,1); + + } else { + finish_op(op,0); + } + } + break; + case Operation::SET: + //if (OP_src(op) == SRC_WB) { + // del_wb_keys(string(op->key)); + //} + finish_op(op,1); + break; + case Operation::TOUCH: + if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) { + int valuelen = op->valuelen; + if (!found) { + int index = lrand48() % (1024 * 1024); + issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M); + this->stats.set_misses_l2++; + } else { + if (OP_src(op) == SRC_DIRECT_SET) { + issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY); + } + } + //del_touch_keys(string(op->key)); + } + finish_op(op,0); + break; + case Operation::DELETE: + //check to see if it was a hit + //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op)); + if (OP_src(op) == SRC_DIRECT_SET) { + if (found) { + this->stats.delete_hits_l2++; + } else { + this->stats.delete_misses_l2++; + } + } + finish_op(op,1); + break; + default: + fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque); + DIE("not implemented"); + } + + pthread_mutex_lock(lock_in[2]); + unsigned char *in = bipbuf_peek(bipbuf_in[2],24); + if (in) { + memcpy(input,in,24); + pthread_mutex_unlock(lock_in[2]); + } else { + pthread_mutex_unlock(lock_in[2]); + break; + } + + } +} + diff --git a/ConnectionOptions.h b/ConnectionOptions.h index ba3d70c..96d70fc 100644 --- a/ConnectionOptions.h +++ b/ConnectionOptions.h @@ -4,17 +4,37 @@ #include "distributions.h" typedef struct { + int apps; + int rand_admit; + bool ratelimit; + bool v1callback; + int threshold; + int wb_all; + bool miss_through; int connections; bool blocking; double lambda; int qps; int records; - + int misswindow; + int queries; + int assoc; + char file_name[256]; + bool read_file; bool binary; + bool unix_socket; + bool successful_queries; + bool use_assoc; + bool redis; + bool getset; + bool getsetorset; + bool delete90; bool sasl; char username[32]; char password[32]; + char prefix[256]; + char hashtype[256]; char keysize[32]; char valuesize[32]; // int keysize; @@ -23,7 +43,7 @@ typedef struct { // qps_per_connection // iadist - + int twitter_trace; double update; int time; bool loadonly; @@ -42,6 +62,8 @@ typedef struct { bool oob_thread; bool moderate; + char inclusives[256]; + } options_t; #endif // CONNECTIONOPTIONS_H diff --git a/ConnectionStats.h b/ConnectionStats.h index e957c19..1c79ea4 100644 --- a/ConnectionStats.h +++ b/ConnectionStats.h @@ -22,43 +22,101 @@ class ConnectionStats { public: ConnectionStats(bool _sampling = true) : #ifdef USE_ADAPTIVE_SAMPLER - get_sampler(100000), set_sampler(100000), op_sampler(100000), + get_sampler(100000), set_sampler(100000), + get_l1_sampler(100000), set_l1_sampler(100000), + get_l2_sampler(100000), set_l2_sampler(100000), + access_sampler(100000), op_sampler(100000), #elif defined(USE_HISTOGRAM_SAMPLER) - get_sampler(10000,1), set_sampler(10000,1), op_sampler(1000,1), + get_sampler(10000,1), set_sampler(10000,1), + get_l1_sampler(10000,1), set_l1_sampler(10000,1), + get_l2_sampler(10000,1), set_l2_sampler(10000,1), + access_sampler(10000,1), op_sampler(1000,1), #else - get_sampler(200), set_sampler(200), op_sampler(100), + get_sampler(200), set_sampler(200), + get_l1_sampler(200), set_l1_sampler(200), + get_l2_sampler(200), set_l2_sampler(200), + access_sampler(200), op_sampler(100), #endif - rx_bytes(0), tx_bytes(0), gets(0), sets(0), - get_misses(0), skips(0), sampling(_sampling) {} + rx_bytes(0), tx_bytes(0), + gets(0), sets(0), + gets_l1(0), sets_l1(0), + gets_l2(0), sets_l2(0), + accesses(0), + get_misses(0), + get_misses_l1(0), get_misses_l2(0), + set_misses_l1(0), set_misses_l2(0), + excl_wbs(0), incl_wbs(0), + copies_to_l1(0), + delete_misses_l2(0), + delete_hits_l2(0), + gets_cid(40), sets_cid(40), + set_incl_hits_l1(0),set_excl_hits_l1(0), + window_gets(0), window_sets(0), window_accesses(0), + window_get_misses(0), skips(0), sampling(_sampling) {} #ifdef USE_ADAPTIVE_SAMPLER AdaptiveSampler get_sampler; AdaptiveSampler set_sampler; + AdaptiveSampler get_l1_sampler; + AdaptiveSampler set_l1_sampler; + AdaptiveSampler get_l2_sampler; + AdaptiveSampler set_l2_sampler; + AdaptiveSampler access_sampler; AdaptiveSampler op_sampler; #elif defined(USE_HISTOGRAM_SAMPLER) HistogramSampler get_sampler; HistogramSampler set_sampler; + HistogramSampler get_l1_sampler; + HistogramSampler get_l2_sampler; + HistogramSampler set_l1_sampler; + HistogramSampler set_l2_sampler; + HistogramSampler access_sampler; HistogramSampler op_sampler; #else LogHistogramSampler get_sampler; LogHistogramSampler set_sampler; + LogHistogramSampler get_l1_sampler; + LogHistogramSampler set_l1_sampler; + LogHistogramSampler get_l2_sampler; + LogHistogramSampler set_l2_sampler; + LogHistogramSampler access_sampler; LogHistogramSampler op_sampler; #endif uint64_t rx_bytes, tx_bytes; - uint64_t gets, sets, get_misses; + uint64_t gets, sets; + uint64_t gets_l1, sets_l1, gets_l2, sets_l2; + uint64_t accesses, get_misses; + uint64_t get_misses_l1, get_misses_l2; + uint64_t set_misses_l1, set_misses_l2; + uint64_t excl_wbs, incl_wbs; + uint64_t copies_to_l1; + uint64_t delete_misses_l2; + uint64_t delete_hits_l2; + vector gets_cid; + vector sets_cid; + uint64_t set_incl_hits_l1, set_excl_hits_l1; + uint64_t window_gets, window_sets, window_accesses, window_get_misses; uint64_t skips; double start, stop; bool sampling; - void log_get(Operation& op) { if (sampling) get_sampler.sample(op); gets++; } - void log_set(Operation& op) { if (sampling) set_sampler.sample(op); sets++; } + void log_get(Operation& op) { if (sampling) get_sampler.sample(op); } //window_gets++; gets++; } + void log_set(Operation& op) { if (sampling) set_sampler.sample(op); window_sets++; sets++; } + + void log_get_l1(Operation& op) { if (sampling) get_l1_sampler.sample(op); window_gets++; gets_l1++; } + void log_set_l1(Operation& op) { if (sampling) set_l1_sampler.sample(op); window_sets++; sets_l1++; } + + void log_get_l2(Operation& op) { if (sampling) get_l2_sampler.sample(op); window_gets++; gets_l2++; } + void log_set_l2(Operation& op) { if (sampling) set_l2_sampler.sample(op); window_sets++; sets_l2++; } + void log_access(Operation& op) { //if (sampling) access_sampler.sample(op); + window_accesses++; } //accesses++; } void log_op (double op) { if (sampling) op_sampler.sample(op); } double get_qps() { - return (gets + sets) / (stop - start); + return (gets_l1 + gets_l2 + sets_l1 + sets_l2) / (stop - start); } #ifdef USE_ADAPTIVE_SAMPLER @@ -69,8 +127,18 @@ class ConnectionStats { for (auto s: get_sampler.samples) samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); + for (auto s: get_l1_sampler.samples) + samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); + for (auto s: get_l2_sampler.samples) + samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); for (auto s: set_sampler.samples) samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); + for (auto s: set_l1_sampler.samples) + samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); + for (auto s: set_l2_sampler.samples) + samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); + for (auto s: access_sampler.samples) + samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000); sort(samples.begin(), samples.end()); @@ -91,19 +159,49 @@ class ConnectionStats { void accumulate(const ConnectionStats &cs) { #ifdef USE_ADAPTIVE_SAMPLER for (auto i: cs.get_sampler.samples) get_sampler.sample(i); //log_get(i); + for (auto i: cs.get_l1_sampler.samples) get_l1_sampler.sample(i); //log_get(i); + for (auto i: cs.get_l2_sampler.samples) get_l2_sampler.sample(i); //log_get(i); for (auto i: cs.set_sampler.samples) set_sampler.sample(i); //log_set(i); + for (auto i: cs.set_l1_sampler.samples) set_l1_sampler.sample(i); //log_set(i); + for (auto i: cs.set_l2_sampler.samples) set_l2_sampler.sample(i); //log_set(i); + for (auto i: cs.access_sampler.samples) access_sampler.sample(i); //log_access(i); for (auto i: cs.op_sampler.samples) op_sampler.sample(i); //log_op(i); #else get_sampler.accumulate(cs.get_sampler); + get_l1_sampler.accumulate(cs.get_l1_sampler); + get_l2_sampler.accumulate(cs.get_l2_sampler); set_sampler.accumulate(cs.set_sampler); + set_l1_sampler.accumulate(cs.set_l1_sampler); + set_l2_sampler.accumulate(cs.set_l2_sampler); + access_sampler.accumulate(cs.access_sampler); op_sampler.accumulate(cs.op_sampler); #endif + for (int i = 0; i < 40; i++) { + gets_cid[i] += cs.gets_cid[i]; + sets_cid[i] += cs.sets_cid[i]; + } rx_bytes += cs.rx_bytes; tx_bytes += cs.tx_bytes; gets += cs.gets; sets += cs.sets; + gets_l1 += cs.gets_l1; + gets_l2 += cs.gets_l2; + sets_l1 += cs.sets_l1; + sets_l2 += cs.sets_l2; + accesses += cs.accesses; get_misses += cs.get_misses; + get_misses_l1 += cs.get_misses_l1; + get_misses_l2 += cs.get_misses_l2; + set_misses_l1 += cs.set_misses_l1; + set_misses_l2 += cs.set_misses_l2; + excl_wbs += cs.excl_wbs; + incl_wbs += cs.incl_wbs; + copies_to_l1 += cs.copies_to_l1; + delete_misses_l2 += cs.delete_misses_l2; + delete_hits_l2 += cs.delete_hits_l2; + set_excl_hits_l1 += cs.set_excl_hits_l1; + set_incl_hits_l1 += cs.set_incl_hits_l1; skips += cs.skips; start = cs.start; @@ -115,7 +213,23 @@ class ConnectionStats { tx_bytes += as.tx_bytes; gets += as.gets; sets += as.sets; + gets_l1 += as.gets_l1; + gets_l2 += as.gets_l2; + sets_l1 += as.sets_l1; + sets_l2 += as.sets_l2; + accesses += as.accesses; get_misses += as.get_misses; + get_misses_l1 += as.get_misses_l1; + get_misses_l2 += as.get_misses_l2; + set_misses_l1 += as.set_misses_l1; + set_misses_l2 += as.set_misses_l2; + excl_wbs += as.excl_wbs; + incl_wbs += as.incl_wbs; + copies_to_l1 += as.copies_to_l1; + delete_misses_l2 += as.delete_misses_l2; + delete_hits_l2 += as.delete_hits_l2; + set_excl_hits_l1 += as.set_excl_hits_l1; + set_incl_hits_l1 += as.set_incl_hits_l1; skips += as.skips; start = as.start; @@ -123,9 +237,9 @@ class ConnectionStats { } static void print_header() { - printf("%-7s %7s %7s %7s %7s %7s %7s %7s %7s\n", + printf("%-7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s\n", "#type", "avg", "std", "min", /*"1st",*/ "5th", "10th", - "90th", "95th", "99th"); + "50th", "90th", "95th", "99th", "99.9th"); } #ifdef USE_ADAPTIVE_SAMPLER @@ -137,18 +251,18 @@ class ConnectionStats { size_t l = copy.size(); if (l == 0) { - printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", - tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); + printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", + tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); if (newline) printf("\n"); return; } sort(copy.begin(), copy.end()); - printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", + printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", tag, std::accumulate(copy.begin(), copy.end(), 0.0) / l, - copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100], - copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100] + copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100], copy[(l*50) / 100], + copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100], copy[(l*99.9) / 100] ); if (newline) printf("\n"); } @@ -164,10 +278,10 @@ class ConnectionStats { sort(copy.begin(), copy.end()); - printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", + printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", tag, std::accumulate(copy.begin(), copy.end(), 0.0) / l, - copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100], - copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100] + copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100], copy[(l*50) / 100], + copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100], copy[(l*99.9) / 100] ); if (newline) printf("\n"); } @@ -175,8 +289,8 @@ class ConnectionStats { void print_stats(const char *tag, HistogramSampler &sampler, bool newline = true) { if (sampler.total() == 0) { - printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", - tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); + printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", + tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); if (newline) printf("\n"); return; } @@ -184,8 +298,8 @@ class ConnectionStats { printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", tag, sampler.average(), sampler.get_nth(0), sampler.get_nth(1), sampler.get_nth(5), - sampler.get_nth(10), sampler.get_nth(90), - sampler.get_nth(95), sampler.get_nth(99)); + sampler.get_nth(10), sampler.get_nth(50), sampler.get_nth(90), + sampler.get_nth(95), sampler.get_nth(99), sampler.get_nth(99.9)); if (newline) printf("\n"); } @@ -193,17 +307,18 @@ class ConnectionStats { void print_stats(const char *tag, LogHistogramSampler &sampler, bool newline = true) { if (sampler.total() == 0) { - printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", - tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); + printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", + tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); if (newline) printf("\n"); return; } - printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", + printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f", tag, sampler.average(), sampler.stddev(), - sampler.get_nth(0), /*sampler.get_nth(1),*/ sampler.get_nth(5), - sampler.get_nth(10), sampler.get_nth(90), - sampler.get_nth(95), sampler.get_nth(99)); + sampler.get_nth(0), sampler.get_nth(5), + sampler.get_nth(10), sampler.get_nth(50), + sampler.get_nth(90), sampler.get_nth(95), + sampler.get_nth(99), sampler.get_nth(99.9) ); if (newline) printf("\n"); } diff --git a/Generator.h b/Generator.h index eb598b1..bf57e2a 100644 --- a/Generator.h +++ b/Generator.h @@ -119,6 +119,102 @@ class Exponential : public Generator { double lambda; }; +class Zipfian : public Generator { +public: + Zipfian(double _alpha = 1.0, unsigned int _m = 100) : + alpha(_alpha), m(_m) { + int i; + // Compute normalization constant + for (i = 1; i <= m; i++) + c = c + (1.0 / pow((double) i, alpha)); + c = 1.0 / c; + + sum_probs = (double*)malloc((m+1)*sizeof(double)); + sum_probs[0] = 0; + for (i = 1; i <= m; i++) { + sum_probs[i] = sum_probs[i-1] + c / pow((double) i, alpha); + } + + D("Zipfian(alpha=%f, m=%u)", alpha, m); + } + + virtual double generate() { + double z; // Uniform random number (0 < z < 1) + int zipf_value; // Computed exponential value to be returned + int low, high, mid; // Binary-search bounds + + // Pull a uniform random number (0 < z < 1) + do + { + z = rand_val(0); + } + while ((z == 0) || (z == 1)); + + // Map z to the value + low = 1, high = m, mid = (m/2); + do { + mid = floor((low+high)/2); + if (sum_probs[mid] >= z && sum_probs[mid-1] < z) { + zipf_value = mid; + break; + } else if (sum_probs[mid] >= z) { + high = mid-1; + } else { + low = mid+1; + } + } while (low <= high); + + // Assert that zipf_value is between 1 and M + assert((zipf_value >=1) && (zipf_value <= m)); + + return(zipf_value); + } + + //========================================================================= + //= Multiplicative LCG for generating uniform(0.0, 1.0) random numbers = + //= - x_n = 7^5*x_(n-1)mod(2^31 - 1) = + //= - With x seeded to 1 the 10000th x value should be 1043618065 = + //= - From R. Jain, "The Art of Computer Systems Performance Analysis," = + //= John Wiley & Sons, 1991. (Page 443, Figure 26.2) = + //========================================================================= + static double rand_val(int seed) { + const long a = 16807; // Multiplier + const long m = 2147483647; // Modulus + const long q = 127773; // m div a + const long r = 2836; // m mod a + static long x; // Random int value + long x_div_q; // x divided by q + long x_mod_q; // x modulo q + long x_new; // New x value + + // Set the seed if argument is non-zero and then return zero + if (seed > 0) + { + x = seed; + return(0.0); + } + + // RNG using integer arithmetic + x_div_q = x / q; + x_mod_q = x % q; + x_new = (a * x_mod_q) - (r * x_div_q); + if (x_new > 0) + x = x_new; + else + x = x_new + m; + + // Return a random value between 0.0 and 1.0 + return((double) x / m); + } + + +private: + double alpha; + double m; + double c; + double *sum_probs; // Pre-calculated sum of probabilities +}; + class GPareto : public Generator { public: GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0) : @@ -197,8 +293,8 @@ class KeyGenerator { double U = (double) h / ULLONG_MAX; double G = g->generate(U); int keylen = MAX(round(G), floor(log10(max)) + 1); - char key[256]; - snprintf(key, 256, "%0*" PRIu64, keylen, ind); + char key[250]; //memcached limit of 255 chars + snprintf(key, keylen, "%lu" , ind); // D("%d = %s", ind, key); return std::string(key); diff --git a/Operation.h b/Operation.h index b594b17..ceb0531 100644 --- a/Operation.h +++ b/Operation.h @@ -8,16 +8,36 @@ using namespace std; class Operation { public: + Operation() { + valuelen = 0; + opaque = 0; + flags = 0; + clsid = 0; + future = 0; + curr = 0; + l1 = NULL; + type = NOOP; + appid = 0; + start_time = 0; + end_time = 0; + memset(key,0,256); + } double start_time, end_time; enum type_enum { - GET, SET, SASL + GET, SET, DELETE, SASL, NOOP, TOUCH }; type_enum type; - - string key; - // string value; + uint16_t appid; + uint32_t valuelen; + uint32_t opaque; + uint32_t flags; + uint16_t clsid; + uint8_t future; + uint8_t curr; + char key[256]; + Operation *l1; double time() const { return (end_time - start_time) * 1000000; } }; diff --git a/Protocol.cc b/Protocol.cc index 6d346b8..2a46f40 100644 --- a/Protocol.cc +++ b/Protocol.cc @@ -19,35 +19,387 @@ #define unlikely(x) __builtin_expect((x),0) +/** + * + * First we build a RESP Array: + * 1. * character as the first byte + * 2. the number of elements in the array as a decimal number + * 3. CRLF + * 4. The actual RESP element we are putting into the array + * + * All Redis commands are sent as arrays of bulk strings. + * For example, the command “SET mykey ‘my value’” would be written and sent as: + * *3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$8\r\nmy value\r\n + * + * Then package command as a RESP Bulk String to the server + * + * Bulk String is the defined by the following: + * 1."$" byte followed by the number of bytes composing the + * string (a prefixed length), terminated by CRLF. + * 2. The actual string data. + * 3. A final CRLF. + * + * DBG code + * fprintf(stderr,"--\n"); + * fprintf(stderr,"*3\r\n$3\r\nSET\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n", + * strlen(key),key,len,val); + * fprintf(stderr,"--\n"); + * + */ +int ProtocolRESP::set_request(const char* key, const char* value, int len, uint32_t opaque) { + + //need to make the real value + char *val = (char*)malloc(len*sizeof(char)+1); + memset(val, 'a', len); + val[len] = '\0'; + + //check if we should use assoc + if (opts.use_assoc && strlen(key) > ((unsigned int)(opts.assoc+1)) ) + { + int l = hset_request(key,val,len); + free(val); + return l; + } + + else + { + int l; + l = evbuffer_add_printf(bufferevent_get_output(bev), + "*3\r\n$3\r\nSET\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n", + strlen(key),key,len,val); + l += len + 2; + if (read_state == IDLE) read_state = WAITING_FOR_GET; + free(val); + return l; + } + +} + +/** + * Send a RESP get request. + */ +int ProtocolRESP::get_request(const char* key, uint32_t opaque) { + + //check if we should use assoc + if (opts.use_assoc && strlen(key) > ((unsigned int)(opts.assoc+1)) ) + return hget_request(key); + else + { + int l; + l = evbuffer_add_printf(bufferevent_get_output(bev), + "*2\r\n$3\r\nGET\r\n$%lu\r\n%s\r\n",strlen(key),key); + + if (read_state == IDLE) read_state = WAITING_FOR_GET; + return l; + } +} + +/** + * RESP HSET + * HSET myhash field1 "Hello" + * We break the key by last assoc bytes for now... + * We are guarenteed a key of at least assoc+1 bytes...but + * the vast vast majority are going to be 20 bytes. + * + * DBG code + * fprintf(stderr,"--\n"); + * fprintf(stderr,"*4\r\n$4\r\nHSET\r\n$%lu\r\n%s\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n", + * strlen(hash),hash,strlen(field),field,len,value); + * fprintf(stderr,"--\n"); + */ + +int ProtocolRESP::hset_request(const char* key, const char* value, int len) { + + int l = 0; + //hash is first n-assoc bytes + //field is last assoc bytes + //value is value + //int assoc = opts.assoc; + //char* hash = (char*)malloc(sizeof(char)*((strlen(key)-assoc)+1)); + //char* field = (char*)malloc(sizeof(char)*(assoc+1)); + //strncpy(hash, key, strlen(key)-assoc); + //strncpy(field,key+strlen(key)-assoc,assoc); + //hash[strlen(key)-assoc] = '\0'; + //field[assoc] = '\0'; + //l = evbuffer_add_printf(bufferevent_get_output(bev), + // "*4\r\n$4\r\nHSET\r\n$%lu\r\n%s\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n", + // strlen(hash),hash,strlen(field),field,len,value); + //l += len + 2; + //if (read_state == IDLE) read_state = WAITING_FOR_END; + //free(hash); + //free(field); + return l; + +} + +/** + * RESP HGET + * HGET myhash field1 + * We break the key by last assoc bytes for now... + * We are guarenteed a key of at least assoc+1 bytes...but + * the vast vast majority are going to be 20 bytes. + */ +int ProtocolRESP::hget_request(const char* key) { + int l = 0; + //hash is first n-assoc bytes + //field is last assoc bytes + //int assoc = opts.assoc; + //char* hash = (char*)malloc(sizeof(char)*((strlen(key)-assoc)+1)); + //char* field = (char*)malloc(sizeof(char)*(assoc+1)); + //strncpy(hash, key, strlen(key)-assoc); + //strncpy(field,key+strlen(key)-assoc,assoc); + //hash[strlen(key)-assoc] = '\0'; + //field[assoc] = '\0'; + //l = evbuffer_add_printf(bufferevent_get_output(bev), + // "*3\r\n$4\r\nHGET\r\n$%lu\r\n%s\r\n$%lu\r\n%s\r\n", + // strlen(hash),hash,strlen(field),field); + + //if (read_state == IDLE) read_state = WAITING_FOR_GET; + //free(hash); + //free(field); + return l; +} + +/** + * RESP DELETE 90 - delete 90 percent of keys in DB + */ +int ProtocolRESP::delete90_request() { + int l; + l = evbuffer_add_printf(bufferevent_get_output(bev), + "*1\r\n$8\r\nFLUSHALL\r\n"); + + if (read_state == IDLE) read_state = WAITING_FOR_DELETE; + return l; +} + +/** + * Handle a RESP response. + * + * In RESP, the type of data depends on the first byte: + * + * Simple Strings the first byte of the reply is "+" + * Errors the first byte of the reply is "-" + * Integers the first byte of the reply is ":" + * Bulk Strings the first byte of the reply is "$" + * Arrays the first byte of the reply is "*" + * + * Right now we are only implementing GET response + * so the RESP type will be bulk string. + * + * + */ +bool ProtocolRESP::handle_response(evbuffer *input, bool &done, bool &found, int &obj_size, uint32_t &opaque) { + opaque = 0; + + char *buf = NULL; + char *databuf = NULL; + char *obj_size_str = NULL; + int len; + size_t n_read_out; + + switch (read_state) { + + case WAITING_FOR_GET: + + buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF); + if (buf == NULL) return false; + + obj_size_str = buf+1; + obj_size = atoi(obj_size_str); + + conn->stats.rx_bytes += n_read_out; + + databuf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF); + //fprintf(stderr,"--------------------\n"); + //fprintf(stderr,"resp size %lu\n",n_read_out); + //fprintf(stderr,"data size %d\n",obj_size); + //fprintf(stderr,"-------header---------\n"); + //fprintf(stderr,"%s\n",buf); + //fprintf(stderr,"-------data-----------\n"); + //fprintf(stderr,"%s\n",databuf); + + conn->stats.rx_bytes += n_read_out; + + if (!strncmp(buf,"$-1",3)) { + conn->stats.get_misses++; + conn->stats.window_get_misses++; + found = false; + done = true; + } else if ((int)n_read_out != obj_size) { + + + // FIXME: check key name to see if it corresponds to the op at + // the head of the op queue? This will be necessary to + // support "gets" where there may be misses. + + data_length = obj_size; + read_state = WAITING_FOR_GET_DATA; + done = false; + } else if (!strncmp(buf,"+OK",3) || !strncmp(buf,":1",2) || !strncmp(buf,":0",2) ) { + found = false; + done = true; + } else { + // got all the data.. + found = true; + done = true; + } + if (databuf) + free(databuf); + free(buf); + return true; + + case WAITING_FOR_GET_DATA: + + len = evbuffer_get_length(input); + + //finally got all data... + if (len >= data_length + 2) { + evbuffer_drain(input, data_length + 2); + conn->stats.rx_bytes += data_length + 2; + read_state = WAITING_FOR_GET; + obj_size = data_length; + found = true; + done = true; + return true; + } + return false; + + default: printf("state: %d\n", read_state); DIE("Unimplemented!"); + } + + DIE("Shouldn't ever reach here..."); +} + + //char *buf = NUL; //for initial readline + //char *dbuf = NULL; //for data readline + //size_t n_read_out; + + //buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF); + //if (buf == NULL) + //{ + // done = false; + // return false; + //} + //conn->stats.rx_bytes += n_read_out; + // + //size_t len = evbuffer_get_length(input); + + //fprintf(stderr,"--------------------\n"); + //fprintf(stderr,"resp size %lu\n",n_read_out); + //fprintf(stderr,"ev len %lu\n",len); + //fprintf(stderr,"--------------------\n"); + //fprintf(stderr,"%s\n",buf); + ////RESP null response => miss + //if (!strncmp(buf,"$-1",3)) + //{ + // conn->stats.get_misses++; + // conn->stats.window_get_misses++; + // found = false; + // + //} + ////HSET or SET response was good, just consume the input and move on + ////with our lives + //else if (!strncmp(buf,"+OK",3) || !strncmp(buf,":1",2) || !strncmp(buf,":0",2) ) + //{ + // found = false; + // done = true; + //} + ////else we got a hit + //else + //{ + // char* nlen = buf+1; + // //fprintf(stderr,"%s\n",nlen); + // obj_size = atoi(nlen); + // // Consume the next "foobar" + // //size_t len = evbuffer_get_length(input); + // //dbuf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF); + // //if (!dbuf) + // //{ + // // fprintf(stderr,"--------------------\n"); + // // fprintf(stderr,"next foobar (null) %lu\n",n_read_out); + // // fprintf(stderr,"ev len %lu\n",len); + // // fprintf(stderr,"--------------------\n"); + // // fprintf(stderr,"%s\n",dbuf); + + // // //read_state = WAITING_FOR_GET_DATA; + // // //done = false; + // // //return false; + // //} + // //else + // //{ + + // // fprintf(stderr,"--------------------\n"); + // // fprintf(stderr,"next foobar (null) %lu\n",n_read_out); + // // fprintf(stderr,"ev len %lu\n",len); + // // fprintf(stderr,"--------------------\n"); + // // fprintf(stderr,"%s\n",dbuf); + // //} + + // //conn->stats.rx_bytes += n_read_out; + // found = true; + //} + ////read_state = WAITING_FOR_GET; + ////fprintf(stderr,"--------------------\n"); + ////fprintf(stderr,"read_state %u\n",read_state); + ////fprintf(stderr,"--------------------\n"); + //done = true; + ////if (dbuf) + //// free(dbuf); + //free(buf); + //return true; + + +//} + /** * Send an ascii get request. */ -int ProtocolAscii::get_request(const char* key) { +int ProtocolAscii::get_request(const char* key, uint32_t opaque) { int l; l = evbuffer_add_printf( bufferevent_get_output(bev), "get %s\r\n", key); - if (read_state == IDLE) read_state = WAITING_FOR_GET; + if (read_state == IDLE) { + read_state = WAITING_FOR_GET; + } return l; } /** * Send an ascii set request. */ -int ProtocolAscii::set_request(const char* key, const char* value, int len) { +int ProtocolAscii::set_request(const char* key, const char* value, int len, uint32_t opaque) { int l; l = evbuffer_add_printf(bufferevent_get_output(bev), "set %s 0 0 %d\r\n", key, len); - bufferevent_write(bev, value, len); + + char *val = (char*)malloc(len*sizeof(char)+1); + memset(val, 'a', len); + val[len] = '\0'; + + bufferevent_write(bev, val, len); bufferevent_write(bev, "\r\n", 2); l += len + 2; - if (read_state == IDLE) read_state = WAITING_FOR_END; + if (read_state == IDLE) { + read_state = WAITING_FOR_END; + } + free(val); + return l; +} + +/** WARNING UNIMPLEMENTED **/ +int ProtocolAscii::delete90_request() { + int l; + l = evbuffer_add_printf(bufferevent_get_output(bev), + "*1\r\n$8\r\nFLUSHALL\r\n"); + return l; } /** * Handle an ascii response. */ -bool ProtocolAscii::handle_response(evbuffer *input, bool &done) { +bool ProtocolAscii::handle_response(evbuffer *input, bool &done, bool &found, int &obj_size, uint32_t &opaque) { + opaque = 0; char *buf = NULL; int len; size_t n_read_out; @@ -62,7 +414,14 @@ bool ProtocolAscii::handle_response(evbuffer *input, bool &done) { conn->stats.rx_bytes += n_read_out; if (!strncmp(buf, "END", 3)) { - if (read_state == WAITING_FOR_GET) conn->stats.get_misses++; + if (read_state == WAITING_FOR_GET) { + conn->stats.get_misses++; + conn->stats.window_get_misses++; + found = false; + } + read_state = WAITING_FOR_GET; + done = true; + } else if (!strncmp(buf, "STORED", 6)) { read_state = WAITING_FOR_GET; done = true; } else if (!strncmp(buf, "VALUE", 5)) { @@ -93,6 +452,42 @@ bool ProtocolAscii::handle_response(evbuffer *input, bool &done) { } return false; + /* + case WAITING_FOR_GETSET: + buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF); + if (buf == NULL) return false; + + conn->stats.rx_bytes += n_read_out; + if (!strncmp(buf, "END", 3)) { + conn->stats.get_misses++; + conn->stats.window_get_misses++; + found = false; + done = true; + read_state = WAITING_FOR_SET; + return true; + } else if (!strncmp(buf, "STORED", 6)) { + done = true; + read_state = WAITING_FOR_GET; + return true; + } + + + case WAITING_FOR_SET: + buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF); + if (buf == NULL) return false; + + conn->stats.rx_bytes += n_read_out; + + if (!strncmp(buf, "STORED", 6)) { + done = true; + read_state = IDLE; + return true; + } else { + done = false; + return true; + } + */ + default: printf("state: %d\n", read_state); DIE("Unimplemented!"); } @@ -108,7 +503,7 @@ bool ProtocolBinary::setup_connection_w() { string user = string(opts.username); string pass = string(opts.password); - binary_header_t header = {0x80, CMD_SASL, 0, 0, 0, {0}, 0, 0, 0}; + binary_header_t header = {0x80, CMD_SASL, 0, 0, 0, 0, 0, 0, 0}; header.key_len = htons(5); header.body_len = htonl(6 + user.length() + 1 + pass.length()); @@ -126,50 +521,74 @@ bool ProtocolBinary::setup_connection_w() { bool ProtocolBinary::setup_connection_r(evbuffer* input) { if (!opts.sasl) return true; - bool b; - return handle_response(input, b); + bool b,c; + int obj_size; + uint32_t opaque; + return handle_response(input, b, c, obj_size, opaque); } /** * Send a binary get request. */ -int ProtocolBinary::get_request(const char* key) { +int ProtocolBinary::get_request(const char* key, uint32_t opaque) { + + struct evbuffer *output = bufferevent_get_output(bev); + uint16_t keylen = strlen(key); // each line is 4-bytes binary_header_t h = { 0x80, CMD_GET, htons(keylen), - 0x00, 0x00, {htons(0)}, + 0x00, 0x00, htons(0), htonl(keylen) }; + h.opaque = htonl(opaque); - bufferevent_write(bev, &h, 24); // size does not include extras - bufferevent_write(bev, key, keylen); + evbuffer_add(output, &h, 24); + evbuffer_add(output, key, keylen); + //bufferevent_write(bev, &h, 24); // size does not include extras + //bufferevent_write(bev, key, keylen); return 24 + keylen; } + + /** * Send a binary set request. */ -int ProtocolBinary::set_request(const char* key, const char* value, int len) { +int ProtocolBinary::set_request(const char* key, const char* value, int len, uint32_t opaque) { + struct evbuffer *output = bufferevent_get_output(bev); + uint16_t keylen = strlen(key); // each line is 4-bytes binary_header_t h = { 0x80, CMD_SET, htons(keylen), - 0x08, 0x00, {htons(0)}, + 0x08, 0x00, htons(0), htonl(keylen + 8 + len) }; - - bufferevent_write(bev, &h, 32); // With extras - bufferevent_write(bev, key, keylen); - bufferevent_write(bev, value, len); + h.opaque = htonl(opaque); + //bufferevent_write(bev, &h, 32); // With extras + //bufferevent_write(bev, key, keylen); + //bufferevent_write(bev, value, len); + evbuffer_add(output, &h, 32); + evbuffer_add(output, key, keylen); + evbuffer_add(output, value, len); return 24 + ntohl(h.body_len); } +/** WARNING UNIMPLEMENTED **/ +int ProtocolBinary::delete90_request() { + int l; + l = evbuffer_add_printf(bufferevent_get_output(bev), + "*1\r\n$8\r\nFLUSHALL\r\n"); + + return l; +} + /** * Tries to consume a binary response (in its entirety) from an evbuffer. * * @param input evBuffer to read response from * @return true if consumed, false if not enough data in buffer. */ -bool ProtocolBinary::handle_response(evbuffer *input, bool &done) { +bool ProtocolBinary::handle_response(evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque) { // Read the first 24 bytes as a header int length = evbuffer_get_length(input); if (length < 24) return false; @@ -177,24 +596,41 @@ bool ProtocolBinary::handle_response(evbuffer *input, bool &done) { reinterpret_cast(evbuffer_pullup(input, 24)); assert(h); + int bl = ntohl(h->body_len); // Not whole response - int targetLen = 24 + ntohl(h->body_len); + int targetLen = 24 + bl; if (length < targetLen) return false; + //fprintf(stderr,"handle resp - opcode: %u opaque: %u len: %u status: %u\n", + // h->opcode,ntohl(h->opaque), + // ntohl(h->body_len),ntohl(h->status)); + opcode = h->opcode; + opaque = ntohl(h->opaque); // If something other than success, count it as a miss - if (h->opcode == CMD_GET && h->status) { + if (opcode == CMD_GET && h->status) { conn->stats.get_misses++; + conn->stats.window_get_misses++; + found = false; } - if (unlikely(h->opcode == CMD_SASL)) { + if (unlikely(opcode == CMD_SASL)) { if (h->status == RESP_OK) { V("SASL authentication succeeded"); } else { DIE("SASL authentication failed"); } } + + if (bl > 0 && opcode == 1) { + //fprintf(stderr,"set resp len: %u\n",bl); + //void *data = malloc(bl); + //data = evbuffer_pullup(input, bl); + //free(data); + evbuffer_drain(input, targetLen); + } else { + evbuffer_drain(input, targetLen); + } - evbuffer_drain(input, targetLen); conn->stats.rx_bytes += targetLen; done = true; return true; diff --git a/Protocol.h b/Protocol.h index da7b253..ccd2293 100644 --- a/Protocol.h +++ b/Protocol.h @@ -18,9 +18,10 @@ class Protocol { virtual bool setup_connection_w() = 0; virtual bool setup_connection_r(evbuffer* input) = 0; - virtual int get_request(const char* key) = 0; - virtual int set_request(const char* key, const char* value, int len) = 0; - virtual bool handle_response(evbuffer* input, bool &done) = 0; + virtual int get_request(const char* key, uint32_t opaque) = 0; + virtual int set_request(const char* key, const char* value, int len, uint32_t opaque) = 0; + virtual int delete90_request() = 0; + virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque) = 0; protected: options_t opts; @@ -39,9 +40,10 @@ class ProtocolAscii : public Protocol { virtual bool setup_connection_w() { return true; } virtual bool setup_connection_r(evbuffer* input) { return true; } - virtual int get_request(const char* key); - virtual int set_request(const char* key, const char* value, int len); - virtual bool handle_response(evbuffer* input, bool &done); + virtual int get_request(const char* key, uint32_t opaque); + virtual int set_request(const char* key, const char* value, int len, uint32_t opaque); + virtual int delete90_request(); + virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque); private: enum read_fsm { @@ -49,6 +51,8 @@ class ProtocolAscii : public Protocol { WAITING_FOR_GET, WAITING_FOR_GET_DATA, WAITING_FOR_END, + WAITING_FOR_SET, + WAITING_FOR_GETSET }; read_fsm read_state; @@ -58,14 +62,49 @@ class ProtocolAscii : public Protocol { class ProtocolBinary : public Protocol { public: ProtocolBinary(options_t opts, Connection* conn, bufferevent* bev): - Protocol(opts, conn, bev) {}; + Protocol(opts, conn, bev) { + //int wbuf_written; + //int wbuf_towrite; + //unsigned char *wbuf_pos; + //unsigned char wbuf[65536]; + }; ~ProtocolBinary() {}; virtual bool setup_connection_w(); virtual bool setup_connection_r(evbuffer* input); - virtual int get_request(const char* key); - virtual int set_request(const char* key, const char* value, int len); - virtual bool handle_response(evbuffer* input, bool &done); + virtual int get_request(const char* key, uint32_t opaque); + virtual int set_request(const char* key, const char* value, int len, uint32_t opaque); + virtual int delete90_request(); + virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque); +}; + +class ProtocolRESP : public Protocol { +public: + ProtocolRESP(options_t opts, Connection* conn, bufferevent* bev): + Protocol(opts, conn, bev) { + }; + ~ProtocolRESP() {}; + + virtual bool setup_connection_w() { return true; } + virtual bool setup_connection_r(evbuffer* input) { return true; } + virtual int get_request(const char* key, uint32_t opaque); + virtual int set_request(const char* key, const char* value, int len, uint32_t opaque); + virtual int hget_request(const char* key); + virtual int hset_request(const char* key, const char* value, int len); + virtual int delete90_request(); + virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque); + +private: + enum read_fsm { + IDLE, + WAITING_FOR_GET, + WAITING_FOR_GET_DATA, + WAITING_FOR_DELETE, + WAITING_FOR_END + }; + + read_fsm read_state; + int data_length; }; #endif diff --git a/README.md b/README.md index e599886..2b1904f 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ client-side queuing delay adulterating the latency measurements. Command-line Options ==================== - mutilate3 0.1 + mutilate 0.1 Usage: mutilate -s server[:port] [options] @@ -129,20 +129,60 @@ Command-line Options --quiet Disable log messages. Basic options: - -s, --server=STRING Memcached server hostname[:port]. Repeat to + -s, --server=STRING Memcached server hostname[:port]. Repeat to specify multiple servers. + --unix_socket Use UNIX socket instead of TCP. --binary Use binary memcached protocol instead of ASCII. - -q, --qps=INT Target aggregate QPS. 0 = peak QPS. + --redis Use Redis RESP protocol instead of memchached. + --getset Use getset mode, in getset mode we first issue + a GET and if the response is MISS, then issue + a SET for on that + key following distribution value. + --getsetorset Use getset mode and allow for direct writes + (with optype == 2). + --successful Only record latency and throughput stats for + successful queries + --prefix=STRING Prefix all keys with a string (helps with + multi-tennant eval) + --delete90 Delete 90 percent of keys after halfway through + the workload, used to model Rumbel et. al. + USENIX FAST '14 + workloads. MUST BE IN GETSET MODE and + have a set number of + queries + --assoc=INT We create hash tables by taking the truncating + the key by b bytes. The + n-b bytes are the key for redis, in the + original (key,value). The + value is a hash table and we acess field + b to get the value. Essentially this makes + redis n-way associative + cache. Only works in redis mode. For small + key sizes we just use + normal method of (key,value) store. No hash + table. (default=`4') + -q, --qps=INT Target aggregate QPS. 0 = peak QPS. (default=`0') -t, --time=INT Maximum time to run (seconds). (default=`5') - -K, --keysize=STRING Length of memcached keys (distribution). + --read_file=STRING Read keys from file. (default=`') + --twitter_trace=INT use twitter memcached trace format from file. + (default=`0') + -K, --keysize=STRING Length of memcached keys (distribution). (default=`30') - -V, --valuesize=STRING Length of memcached values (distribution). + -V, --valuesize=STRING Length of memcached values (distribution). (default=`200') - -r, --records=INT Number of memcached records to use. If - multiple memcached servers are given, this - number is divided by the number of servers. + -r, --records=INT Number of memcached records to use. If + multiple memcached servers are given, this + number is divided by the number of servers. (default=`10000') + -m, --misswindow=INT Window for recording misses, used to find + steady state, no window by default, which + gives us summary stats in total + (default=`0') + -N, --queries=INT Number of queries to make. 0 is unlimited + (default) If multiple memcached servers are + given, this number is divided by the number + of servers. (default=`0') -u, --update=FLOAT Ratio of set:get commands. (default=`0.0') Advanced options: @@ -150,34 +190,34 @@ Command-line Options -P, --password=STRING Password to use for SASL authentication. -T, --threads=INT Number of threads to spawn. (default=`1') --affinity Set CPU affinity for threads, round-robin - -c, --connections=INT Connections to establish per server. + -c, --connections=INT Connections to establish per server. (default=`1') - -d, --depth=INT Maximum depth to pipeline requests. + -d, --depth=INT Maximum depth to pipeline requests. (default=`1') - -R, --roundrobin Assign threads to servers in round-robin - fashion. By default, each thread connects to + -R, --roundrobin Assign threads to servers in round-robin + fashion. By default, each thread connects to every server. - -i, --iadist=STRING Inter-arrival distribution (distribution). - Note: The distribution will automatically be - adjusted to match the QPS given by --qps. + -i, --iadist=STRING Inter-arrival distribution (distribution). + Note: The distribution will automatically be + adjusted to match the QPS given by --qps. (default=`exponential') - -S, --skip Skip transmissions if previous requests are - late. This harms the long-term QPS average, - but reduces spikes in QPS after long latency + -S, --skip Skip transmissions if previous requests are + late. This harms the long-term QPS average, + but reduces spikes in QPS after long latency requests. - --moderate Enforce a minimum delay of ~1/lambda between + --moderate Enforce a minimum delay of ~1/lambda between requests. --noload Skip database loading. --loadonly Load database and then exit. -B, --blocking Use blocking epoll(). May increase latency. --no_nodelay Don't use TCP_NODELAY. -w, --warmup=INT Warmup time before starting measurement. - -W, --wait=INT Time to wait after startup to start + -W, --wait=INT Time to wait after startup to start measurement. --save=STRING Record latency samples to given file. - --search=N:X Search for the QPS where N-order statistic < - Xus. (i.e. --search 95:1000 means find the - QPS where 95% of requests are faster than + --search=N:X Search for the QPS where N-order statistic < + Xus. (i.e. --search 95:1000 means find the + QPS where 95% of requests are faster than 1000us). --scan=min:max:step Scan latency across QPS rates from min to max. @@ -185,11 +225,11 @@ Command-line Options -A, --agentmode Run client in agent mode. -a, --agent=host Enlist remote agent. -p, --agent_port=STRING Agent port. (default=`5556') - -l, --lambda_mul=INT Lambda multiplier. Increases share of QPS for + -l, --lambda_mul=INT Lambda multiplier. Increases share of QPS for this client. (default=`1') - -C, --measure_connections=INT Master client connections per server, overrides + -C, --measure_connections=INT Master client connections per server, overrides --connections. - -Q, --measure_qps=INT Explicitly set master client QPS, spread across + -Q, --measure_qps=INT Explicitly set master client QPS, spread across threads and connections. -D, --measure_depth=INT Set master client connection depth. diff --git a/SConstruct b/SConstruct index 57d0054..f2a4e64 100644 --- a/SConstruct +++ b/SConstruct @@ -6,12 +6,18 @@ env = Environment(ENV = os.environ) env['HAVE_POSIX_BARRIER'] = True -env.Append(CPPPATH = ['/usr/local/include', '/opt/local/include']) -env.Append(LIBPATH = ['/opt/local/lib']) -env.Append(CCFLAGS = '-std=c++11 -D_GNU_SOURCE') -if sys.platform == 'darwin': - env['CC'] = 'clang' - env['CXX'] = 'clang++' +#env['CC'] = 'clang' +#env['CXX'] = 'clang++' + +#env.Append(CPPPATH = ['/u/dbyrne99/local/include', '/usr/include']) +#env.Append(CPATH = ['/u/dbyrne99/local/include', '/usr/include']) +#env.Append(LIBPATH = ['/u/dbyrne99/local/lib', '/lib64/']) + +#env.Append(CFLAGS = '-std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' ) +#env.Append(CCFLAGS = '-std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' ) +#if sys.platform == 'darwin': +#env['CC'] = 'clang' +#env['CXX'] = 'clang++' conf = env.Configure(config_h = "config.h") conf.Define("__STDC_FORMAT_MACROS") @@ -23,13 +29,14 @@ if env.Execute("@which gengetopt &> /dev/null"): print "not found (required)" Exit(1) else: print "found" -if not conf.CheckLibWithHeader("event", "event2/event.h", "C++"): - print "libevent required" - Exit(1) -conf.CheckDeclaration("EVENT_BASE_FLAG_PRECISE_TIMER", '#include ', "C++") +#if not conf.CheckLibWithHeader("event", "event2/event.h", "C++"): +# print "libevent required" +# Exit(1) +#conf.CheckDeclaration("EVENT_BASE_FLAG_PRECISE_TIMER", '#include ', "C++") if not conf.CheckLibWithHeader("pthread", "pthread.h", "C++"): print "pthread required" Exit(1) + conf.CheckLib("rt", "clock_gettime", language="C++") conf.CheckLibWithHeader("zmq", "zmq.hpp", "C++") if not conf.CheckFunc('pthread_barrier_init'): @@ -37,17 +44,33 @@ if not conf.CheckFunc('pthread_barrier_init'): env = conf.Finish() -env.Append(CFLAGS = ' -O3 -Wall -g') -env.Append(CPPFLAGS = ' -O3 -Wall -g') +#env.Append(CFLAGS = '-O0 -Wall -g --std=c++17 -lstdc++fs -fsanitize=address') +#env.Append(CPPFLAGS = '-O0 -Wall -g --std=c++17 -lstdc++fs -fsanitize=address') +env.Append(CFLAGS = ' -O2 -Wall -g --std=c++17 -lstdc++fs') +env.Append(CPPFLAGS = ' -O2 -Wall -g --std=c++17 -lstdc++fs') +#env.Append(CFLAGS = ' -O3 -Wall -g') +#env.Append(CPPFLAGS = ' -O3 -Wall -g') +#env.Append(LDFLAGS = '-fsantize=address') +#env.Append(CFLAGS = ' -O3 -Wall -g -fsantize=address') +#env.Append(CPPFLAGS = ' -O3 -Wall -g -fsanitize=address') +#env.Append(CFLAGS = ' -O0 -Wall -g') +#env.Append(CPPFLAGS = ' -O0 -Wall -g') + +#env.Append(CFLAGS = '-g -std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' ) +#env.Append(CCFLAGS = '-g -std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' ) env.Command(['cmdline.cc', 'cmdline.h'], 'cmdline.ggo', 'gengetopt < $SOURCE') src = Split("""mutilate.cc cmdline.cc log.cc distributions.cc util.cc - Connection.cc Protocol.cc Generator.cc""") + Connection.cc ConnectionMulti.cc ConnectionMultiApprox.cc ConnectionMultiApproxBatchShm.cc ConnectionMultiApproxBatch.cc ConnectionMultiApproxShm.cc Protocol.cc Generator.cc bipbuffer.cc""") + +#src = Split("""mutilate.cc cmdline.cc log.cc distributions.cc util.cc +# ConnectionMultiApprox.cc ConnectionMultiApproxBatchShm.cc Generator.cc bipbuffer.cc""") if not env['HAVE_POSIX_BARRIER']: # USE_POSIX_BARRIER: src += ['barrier.cc'] +src += ['libzstd.a', '/u/dbyrne99/local/lib/libevent.a'] env.Program(target='mutilate', source=src) -env.Program(target='gtest', source=['TestGenerator.cc', 'log.cc', 'util.cc', - 'Generator.cc']) +#env.Program(target='gtest', source=['TestGenerator.cc', 'log.cc', 'util.cc', +# 'Generator.cc']) diff --git a/binary_protocol.h b/binary_protocol.h index 2b5ef66..7c59ddf 100644 --- a/binary_protocol.h +++ b/binary_protocol.h @@ -1,31 +1,33 @@ #ifndef BINARY_PROTOCOL_H #define BINARY_PROTOCOL_H +#include + #define CMD_GET 0x00 +#define CMD_GETQ 0x09 +#define CMD_TOUCH 0x1c +#define CMD_TOUCH 0x1c +#define CMD_DELETE 0x04 #define CMD_SET 0x01 +#define CMD_NOOP 0x0a +#define CMD_SETQ 0x11 #define CMD_SASL 0x21 #define RESP_OK 0x00 +#define RESP_NOT_FOUND 0x01 #define RESP_SASL_ERR 0x20 -typedef struct __attribute__ ((__packed__)) { +typedef struct { uint8_t magic; uint8_t opcode; uint16_t key_len; - uint8_t extra_len; uint8_t data_type; - union { - uint16_t vbucket; // request use - uint16_t status; // response use - }; - + uint16_t status; // response use uint32_t body_len; uint32_t opaque; - uint64_t version; + uint64_t cas; - // Used for set only. - uint64_t extras; } binary_header_t; #endif /* BINARY_PROTOCOL_H */ diff --git a/bipbuffer.cc b/bipbuffer.cc new file mode 100644 index 0000000..b712617 --- /dev/null +++ b/bipbuffer.cc @@ -0,0 +1,182 @@ +/** + * Copyright (c) 2011, Willem-Hendrik Thiart + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE.bipbuffer file. + * + * @file + * @author Willem Thiart himself@willemthiart.com + */ + +//#include "stdio.h" +#include + +/* for memcpy */ +#include + +#include "bipbuffer.h" + +static size_t bipbuf_sizeof(const unsigned int size) +{ + return sizeof(bipbuf_t) + size; +} + +int bipbuf_unused(const bipbuf_t* me) +{ + if (1 == me->b_inuse) + /* distance between region B and region A */ + return me->a_start - me->b_end; + else + return me->size - me->a_end; +} + +int bipbuf_size(const bipbuf_t* me) +{ + return me->size; +} + +int bipbuf_used(const bipbuf_t* me) +{ + return (me->a_end - me->a_start) + me->b_end; +} + +void bipbuf_init(bipbuf_t* me, const unsigned int size) +{ + me->a_start = me->a_end = me->b_end = 0; + me->size = size; + me->b_inuse = 0; +} + +bipbuf_t *bipbuf_new(const unsigned int size) +{ + bipbuf_t *me = (bipbuf_t*)malloc(bipbuf_sizeof(size)); + if (!me) + return NULL; + bipbuf_init(me, size); + return me; +} + +void bipbuf_free(bipbuf_t* me) +{ + free(me); +} + +int bipbuf_is_empty(const bipbuf_t* me) +{ + return me->a_start == me->a_end; +} + +/* find out if we should turn on region B + * ie. is the distance from A to buffer's end less than B to A? */ +static void __check_for_switch_to_b(bipbuf_t* me) +{ + if (me->size - me->a_end < me->a_start - me->b_end) { + //fprintf(stderr,"%p switching to b, a_start: %d, a_end: %d, b_end %d\n",me,me->a_start,me->a_end,me->b_end); + me->b_inuse = 1; + } +} + +/* TODO: DOCUMENT THESE TWO FUNCTIONS */ +unsigned char *bipbuf_request(bipbuf_t* me, const int size) +{ + if (bipbuf_unused(me) < size) + return 0; + if (1 == me->b_inuse) + { + return (unsigned char *)me->data + me->b_end; + } + else + { + return (unsigned char *)me->data + me->a_end; + } +} + +int bipbuf_push(bipbuf_t* me, const int size) +{ + if (bipbuf_unused(me) < size) + return 0; + + if (1 == me->b_inuse) + { + me->b_end += size; + } + else + { + me->a_end += size; + } + + __check_for_switch_to_b(me); + return size; +} + +int bipbuf_offer(bipbuf_t* me, const unsigned char *data, const int size) +{ + /* not enough space */ + if (bipbuf_unused(me) < size) + return 0; + + if (1 == me->b_inuse) + { + memcpy(me->data + me->b_end, data, size); + me->b_end += size; + } + else + { + memcpy(me->data + me->a_end, data, size); + me->a_end += size; + } + + __check_for_switch_to_b(me); + return size; +} + +unsigned char *bipbuf_peek(const bipbuf_t* me, const unsigned int size) +{ + /* make sure we can actually peek at this data */ + if (me->size < me->a_start + size) + return NULL; + + if (bipbuf_is_empty(me)) + return NULL; + + return (unsigned char *)me->data + me->a_start; +} + +unsigned char *bipbuf_peek_all(const bipbuf_t* me, unsigned int *size) +{ + if (bipbuf_is_empty(me)) + return NULL; + + *size = me->a_end - me->a_start; + return (unsigned char*)me->data + me->a_start; +} + +unsigned char *bipbuf_poll(bipbuf_t* me, const unsigned int size) +{ + if (bipbuf_is_empty(me)) + return NULL; + + /* make sure we can actually poll this data */ + if (me->size < me->a_start + size) + return NULL; + + void *end = me->data + me->a_start; + me->a_start += size; + + /* we seem to be empty.. */ + if (me->a_start == me->a_end) + { + /* replace a with region b */ + if (1 == me->b_inuse) + { + me->a_start = 0; + me->a_end = me->b_end; + me->b_end = me->b_inuse = 0; + } + else + /* safely move cursor back to the start because we are empty */ + me->a_start = me->a_end = 0; + } + + __check_for_switch_to_b(me); + return (unsigned char*) end; +} diff --git a/bipbuffer.h b/bipbuffer.h new file mode 100644 index 0000000..f99f148 --- /dev/null +++ b/bipbuffer.h @@ -0,0 +1,93 @@ +#ifndef BIPBUFFER_H +#define BIPBUFFER_H + +#define BIPBUFSIZE 4*1024*1024 +#include "binary_protocol.h" +#include + +extern "C" { + typedef struct + { + unsigned long int size; + + /* region A */ + unsigned int a_start, a_end; + + /* region B */ + unsigned int b_end; + + /* is B inuse? */ + int b_inuse; + + unsigned char data[BIPBUFSIZE]; + } bipbuf_t; + +/** + * Create a new bip buffer. + * + * malloc()s space + * + * @param[in] size The size of the buffer */ +bipbuf_t *bipbuf_new(const unsigned int size); + +/** + * Initialise a bip buffer. Use memory provided by user. + * + * No malloc()s are performed. + * + * @param[in] size The size of the array */ +void bipbuf_init(bipbuf_t* me, const unsigned int size); + +/** + * Free the bip buffer */ +void bipbuf_free(bipbuf_t *me); + +/* TODO: DOCUMENTATION */ +unsigned char *bipbuf_request(bipbuf_t* me, const int size); +int bipbuf_push(bipbuf_t* me, const int size); + +/** + * @param[in] data The data to be offered to the buffer + * @param[in] size The size of the data to be offered + * @return number of bytes offered */ +int bipbuf_offer(bipbuf_t *me, const unsigned char *data, const int size); + +/** + * Look at data. Don't move cursor + * + * @param[in] len The length of the data to be peeked + * @return data on success, NULL if we can't peek at this much data */ +unsigned char *bipbuf_peek(const bipbuf_t* me, const unsigned int len); + +/** + * Look at data. Don't move cursor + * + * @param[in] len The length of the data returned + * @return data on success, NULL if nothing available */ +unsigned char *bipbuf_peek_all(const bipbuf_t* me, unsigned int *len); + +/** + * Get pointer to data to read. Move the cursor on. + * + * @param[in] len The length of the data to be polled + * @return pointer to data, NULL if we can't poll this much data */ +unsigned char *bipbuf_poll(bipbuf_t* me, const unsigned int size); + +/** + * @return the size of the bipbuffer */ +int bipbuf_size(const bipbuf_t* me); + +/** + * @return 1 if buffer is empty; 0 otherwise */ +int bipbuf_is_empty(const bipbuf_t* me); + +/** + * @return how much space we have assigned */ +int bipbuf_used(const bipbuf_t* cb); + +/** + * @return bytes of unused space */ +int bipbuf_unused(const bipbuf_t* me); + +} +#endif /* BIPBUFFER_H */ diff --git a/blockingconcurrentqueue.h b/blockingconcurrentqueue.h new file mode 100644 index 0000000..66579b6 --- /dev/null +++ b/blockingconcurrentqueue.h @@ -0,0 +1,582 @@ +// Provides an efficient blocking version of moodycamel::ConcurrentQueue. +// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified +// BSD license, available at the top of concurrentqueue.h. +// Also dual-licensed under the Boost Software License (see LICENSE.md) +// Uses Jeff Preshing's semaphore implementation (under the terms of its +// separate zlib license, see lightweightsemaphore.h). + +#pragma once + +#include "concurrentqueue.h" +#include "lightweightsemaphore.h" + +#include +#include +#include +#include +#include + +namespace moodycamel +{ +// This is a blocking version of the queue. It has an almost identical interface to +// the normal non-blocking version, with the addition of various wait_dequeue() methods +// and the removal of producer-specific dequeue methods. +template +class BlockingConcurrentQueue +{ +private: + typedef ::moodycamel::ConcurrentQueue ConcurrentQueue; + typedef ::moodycamel::LightweightSemaphore LightweightSemaphore; + +public: + typedef typename ConcurrentQueue::producer_token_t producer_token_t; + typedef typename ConcurrentQueue::consumer_token_t consumer_token_t; + + typedef typename ConcurrentQueue::index_t index_t; + typedef typename ConcurrentQueue::size_t size_t; + typedef typename std::make_signed::type ssize_t; + + static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE; + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD; + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE; + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE; + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE; + static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE; + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : inner(capacity), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + // Disable copying and copy assignment + BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : inner(std::move(other.inner)), sema(std::move(other.sema)) + { } + + inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + inner.swap(other.inner); + sema.swap(other.sema); + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if ((details::likely)(inner.enqueue(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if ((details::likely)(inner.enqueue(std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + if ((details::likely)(inner.enqueue(token, item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + if ((details::likely)(inner.enqueue(token, std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(token, std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (inner.try_enqueue(item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (inner.try_enqueue(std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + if (inner.try_enqueue(token, item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + if (inner.try_enqueue(token, std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(token, std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(consumer_token_t& token, U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + + + // Blocks the current thread until there's something to dequeue, then + // dequeues it. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(U& item) + { + while (!sema->wait()) { + continue; + } + while (!inner.try_dequeue(item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(item, std::chrono::duration_cast(timeout).count()); + } + + // Blocks the current thread until there's something to dequeue, then + // dequeues it using an explicit consumer token. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(consumer_token_t& token, U& item) + { + while (!sema->wait()) { + continue; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(token, item, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(token, itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + inline size_t size_approx() const + { + return (size_t)sema->availableApprox(); + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return ConcurrentQueue::is_lock_free(); + } + + +private: + template + static inline U* create(A1&& a1, A2&& a2) + { + void* p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1), std::forward(a2)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + ConcurrentQueue inner; + std::unique_ptr sema; +}; + + +template +inline void swap(BlockingConcurrentQueue& a, BlockingConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // end namespace moodycamel diff --git a/cmdline.ggo b/cmdline.ggo index 331dd21..73e4b34 100644 --- a/cmdline.ggo +++ b/cmdline.ggo @@ -10,11 +10,45 @@ option "quiet" - "Disable log messages." text "\nBasic options:" +option "use_shm" - "use shared memory" +option "use_shm_batch" - "use shared memory BATCHED" +option "ratelimit" - "limit conns from exceeding each other in requests" +option "v1callback" - "use v1 readcallbacks" option "server" s "Memcached server hostname[:port]. \ Repeat to specify multiple servers." string multiple +option "unix_socket" - "Use UNIX socket instead of TCP." +option "approx" - "approximate two level caching with inclusive/exclusive" +option "approx_batch" - "approximate two level caching with inclusive/exclusive and batching of reqs" +option "inclusives" - "give a list of 1 == inclusive, 2 == exclusives for each class" string default="" option "binary" - "Use binary memcached protocol instead of ASCII." +option "redis" - "Use Redis RESP protocol instead of memchached." +option "getset" - "Use getset mode, in getset mode we first issue \ +a GET and if the response is MISS, then issue a SET for on that +key following distribution value." +option "getsetorset" - "Use getset mode and allow for direct writes (with optype == 2)." +option "successful" - "Only record latency and throughput stats for successful queries" +option "prefix" - "Prefix all keys with a string (helps with multi-tennant eval)" string +option "delete90" - "Delete 90 percent of keys after halfway through \ + the workload, used to model Rumbel et. al. USENIX \ + FAST '14 workloads. MUST BE IN GETSET MODE and + have a set number of queries" + +option "assoc" - "We create hash tables by taking the truncating the \ + key by b bytes. The n-b bytes are the key for redis, in the original \ + (key,value). The value is a hash table and we acess field \ + b to get the value. Essentially this makes redis n-way \ + associative cache. Only works in redis mode. For small key \ + sizes we just use normal method of (key,value) store. No hash table." int default="4" option "qps" q "Target aggregate QPS. 0 = peak QPS." int default="0" option "time" t "Maximum time to run (seconds)." int default="5" +option "apps" - "Number of apps, should eqaul total conns" int default="1" +option "rand_admit" - "random admission to nvm" int default="0" +option "wb_all" - "all admission to nvm" int default="1" +option "threshold" - "admission to nvm if in top n" int default="0" +option "miss_through" - "All sets are considered dirty, expect for miss driven sets" + +option "read_file" - "Read keys from file." string default="" +option "twitter_trace" - "use twitter memcached trace format from file." int default="0" option "keysize" K "Length of memcached keys (distribution)." string default="30" @@ -25,6 +59,14 @@ option "records" r "Number of memcached records to use. \ If multiple memcached servers are given, this number is divided \ by the number of servers." int default="10000" +option "misswindow" m "Window for recording misses, used to find \ + steady state, no window by default, which \ + gives us summary stats in total" int default="0" + +option "queries" N "Number of queries to make. 0 is unlimited (default) \ +If multiple memcached servers are given, this number is divided \ +by the number of servers." int default="0" + option "update" u "Ratio of set:get commands." float default="0.0" text "\nAdvanced options:" diff --git a/common.h b/common.h new file mode 100644 index 0000000..8d21e69 --- /dev/null +++ b/common.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* + * This header file has common utility functions used in examples. + */ +#ifndef COMMON_H +#define COMMON_H + +#include // malloc, free, exit +#include // fprintf, perror, fopen, etc. +#include // strerror +#include // errno +#include // stat +#include "zstd.h" + +/* + * Define the returned error code from utility functions. + */ +typedef enum { + ERROR_fsize = 1, + ERROR_fopen = 2, + ERROR_fclose = 3, + ERROR_fread = 4, + ERROR_fwrite = 5, + ERROR_loadFile = 6, + ERROR_saveFile = 7, + ERROR_malloc = 8, + ERROR_largeFile = 9, +} COMMON_ErrorCode; + +/*! CHECK + * Check that the condition holds. If it doesn't print a message and die. + */ +#define CHECK(cond, ...) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, \ + "%s:%d CHECK(%s) failed: ", \ + __FILE__, \ + __LINE__, \ + #cond); \ + fprintf(stderr, "" __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(1); \ + } \ + } while (0) + +/*! CHECK_ZSTD + * Check the zstd error code and die if an error occurred after printing a + * message. + */ +#define CHECK_ZSTD(fn, ...) \ + do { \ + size_t const err = (fn); \ + CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ + } while (0) + +/*! fsize_orDie() : + * Get the size of a given file path. + * + * @return The size of a given file path. + * +static size_t fsize_orDie(const char *filename) +{ + struct stat st; + if (stat(filename, &st) != 0) { + perror(filename); + exit(ERROR_fsize); + } + + off_t const fileSize = st.st_size; + size_t const size = (size_t)fileSize; + * 1. fileSize should be non-negative, + * 2. if off_t -> size_t type conversion results in discrepancy, + * the file size is too large for type size_t. + * + if ((fileSize < 0) || (fileSize != (off_t)size)) { + fprintf(stderr, "%s : filesize too large \n", filename); + exit(ERROR_largeFile); + } + return size; +} +*/ + +/*! fopen_orDie() : + * Open a file using given file path and open option. + * + * @return If successful this function will return a FILE pointer to an + * opened file otherwise it sends an error to stderr and exits. + */ +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(ERROR_fopen); +} + +/*! fclose_orDie() : + * Close an opened file using given FILE pointer. + */ +static void fclose_orDie(FILE* file) +{ + if (!fclose(file)) { return; }; + /* error */ + perror("fclose"); + exit(ERROR_fclose); +} + +/*! fread_orDie() : + * + * Read sizeToRead bytes from a given file, storing them at the + * location given by buffer. + * + * @return The number of bytes read. + */ +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(ERROR_fread); +} + +/*! fwrite_orDie() : + * + * Write sizeToWrite bytes to a file pointed to by file, obtaining + * them from a location given by buffer. + * + * Note: This function will send an error to stderr and exit if it + * cannot write data to the given file pointer. + * + * @return The number of bytes written. + */ +//static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +//{ +// size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); +// if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ +// /* error */ +// perror("fwrite"); +// exit(ERROR_fwrite); +//} + +/*! malloc_orDie() : + * Allocate memory. + * + * @return If successful this function returns a pointer to allo- + * cated memory. If there is an error, this function will send that + * error to stderr and exit. + */ +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc"); + exit(ERROR_malloc); +} + +/*! loadFile_orDie() : + * load file into buffer (memory). + * + * Note: This function will send an error to stderr and exit if it + * cannot read data from the given file path. + * + * @return If successful this function will load file into buffer and + * return file size, otherwise it will printout an error to stderr and exit. + * +static size_t loadFile_orDie(const char* fileName, void* buffer, size_t bufferSize) +{ + size_t const fileSize = fsize_orDie(fileName); + CHECK(fileSize <= bufferSize, "File too large!"); + + FILE* const inFile = fopen_orDie(fileName, "rb"); + size_t const readSize = fread(buffer, 1, fileSize, inFile); + if (readSize != (size_t)fileSize) { + fprintf(stderr, "fread: %s : %s \n", fileName, strerror(errno)); + exit(ERROR_fread); + } + fclose(inFile); + return fileSize; +} +*/ + +/*! mallocAndLoadFile_orDie() : + * allocate memory buffer and then load file into it. + * + * Note: This function will send an error to stderr and exit if memory allocation + * fails or it cannot read data from the given file path. + * + * @return If successful this function will return buffer and bufferSize(=fileSize), + * otherwise it will printout an error to stderr and exit. + * +static void* mallocAndLoadFile_orDie(const char* fileName, size_t* bufferSize) { + size_t const fileSize = fsize_orDie(fileName); + *bufferSize = fileSize; + void* const buffer = malloc_orDie(*bufferSize); + loadFile_orDie(fileName, buffer, *bufferSize); + return buffer; +} +*/ + +/*! saveFile_orDie() : + * + * Save buffSize bytes to a given file path, obtaining them from a location pointed + * to by buff. + * + * Note: This function will send an error to stderr and exit if it + * cannot write to a given file. + */ +//static void saveFile_orDie(const char* fileName, const void* buff, size_t buffSize) +//{ +// FILE* const oFile = fopen_orDie(fileName, "wb"); +// size_t const wSize = fwrite(buff, 1, buffSize, oFile); +// if (wSize != (size_t)buffSize) { +// fprintf(stderr, "fwrite: %s : %s \n", fileName, strerror(errno)); +// exit(ERROR_fwrite); +// } +// if (fclose(oFile)) { +// perror(fileName); +// exit(ERROR_fclose); +// } +//} + +#endif diff --git a/concurrentqueue.h b/concurrentqueue.h new file mode 100644 index 0000000..b38d218 --- /dev/null +++ b/concurrentqueue.h @@ -0,0 +1,3742 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2020, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Also dual-licensed under the Boost Software License (see LICENSE.md) + +#pragma once + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher +// does not support `if constexpr`, so we have no choice but to simply disable the warning +#pragma warning(push) +#pragma warning(disable: 4127) // conditional expression is constant +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. +// We'll override the default trait malloc ourselves without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include // for max_align_t +#include +#include +#include +#include +#include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +// Platform-specific definitions of a numeric thread ID type and an invalid value +namespace moodycamel { namespace details { + template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } + }; +} } +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel { namespace details { + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; + static inline thread_id_t thread_id() { return rl::thread_index(); } +} } +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the function +// we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +namespace moodycamel { namespace details { + static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. + static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } +} } +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel { namespace details { + static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); + + typedef std::thread::id thread_id_t; + static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + + // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's + // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't + // be. + static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + + template struct thread_id_size { }; + template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; + template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; + + template<> struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const& x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } + }; +} } +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a thread-local +// static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel { namespace details { + typedef std::uintptr_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr + static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. + inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } +} } +#endif + +// Constexpr if +#ifndef MOODYCAMEL_CONSTEXPR_IF +#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L +#define MOODYCAMEL_CONSTEXPR_IF if constexpr +#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] +#else +#define MOODYCAMEL_CONSTEXPR_IF if +#define MOODYCAMEL_MAYBE_UNUSED +#endif +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) +#else +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +namespace moodycamel { namespace details { +#ifndef MOODYCAMEL_ALIGNAS +// VS2013 doesn't support alignas or alignof, and align() requires a constant literal +#if defined(_MSC_VER) && _MSC_VER <= 1800 +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type + template struct Vs2013Aligned { }; // default, unsupported alignment + template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; + template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; + template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; + template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; + template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; + template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; + template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; + template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; + template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; +#else + template struct identity { typedef T type; }; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type +#endif +#endif +} } + + +// TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, +// we can apply per-function compile-time suppression. +// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer +#define MOODYCAMEL_NO_TSAN +#if defined(__has_feature) + #if __has_feature(thread_sanitizer) + #undef MOODYCAMEL_NO_TSAN + #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) + #endif // TSAN +#endif // TSAN + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + static inline bool (likely)(bool x) { return __builtin_expect((x), true); } + static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +#else + static inline bool (likely)(bool x) { return x; } + static inline bool (unlikely)(bool x) { return x; } +#endif +} } + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel { +namespace details { + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try 0-100). + // Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template class ConcurrentQueue; +template class BlockingConcurrentQueue; +class ConcurrentQueueTests; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } + }; + + template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } + }; + template<> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } + }; + template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; + + static inline size_t hash_thread_id(thread_id_t id) + { + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); + } + + template + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template struct is_trivially_destructible : std::is_trivially_destructible { }; +#else + template struct is_trivially_destructible : std::has_trivial_destructor { }; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY + typedef RelacyThreadExitListener ThreadExitListener; + typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else + struct ThreadExitListener + { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + }; + + + class ThreadExitNotifier + { + public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier() : tail(nullptr) { } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + private: + ThreadExitListener* tail; + }; +#endif +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void* owner; +#endif + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() { } + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } + else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / BLOCK_SIZE); + auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = { block, index }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer* nextExplicitProducer; + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = { block, index, entry, this->parent }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } + else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index()) { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast(static_cast::type>(index - tailBase) / BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer* nextImplicitProducer; + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { + return create(); + } + else { + return nullptr; + } + } + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = { 0 }; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } + else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) { } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } + else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + bool recycled; + auto producer = static_cast(recycle_or_create_producer(false, recycled)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1; + probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release); + break; + } + ++index; + } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void* aligned_malloc(size_t size) + { + MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) + return (Traits::malloc)(size); + else { + size_t alignment = std::alignment_of::value; + void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); + if (!raw) + return nullptr; + char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template + static inline void aligned_free(void* ptr) + { + MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) + return (Traits::free)(ptr); + else + (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); + } + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + U* p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0; ) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U* create() + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + +#ifndef MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +#pragma warning(pop) +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#pragma GCC diagnostic pop +#endif diff --git a/distributions.cc b/distributions.cc index ce939e7..6cb9532 100644 --- a/distributions.cc +++ b/distributions.cc @@ -32,3 +32,4 @@ double generate_uniform(double lambda) { if (lambda <= 0.0) return 0; return 1.0 / lambda; } + diff --git a/libzstd.a b/libzstd.a new file mode 100644 index 0000000..3be4d40 Binary files /dev/null and b/libzstd.a differ diff --git a/lightweightsemaphore.h b/lightweightsemaphore.h new file mode 100644 index 0000000..b0f24e1 --- /dev/null +++ b/lightweightsemaphore.h @@ -0,0 +1,411 @@ +// Provides an efficient implementation of a semaphore (LightweightSemaphore). +// This is an extension of Jeff Preshing's sempahore implementation (licensed +// under the terms of its separate zlib license) that has been adapted and +// extended by Cameron Desrochers. + +#pragma once + +#include // For std::size_t +#include +#include // For std::make_signed + +#if defined(_WIN32) +// Avoid including windows.h in a header; we only need a handful of +// items, so we'll redeclare them here (this is relatively safe since +// the API generally has to remain stable between Windows versions). +// I know this is an ugly hack but it still beats polluting the global +// namespace with thousands of generic names or adding a .cpp for nothing. +extern "C" { + struct _SECURITY_ATTRIBUTES; + __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); + __declspec(dllimport) int __stdcall CloseHandle(void* hObject); + __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); + __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); +} +#elif defined(__MACH__) +#include +#elif defined(__unix__) +#include +#endif + +namespace moodycamel +{ +namespace details +{ + +// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's +// portable + lightweight semaphore implementations, originally from +// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h +// LICENSE: +// Copyright (c) 2015 Jeff Preshing +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. +#if defined(_WIN32) +class Semaphore +{ +private: + void* m_hSema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + const long maxLong = 0x7fffffff; + m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); + assert(m_hSema); + } + + ~Semaphore() + { + CloseHandle(m_hSema); + } + + bool wait() + { + const unsigned long infinite = 0xffffffff; + return WaitForSingleObject(m_hSema, infinite) == 0; + } + + bool try_wait() + { + return WaitForSingleObject(m_hSema, 0) == 0; + } + + bool timed_wait(std::uint64_t usecs) + { + return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0; + } + + void signal(int count = 1) + { + while (!ReleaseSemaphore(m_hSema, count, nullptr)); + } +}; +#elif defined(__MACH__) +//--------------------------------------------------------- +// Semaphore (Apple iOS and OSX) +// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html +//--------------------------------------------------------- +class Semaphore +{ +private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); + assert(rc == KERN_SUCCESS); + (void)rc; + } + + ~Semaphore() + { + semaphore_destroy(mach_task_self(), m_sema); + } + + bool wait() + { + return semaphore_wait(m_sema) == KERN_SUCCESS; + } + + bool try_wait() + { + return timed_wait(0); + } + + bool timed_wait(std::uint64_t timeout_usecs) + { + mach_timespec_t ts; + ts.tv_sec = static_cast(timeout_usecs / 1000000); + ts.tv_nsec = static_cast((timeout_usecs % 1000000) * 1000); + + // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html + kern_return_t rc = semaphore_timedwait(m_sema, ts); + return rc == KERN_SUCCESS; + } + + void signal() + { + while (semaphore_signal(m_sema) != KERN_SUCCESS); + } + + void signal(int count) + { + while (count-- > 0) + { + while (semaphore_signal(m_sema) != KERN_SUCCESS); + } + } +}; +#elif defined(__unix__) +//--------------------------------------------------------- +// Semaphore (POSIX, Linux) +//--------------------------------------------------------- +class Semaphore +{ +private: + sem_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + int rc = sem_init(&m_sema, 0, static_cast(initialCount)); + assert(rc == 0); + (void)rc; + } + + ~Semaphore() + { + sem_destroy(&m_sema); + } + + bool wait() + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do { + rc = sem_wait(&m_sema); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + bool try_wait() + { + int rc; + do { + rc = sem_trywait(&m_sema); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + bool timed_wait(std::uint64_t usecs) + { + struct timespec ts; + const int usecs_in_1_sec = 1000000; + const int nsecs_in_1_sec = 1000000000; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += (time_t)(usecs / usecs_in_1_sec); + ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000; + // sem_timedwait bombs if you have more than 1e9 in tv_nsec + // so we have to clean things up before passing it in + if (ts.tv_nsec >= nsecs_in_1_sec) { + ts.tv_nsec -= nsecs_in_1_sec; + ++ts.tv_sec; + } + + int rc; + do { + rc = sem_timedwait(&m_sema, &ts); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + void signal() + { + while (sem_post(&m_sema) == -1); + } + + void signal(int count) + { + while (count-- > 0) + { + while (sem_post(&m_sema) == -1); + } + } +}; +#else +#error Unsupported platform! (No semaphore wrapper available) +#endif + +} // end namespace details + + +//--------------------------------------------------------- +// LightweightSemaphore +//--------------------------------------------------------- +class LightweightSemaphore +{ +public: + typedef std::make_signed::type ssize_t; + +private: + std::atomic m_count; + details::Semaphore m_sema; + int m_maxSpins; + + bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) + { + ssize_t oldCount; + int spin = m_maxSpins; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + std::atomic_signal_fence(std::memory_order_acquire); // Prevent the compiler from collapsing the loop. + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount > 0) + return true; + if (timeout_usecs < 0) + { + if (m_sema.wait()) + return true; + } + if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs)) + return true; + // At this point, we've timed out waiting for the semaphore, but the + // count is still decremented indicating we may still be waiting on + // it. So we have to re-adjust the count, but only if the semaphore + // wasn't signaled enough times for us too since then. If it was, we + // need to release the semaphore too. + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + return true; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return false; + } + } + + ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1) + { + assert(max > 0); + ssize_t oldCount; + int spin = m_maxSpins; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + std::atomic_signal_fence(std::memory_order_acquire); + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount <= 0) + { + if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs))) + { + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + break; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return 0; + } + } + } + if (max > 1) + return 1 + tryWaitMany(max - 1); + return 1; + } + +public: + LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins) + { + assert(initialCount >= 0); + assert(maxSpins >= 0); + } + + bool tryWait() + { + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + } + return false; + } + + bool wait() + { + return tryWait() || waitWithPartialSpinning(); + } + + bool wait(std::int64_t timeout_usecs) + { + return tryWait() || waitWithPartialSpinning(timeout_usecs); + } + + // Acquires between 0 and (greedily) max, inclusive + ssize_t tryWaitMany(ssize_t max) + { + assert(max >= 0); + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + return 0; + } + + // Acquires at least one, and (greedily) at most max + ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs) + { + assert(max >= 0); + ssize_t result = tryWaitMany(max); + if (result == 0 && max > 0) + result = waitManyWithPartialSpinning(max, timeout_usecs); + return result; + } + + ssize_t waitMany(ssize_t max) + { + ssize_t result = waitMany(max, -1); + assert(result > 0); + return result; + } + + void signal(ssize_t count = 1) + { + assert(count >= 0); + ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release); + ssize_t toRelease = -oldCount < count ? -oldCount : count; + if (toRelease > 0) + { + m_sema.signal((int)toRelease); + } + } + + std::size_t availableApprox() const + { + ssize_t count = m_count.load(std::memory_order_relaxed); + return count > 0 ? static_cast(count) : 0; + } +}; + +} // end namespace moodycamel diff --git a/mutilate.cc b/mutilate.cc index 426fd05..a1f298a 100644 --- a/mutilate.cc +++ b/mutilate.cc @@ -2,16 +2,23 @@ #include #include #include +#include #include #include #include #include +#include +#include /* Added for the nonblocking socket */ +#include #include #include #include #include #include +#include +#include +namespace fs = std::filesystem; #include #include @@ -20,6 +27,10 @@ #include #include + +#include "common.h" //for zstd +#include "zstd.h" //shippped with mutilate + #include "config.h" #ifdef HAVE_LIBZMQ @@ -37,13 +48,43 @@ #include "log.h" #include "mutilate.h" #include "util.h" +#include "blockingconcurrentqueue.h" +//#include +//#include #define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define hashsize(n) ((unsigned long int)1<<(n)) using namespace std; +using namespace moodycamel; +//using namespace folly; + +int max_n[3] = {0,0,0}; +ifstream kvfile; +pthread_mutex_t flock = PTHREAD_MUTEX_INITIALIZER; + +pthread_mutex_t reader_l; +pthread_cond_t reader_ready; +int reader_not_ready = 1; + +pthread_mutex_t *item_locks; +int item_lock_hashpower = 14; + +map g_key_hist; + +//USPMCQueue g_trace_queue; + +//ConcurrentHashMap cid_rate; +unordered_map cid_rate; +//ConcurrentHashMap> copy_keys; +unordered_map> copy_keys; +unordered_map> wb_keys; +//ConcurrentHashMap> touch_keys; +unordered_map touch_keys; +//ConcurrentHashMap> wb_keys; gengetopt_args_info args; -char random_char[2 * 1024 * 1024]; // Buffer used to generate random values. +char random_char[4 * 1024 * 1024]; // Buffer used to generate random values. #ifdef HAVE_LIBZMQ vector agent_sockets; @@ -55,11 +96,27 @@ struct thread_data { options_t *options; bool master; // Thread #0, not to be confused with agent master. #ifdef HAVE_LIBZMQ - zmq::socket_t *socket; + zmq::socket_t *socketz; #endif + int id; + //std::vector*> trace_queue; + std::vector*> *trace_queue; + //std::vector *mutexes; + pthread_mutex_t* g_lock; + std::unordered_map> *g_wb_keys; +}; + +struct reader_data { + //std::vector*> trace_queue; + std::vector*> *trace_queue; + std::vector *mutexes; + string *trace_filename; + int twitter_trace; }; // struct evdns_base *evdns; + +pthread_t pt[1024]; pthread_barrier_t barrier; @@ -70,33 +127,36 @@ void init_random_stuff(); void go(const vector &servers, options_t &options, ConnectionStats &stats #ifdef HAVE_LIBZMQ -, zmq::socket_t* socket = NULL +, zmq::socket_t* socketz = NULL #endif ); +//void do_mutilate(const vector &servers, options_t &options, +// ConnectionStats &stats,std::vector*> trace_queue, bool master = true void do_mutilate(const vector &servers, options_t &options, - ConnectionStats &stats, bool master = true + ConnectionStats &stats,std::vector*> *trace_queue, pthread_mutex_t *g_lock, unordered_map> *g_wb_keys, bool master = true #ifdef HAVE_LIBZMQ -, zmq::socket_t* socket = NULL +, zmq::socket_t* socketz = NULL #endif ); void args_to_options(options_t* options); void* thread_main(void *arg); +void* reader_thread(void *arg); #ifdef HAVE_LIBZMQ -static std::string s_recv (zmq::socket_t &socket) { +static std::string s_recv (zmq::socket_t &socketz) { zmq::message_t message; - socket.recv(&message); + socketz.recv(&message); return std::string(static_cast(message.data()), message.size()); } // Convert string to 0MQ string and send to socket -static bool s_send (zmq::socket_t &socket, const std::string &string) { +static bool s_send (zmq::socket_t &socketz, const std::string &string) { zmq::message_t message(string.size()); memcpy(message.data(), string.data(), string.size()); - return socket.send(message); + return socketz.send(message); } /* @@ -156,17 +216,21 @@ static bool s_send (zmq::socket_t &socket, const std::string &string) { void agent() { zmq::context_t context(1); - zmq::socket_t socket(context, ZMQ_REP); - socket.bind((string("tcp://*:")+string(args.agent_port_arg)).c_str()); + zmq::socket_t socketz(context, ZMQ_REP); + if (atoi(args.agent_port_arg) == -1) { + socketz.bind(string("ipc:///tmp/memcached.sock").c_str()); + } else { + socketz.bind((string("tcp://*:")+string(args.agent_port_arg)).c_str()); + } while (true) { zmq::message_t request; - socket.recv(&request); + socketz.recv(&request); zmq::message_t num(sizeof(int)); *((int *) num.data()) = args.threads_arg * args.lambda_mul_arg; - socket.send(num); + socketz.send(num); options_t options; memcpy(&options, request.data(), sizeof(options)); @@ -174,8 +238,8 @@ void agent() { vector servers; for (int i = 0; i < options.server_given; i++) { - servers.push_back(s_recv(socket)); - s_send(socket, "ACK"); + servers.push_back(s_recv(socketz)); + s_send(socketz, "ACK"); } for (auto i: servers) { @@ -184,9 +248,9 @@ void agent() { options.threads = args.threads_arg; - socket.recv(&request); + socketz.recv(&request); options.lambda_denom = *((int *) request.data()); - s_send(socket, "THANKS"); + s_send(socketz, "THANKS"); // V("AGENT SLEEPS"); sleep(1); options.lambda = (double) options.qps / options.lambda_denom * args.lambda_mul_arg; @@ -199,7 +263,7 @@ void agent() { ConnectionStats stats; - go(servers, options, stats, &socket); + go(servers, options, stats, &socketz); AgentStats as; @@ -212,11 +276,11 @@ void agent() { as.stop = stats.stop; as.skips = stats.skips; - string req = s_recv(socket); + string req = s_recv(socketz); // V("req = %s", req.c_str()); request.rebuild(sizeof(as)); memcpy(request.data(), &as, sizeof(as)); - socket.send(request); + socketz.send(request); } } @@ -319,7 +383,7 @@ void finish_agent(ConnectionStats &stats) { * skew. */ -void sync_agent(zmq::socket_t* socket) { +void sync_agent(zmq::socket_t* socketz) { // V("agent: synchronizing"); if (args.agent_given) { @@ -338,16 +402,16 @@ void sync_agent(zmq::socket_t* socket) { if (s_recv(*s).compare(string("ack"))) DIE("sync_agent[M]: out of sync [2]"); } else if (args.agentmode_given) { - if (s_recv(*socket).compare(string("sync_req"))) + if (s_recv(*socketz).compare(string("sync_req"))) DIE("sync_agent[A]: out of sync [1]"); /* The real sync */ - s_send(*socket, "sync"); - if (s_recv(*socket).compare(string("proceed"))) + s_send(*socketz, "sync"); + if (s_recv(*socketz).compare(string("proceed"))) DIE("sync_agent[A]: out of sync [2]"); /* End sync */ - s_send(*socket, "ack"); + s_send(*socketz, "ack"); } // V("agent: synchronized"); @@ -413,6 +477,7 @@ string name_to_ipaddr(string host) { } int main(int argc, char **argv) { + //event_enable_debug_mode(); if (cmdline_parser(argc, argv, &args) != 0) exit(-1); for (unsigned int i = 0; i < args.verbose_given; i++) @@ -445,7 +510,7 @@ int main(int argc, char **argv) { // struct event_base *base; // if ((base = event_base_new()) == NULL) DIE("event_base_new() fail"); - // evthread_use_pthreads(); + //evthread_use_pthreads(); // if ((evdns = evdns_base_new(base, 1)) == 0) DIE("evdns"); @@ -470,8 +535,14 @@ int main(int argc, char **argv) { pthread_barrier_init(&barrier, NULL, options.threads); vector servers; - for (unsigned int s = 0; s < args.server_given; s++) - servers.push_back(name_to_ipaddr(string(args.server_arg[s]))); + for (unsigned int s = 0; s < args.server_given; s++) { + if (options.unix_socket || args.use_shm_given) { + servers.push_back(string(args.server_arg[s])); + } else { + servers.push_back(name_to_ipaddr(string(args.server_arg[s]))); + } + } + ConnectionStats stats; @@ -583,23 +654,61 @@ int main(int argc, char **argv) { if (!args.scan_given && !args.loadonly_given) { stats.print_header(); - stats.print_stats("read", stats.get_sampler); - stats.print_stats("update", stats.set_sampler); - stats.print_stats("op_q", stats.op_sampler); + stats.print_stats("read ", stats.get_sampler); + stats.print_stats("read_l1 ", stats.get_l1_sampler); + stats.print_stats("read_l2 ", stats.get_l2_sampler); + stats.print_stats("update_l1", stats.set_l1_sampler); + stats.print_stats("update_l2", stats.set_l2_sampler); + stats.print_stats("op_q ", stats.op_sampler); - int total = stats.gets + stats.sets; + int total = stats.gets_l1 + stats.gets_l2 + stats.sets_l1 + stats.sets_l2; printf("\nTotal QPS = %.1f (%d / %.1fs)\n", total / (stats.stop - stats.start), total, stats.stop - stats.start); + + int rtotal = stats.gets + stats.sets; + printf("\nTotal RPS = %.1f (%d / %.1fs)\n", + rtotal / (stats.stop - stats.start), + rtotal, stats.stop - stats.start); if (args.search_given && peak_qps > 0.0) printf("Peak QPS = %.1f\n", peak_qps); printf("\n"); - printf("Misses = %" PRIu64 " (%.1f%%)\n", stats.get_misses, - (double) stats.get_misses/stats.gets*100); + printf("GET Misses = %" PRIu64 " (%.1f%%)\n", stats.get_misses, + (double) stats.get_misses/(stats.gets)*100); + if (servers.size() == 2) { + int64_t additional = 0; + if (stats.delete_misses_l2 > 0) { + additional = stats.delete_misses_l2 - stats.set_excl_hits_l1; + fprintf(stderr,"delete misses_l2 %lu, delete hits_l2 %lu, excl_set_l1_hits: %lu\n",stats.delete_misses_l2,stats.delete_hits_l2,stats.set_excl_hits_l1); + if (additional < 0) { + fprintf(stderr,"additional misses is neg! %ld\n",additional); + additional = 0; + } + } + + for (int i = 0; i < 40; i++) { + fprintf(stderr,"class %d, gets: %lu, sets: %lu\n",i,stats.gets_cid[i],stats.sets_cid[i]); + } + //printf("Misses (L1) = %" PRIu64 " (%.1f%%)\n", stats.get_misses_l1 + stats.set_misses_l1, + // (double) (stats.get_misses_l1 + stats.set_misses_l1) /(stats.gets + stats.sets)*100); + printf("Misses (L1) = %" PRIu64 " (%.1f%%)\n", stats.get_misses_l1 , + (double) (stats.get_misses_l1) /(stats.gets)*100); + printf("SET Misses (L1) = %" PRIu64 " (%.1f%%)\n", stats.set_misses_l1 , + (double) (stats.set_misses_l1) /(stats.sets)*100); + //printf("Misses (L2) = %" PRIu64 " (%.1f%%)\n", stats.get_misses_l2, + // (double) (stats.get_misses_l2) /(stats.gets)*100); + printf("L2 Writes = %" PRIu64 " (%.1f%%)\n", stats.sets_l2, + (double) stats.sets_l2/(stats.gets+stats.sets)*100); + + printf("Incl WBs = %" PRIu64 " (%.1f%%)\n", stats.incl_wbs, + (double) stats.incl_wbs/(stats.gets+stats.sets)*100); + printf("Excl WBs = %" PRIu64 " (%.1f%%)\n", stats.excl_wbs, + (double) stats.excl_wbs/(stats.gets+stats.sets)*100); + } printf("Skipped TXs = %" PRIu64 " (%.1f%%)\n\n", stats.skips, (double) stats.skips / total * 100); @@ -642,7 +751,7 @@ int main(int argc, char **argv) { void go(const vector& servers, options_t& options, ConnectionStats &stats #ifdef HAVE_LIBZMQ -, zmq::socket_t* socket +, zmq::socket_t* socketz #endif ) { #ifdef HAVE_LIBZMQ @@ -651,8 +760,53 @@ void go(const vector& servers, options_t& options, } #endif + //std::vector*> trace_queue; // = (ConcurrentQueue**)malloc(sizeof(ConcurrentQueue) + std::vector*> *trace_queue = new std::vector*>(); + // = (ConcurrentQueue**)malloc(sizeof(ConcurrentQueue) + //std::vector *mutexes = new std::vector(); + pthread_mutex_t *g_lock = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t)); + *g_lock = PTHREAD_MUTEX_INITIALIZER; + + unordered_map> *g_wb_keys = new unordered_map>(); + + for (int i = 0; i <= options.apps; i++) { + // //trace_queue.push_back(new ConcurrentQueue(2000000)); + // pthread_mutex_t *lock = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t)); + // *lock = PTHREAD_MUTEX_INITIALIZER; + // mutexes->push_back(lock); + trace_queue->push_back(new std::queue()); + } + pthread_mutex_init(&reader_l, NULL); + pthread_cond_init(&reader_ready, NULL); + + //ConcurrentQueue *trace_queue = new ConcurrentQueue(20000000); + struct reader_data *rdata = (struct reader_data*)malloc(sizeof(struct reader_data)); + rdata->trace_queue = trace_queue; + //rdata->mutexes = mutexes; + rdata->twitter_trace = options.twitter_trace; + pthread_t rtid; + if (options.read_file) { + rdata->trace_filename = new string(options.file_name); + int error = 0; + if ((error = pthread_create(&rtid, NULL,reader_thread,rdata)) != 0) { + printf("reader thread failed to be created with error code %d\n", error); + } + pthread_mutex_lock(&reader_l); + while (reader_not_ready) + pthread_cond_wait(&reader_ready,&reader_l); + pthread_mutex_unlock(&reader_l); + + } + + /* initialize item locks */ + uint32_t item_lock_count = hashsize(item_lock_hashpower); + item_locks = (pthread_mutex_t*)calloc(item_lock_count, sizeof(pthread_mutex_t)); + for (size_t i = 0; i < item_lock_count; i++) { + pthread_mutex_init(&item_locks[i], NULL); + } + + if (options.threads > 1) { - pthread_t pt[options.threads]; struct thread_data td[options.threads]; #ifdef __clang__ vector* ts = static_cast*>(alloca(sizeof(vector) * options.threads)); @@ -664,10 +818,15 @@ void go(const vector& servers, options_t& options, int current_cpu = -1; #endif + for (int t = 0; t < options.threads; t++) { td[t].options = &options; + td[t].id = t; + td[t].trace_queue = trace_queue; + td[t].g_lock = g_lock; + td[t].g_wb_keys = g_wb_keys; #ifdef HAVE_LIBZMQ - td[t].socket = socket; + td[t].socketz = socketz; #endif if (t == 0) td[t].master = true; else td[t].master = false; @@ -711,24 +870,31 @@ void go(const vector& servers, options_t& options, if (pthread_create(&pt[t], &attr, thread_main, &td[t])) DIE("pthread_create() failed"); + usleep(t); } for (int t = 0; t < options.threads; t++) { ConnectionStats *cs; if (pthread_join(pt[t], (void**) &cs)) DIE("pthread_join() failed"); stats.accumulate(*cs); + delete cs; } + for (int i = 1; i <= 2; i++) { + fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]); + } + //delete trace_queue; + } else if (options.threads == 1) { - do_mutilate(servers, options, stats, true + do_mutilate(servers, options, stats, trace_queue, g_lock, g_wb_keys, true #ifdef HAVE_LIBZMQ -, socket +, socketz #endif ); } else { #ifdef HAVE_LIBZMQ if (args.agent_given) { - sync_agent(socket); + sync_agent(socketz); } #endif } @@ -746,14 +912,427 @@ void go(const vector& servers, options_t& options, #endif } +int stick_this_thread_to_core(int core_id) { + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + if (core_id < 0 || core_id >= num_cores) + return EINVAL; + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + + pthread_t current_thread = pthread_self(); + return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); +} + +bool hasEnding (string const &fullString, string const &ending) { + if (fullString.length() >= ending.length()) { + return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending)); + } else { + return false; + } +} + +static char *get_stream(ZSTD_DCtx* dctx, FILE *fin, size_t const buffInSize, void* const buffIn, size_t const buffOutSize, void* const buffOut) { + /* This loop assumes that the input file is one or more concatenated zstd + * streams. This example won't work if there is trailing non-zstd data at + * the end, but streaming decompression in general handles this case. + * ZSTD_decompressStream() returns 0 exactly when the frame is completed, + * and doesn't consume input after the frame. + */ + size_t const toRead = buffInSize; + size_t read; + size_t lastRet = 0; + int isEmpty = 1; + if ( (read = fread_orDie(buffIn, toRead, fin)) ) { + isEmpty = 0; + ZSTD_inBuffer input = { buffIn, read, 0 }; + /* Given a valid frame, zstd won't consume the last byte of the frame + * until it has flushed all of the decompressed data of the frame. + * Therefore, instead of checking if the return code is 0, we can + * decompress just check if input.pos < input.size. + */ + char *trace = (char*)malloc(buffOutSize*2); + memset(trace,0,buffOutSize+1); + size_t tracelen = buffOutSize+1; + size_t total = 0; + while (input.pos < input.size) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + /* The return code is zero if the frame is complete, but there may + * be multiple frames concatenated together. Zstd will automatically + * reset the context when a frame is complete. Still, calling + * ZSTD_DCtx_reset() can be useful to reset the context to a clean + * state, for instance if the last decompression call returned an + * error. + */ + + size_t const ret = ZSTD_decompressStream(dctx, &output , &input); + + if (output.pos + total > tracelen) { + trace = (char*)realloc(trace,(output.pos+total+1)); + tracelen = (output.pos+total+1); + } + strncat(trace,(const char*)buffOut,output.pos); + total += output.pos; + + lastRet = ret; + } + int idx = total; + while (trace[idx] != '\n') { + idx--; + } + trace[idx] = 0; + trace[idx+1] = 0; + return trace; + + } + + if (isEmpty) { + fprintf(stderr, "input is empty\n"); + return NULL; + } + + if (lastRet != 0) { + /* The last return value from ZSTD_decompressStream did not end on a + * frame, but we reached the end of the file! We assume this is an + * error, and the input was truncated. + */ + fprintf(stderr, "EOF before end of stream: %zu\n", lastRet); + exit(1); + } + return NULL; + +} + +void* reader_thread(void *arg) { + struct reader_data *rdata = (struct reader_data *) arg; + //std::vector*> trace_queue = (std::vector*>) rdata->trace_queue; + std::vector*> *trace_queue = (std::vector*>*) rdata->trace_queue; + // std::vector *mutexes = (std::vector*) rdata->mutexes; + int twitter_trace = rdata->twitter_trace; + string fn = *(rdata->trace_filename); + srand(time(NULL)); + if (hasEnding(fn,".zst")) { + string blobfile = fs::path( fn ).filename(); + blobfile.erase(blobfile.length()-4); + blobfile.insert(0,"/dev/shm/"); + blobfile.append(".data"); + int do_blob = 0; + int blob = 0; + if (do_blob) { + blob = open(blobfile.c_str(),O_CREAT | O_APPEND | O_RDWR, S_IRWXU); + } + //init + const char *filename = fn.c_str(); + FILE* const fin = fopen_orDie(filename, "rb"); + size_t const buffInSize = ZSTD_DStreamInSize()*1000; + void* const buffIn = malloc_orDie(buffInSize); + size_t const buffOutSize = ZSTD_DStreamOutSize()*1000; + void* const buffOut = malloc_orDie(buffOutSize); + + map key_hist; + ZSTD_DCtx* const dctx = ZSTD_createDCtx(); + //CHECK(dctx != NULL, "ZSTD_createDCtx() failed!"); + //char *leftover = malloc(buffOutSize); + //memset(leftover,0,buffOutSize); + //char *trace = (char*)decompress(filename); + uint64_t nwrites = 0; + uint64_t nout = 1; + int batch = 0; + int cappid = 1; + fprintf(stderr,"%lu trace queues for connections\n",trace_queue->size()); + char *trace = get_stream(dctx, fin, buffInSize, buffIn, buffOutSize, buffOut); + while (trace != NULL) { + char *ftrace = trace; + char *line = NULL; + char *line_p = (char*)calloc(2048,sizeof(char)); + while ((line = strsep(&trace,"\n"))) { + strncpy(line_p,line,2048); + string full_line(line); + //check the appid + int appid = 0; + int first = 1; + if (full_line.length() > 10) { + + if (trace_queue->size() > 0) { + stringstream ss(full_line); + string rT; + string rApp; + string rKey; + string rOp; + string rvaluelen; + Operation *Op = new Operation; + if (twitter_trace == 1) { + string rKeySize; + size_t n = std::count(full_line.begin(), full_line.end(), ','); + if (n == 6) { + getline( ss, rT, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rKeySize, ',' ); + getline( ss, rvaluelen, ',' ); + getline( ss, rApp, ',' ); + getline( ss, rOp, ',' ); + if (rOp.compare("get") == 0) { + Op->type = Operation::GET; + } else if (rOp.compare("set") == 0) { + Op->type = Operation::SET; + } + if (rvaluelen.compare("") == 0 || rvaluelen.size() < 1 || rvaluelen.empty()) { + continue; + } + appid = cappid; + if (nout % 1000 == 0) { + cappid++; + cappid = cappid % trace_queue->size(); + if (cappid == 0) cappid = 1; + } + //appid = stoi(rApp) % trace_queue->size(); + if (appid == 0) appid = 1; + //appid = (rand() % (trace_queue->size()-1)) + 1; + //if (appid == 0) appid = 1; + + + } else { + continue; + } + + } + else if (twitter_trace == 2) { + size_t n = std::count(full_line.begin(), full_line.end(), ','); + if (n == 4) { + getline( ss, rT, ','); + getline( ss, rApp, ','); + getline( ss, rOp, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rvaluelen, ',' ); + int ot = stoi(rOp); + switch (ot) { + case 1: + Op->type = Operation::GET; + break; + case 2: + Op->type = Operation::SET; + break; + } + appid = (stoi(rApp)) % trace_queue->size(); + if (appid == 0) appid = 1; + //appid = (nout) % trace_queue->size(); + } else { + continue; + } + } + else if (twitter_trace == 3) { + size_t n = std::count(full_line.begin(), full_line.end(), ','); + if (n == 4) { + getline( ss, rT, ','); + getline( ss, rApp, ','); + getline( ss, rOp, ',' ); + getline( ss, rKey, ',' ); + getline( ss, rvaluelen, ',' ); + int ot = stoi(rOp); + switch (ot) { + case 1: + Op->type = Operation::GET; + break; + case 2: + Op->type = Operation::SET; + break; + } + //if (first) { + // appid = (rand() % (trace_queue->size()-1)) + 1; + // if (appid == 0) appid = 1; + // first = 0; + //} + //batch++; + appid = (rand() % (trace_queue->size()-1)) + 1; + if (appid == 0) appid = 1; + } else { + continue; + } + } + else if (twitter_trace == 4) { + size_t n = std::count(full_line.begin(), full_line.end(), ','); + if (n == 4) { + getline( ss, rT, ','); + getline( ss, rKey, ',' ); + getline( ss, rOp, ',' ); + getline( ss, rvaluelen, ',' ); + int ot = stoi(rOp); + switch (ot) { + case 1: + Op->type = Operation::GET; + break; + case 2: + Op->type = Operation::SET; + break; + } + if (rvaluelen == "0") { + rvaluelen = "50000"; + } + + appid = (rand() % (trace_queue->size()-1)) + 1; + if (appid == 0) appid = 1; + } else { + continue; + } + } + int vl = stoi(rvaluelen); + if (appid < (int)trace_queue->size() && vl < 524000 && vl > 1) { + Op->valuelen = vl; + strncpy(Op->key,rKey.c_str(),255);; + if (Op->type == Operation::GET) { + //find when was last read + Operation *last_op = key_hist[rKey]; + if (last_op != NULL) { + last_op->future = 1; //THE FUTURE IS NOW + Op->curr = 1; + Op->future = 0; + key_hist[rKey] = Op; + g_key_hist[rKey] = 1; + } else { + //first ref + Op->curr = 1; + Op->future = 0; + key_hist[rKey] = Op; + g_key_hist[rKey] = 0; + } + } + Op->appid = appid; + trace_queue->at(appid)->push(Op); + //g_trace_queue.enqueue(Op); + //if (twitter_trace == 3) { // && batch == 2) { + // appid = (rand() % (trace_queue->size()-1)) + 1; + // if (appid == 0) appid = 1; + // batch = 0; + //} + } + } else { + fprintf(stderr,"big error!\n"); + } + } + //bool res = trace_queue[appid]->try_enqueue(full_line); + //while (!res) { + // //usleep(10); + // //res = trace_queue[appid]->try_enqueue(full_line); + // nwrites++; + //} + nout++; + if (nout % 1000000 == 0) fprintf(stderr,"decompressed requests: %lu, waits: %lu\n",nout,nwrites); + + } + free(line_p); + free(ftrace); + trace = get_stream(dctx, fin, buffInSize, buffIn, buffOutSize, buffOut); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < (int)trace_queue->size(); j++) { + //trace_queue[j]->enqueue(eof); + Operation *eof = new Operation; + eof->type = Operation::SASL; + eof->appid = j; + trace_queue->at(j)->push(eof); + //g_trace_queue.enqueue(eof); + if (i == 0) { + fprintf(stderr,"appid %d, tq size: %ld\n",j,trace_queue->at(j)->size()); + } + } + } + if (do_blob) { + for (int i = 0; i < (int)trace_queue->size(); i++) { + queue tmp = *(trace_queue->at(i)); + while (!tmp.empty()) { + Operation *Op = tmp.front(); + int br = write(blob,(void*)(Op),sizeof(Operation)); + if (br != sizeof(Operation)) { + fprintf(stderr,"error writing op!\n"); + } + tmp.pop(); + } + + } + } + + pthread_mutex_lock(&reader_l); + if (reader_not_ready) { + reader_not_ready = 0; + } + pthread_mutex_unlock(&reader_l); + pthread_cond_signal(&reader_ready); + if (trace) { + free(trace); + } + ZSTD_freeDCtx(dctx); + fclose_orDie(fin); + free(buffIn); + free(buffOut); + + + } else if (hasEnding(fn,".data")) { + ifstream trace_file (fn, ios::in | ios::binary); + uint32_t treqs = 0; + char *ops = (char*)malloc(sizeof(Operation)*1000000); + Operation *optr = (Operation*)(ops); + while (trace_file.good()) { + trace_file.read((char*)ops,sizeof(Operation)*1000000); + int tbytes = trace_file.gcount(); + int tops = tbytes/sizeof(Operation); + for (int i = 0; i < tops; i++) { + Operation *op = (Operation*)optr; + string rKey = string(op->key); + g_key_hist[rKey] = 0; + if (op->future) { + g_key_hist[rKey] = 1; + } + trace_queue->at(op->appid)->push(op); + treqs++; + if (treqs % 1000000 == 0) fprintf(stderr,"loaded requests: %u\n",treqs); + optr++; + + } + optr = (Operation*)ops; + } + trace_file.close(); + + pthread_mutex_lock(&reader_l); + if (reader_not_ready) { + reader_not_ready = 0; + } + pthread_mutex_unlock(&reader_l); + pthread_cond_signal(&reader_ready); + + } + //else { + + //ifstream trace_file; + //trace_file.open(rdata->trace_filename); + //while (trace_file.good()) { + // string line; + // getline(trace_file,line); + // trace_queue->enqueue(line); + //} + //string eof = "EOF"; + //for (int i = 0; i < 1000; i++) { + // trace_queue->enqueue(eof); + //} + //} + + return NULL; +} + void* thread_main(void *arg) { struct thread_data *td = (struct thread_data *) arg; - + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + //int res = stick_this_thread_to_core(td->id % num_cores); + //if (res != 0) { + // DIE("pthread_attr_setaffinity_np(%d) failed: %s", + // td->id, strerror(res)); + //} ConnectionStats *cs = new ConnectionStats(); - do_mutilate(*td->servers, *td->options, *cs, td->master + do_mutilate(*td->servers, *td->options, *cs, td->trace_queue, td->g_lock, td->g_wb_keys, td->master #ifdef HAVE_LIBZMQ -, td->socket +, td->socketz #endif ); @@ -761,9 +1340,9 @@ void* thread_main(void *arg) { } void do_mutilate(const vector& servers, options_t& options, - ConnectionStats& stats, bool master + ConnectionStats& stats, vector*> *trace_queue, pthread_mutex_t* g_lock, unordered_map> *g_wb_keys, bool master #ifdef HAVE_LIBZMQ -, zmq::socket_t* socket +, zmq::socket_t* socketz #endif ) { int loop_flag = @@ -775,136 +1354,251 @@ void do_mutilate(const vector& servers, options_t& options, struct evdns_base *evdns; struct event_config *config; + if ((config = event_config_new()) == NULL) DIE("event_config_new() fail"); #ifdef HAVE_DECL_EVENT_BASE_FLAG_PRECISE_TIMER if (event_config_set_flag(config, EVENT_BASE_FLAG_PRECISE_TIMER)) - DIE("event_config_set_flag(EVENT_BASE_FLAG_PRECISE_TIMER) fail"); + DIE("event_config_set_flag(EVENT_BASE_FLAG_PRECISE_TIMER) fail"); #endif if ((base = event_base_new_with_config(config)) == NULL) DIE("event_base_new() fail"); - // evthread_use_pthreads(); + //evthread_use_pthreads(); if ((evdns = evdns_base_new(base, 1)) == 0) DIE("evdns"); // event_base_priority_init(base, 2); // FIXME: May want to move this to after all connections established. - double start = get_time(); - double now = start; - vector connections; - vector server_lead; - for (auto s: servers) { - // Split args.server_arg[s] into host:port using strtok(). - char *s_copy = new char[s.length() + 1]; - strcpy(s_copy, s.c_str()); + if (servers.size() == 1) { + vector connections; + vector server_lead; + for (auto s: servers) { + // Split args.server_arg[s] into host:port using strtok(). + char *s_copy = new char[s.length() + 1]; + strcpy(s_copy, s.c_str()); + + char *h_ptr = strtok_r(s_copy, ":", &saveptr); + char *p_ptr = strtok_r(NULL, ":", &saveptr); + + if (h_ptr == NULL) DIE("strtok(.., \":\") failed to parse %s", s.c_str()); + + string hostname = h_ptr; + string port = "11211"; + if (p_ptr) port = p_ptr; + + delete[] s_copy; + + int conns = args.measure_connections_given ? args.measure_connections_arg : + options.connections; + + srand(time(NULL)); + for (int c = 0; c <= conns; c++) { + Connection* conn = new Connection(base, evdns, hostname, port, options, + //NULL,//trace_queue, + args.agentmode_given ? false : + true); + int tries = 120; + int connected = 0; + int s = 2; + for (int i = 0; i < tries; i++) { + int ret = conn->do_connect(); + if (ret) { + connected = 1; + fprintf(stderr,"thread %lu, conn: %d, connected!\n",pthread_self(),c+1); + break; + } + int d = s + rand() % 100; + //s = s + d; + + //fprintf(stderr,"conn: %d, sleeping %d\n",c,d); + sleep(d); + } + if (connected) { + //fprintf(stderr,"cid %d gets trace_queue\nfirst: %s",conn->get_cid(),trace_queue->at(conn->get_cid())->front().c_str()); + //conn->set_queue(trace_queue->at(conn->get_cid())); + //conn->set_lock(mutexes->at(conn->get_cid())); + connections.push_back(conn); + } else { + fprintf(stderr,"conn: %d, not connected!!\n",c); + + } + if (c == 0) server_lead.push_back(conn); + } + } + double start = get_time(); + double now = start; + + // Wait for all Connections to become IDLE. + while (1) { + // FIXME: If all connections become ready before event_base_loop + // is called, this will deadlock. + event_base_loop(base, EVLOOP_ONCE); - char *h_ptr = strtok_r(s_copy, ":", &saveptr); - char *p_ptr = strtok_r(NULL, ":", &saveptr); + bool restart = false; + for (Connection *conn: connections) + if (!conn->is_ready()) restart = true; - if (h_ptr == NULL) DIE("strtok(.., \":\") failed to parse %s", s.c_str()); + if (restart) continue; + else break; + } - string hostname = h_ptr; - string port = "11211"; - if (p_ptr) port = p_ptr; + // Load database on lead connection for each server. + if (!options.noload) { + V("Loading database."); - delete[] s_copy; + for (auto c: server_lead) c->start_loading(); - int conns = args.measure_connections_given ? args.measure_connections_arg : - options.connections; + // Wait for all Connections to become IDLE. + while (1) { + // FIXME: If all connections become ready before event_base_loop + // is called, this will deadlock. + event_base_loop(base, EVLOOP_ONCE); - for (int c = 0; c < conns; c++) { - Connection* conn = new Connection(base, evdns, hostname, port, options, - args.agentmode_given ? false : - true); - connections.push_back(conn); - if (c == 0) server_lead.push_back(conn); + bool restart = false; + for (Connection *conn: connections) + if (!conn->is_ready()) restart = true; + + if (restart) continue; + else break; + } } - } - // Wait for all Connections to become IDLE. - while (1) { - // FIXME: If all connections become ready before event_base_loop - // is called, this will deadlock. - event_base_loop(base, EVLOOP_ONCE); + if (options.loadonly) { + evdns_base_free(evdns, 0); + event_base_free(base); + return; + } - bool restart = false; - for (Connection *conn: connections) - if (!conn->is_ready()) restart = true; + // FIXME: Remove. Not needed, testing only. + // // FIXME: Synchronize start_time here across threads/nodes. + // pthread_barrier_wait(&barrier); - if (restart) continue; - else break; - } + // Warmup connection. + if (options.warmup > 0) { + if (master) V("Warmup start."); - // Load database on lead connection for each server. - if (!options.noload) { - V("Loading database."); +#ifdef HAVE_LIBZMQ + if (args.agent_given || args.agentmode_given) { + if (master) V("Synchronizing."); - for (auto c: server_lead) c->start_loading(); + // 1. thread barrier: make sure our threads ready before syncing agents + // 2. sync agents: all threads across all agents are now ready + // 3. thread barrier: don't release our threads until all agents ready + pthread_barrier_wait(&barrier); + if (master) sync_agent(socketz); + pthread_barrier_wait(&barrier); - // Wait for all Connections to become IDLE. - while (1) { - // FIXME: If all connections become ready before event_base_loop - // is called, this will deadlock. - event_base_loop(base, EVLOOP_ONCE); + if (master) V("Synchronized."); + } +#endif + + int old_time = options.time; + // options.time = 1; + + start = get_time(); + for (Connection *conn: connections) { + conn->start_time = start; + conn->options.time = options.warmup; + conn->start(); // Kick the Connection into motion. + } + + while (1) { + event_base_loop(base, loop_flag); + + //#ifdef USE_CLOCK_GETTIME + // now = get_time(); + //#else + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); + //#endif + + bool restart = false; + for (Connection *conn: connections) + if (!conn->check_exit_condition(now)) + restart = true; + + if (restart) continue; + else break; + } bool restart = false; for (Connection *conn: connections) if (!conn->is_ready()) restart = true; - if (restart) continue; - else break; + if (restart) { + + // Wait for all Connections to become IDLE. + while (1) { + // FIXME: If there were to use EVLOOP_ONCE and all connections + // become ready before event_base_loop is called, this will + // deadlock. We should check for IDLE before calling + // event_base_loop. + event_base_loop(base, EVLOOP_ONCE); // EVLOOP_NONBLOCK); + + bool restart = false; + for (Connection *conn: connections) + if (!conn->is_ready()) restart = true; + + if (restart) continue; + else break; + } + } + + for (Connection *conn: connections) { + conn->reset(); + conn->options.time = old_time; + } + + if (master) V("Warmup stop."); } - } - if (options.loadonly) { - evdns_base_free(evdns, 0); - event_base_free(base); - return; - } - // FIXME: Remove. Not needed, testing only. - // // FIXME: Synchronize start_time here across threads/nodes. - // pthread_barrier_wait(&barrier); + // FIXME: Synchronize start_time here across threads/nodes. + pthread_barrier_wait(&barrier); - // Warmup connection. - if (options.warmup > 0) { - if (master) V("Warmup start."); + if (master && args.wait_given) { + if (get_time() < boot_time + args.wait_arg) { + double t = (boot_time + args.wait_arg)-get_time(); + V("Sleeping %.1fs for -W.", t); + sleep_time(t); + } + } #ifdef HAVE_LIBZMQ if (args.agent_given || args.agentmode_given) { if (master) V("Synchronizing."); - // 1. thread barrier: make sure our threads ready before syncing agents - // 2. sync agents: all threads across all agents are now ready - // 3. thread barrier: don't release our threads until all agents ready pthread_barrier_wait(&barrier); - if (master) sync_agent(socket); + if (master) sync_agent(socketz); pthread_barrier_wait(&barrier); if (master) V("Synchronized."); } #endif - int old_time = options.time; - // options.time = 1; + if (master && !args.scan_given && !args.search_given) + V("started at %f", get_time()); start = get_time(); for (Connection *conn: connections) { conn->start_time = start; - conn->options.time = options.warmup; conn->start(); // Kick the Connection into motion. } + // V("Start = %f", start); + + // Main event loop. while (1) { event_base_loop(base, loop_flag); - //#ifdef USE_CLOCK_GETTIME - // now = get_time(); + //#if USE_CLOCK_GETTIME + // now = get_time(); //#else struct timeval now_tv; event_base_gettimeofday_cached(base, &now_tv); @@ -920,108 +1614,613 @@ void do_mutilate(const vector& servers, options_t& options, else break; } - bool restart = false; - for (Connection *conn: connections) - if (!conn->is_ready()) restart = true; + if (master && !args.scan_given && !args.search_given) + V("stopped at %f options.time = %d", get_time(), options.time); + + // Tear-down and accumulate stats. + for (Connection *conn: connections) { + stats.accumulate(conn->stats); + delete conn; + } + + stats.start = start; + stats.stop = now; - if (restart) { + event_config_free(config); + evdns_base_free(evdns, 0); + event_base_free(base); + } else if (servers.size() == 2 && !(args.approx_given || args.approx_batch_given || args.use_shm_given || args.use_shm_batch_given)) { + vector connections; + vector server_lead; + string hostname1 = servers[0]; + string hostname2 = servers[1]; + string port = "11211"; + + int conns = args.measure_connections_given ? args.measure_connections_arg : + options.connections; + + srand(time(NULL)); + for (int c = 0; c < conns; c++) { + + int fd1 = -1; + + if ( (fd1 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("socket error"); + exit(-1); + } + + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + fcntl(fd1, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state */ + int addrlen; + addrlen = sizeof(sin1); + + int max_tries = 50; + int n_tries = 0; + int s = 10; + while (connect(fd1, (struct sockaddr*)&sin1, addrlen) == -1) { + perror("l1 connect error"); + if (n_tries++ > max_tries) { + fprintf(stderr,"conn l1 %d unable to connect after sleep for %d\n",c+1,s); + exit(-1); + } + int d = s + rand() % 100; + usleep(d); + s = (int)((double)s*1.25); + } + + int fd2 = -1; + if ( (fd2 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("l2 socket error"); + exit(-1); + } + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + fcntl(fd2, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state */ + addrlen = sizeof(sin2); + n_tries = 0; + s = 10; + while (connect(fd2, (struct sockaddr*)&sin2, addrlen) == -1) { + perror("l2 connect error"); + if (n_tries++ > max_tries) { + fprintf(stderr,"conn l2 %d unable to connect after sleep for %d\n",c+1,s); + exit(-1); + } + int d = s + rand() % 100; + usleep(d); + s = (int)((double)s*1.25); + } + + + ConnectionMulti* conn = new ConnectionMulti(base, evdns, + hostname1, hostname2, port, options,args.agentmode_given ? false : true, fd1, fd2); + + int connected = 0; + if (conn) { + connected = 1; + } + int cid = conn->get_cid(); + + if (connected) { + fprintf(stderr,"cid %d gets l1 fd %d l2 fd %d\n",cid,fd1,fd2); + fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key); + if (g_lock != NULL) { + conn->set_g_wbkeys(g_wb_keys); + conn->set_lock(g_lock); + } + conn->set_queue(trace_queue->at(cid)); + connections.push_back(conn); + } else { + fprintf(stderr,"conn multi: %d, not connected!!\n",c); + + } + } + + // wait for all threads to reach here + pthread_barrier_wait(&barrier); + + fprintf(stderr,"thread %ld gtg\n",pthread_self()); // Wait for all Connections to become IDLE. while (1) { - // FIXME: If there were to use EVLOOP_ONCE and all connections - // become ready before event_base_loop is called, this will - // deadlock. We should check for IDLE before calling - // event_base_loop. - event_base_loop(base, EVLOOP_ONCE); // EVLOOP_NONBLOCK); + // FIXME: If all connections become ready before event_base_loop + // is called, this will deadlock. + event_base_loop(base, EVLOOP_ONCE); bool restart = false; - for (Connection *conn: connections) + for (ConnectionMulti *conn: connections) if (!conn->is_ready()) restart = true; if (restart) continue; else break; } + + + + double start = get_time(); + double now = start; + for (ConnectionMulti *conn: connections) { + conn->start_time = start; + conn->start(); // Kick the Connection into motion. + } + //fprintf(stderr,"Start = %f\n", start); + + // Main event loop. + while (1) { + event_base_loop(base, loop_flag); + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); + + bool restart = false; + for (ConnectionMulti *conn: connections) { + if (!conn->check_exit_condition(now)) { + restart = true; + } + } + if (restart) continue; + else break; + } - for (Connection *conn: connections) { - conn->reset(); - conn->options.time = old_time; + + // V("Start = %f", start); + + if (master && !args.scan_given && !args.search_given) + V("stopped at %f options.time = %d", get_time(), options.time); + + // Tear-down and accumulate stats. + for (ConnectionMulti *conn: connections) { + stats.accumulate(conn->stats); + delete conn; } - if (master) V("Warmup stop."); - } + stats.start = start; + stats.stop = now; + + event_config_free(config); + evdns_base_free(evdns, 0); + event_base_free(base); + } else if (servers.size() == 2 && args.approx_given) { + vector connections; + vector server_lead; + + string hostname1 = servers[0]; + string hostname2 = servers[1]; + string port = "11211"; + + int conns = args.measure_connections_given ? args.measure_connections_arg : + options.connections; + + srand(time(NULL)); + for (int c = 0; c < conns; c++) { + + int fd1 = -1; + if ( (fd1 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("socket error"); + exit(-1); + } - // FIXME: Synchronize start_time here across threads/nodes. - pthread_barrier_wait(&barrier); + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + fcntl(fd1, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state */ + int addrlen; + addrlen = sizeof(sin1); + + int max_tries = 50; + int n_tries = 0; + int s = 10; + while (connect(fd1, (struct sockaddr*)&sin1, addrlen) == -1) { + perror("l1 connect error"); + if (n_tries++ > max_tries) { + fprintf(stderr,"conn l1 %d unable to connect after sleep for %d\n",c+1,s); + exit(-1); + } + int d = s + rand() % 100; + usleep(d); + s = (int)((double)s*1.25); + } + + int fd2 = -1; + if ( (fd2 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("l2 socket error"); + exit(-1); + } + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + fcntl(fd2, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state */ + addrlen = sizeof(sin2); + n_tries = 0; + s = 10; + while (connect(fd2, (struct sockaddr*)&sin2, addrlen) == -1) { + perror("l2 connect error"); + if (n_tries++ > max_tries) { + fprintf(stderr,"conn l2 %d unable to connect after sleep for %d\n",c+1,s); + exit(-1); + } + int d = s + rand() % 100; + usleep(d); + s = (int)((double)s*1.25); + } - if (master && args.wait_given) { - if (get_time() < boot_time + args.wait_arg) { - double t = (boot_time + args.wait_arg)-get_time(); - V("Sleeping %.1fs for -W.", t); - sleep_time(t); + + ConnectionMultiApprox* conn = new ConnectionMultiApprox(base, evdns, + hostname1, hostname2, port, options,args.agentmode_given ? false : true, fd1, fd2); + + int connected = 0; + if (conn) { + connected = 1; + } + int cid = conn->get_cid(); + + if (connected) { + fprintf(stderr,"cid %d gets l1 fd %d l2 fd %d\n",cid,fd1,fd2); + fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key); + if (g_lock != NULL) { + conn->set_g_wbkeys(g_wb_keys); + conn->set_lock(g_lock); + } + conn->set_queue(trace_queue->at(cid)); + connections.push_back(conn); + } else { + fprintf(stderr,"conn multi: %d, not connected!!\n",c); + + } } - } + + // wait for all threads to reach here + pthread_barrier_wait(&barrier); -#ifdef HAVE_LIBZMQ - if (args.agent_given || args.agentmode_given) { - if (master) V("Synchronizing."); + fprintf(stderr,"thread %ld gtg\n",pthread_self()); + // Wait for all Connections to become IDLE. + while (1) { + // FIXME: If all connections become ready before event_base_loop + // is called, this will deadlock. + event_base_loop(base, EVLOOP_ONCE); + + bool restart = false; + for (ConnectionMultiApprox *conn: connections) + if (!conn->is_ready()) restart = true; + if (restart) continue; + else break; + } + + + + double start = get_time(); + double now = start; + for (ConnectionMultiApprox *conn: connections) { + conn->start_time = start; + conn->start(); // Kick the Connection into motion. + } + //fprintf(stderr,"Start = %f\n", start); + + // Main event loop. + while (1) { + event_base_loop(base, loop_flag); + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); + + bool restart = false; + for (ConnectionMultiApprox *conn: connections) { + if (!conn->check_exit_condition(now)) { + restart = true; + } + } + if (restart) continue; + else break; + + } + + + // V("Start = %f", start); + + if (master && !args.scan_given && !args.search_given) + V("stopped at %f options.time = %d", get_time(), options.time); + + // Tear-down and accumulate stats. + for (ConnectionMultiApprox *conn: connections) { + stats.accumulate(conn->stats); + delete conn; + } + + stats.start = start; + stats.stop = now; + + event_config_free(config); + evdns_base_free(evdns, 0); + event_base_free(base); + + } else if (servers.size() == 2 && args.approx_batch_given) { + vector connections; + vector server_lead; + + string hostname1 = servers[0]; + string hostname2 = servers[1]; + string port = "11211"; + + int conns = args.measure_connections_given ? args.measure_connections_arg : + options.connections; + + srand(time(NULL)); + for (int c = 0; c < conns; c++) { + + int fd1 = -1; + + if ( (fd1 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("socket error"); + exit(-1); + } + + struct sockaddr_un sin1; + memset(&sin1, 0, sizeof(sin1)); + sin1.sun_family = AF_LOCAL; + strcpy(sin1.sun_path, hostname1.c_str()); + + fcntl(fd1, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state */ + int addrlen; + addrlen = sizeof(sin1); + + int max_tries = 50; + int n_tries = 0; + int s = 10; + while (connect(fd1, (struct sockaddr*)&sin1, addrlen) == -1) { + perror("l1 connect error"); + if (n_tries++ > max_tries) { + fprintf(stderr,"conn l1 %d unable to connect after sleep for %d\n",c+1,s); + exit(-1); + } + int d = s + rand() % 100; + usleep(d); + s = (int)((double)s*1.25); + } + + int fd2 = -1; + if ( (fd2 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("l2 socket error"); + exit(-1); + } + struct sockaddr_un sin2; + memset(&sin2, 0, sizeof(sin2)); + sin2.sun_family = AF_LOCAL; + strcpy(sin2.sun_path, hostname2.c_str()); + fcntl(fd2, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state */ + addrlen = sizeof(sin2); + n_tries = 0; + s = 10; + while (connect(fd2, (struct sockaddr*)&sin2, addrlen) == -1) { + perror("l2 connect error"); + if (n_tries++ > max_tries) { + fprintf(stderr,"conn l2 %d unable to connect after sleep for %d\n",c+1,s); + exit(-1); + } + int d = s + rand() % 100; + usleep(d); + s = (int)((double)s*1.25); + } + + + ConnectionMultiApproxBatch* conn = new ConnectionMultiApproxBatch(base, evdns, + hostname1, hostname2, port, options,args.agentmode_given ? false : true, fd1, fd2); + + int connected = 0; + if (conn) { + connected = 1; + } + int cid = conn->get_cid(); + + if (connected) { + fprintf(stderr,"cid %d gets l1 fd %d l2 fd %d\n",cid,fd1,fd2); + fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key); + if (g_lock != NULL) { + conn->set_g_wbkeys(g_wb_keys); + conn->set_lock(g_lock); + } + conn->set_queue(trace_queue->at(cid)); + connections.push_back(conn); + } else { + fprintf(stderr,"conn multi: %d, not connected!!\n",c); + + } + } + + // wait for all threads to reach here pthread_barrier_wait(&barrier); - if (master) sync_agent(socket); + + fprintf(stderr,"thread %ld gtg\n",pthread_self()); + // Wait for all Connections to become IDLE. + while (1) { + // FIXME: If all connections become ready before event_base_loop + // is called, this will deadlock. + event_base_loop(base, EVLOOP_ONCE); + + bool restart = false; + for (ConnectionMultiApproxBatch *conn: connections) + if (!conn->is_ready()) restart = true; + + if (restart) continue; + else break; + } + + + + double start = get_time(); + double now = start; + for (ConnectionMultiApproxBatch *conn: connections) { + conn->start_time = start; + conn->start(); // Kick the Connection into motion. + } + //fprintf(stderr,"Start = %f\n", start); + + // Main event loop. + while (1) { + event_base_loop(base, loop_flag); + struct timeval now_tv; + event_base_gettimeofday_cached(base, &now_tv); + now = tv_to_double(&now_tv); + + bool restart = false; + for (ConnectionMultiApproxBatch *conn: connections) { + if (!conn->check_exit_condition(now)) { + restart = true; + } + } + if (restart) continue; + else { + for (ConnectionMultiApproxBatch *conn: connections) { + fprintf(stderr,"tid %ld, cid: %d\n",pthread_self(),conn->get_cid()); + } + break; + } + + } + + + // V("Start = %f", start); + + if (master && !args.scan_given && !args.search_given) + V("stopped at %f options.time = %d", get_time(), options.time); + + // Tear-down and accumulate stats. + for (ConnectionMultiApproxBatch *conn: connections) { + stats.accumulate(conn->stats); + delete conn; + } + + stats.start = start; + stats.stop = now; + + event_config_free(config); + evdns_base_free(evdns, 0); + event_base_free(base); + } else if (servers.size() == 2 && args.use_shm_given) { + vector connections; + + int conns = args.measure_connections_given ? args.measure_connections_arg : + options.connections; + + srand(time(NULL)); + for (int c = 0; c < conns; c++) { + + + ConnectionMultiApproxShm* conn = new ConnectionMultiApproxShm(options,args.agentmode_given ? false : true); + int connected = 0; + if (conn && conn->do_connect()) { + connected = 1; + } + int cid = conn->get_cid(); + + if (connected) { + fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key); + if (g_lock != NULL) { + conn->set_g_wbkeys(g_wb_keys); + conn->set_lock(g_lock); + } + conn->set_queue(trace_queue->at(cid)); + connections.push_back(conn); + } else { + fprintf(stderr,"conn multi: %d, not connected!!\n",c); + + } + } + + // wait for all threads to reach here pthread_barrier_wait(&barrier); + double start = get_time(); + fprintf(stderr,"Start = %f\n", start); + double now = start; + for (ConnectionMultiApproxShm *conn: connections) { + conn->start_time = now; + conn->drive_write_machine_shm(now); + } - if (master) V("Synchronized."); - } -#endif - if (master && !args.scan_given && !args.search_given) - V("started at %f", get_time()); - start = get_time(); - for (Connection *conn: connections) { - conn->start_time = start; - conn->start(); // Kick the Connection into motion. - } + if (master && !args.scan_given && !args.search_given) + V("stopped at %f options.time = %d", get_time(), options.time); - // V("Start = %f", start); + // Tear-down and accumulate stats. + for (ConnectionMultiApproxShm *conn: connections) { + stats.accumulate(conn->stats); + delete conn; + } + double stop = get_time(); + fprintf(stderr,"Stop = %f\n", stop); + stats.start = start; + stats.stop = stop; - // Main event loop. - while (1) { - event_base_loop(base, loop_flag); - //#if USE_CLOCK_GETTIME - // now = get_time(); - //#else - struct timeval now_tv; - event_base_gettimeofday_cached(base, &now_tv); - now = tv_to_double(&now_tv); - //#endif + } else if (servers.size() == 2 && args.use_shm_batch_given) { + vector connections; - bool restart = false; - for (Connection *conn: connections) - if (!conn->check_exit_condition(now)) - restart = true; + int conns = args.measure_connections_given ? args.measure_connections_arg : + options.connections; - if (restart) continue; - else break; - } + srand(time(NULL)); + for (int c = 0; c < conns; c++) { - if (master && !args.scan_given && !args.search_given) - V("stopped at %f options.time = %d", get_time(), options.time); - // Tear-down and accumulate stats. - for (Connection *conn: connections) { - stats.accumulate(conn->stats); - delete conn; - } + ConnectionMultiApproxBatchShm* conn = new ConnectionMultiApproxBatchShm(options,args.agentmode_given ? false : true); + int connected = 0; + if (conn && conn->do_connect()) { + connected = 1; + } + int cid = conn->get_cid(); + + if (connected) { + fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key); + if (g_lock != NULL) { + conn->set_g_wbkeys(g_wb_keys); + conn->set_lock(g_lock); + } + conn->set_queue(trace_queue->at(cid)); + connections.push_back(conn); + } else { + fprintf(stderr,"conn multi: %d, not connected!!\n",c); + + } + } + + // wait for all threads to reach here + pthread_barrier_wait(&barrier); + double start = get_time(); + fprintf(stderr,"Start = %f\n", start); + double now = start; + for (ConnectionMultiApproxBatchShm *conn: connections) { + conn->start_time = now; + conn->drive_write_machine_shm(now); + } + - stats.start = start; - stats.stop = now; - event_config_free(config); - evdns_base_free(evdns, 0); - event_base_free(base); + if (master && !args.scan_given && !args.search_given) + V("stopped at %f options.time = %d", get_time(), options.time); + + // Tear-down and accumulate stats. + for (ConnectionMultiApproxBatchShm *conn: connections) { + stats.accumulate(conn->stats); + delete conn; + } + double stop = get_time(); + fprintf(stderr,"Stop = %f\n", stop); + stats.start = start; + stats.stop = stop; + + + } } void args_to_options(options_t* options) { @@ -1032,6 +2231,16 @@ void args_to_options(options_t* options) { options->threads = args.threads_arg; options->server_given = args.server_given; options->roundrobin = args.roundrobin_given; + options->apps = args.apps_arg; + options->rand_admit = args.rand_admit_arg; + options->threshold = args.threshold_arg; + options->wb_all = args.wb_all_arg; + options->ratelimit = args.ratelimit_given; + options->v1callback = args.v1callback_given; + if (args.inclusives_given) { + memset(options->inclusives,0,256); + strncpy(options->inclusives,args.inclusives_arg,256); + } int connections = options->connections; if (options->roundrobin) { @@ -1058,7 +2267,38 @@ void args_to_options(options_t* options) { // else options->records = args.records_arg / options->server_given; + options->queries = args.queries_arg / options->server_given; + + options->misswindow = args.misswindow_arg; + + options->use_assoc = args.assoc_given; + options->assoc = args.assoc_arg; + options->twitter_trace = args.twitter_trace_arg; + + options->unix_socket = args.unix_socket_given; + options->miss_through = args.miss_through_given; + options->successful_queries = args.successful_given; options->binary = args.binary_given; + options->redis = args.redis_given; + + if (options->use_assoc && !options->redis) + DIE("assoc must be used with redis"); + + options->read_file = args.read_file_given; + if (args.read_file_given) + strcpy(options->file_name, args.read_file_arg); + + if (args.prefix_given) + strcpy(options->prefix,args.prefix_arg); + + //getset mode (first issue get, then set same key if miss) + options->getset = args.getset_given; + options->getsetorset = args.getsetorset_given; + //delete 90 percent of keys after halfway + //model workload in Rumble and Ousterhout - log structured memory + //for dram based storage + options->delete90 = args.delete90_given; + options->sasl = args.username_given; if (args.password_given) diff --git a/update_readme.sh b/update_readme.sh old mode 100644 new mode 100755 diff --git a/zstd.h b/zstd.h new file mode 100644 index 0000000..222339d --- /dev/null +++ b/zstd.h @@ -0,0 +1,2450 @@ +/* + * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 + +/* ====== Dependency ======*/ +#include /* INT_MAX */ +#include /* size_t */ + + +/* ===== ZSTDLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDLIB_API ZSTDLIB_VISIBILITY +#endif + + +/******************************************************************************* + Introduction + + zstd, short for Zstandard, is a fast lossless compression algorithm, targeting + real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression + functions. + + The library supports regular compression levels from 1 up to ZSTD_maxCLevel(), + which is currently 22. Levels >= 20, labeled `--ultra`, should be used with + caution, as they require more memory. The library also offers negative + compression levels, which extend the range of speed vs. ratio preferences. + The lower the level, the faster the speed (at the cost of compression). + + Compression can be done in: + - a single step (described as Simple API) + - a single step, reusing a context (described as Explicit context) + - unbounded multiple steps (described as Streaming compression) + + The compression ratio achievable on small data can be highly improved using + a dictionary. Dictionary compression can be performed in: + - a single step (described as Simple dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing + dictionary API) + + Advanced experimental functions can be accessed using + `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h. + + Advanced experimental APIs should never be used with a dynamically-linked + library. They are not "stable"; their definitions or signatures may change in + the future. Only static linking is allowed. +*******************************************************************************/ + +/*------ Version ------*/ +#define ZSTD_VERSION_MAJOR 1 +#define ZSTD_VERSION_MINOR 4 +#define ZSTD_VERSION_RELEASE 9 +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + +/*! ZSTD_versionNumber() : + * Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */ +ZSTDLIB_API unsigned ZSTD_versionNumber(void); + +#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE +#define ZSTD_QUOTE(str) #str +#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) +#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) + +/*! ZSTD_versionString() : + * Return runtime library version, like "1.4.5". Requires v1.3.0+. */ +ZSTDLIB_API const char* ZSTD_versionString(void); + +/* ************************************* + * Default constant + ***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + +/* ************************************* + * Constants + ***************************************/ + +/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */ +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */ +#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0 + +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*! ZSTD_decompress() : + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/*! ZSTD_getFrameContentSize() : requires v1.3.0+ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/*! ZSTD_getDecompressedSize() : + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD frame or skippable frame. + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +/*====== Helper functions ======*/ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ + + +/*************************************** +* Explicit context +***************************************/ +/*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. + * Note 2 : In multi-threaded environments, + * use one different context per thread for parallel execution. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); + +/*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. + * Important : in order to behave similarly to `ZSTD_compress()`, + * this function compresses at requested compression level, + * __ignoring any other parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); + +/*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. + * Compatible with sticky parameters. + */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced compression API +***************************************/ + +/* API design : + * Parameters are pushed one by one into an existing context, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! + * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supercedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove from experimental API entry points which are redundant with this API. + */ + + +/* Compression strategies, listed from fastest to strongest */ +typedef enum { ZSTD_fast=1, + ZSTD_dfast=2, + ZSTD_greedy=3, + ZSTD_lazy=4, + ZSTD_lazy2=5, + ZSTD_btlazy2=6, + ZSTD_btopt=7, + ZSTD_btultra=8, + ZSTD_btultra2=9 + /* note : new strategies _might_ be added in the future. + Only the order (from fast to strong) is guaranteed */ +} ZSTD_strategy; + + +typedef enum { + + /* compression parameters + * Note: When compressing with a ZSTD_CDict these parameters are superseded + * by the parameters used to construct the ZSTD_CDict. + * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */ + ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table. + * Note that exact compression parameters are dynamically determined, + * depending on both compression level and srcSize (when known). + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. + * Note 1 : it's possible to pass a negative compression level. + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ + /* Advanced compression parameters : + * It's possible to pin down compression parameters to some specific values. + * In which case, these values are no longer dynamically selected by the compressor */ + ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2. + * This will set a memory budget for streaming decompression, + * with larger values requiring more memory + * and typically compressing more. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT + * requires explicitly allowing such size at streaming decompression stage. */ + ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2. + * Resulting memory usage is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2. + * Resulting memory usage is (1 << (chainLog+2)). + * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX. + * Larger tables result in better and slower compression. + * This parameter is useless for "fast" strategy. + * It's still useful when using "dfast" strategy, + * in which case it defines a secondary probe table. + * Special: value 0 means "use default chainLog". */ + ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless for "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_c_minMatch=105, /* Minimum size of searched matches. + * Note that Zstandard can still find matches of smaller size, + * it just tweaks its search algorithm to look for this size and larger. + * Larger values increase compression and decompression speed, but decrease ratio. + * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_c_targetLength=106, /* Impact of this field depends on strategy. + * For strategies btopt, btultra & btultra2: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB + * except when expressly set to a different value. + * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and + * compression strategy >= ZSTD_btopt (== compression level 16+) */ + ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX. + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashRateLog". */ + + /* frame parameters */ + ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression. + * This is automatically the case when using ZSTD_compress2(), + * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * Otherwise, trying to set any other value than default (0) will be a no-op and return an error. + * In a situation where it's unknown if the linked library supports multi-threading or not, + * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property. + */ + ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() : + * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, + * while compression is performed in parallel, within worker thread(s). + * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : + * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, + * compression is performed inside Caller's thread, and all invocations are blocking */ + ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. + * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced. */ + ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. + * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. + * It helps preserve compression ratio, while each job is compressed in parallel. + * This value is enforced only when nbWorkers >= 1. + * Larger values increase compression ratio, but decrease speed. + * Possible values range from 0 to 9 : + * - 0 means "default" : value will be determined by the library, depending on strategy + * - 1 means "no overlap" + * - 9 means "full overlap", using a full window size. + * Each intermediate rank increases/decreases load size by a factor 2 : + * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default + * default value varies between 6 and 9, depending on strategy */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_c_rsyncable + * ZSTD_c_format + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode + * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer + * ZSTD_c_stableOutBuffer + * ZSTD_c_blockDelimiters + * ZSTD_c_validateSequences + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. + */ + ZSTD_c_experimentalParam1=500, + ZSTD_c_experimentalParam2=10, + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, + ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, + ZSTD_c_experimentalParam10=1007, + ZSTD_c_experimentalParam11=1008, + ZSTD_c_experimentalParam12=1009 +} ZSTD_cParameter; + +typedef struct { + size_t error; + int lowerBound; + int upperBound; +} ZSTD_bounds; + +/*! ZSTD_cParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - lower and upper bounds, both inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam); + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is generally only possible during frame initialization (before starting compression). + * Exception : when using multi-threading mode (nbWorkers >= 1), + * the following parameters can be updated _during_ compression (within same frame): + * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * new parameters will be active for next job only (after a flush()). + * @return : an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag. + * This value will also be controlled at end of frame, and trigger an error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame. + * Note 2 : pledgedSrcSize is only valid once, for the next frame. + * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN. + * Note 3 : Whenever all input data is provided and consumed in a single round, + * for example with ZSTD_compress2(), + * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end), + * this value is automatically overridden by srcSize instead. + */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + +typedef enum { + ZSTD_reset_session_only = 1, + ZSTD_reset_parameters = 2, + ZSTD_reset_session_and_parameters = 3 +} ZSTD_ResetDirective; + +/*! ZSTD_CCtx_reset() : + * There are 2 different things that can be reset, independently or jointly : + * - The session : will stop compressing current frame, and make CCtx ready to start a new one. + * Useful after an error, or to interrupt any ongoing compression. + * Any internal data not yet flushed is cancelled. + * Compression parameters and dictionary remain unchanged. + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". + * This removes any reference to any dictionary too. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. + */ +ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + +/*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced decompression API +***************************************/ + +/* The advanced API pushes parameters one by one into an existing DCtx context. + * Parameters are sticky, and remain valid for all following frames + * using the same DCtx context. + * It's possible to reset parameters to default values using ZSTD_DCtx_reset(). + * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream(). + * Therefore, no new decompression function is necessary. + */ + +typedef enum { + + ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which + * the streaming API will refuse to allocate memory buffer + * in order to protect the host from unreasonable memory requirements. + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT). + * Special: value 0 means "use default maximum windowLog". */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, + ZSTD_d_experimentalParam4=1003 + +} ZSTD_dParameter; + +/*! ZSTD_dParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - both lower and upper bounds, inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam); + +/*! ZSTD_DCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_dParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is only possible during frame initialization (before starting decompression). + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value); + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * Session and parameters can be reset jointly or separately. + * Parameters can only be reset when no active frame is being decompressed. + * @return : 0, or an error code, which can be tested with ZSTD_isError() + */ +ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset); + + +/**************************** +* Streaming +****************************/ + +typedef struct ZSTD_inBuffer_s { + const void* src; /**< start of input buffer */ + size_t size; /**< size of input buffer */ + size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_inBuffer; + +typedef struct ZSTD_outBuffer_s { + void* dst; /**< start of output buffer */ + size_t size; /**< size of output buffer */ + size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_outBuffer; + + + +/*-*********************************************************************** +* Streaming compression - HowTo +* +* A ZSTD_CStream object is required to track streaming operation. +* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. +* ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* +* For parallel execution, use one separate ZSTD_CStream per thread. +* +* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. +* +* Parameters are sticky : when starting a new compression on the same context, +* it will re-use the same sticky parameters as previous compression session. +* When in doubt, it's recommended to fully initialize the context before usage. +* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), +* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +* set more specific parameters, the pledged source size, or load a dictionary. +* +* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to +* consume input stream. The function will automatically update both `pos` +* fields within `input` and `output`. +* Note that the function may not consume the entire input, for example, because +* the output buffer is already full, in which case `input.pos < input.size`. +* The caller must check if input has been entirely consumed. +* If not, the caller must make some room to receive more compressed data, +* and then present again remaining input data. +* note: ZSTD_e_continue is guaranteed to make some forward progress when called, +* but doesn't guarantee maximal forward progress. This is especially relevant +* when compressing with multiple threads. The call won't block if it can +* consume some input, but if it can't it will wait for some, but not all, +* output to be flushed. +* @return : provides a minimum amount of data remaining to be flushed from internal buffers +* or an error code, which can be tested using ZSTD_isError(). +* +* At any moment, it's possible to flush whatever data might remain stuck within internal buffer, +* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated. +* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0). +* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the +* operation. +* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if internal buffers are entirely flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to +* start a new frame. +* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if frame fully completed and fully flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* *******************************************************************/ + +typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); + +/*===== Streaming compression functions =====*/ +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */ + ZSTD_e_flush=1, /* flush any data provided so far, + * it creates (at least) one new block, that can be decoded immediately on reception; + * frame will continue: any future data can still reference previously compressed data, improving compression. + * note : multithreaded compression will block to flush as much output as possible. */ + ZSTD_e_end=2 /* flush any remaining data _and_ close current frame. + * note that frame is only closed after compressed data is fully flushed (return value == 0). + * After that point, any additional data starts a new frame. + * note : each frame is independent (does not reference any content from previous frame). + : note : multithreaded compression will block to flush as much output as possible. */ +} ZSTD_EndDirective; + +/*! ZSTD_compressStream2() : + * Behaves about the same as ZSTD_compressStream, with additional control on end directive. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) + * - output->pos must be <= dstCapacity, input->pos must be <= srcSize + * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - endOp must be a valid directive + * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. + * - When nbWorkers>=1, function is non-blocking : it copies a portion of input, distributes jobs to internal worker threads, flush to output whatever is available, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* These buffer sizes are softly recommended. + * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output. + * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(), + * reducing the amount of memory shuffling and buffering, resulting in minor performance savings. + * + * However, note that these recommendations are from the perspective of a C caller program. + * If the streaming interface is invoked from some other language, + * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo, + * a major performance rule is to reduce crossing such interface to an absolute minimum. + * It's not rare that performance ends being spent more into the interface, rather than compression itself. + * In which cases, prefer using large buffers, as large as practical, + * for both input and output, to reduce the nb of roundtrips. + */ +ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */ + + +/* ***************************************************************************** + * This following is a legacy streaming API. + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. + * Advanced parameters and dictionary compression can only be used through the + * new API. + ******************************************************************************/ + +/*! + * Equivalent to: + * + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + */ +ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); +/*! + * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue). + * NOTE: The return value is different. ZSTD_compressStream() returns a hint for + * the next read size (if non-zero and not an error). ZSTD_compressStream2() + * returns the minimum nb of bytes left to flush (if non-zero and not an error). + */ +ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */ +ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */ +ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + + +/*-*************************************************************************** +* Streaming decompression - HowTo +* +* A ZSTD_DStream object is required to track streaming operations. +* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +* ZSTD_DStream objects can be re-used multiple times. +* +* Use ZSTD_initDStream() to start a new decompression operation. +* @return : recommended first input size +* Alternatively, use advanced API to set specific properties. +* +* Use ZSTD_decompressStream() repetitively to consume your input. +* The function will update both `pos` fields. +* If `input.pos < input.size`, some input has not been consumed. +* It's up to the caller to present again remaining data. +* The function tries to flush all data decoded immediately, respecting output buffer size. +* If `output.pos < output.size`, decoder has flushed everything it could. +* But if `output.pos == output.size`, there might be some data left within internal buffers., +* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. +* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. +* @return : 0 when a frame is completely decoded and fully flushed, +* or an error code, which can be tested using ZSTD_isError(), +* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : +* the return value is a suggested next input size (just a hint for better latency) +* that will never request more than the remaining frame size. +* *******************************************************************************/ + +typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ +/*===== ZSTD_DStream management functions =====*/ +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); + +/*===== Streaming decompression functions =====*/ + +/* This function is redundant with the advanced API and equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ +ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + +ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ + + +/************************** +* Simple dictionary API +***************************/ +/*! ZSTD_compress_usingDict() : + * Compression at an explicit compression level using a Dictionary. + * A dictionary can be any arbitrary data segment (also called a prefix), + * or a buffer with specified information (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + int compressionLevel); + +/*! ZSTD_decompress_usingDict() : + * Decompression using a known Dictionary. + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*********************************** + * Bulk processing dictionary API + **********************************/ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/*! ZSTD_createCDict() : + * When compressing multiple messages or blocks using the same dictionary, + * it's recommended to digest the dictionary only once, since it's a costly operation. + * ZSTD_createCDict() will create a state from digesting a dictionary. + * The resulting state can be used for future compression operations with very limited startup cost. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict. + * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content. + * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer, + * in which case the only thing that it transports is the @compressionLevel. + * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively, + * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); + +/*! ZSTD_freeCDict() : + * Function frees memory allocated by ZSTD_createCDict(). */ +ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. + * Note : compression level is _decided at dictionary creation time_, + * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict); + + +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/*! ZSTD_createDDict() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_freeDDict() : + * Function frees memory allocated with ZSTD_createDDict() */ +ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); + +/*! ZSTD_decompress_usingDDict() : + * Decompression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict); + + +/******************************** + * Dictionary helper functions + *******************************/ + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + + +/******************************************************************************* + * Advanced dictionary and prefix API + * + * This API allows dictionaries to be used with ZSTD_compress2(), + * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * only reset with the context is reset with ZSTD_reset_parameters or + * ZSTD_reset_session_and_parameters. Prefixes are single-use. + ******************************************************************************/ + + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. + * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, + * compression parameters can no longer be changed after loading a dictionary. + * Note 3 :`dict` content will be copied internally. + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. + * The dictionary will remain valid for future compressed frames using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Referencing a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compressed frame. + * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. + * Its content must remain unmodified during compression. + * Note 2 : If the intention is to diff some large src data blob with some prior version of itself, + * ensure that the window size is large enough to contain the entire source. + * See ZSTD_c_windowLog. + * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * If there is a need to use the same prefix multiple times, consider loadDictionary instead. + * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent). + * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * The dictionary remains valid for all future frames, until explicitly invalidated. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * It's recommended to "load once, use many times", to amortize the cost + * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading. + * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of + * how dictionary content is loaded and interpreted. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * + * If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function + * will store the DDict references in a table, and the DDict used for decompression + * will be determined at decompression time, as per the dict ID in the frame. + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) to decompress next frame. + * This is the reverse operation of ZSTD_CCtx_refPrefix(), + * and must use the same prefix as the one used during compression. + * Prefix is **only used once**. Reference is discarded at end of frame. + * End of frame is reached when ZSTD_decompressStream() returns 0. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It **must** outlive decompression. + * Prefix buffer must remain unmodified up to the end of frame, + * reached when ZSTD_decompressStream() returns 0. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section) + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + * A full dictionary is more costly, as it requires building tables. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, + const void* prefix, size_t prefixSize); + +/* === Memory management === */ + +/*! ZSTD_sizeof_*() : + * These functions give the _current_ memory usage of selected object. + * Note that object memory usage can evolve (increase or decrease) over time. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +#endif /* ZSTD_H_235446 */ + + +/* ************************************************************************************** + * ADVANCED AND EXPERIMENTAL FUNCTIONS + **************************************************************************************** + * The definitions in the following section are considered experimental. + * They are provided for advanced scenarios. + * They should never be used with a dynamic library, as prototypes may change in the future. + * Use them only in association with static linking. + * ***************************************************************************************/ + +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + +/**************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** + * The following symbols and constants + * are not planned to join "stable API" status in the near future. + * They can still change in future versions. + * Some of them are planned to remain in the static_only section indefinitely. + * Some of them might be removed in the future (especially when redundant with existing stable functions) + * ***************************************************************************************/ + +#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2) +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */ +#define ZSTD_SKIPPABLEHEADERSIZE 8 + +/* compression parameter bounds */ +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ +#define ZSTD_STRATEGY_MIN ZSTD_fast +#define ZSTD_STRATEGY_MAX ZSTD_btultra2 + + +#define ZSTD_OVERLAPLOG_MIN 0 +#define ZSTD_OVERLAPLOG_MAX 9 + +#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame + * requiring larger than (1< 0: + * If litLength != 0: + * rep == 1 --> offset == repeat_offset_1 + * rep == 2 --> offset == repeat_offset_2 + * rep == 3 --> offset == repeat_offset_3 + * If litLength == 0: + * rep == 1 --> offset == repeat_offset_2 + * rep == 2 --> offset == repeat_offset_3 + * rep == 3 --> offset == repeat_offset_1 - 1 + * + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external + * sequence provider's perspective. For example, ZSTD_compressSequences() does not + * use this 'rep' field at all (as of now). + */ +} ZSTD_Sequence; + +typedef struct { + unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */ + unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */ + unsigned hashLog; /**< dispatch table : larger == faster, more memory */ + unsigned searchLog; /**< nb of searches : larger == more compression, slower */ + unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */ + unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */ + ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */ +} ZSTD_compressionParameters; + +typedef struct { + int contentSizeFlag; /**< 1: content size will be in frame header (when known) */ + int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */ + int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */ +} ZSTD_frameParameters; + +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +typedef enum { + ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */ + ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + +typedef enum { + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring this instruction. */ +} ZSTD_format_e; + +typedef enum { + /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */ + ZSTD_d_validateChecksum = 0, + ZSTD_d_ignoreChecksum = 1 +} ZSTD_forceIgnoreChecksum_e; + +typedef enum { + /* Note: this enum controls ZSTD_d_refMultipleDDicts */ + ZSTD_rmd_refSingleDDict = 0, + ZSTD_rmd_refMultipleDDicts = 1 +} ZSTD_refMultipleDDicts_e; + +typedef enum { + /* Note: this enum and the behavior it controls are effectively internal + * implementation details of the compressor. They are expected to continue + * to evolve and should be considered only in the context of extremely + * advanced performance tuning. + * + * Zstd currently supports the use of a CDict in three ways: + * + * - The contents of the CDict can be copied into the working context. This + * means that the compression can search both the dictionary and input + * while operating on a single set of internal tables. This makes + * the compression faster per-byte of input. However, the initial copy of + * the CDict's tables incurs a fixed cost at the beginning of the + * compression. For small compressions (< 8 KB), that copy can dominate + * the cost of the compression. + * + * - The CDict's tables can be used in-place. In this model, compression is + * slower per input byte, because the compressor has to search two sets of + * tables. However, this model incurs no start-up cost (as long as the + * working context's tables can be reused). For small inputs, this can be + * faster than copying the CDict's tables. + * + * - The CDict's tables are not used at all, and instead we use the working + * context alone to reload the dictionary and use params based on the source + * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict(). + * This method is effective when the dictionary sizes are very small relative + * to the input size, and the input size is fairly large to begin with. + * + * Zstd has a simple internal heuristic that selects which strategy to use + * at the beginning of a compression. However, if experimentation shows that + * Zstd is making poor choices, it is possible to override that choice with + * this enum. + */ + ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */ + ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */ + ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */ + ZSTD_dictForceLoad = 3 /* Always reload the dictionary */ +} ZSTD_dictAttachPref_e; + +typedef enum { + ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level. + * Negative compression levels will be uncompressed, and positive compression + * levels will be compressed. */ + ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be + * emitted if Huffman compression is not profitable. */ + ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ +} ZSTD_literalCompressionMode_e; + + +/*************************************** +* Frame size functions +***************************************/ + +/*! ZSTD_findDecompressedSize() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_decompressBound() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - upper-bound for the decompressed size of all data in all successive frames + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. + * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. + * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value. + * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: + * upper-bound = # blocks * min(128 KB, Window_Size) + */ +ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + +/*! ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ +} ZSTD_sequenceFormat_e; + +/*! ZSTD_generateSequences() : + * Generate sequences using ZSTD_compress2, given a source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * + * zc can be used to insert custom compression params. + * This function invokes ZSTD_compress2 + * + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters + * @return : number of sequences generated + */ + +ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); + +/*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals + * by merging them into into the literals of the next sequence. + * + * As such, the final generated result has no explicit representation of block boundaries, + * and the final last literals segment is not represented in the sequences. + * + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters + * @return : number of sequences left after merging + */ +ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + +/*! ZSTD_compressSequences() : + * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * + * The compression behavior changes based on cctx params. In particular: + * If ZSTD_c_blockDelimiters == ZSTD_sf_noBlockDelimiters, the array of ZSTD_Sequence is expected to contain + * no block delimiters (defined in ZSTD_Sequence). Block boundaries are roughly determined based on + * the block size derived from the cctx, and sequences may be split. This is the default setting. + * + * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain + * block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. + * + * If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined + * behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and return an error. + * + * In addition to the two adjustable experimental params, there are other important cctx params. + * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. + * - ZSTD_c_compressionLevel accordingly adjusts the strength of the entropy coder, as it would in typical compression. + * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset + * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md + * + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history + * @return : final compressed size or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize); + + +/*! ZSTD_writeSkippableFrame() : + * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. + * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so + * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, unsigned magicVariant); + + +/*************************************** +* Memory management +***************************************/ + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * for any compression level up to selected one. + * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate + * does not include space for a window buffer. + * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * When srcSize can be bound by a known and rather "small" value, + * this fact can be used to provide a tighter estimation + * because the CCtx compression context will need less memory. + * This tighter estimation can be provided by more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note 2 : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. + */ +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); +typedef void (*ZSTD_freeFunction) (void* opaque, void* address); +typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static +#ifdef __GNUC__ +__attribute__((__unused__)) +#endif +ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +/* ! Thread pool : + * These prototypes make it possible to share a thread pool among multiple compression contexts. + * This can limit resources for applications with multiple threads where each one uses + * a threaded compression mode (via ZSTD_c_nbWorkers parameter). + * ZSTD_createThreadPool creates a new thread pool with a given number of threads. + * Note that the lifetime of such pool must exist while being used. + * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value + * to use an internal thread pool). + * ZSTD_freeThreadPool frees a thread pool. + */ +typedef struct POOL_ctx_s ZSTD_threadPool; +ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); +ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); +ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); + + +/* + * This API is temporary and is expected to change or disappear in the future! + */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CCtx_params* cctxParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + + +/*************************************** +* Advanced compression functions +***************************************/ + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is just referenced, not duplicated. + * As a consequence, `dictBuffer` **must** outlive CDict, + * and its content must remain unmodified throughout the lifetime of CDict. + * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + +/*! ZSTD_getDictID_fromCDict() : + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. + * `estimatedSrcSize` value is optional, select 0 if not known */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_getParams() : + * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. + * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ +ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_checkCParams() : + * Ensure param values remain within authorized range. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + +/*! ZSTD_adjustCParams() : + * optimize params for a given `srcSize` and `dictSize`. + * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN. + * `dictSize` must be `0` when there is no dictionary. + * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. + * This function never fails (wide contract) */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ +ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now REDUNDANT. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning in some future version */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams); + + +/*! ZSTD_CCtx_loadDictionary_byReference() : + * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. + * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_loadDictionary_advanced() : + * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_refPrefix_advanced() : + * Same as ZSTD_CCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/* === experimental parameters === */ +/* these parameters can be used with ZSTD_setParameter() + * they are not guaranteed to remain supported in the future */ + + /* Enables rsyncable mode, + * which makes compressed files more rsync friendly + * by adding periodic synchronization points to the compressed data. + * The target average block size is ZSTD_c_jobSize / 2. + * It's possible to modify the job size to increase or decrease + * the granularity of the synchronization point. + * Once the jobSize is smaller than the window size, + * it will result in compression ratio degradation. + * NOTE 1: rsyncable mode only works when multithreading is enabled. + * NOTE 2: rsyncable performs poorly in combination with long range mode, + * since it will decrease the effectiveness of synchronization points, + * though mileage may vary. + * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s. + * If the selected compression level is already running significantly slower, + * the overall speed won't be significantly impacted. + */ + #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1 + +/* Select a compression format. + * The value must be of type ZSTD_format_e. + * See ZSTD_format_e enum definition for details */ +#define ZSTD_c_format ZSTD_c_experimentalParam2 + +/* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ +#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3 + +/* Controls whether the contents of a CDict + * are used in place, or copied into the working context. + * Accepts values from the ZSTD_dictAttachPref_e enum. + * See the comments on that enum for an explanation of the feature. */ +#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +/* Controls how the literals are compressed (default is auto). + * The value must be of type ZSTD_literalCompressionMode_e. + * See ZSTD_literalCompressionMode_t enum definition for details. + */ +#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +/* Tries to fit compressed block size to be around targetCBlockSize. + * No target when targetCBlockSize == 0. + * There is no guarantee on compressed block size (default:0) */ +#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 + +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + +/* Controls whether the new and experimental "dedicated dictionary search + * structure" can be used. This feature is still rough around the edges, be + * prepared for surprising behavior! + * + * How to use it: + * + * When using a CDict, whether to use this feature or not is controlled at + * CDict creation, and it must be set in a CCtxParams set passed into that + * construction (via ZSTD_createCDict_advanced2()). A compression will then + * use the feature or not based on how the CDict was constructed; the value of + * this param, set in the CCtx, will have no effect. + * + * However, when a dictionary buffer is passed into a CCtx, such as via + * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control + * whether the CDict that is created internally can use the feature or not. + * + * What it does: + * + * Normally, the internal data structures of the CDict are analogous to what + * would be stored in a CCtx after compressing the contents of a dictionary. + * To an approximation, a compression using a dictionary can then use those + * data structures to simply continue what is effectively a streaming + * compression where the simulated compression of the dictionary left off. + * Which is to say, the search structures in the CDict are normally the same + * format as in the CCtx. + * + * It is possible to do better, since the CDict is not like a CCtx: the search + * structures are written once during CDict creation, and then are only read + * after that, while the search structures in the CCtx are both read and + * written as the compression goes along. This means we can choose a search + * structure for the dictionary that is read-optimized. + * + * This feature enables the use of that different structure. + * + * Note that some of the members of the ZSTD_compressionParameters struct have + * different semantics and constraints in the dedicated search structure. It is + * highly recommended that you simply set a compression level in the CCtxParams + * you pass into the CDict creation call, and avoid messing with the cParams + * directly. + * + * Effects: + * + * This will only have any effect when the selected ZSTD_strategy + * implementation supports this feature. Currently, that's limited to + * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2. + * + * Note that this means that the CDict tables can no longer be copied into the + * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be + * useable. The dictionary can only be attached or reloaded. + * + * In general, you should expect compression to be faster--sometimes very much + * so--and CDict creation to be slightly slower. Eventually, we will probably + * make this mode the default. + */ +#define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8 + +/* ZSTD_c_stableInBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the compressor, and + * compression will fail if it ever changes. This means the only flush + * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end + * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) + * MUST not be modified during compression or you will get data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until + * the frame is complete. But, it will still allocate an output buffer + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * + * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. + * That means this flag cannot be used with ZSTD_compressStream(). + * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, compression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST + * not be modified during compression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, + * but passing this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +/* ZSTD_c_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells he compressor that the ZSTD_outBuffer will not be resized between + * calls. Specifically: (out.size - out.pos) will never grow. This gives the + * compressor the freedom to say: If the compressed data doesn't fit in the + * output buffer then return ZSTD_error_dstSizeTooSmall. This allows us to + * always decompress directly into the output buffer, instead of decompressing + * into an internal buffer and copying to the output buffer. + * + * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer. It will still allocate the + * input window buffer (see ZSTD_c_stableInBuffer). + * + * Zstd will check that (out.size - out.pos) never grows and return an error + * if it does. While not strictly necessary, this should prevent surprises. + */ +#define ZSTD_c_stableOutBuffer ZSTD_c_experimentalParam10 + +/* ZSTD_c_blockDelimiters + * Default is 0 == ZSTD_sf_noBlockDelimiters. + * + * For use with sequence compression API: ZSTD_compressSequences(). + * + * Designates whether or not the given array of ZSTD_Sequence contains block delimiters + * and last literals, which are defined as sequences with offset == 0 and matchLength == 0. + * See the definition of ZSTD_Sequence for more specifics. + */ +#define ZSTD_c_blockDelimiters ZSTD_c_experimentalParam11 + +/* ZSTD_c_validateSequences + * Default is 0 == disabled. Set to 1 to enable sequence validation. + * + * For use with sequence compression API: ZSTD_compressSequences(). + * Designates whether or not we validate sequences provided to ZSTD_compressSequences() + * during function execution. + * + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * + * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + * + */ +#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 + +/*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent frames. + * - ZSTD_compressStream2() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation of CCtx for single-threaded compression. + */ +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); + +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + +/*! ZSTD_CCtxParams_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using + * ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : a code representing success or failure (which can be tested with + * ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtxParams_getParameter() : + * Similar to ZSTD_CCtx_getParameter. + * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + +/*! ZSTD_compressStream2_simpleArgs() : + * Same as ZSTD_compressStream2(), + * but using only integral types as arguments. + * This variant might be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/*************************************** +* Advanced decompression functions +***************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_byReference() : + * Same as ZSTD_DCtx_loadDictionary(), + * but references `dict` content instead of copying it into `dctx`. + * This saves memory if `dict` remains around., + * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_advanced() : + * Same as ZSTD_DCtx_loadDictionary(), + * but gives direct control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_refPrefix_advanced() : + * Same as ZSTD_DCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This protects a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + +/*! ZSTD_DCtx_getParameter() : + * Get the requested decompression parameter value, selected by enum ZSTD_dParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); + +/* ZSTD_d_format + * experimental parameter, + * allowing selection between ZSTD_format_e input compression formats + */ +#define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flags is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/* ZSTD_d_forceIgnoreChecksum + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable + * + * Tells the decompressor to skip checksum validation during decompression, regardless + * of whether checksumming was specified during compression. This offers some + * slight performance benefits, and may be useful for debugging. + * Param has values of type ZSTD_forceIgnoreChecksum_e + */ +#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3 + +/* ZSTD_d_refMultipleDDicts + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable + * + * If enabled and dctx is allocated on the heap, then additional memory will be allocated + * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict() + * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead + * store all references. At decompression time, the appropriate dictID is selected + * from the set of DDicts based on the dictID in the frame. + * + * Usage is simply calling ZSTD_refDDict() on multiple dict buffers. + * + * Param has values of byte ZSTD_refMultipleDDicts_e + * + * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory + * allocation for the hash table. ZSTD_freeDCtx() also frees this memory. + * Memory is allocated as per ZSTD_DCtx::customMem. + * + * Although this function allocates memory for the table, the user is still responsible for + * memory management of the underlying ZSTD_DDict* themselves. + */ +#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + + +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + +/*! ZSTD_decompressStream_simpleArgs() : + * Same as ZSTD_decompressStream(), + * but using only integral types as arguments. + * This can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/******************************************************************** +* Advanced streaming functions +* Warning : most of these functions are now redundant with the Advanced API. +* Once Advanced API reaches "stable" status, +* redundant functions will be deprecated, and then at some point removed. +********************************************************************/ + +/*===== Advanced Streaming compression functions =====*/ + +/*! ZSTD_initCStream_srcSize() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * pledgedSrcSize must be correct. If it is not known at init time, use + * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, + * "0" also disables frame content size field. It may be enabled in the future. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); + +/*! ZSTD_initCStream_usingDict() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * Creates of an internal CDict (incompatible with static CCtx), except if + * dict == NULL or dictSize < 8, in which case no dict is used. + * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if + * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + +/*! ZSTD_initCStream_advanced() : + * This function is deprecated, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd parameter and leave the rest as-is. + * for ((param, value) : params) { + * ZSTD_CCtx_setParameter(zcs, param, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. + * pledgedSrcSize must be correct. + * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize); + +/*! ZSTD_initCStream_usingCDict() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * note : cdict will just be referenced, and must outlive compression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + +/*! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. + * for ((fParam, value) : fParams) { + * ZSTD_CCtx_setParameter(zcs, fParam, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. + * pledgedSrcSize must be correct. If srcSize is not known at init time, use + * value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize); + +/*! ZSTD_resetCStream() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * start a new frame, using same parameters from previous frame. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +typedef struct { + unsigned long long ingested; /* nb input bytes read and buffered */ + unsigned long long consumed; /* nb input bytes actually compressed */ + unsigned long long produced; /* nb of compressed bytes generated and buffered */ + unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */ + unsigned currentJobID; /* MT only : latest started job nb */ + unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */ +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression() : + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Aggregates progression inside active worker threads. + */ +ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + +/*! ZSTD_toFlushNow() : + * Tell how many bytes are ready to be flushed immediately. + * Useful for multithreading scenarios (nbWorkers >= 1). + * Probe the oldest active job, defined as oldest job not yet entirely flushed, + * and check its output buffer. + * @return : amount of data stored in oldest job and ready to be flushed immediately. + * if @return == 0, it means either : + * + there is no active job (could be checked with ZSTD_frameProgression()), or + * + oldest job is still actively compressing data, + * but everything it has produced has also been flushed so far, + * therefore flush speed is limited by production speed of oldest job + * irrespective of the speed of concurrent (and newer) jobs. + */ +ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + + +/*===== Advanced Streaming decompression functions =====*/ + +/*! + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + +/*! + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + +/*! + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * + * re-use decompression parameters from previous init; saves dictionary loading + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + +/********************************************************************* +* Buffer-less and synchronous inner streaming functions +* +* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. +********************************************************************* */ + +/** + Buffer-less streaming compression (synchronous mode) + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. + ZSTD_CCtx object can be re-used multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, + or ZSTD_compressBegin_advanced(), for finer parameter control. + It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. + - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. + Worst case evaluation is provided by ZSTD_compressBound(). + ZSTD_compressContinue() doesn't guarantee recover after a failed compression. + - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog). + It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks) + - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps. + In which case, it will "discard" the relevant memory section from its history. + + Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. +*/ + +/*===== Buffer-less streaming compression functions =====*/ +ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/** + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. + A ZSTD_DCtx object can be re-used multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). + + Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + + A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + Context can then be reset to start a new decompression. + + Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType(). + This information is not required to properly decode a frame. + + == Special case : skippable frames == + + Skippable frames allow integration of user-defined data into a flow of concatenated frames. + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : + a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F + b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + c) Frame Content - any content (User Data) of length equal to Frame Size + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. +*/ + +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTD_frameHeader; + +/*! ZSTD_getFrameHeader() : + * decode Frame Header, or requires larger `srcSize`. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; +ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + + +/* ============================ */ +/** Block level API */ +/* ============================ */ + +/*! + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. + + A few rules to respect : + - Compressing and decompressing require a context structure + + Use ZSTD_createCCtx() and ZSTD_createDCtx() + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block. + - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) ! + ===> In which case, nothing is produced into `dst` ! + + User __must__ test for such outcome and deal directly with uncompressed data + + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0. + Doing so would mess up with statistics history, leading to potential data corruption. + + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !! + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. +*/ + +/*===== Raw zstd block functions =====*/ +ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif