diff --git a/AdaptiveSampler.h b/AdaptiveSampler.h
index e6efbc5..e2f08a6 100644
--- a/AdaptiveSampler.h
+++ b/AdaptiveSampler.h
@@ -73,8 +73,8 @@ template <class T> class AdaptiveSampler {
   }
 
   void print_header() {
-      printf("#%-6s %6s %8s %8s %8s %8s %8s %8s\n", "type", "size",
-         "min", "max", "avg", "90th", "95th", "99th");
+      printf("#%-6s %6s %8s %8s %8s %8s %8s %8s %8s %8s\n", "type", "size",
+         "min", "max", "avg", "50th", "90th", "95th", "99th", "99.9th");
   }
 
   void print_stats(const char *type, const char *size) {
@@ -82,17 +82,18 @@ template <class T> class AdaptiveSampler {
     size_t l = samples_copy.size();
 
     if (l == 0) {
-      printf("%-7s %6s %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size,
+      printf("%-7s %6s %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size,
              0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
       return;
     }
 
     sort(samples_copy.begin(), samples_copy.end());
 
-    printf("%-7s %6s %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size,
+    printf("%-7s %6s %8.1f %8.1f% 8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", type, size,
            samples_copy[0], samples_copy[l-1], average(),
+           samples_copy[(l*50)/100],
            samples_copy[(l*90)/100], samples_copy[(l*95)/100],
-           samples_copy[(l*99)/100]);
+           samples_copy[(l*99)/100], samples_copy[(l*99.9)/100]);
   }
 };
 
diff --git a/AgentStats.h b/AgentStats.h
index 50e016b..e73bb8c 100644
--- a/AgentStats.h
+++ b/AgentStats.h
@@ -5,7 +5,15 @@
 class AgentStats {
 public:
   uint64_t rx_bytes, tx_bytes;
-  uint64_t gets, sets, get_misses;
+  uint64_t gets, sets, accesses, get_misses;
+  uint64_t gets_l1, gets_l2, sets_l1, sets_l2;
+  uint64_t get_misses_l1, get_misses_l2;
+  uint64_t set_misses_l1, set_misses_l2;
+  uint64_t excl_wbs, incl_wbs;
+  uint64_t copies_to_l1;
+  uint64_t delete_misses_l2;
+  uint64_t delete_hits_l2;
+  uint64_t set_incl_hits_l1, set_excl_hits_l1;
   uint64_t skips;
 
   double start, stop;
diff --git a/Connection.cc b/Connection.cc
index ea02899..9232d91 100644
--- a/Connection.cc
+++ b/Connection.cc
@@ -1,4 +1,9 @@
 #include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <pthread.h>
 
 #include <event2/buffer.h>
 #include <event2/bufferevent.h>
@@ -15,18 +20,135 @@
 #include "mutilate.h"
 #include "binary_protocol.h"
 #include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+
+//#define DEBUGC 
+
+using namespace moodycamel;
+std::hash<string> hashstr;
+
+extern ifstream kvfile;
+extern pthread_mutex_t flock;
+extern pthread_mutex_t *item_locks;
+extern int item_lock_hashpower;
+
+
+pthread_mutex_t cid_lock = PTHREAD_MUTEX_INITIALIZER;
+uint32_t connids = 0;
+
+//pthread_mutex_t opaque_lock = PTHREAD_MUTEX_INITIALIZER;
+//uint32_t g_opaque = 0;
+
+void item_lock(size_t hv, uint32_t cid) {
+    //char out[128];
+    //sprintf(out,"conn: %u, locking %lu\n",cid,hv);
+    //write(2,out,strlen(out));
+    pthread_mutex_lock(&item_locks[hv & hashmask(item_lock_hashpower)]);
+}
+
+void item_unlock(size_t hv, uint32_t cid) {
+    //char out[128];
+    //sprintf(out,"conn: %u, unlocking %lu\n",cid,hv);
+    //write(2,out,strlen(out));
+    pthread_mutex_unlock(&item_locks[hv & hashmask(item_lock_hashpower)]);
+}
+
+void *item_trylock(uint32_t hv, uint32_t cid) {
+    pthread_mutex_t *lock = &item_locks[hv & hashmask(item_lock_hashpower)];
+    if (pthread_mutex_trylock(lock) == 0) {
+        //char out[128];
+        //sprintf(out,"conn: %u, locking %u\n",cid,hv);
+        //write(2,out,strlen(out));
+        return lock;
+    }
+    return NULL;
+}
+
+void item_trylock_unlock(void *lock, uint32_t cid) {
+    //char out[128];
+    //sprintf(out,"conn: %u, unlocking\n",cid);
+    //write(2,out,strlen(out));
+    pthread_mutex_unlock((pthread_mutex_t *) lock);
+}
+
+void Connection::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char a[256];
+    char s[256];
+    memset(a,0,256);
+    memset(s,0,256);
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,op->key,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,op->key,op->opaque,op->type);
+    }
+    write(2,output,strlen(output));
+}
 
 /**
  * Create a new connection to a server endpoint.
  */
 Connection::Connection(struct event_base* _base, struct evdns_base* _evdns,
                        string _hostname, string _port, options_t _options,
-                       bool sampling) :
+                       //ConcurrentQueue<string>* a_trace_queue,
+                       bool sampling ) :
   start_time(0), stats(sampling), options(_options),
   hostname(_hostname), port(_port), base(_base), evdns(_evdns)
 {
   valuesize = createGenerator(options.valuesize);
   keysize = createGenerator(options.keysize);
+
+  //trace_queue = a_trace_queue;
+  opaque = 0;
+  total = 0;
+  op_queue_size = 0;
+  issue_buf_n = 0;
+  //;
+  //op_queue = (Operation**)malloc(sizeof(Operation*)*OPAQUE_MAX);
+  eof = 0;
+
   keygen = new KeyGenerator(keysize, options.records);
 
   if (options.lambda <= 0) {
@@ -39,32 +161,103 @@ Connection::Connection(struct event_base* _base, struct evdns_base* _evdns,
 
   read_state  = INIT_READ;
   write_state = INIT_WRITE;
-
+  last_quiet = false;
+  //op_queue.reserve(OPAQUE_MAX); //new std::vector<Operation>(OPAQUE_MAX);
+  
   last_tx = last_rx = 0.0;
 
-  bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE);
-  bufferevent_setcb(bev, bev_read_cb, bev_write_cb, bev_event_cb, this);
-  bufferevent_enable(bev, EV_READ | EV_WRITE);
+  pthread_mutex_lock(&cid_lock);
+  cid = connids++;
+  pthread_mutex_unlock(&cid_lock);
+  
+  issue_buf_size = 0;
+  issue_buf = (unsigned char*)malloc(sizeof(unsigned char)*MAX_BUFFER_SIZE);
+  memset(issue_buf,0,MAX_BUFFER_SIZE);
+  issue_buf_pos = issue_buf;
+  timer = evtimer_new(base, timer_cb, this);
 
-  if (options.binary) {
-    prot = new ProtocolBinary(options, this, bev);
-  } else {
-    prot = new ProtocolAscii(options, this, bev);
-  }
+}
+
+//void Connection::set_queue(ConcurrentQueue<string>* a_trace_queue) {
+//    trace_queue = a_trace_queue;
+//}
 
-  if (bufferevent_socket_connect_hostname(bev, evdns, AF_UNSPEC,
+void Connection::set_queue(queue<string>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    //while (trace_queue->size() < 1);
+    //usleep(1000);
+}
+
+void Connection::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+uint32_t Connection::get_cid() {
+    return cid;
+}
+
+int Connection::do_connect() {
+
+  int connected = 0;
+  if (options.unix_socket) {
+  
+    bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE);
+    bufferevent_setcb(bev, bev_read_cb, bev_write_cb, bev_event_cb, this);
+    bufferevent_enable(bev, EV_READ | EV_WRITE);
+
+    struct sockaddr_un sin;
+    memset(&sin, 0, sizeof(sin));
+    sin.sun_family = AF_LOCAL;
+    strcpy(sin.sun_path, hostname.c_str());
+
+    int addrlen;
+    addrlen = sizeof(sin);
+    int err = bufferevent_socket_connect(bev,  (struct sockaddr*)&sin, addrlen);
+    if (err == 0) {
+        connected = 1;
+        if (options.binary) {
+          prot = new ProtocolBinary(options, this, bev);
+        } else if (options.redis) {
+          prot = new ProtocolRESP(options, this, bev);
+        } else {
+          prot = new ProtocolAscii(options, this, bev);
+        }
+    } else {
+	connected = 0;
+        err = errno;
+	fprintf(stderr,"error %s\n",strerror(err));
+        bufferevent_free(bev);
+        //event_base_free(_evbase_ptr);
+    }
+  } else {
+    bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE);
+    bufferevent_setcb(bev, bev_read_cb, bev_write_cb, bev_event_cb, this);
+    bufferevent_enable(bev, EV_READ | EV_WRITE);
+
+    if (options.binary) {
+      prot = new ProtocolBinary(options, this, bev);
+    } else if (options.redis) {
+      prot = new ProtocolRESP(options, this, bev);
+    } else {
+      prot = new ProtocolAscii(options, this, bev);
+    }
+    if (bufferevent_socket_connect_hostname(bev, evdns, AF_UNSPEC,
                                           hostname.c_str(),
-                                          atoi(port.c_str()))) {
-    DIE("bufferevent_socket_connect_hostname()");
+                                          atoi(port.c_str())) == 0) {
+        connected = 1;
+    } else {
+        bufferevent_free(bev);
+        connected = 0;
+    }
   }
-
-  timer = evtimer_new(base, timer_cb, this);
+  return connected;
 }
 
 /**
  * Destroy a connection, performing cleanup.
  */
 Connection::~Connection() {
+ 
   event_free(timer);
   timer = NULL;
   // FIXME:  W("Drain op_q?");
@@ -81,8 +274,8 @@ Connection::~Connection() {
  */
 void Connection::reset() {
   // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
-  assert(op_queue.size() == 0);
-  evtimer_del(timer);
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
   read_state = IDLE;
   write_state = INIT_WRITE;
   stats = ConnectionStats(stats.sampling);
@@ -120,9 +313,10 @@ void Connection::start_loading() {
  */
 void Connection::issue_something(double now) {
   char key[256];
+  memset(key,0,256);
   // FIXME: generate key distribution here!
   string keystr = keygen->generate(lrand48() % options.records);
-  strcpy(key, keystr.c_str());
+  strncpy(key, keystr.c_str(),255);
 
   if (drand48() < options.update) {
     int index = lrand48() % (1024 * 1024);
@@ -132,12 +326,405 @@ void Connection::issue_something(double now) {
   }
 }
 
+
+/**
+ * Get/Set Style
+ * Issue a get first, if not found then set
+ */
+void Connection::issue_getset(double now) {
+  
+    if (!options.read_file && !kvfile.is_open())
+    {
+        string keystr;
+        char key[256];
+        memset(key,0,256);
+        keystr = keygen->generate(lrand48() % options.records);
+        strncpy(key, keystr.c_str(),255);
+        
+        char log[1024];
+        int length = valuesize->generate();
+        sprintf(log,"%s,%d\n",key,length);
+        write(2,log,strlen(log));
+        
+        issue_get_with_len(key, length, now);
+    }
+    else
+    {
+        string line;
+        string rT;
+        string rApp;
+        string rReq;
+        string rKey;
+        string rvaluelen;
+        
+        pthread_mutex_lock(&flock);
+        getline(kvfile,line);
+        pthread_mutex_unlock(&flock);
+        stringstream ss(line);
+        getline( ss, rT, ',');
+        getline( ss, rApp, ',');
+        getline( ss, rReq, ',');
+        getline( ss, rKey, ',' );
+        getline( ss, rvaluelen, ',' );
+        
+        int vl = atoi(rvaluelen.c_str());
+        
+        char key[256];
+        memset(key,0,256);
+        strncpy(key, rKey.c_str(),255);
+        issue_get_with_len(key, vl, now);
+    }
+
+}
+
+int Connection::issue_something_trace(double now) {
+    int ret = 0;
+
+    string line;
+    string rT;
+    string rApp;
+    string rOp;
+    string rKey;
+    string rKeySize;
+    string rvaluelen;
+
+    pthread_mutex_lock(&flock);
+    if (kvfile.good()) {
+        getline(kvfile,line);
+        pthread_mutex_unlock(&flock);
+    }
+    else {
+        pthread_mutex_unlock(&flock);
+        return 1;
+    }
+    stringstream ss(line);
+    int Op = 0;
+    int vl = 0;
+
+    if (options.twitter_trace == 1) {
+        getline( ss, rT, ',' );
+        getline( ss, rKey, ',' );
+        getline( ss, rKeySize, ',' );
+        getline( ss, rvaluelen, ',' );
+        getline( ss, rApp, ',' );
+        getline( ss, rOp, ',' );
+        vl = atoi(rvaluelen.c_str());
+        if (vl < 1) vl = 1;
+        if (rOp.compare("get") == 0) {
+            Op = 1;
+        } else if (rOp.compare("set") == 0) {
+            Op = 2;
+        } else {
+            Op = 0;
+        }
+
+        while (Op == 0) {
+            string line1;
+            pthread_mutex_lock(&flock);
+            if (kvfile.good()) {
+                getline(kvfile,line1);
+                pthread_mutex_unlock(&flock);
+            }
+            stringstream ss1(line1);
+            getline( ss1, rT, ',' );
+            getline( ss1, rKey, ',' );
+            getline( ss1, rKeySize, ',' );
+            getline( ss1, rvaluelen, ',' );
+            getline( ss1, rApp, ',' );
+            getline( ss1, rOp, ',' );
+            vl = atoi(rvaluelen.c_str());
+            if (vl < 1) vl = 1;
+
+            if (rOp.compare("get") == 0) {
+                Op = 1;
+            } else if (rOp.compare("set") == 0) {
+                Op = 2;
+            } else {
+                Op = 0;
+            }
+        }
+        
+    } else {
+        getline( ss, rT, ',' );
+        getline( ss, rApp, ',' );
+        getline( ss, rOp, ',' );
+        getline( ss, rKey, ',' );
+        getline( ss, rvaluelen, ',' );
+        if (rOp.compare("read") == 0) 
+            Op = 1;
+        if (rOp.compare("write") == 0) 
+            Op = 2;
+        vl = atoi(rvaluelen.c_str());
+    }
+
+
+    if (vl > 524000) vl = 524000;
+    //if (strcmp(key,"100004781") == 0) {
+    //   fprintf(stderr,"ready!\n");
+    //}
+    switch(Op)
+    {
+      case 1:
+          issue_get_with_len(rKey.c_str(), vl, now);
+          break;
+      case 2:
+          int index = lrand48() % (1024 * 1024);
+          issue_set(rKey.c_str(), &random_char[index], vl, now,true);
+          break;
+    }
+    return ret;
+}
+
+
+/**
+ * Get/Set or Set Style
+ * If a GET command: Issue a get first, if not found then set
+ * If trace file (or prob. write) says to set, then set it
+ */
+int Connection::issue_getsetorset(double now) {
+ 
+  int ret = 0;
+
+  if (!options.read_file) {
+        string keystr;
+        char key[256];
+        memset(key,0,256);
+        keystr = keygen->generate(lrand48() % options.records);
+        strncpy(key, keystr.c_str(),255);
+        
+        char log[1024];
+        int length = valuesize->generate();
+        sprintf(log,"%s,%d\n",key,length);
+        write(2,log,strlen(log));
+        
+        issue_get_with_len(key, length, now);
+
+  } else {
+
+        string line;
+        string rT;
+        string rApp;
+        string rOp;
+        string rKey;
+        string rKeySize;
+        string rvaluelen;
+        
+
+        int nissued = 0;
+        //fprintf(stderr,"starting to issue, current %d\n",issue_buf_n);
+        while (nissued < options.depth) {
+            //bool res = trace_queue->try_dequeue(line);
+            
+            if (trace_queue->size() > 0) {
+                pthread_mutex_lock(lock);
+                line = trace_queue->front();
+                trace_queue->pop();
+                pthread_mutex_unlock(lock);
+                if (line.compare("EOF") == 0) {
+                    eof = 1;
+                    return 1;
+                }
+                
+                stringstream ss(line);
+                int Op = 0;
+                int vl = 0; 
+
+                if (options.twitter_trace == 1) {
+                    getline( ss, rT, ',' );
+                    getline( ss, rKey, ',' );
+                    getline( ss, rKeySize, ',' );
+                    getline( ss, rvaluelen, ',' );
+                    getline( ss, rApp, ',' );
+                    getline( ss, rOp, ',' );
+                    //vl = atoi(rvaluelen.c_str());
+                    vl = stoi(rvaluelen);
+                    //vl = 100000;
+                    if (vl < 1) continue;
+                    if (vl > 524000) vl = 524000;
+                    if (rOp.compare("get") == 0) {
+                        Op = 1;
+                    } else if (rOp.compare("set") == 0) {
+                        Op = 2;
+                    } else {
+                        Op = 0;
+                    }
+                    
+                    
+                } else if (options.twitter_trace == 2) {
+                    getline( ss, rT, ',' );
+                    getline( ss, rApp, ',' );
+                    getline( ss, rOp, ',' );
+                    getline( ss, rKey, ',' );
+                    getline( ss, rvaluelen, ',' );
+                    Op = stoi(rOp);
+                    vl = stoi(rvaluelen);
+                } else {
+                    getline( ss, rT, ',' );
+                    getline( ss, rApp, ',' );
+                    getline( ss, rOp, ',' );
+                    getline( ss, rKey, ',' );
+                    getline( ss, rvaluelen, ',' );
+                    vl = stoi(rvaluelen);
+                    if (rOp.compare("read") == 0) 
+                        Op = 1;
+                    if (rOp.compare("write") == 0) 
+                        Op = 2;
+                }
+
+
+                char key[256];
+                memset(key,0,256);
+                strncpy(key, rKey.c_str(),255);
+                int issued = 0;
+                switch(Op)
+                {
+                  case 0:
+                      //fprintf(stderr,"invalid line: %s, vl: %d @T: %d\n",
+                      //        key,vl,stoi(rT));
+                      break;
+                  case 1:
+                      if (nissued < options.depth-1) {
+                        issued = issue_get_with_len(key, vl, now, true);
+                        last_quiet = true;
+                      } else {
+                        issued = issue_get_with_len(key, vl, now, false);
+                        last_quiet = false;
+                      }
+                      break;
+                  case 2:
+                      if (last_quiet) {
+                          issue_noop(now);
+                      }
+                      int index = lrand48() % (1024 * 1024);
+                      //issued = issue_get_with_len(key, vl, now, false);
+                      issued = issue_set(key, &random_char[index], vl, now, true);
+                      last_quiet = false;
+                      break;
+                
+                }
+                if (issued) {
+                    nissued++;
+                    total++;
+                } else {
+                      if (Op != 0) {
+                        fprintf(stderr,"failed to issue line: %s, vl: %d @T: %d\n",
+                                key,vl,stoi(rT));
+                      }
+                      break;
+                }
+            } else {
+//#ifdef DEBUGC
+               return 0;
+                //fprintf(stderr,"trace_queue size: %d\n",trace_queue->size());
+                //if (stats.accesses > 10) {
+                //    eof = 1;
+                //    return 1;
+                //}
+            }
+        }
+        //fprintf(stderr,"done issue, current %d\n",issue_buf_n);
+        if (last_quiet) {
+            issue_noop();
+            last_quiet = false;
+        }
+#ifdef DEBUGC
+        fprintf(stderr,"getsetorset issuing %d reqs last quiet %d\n",issue_buf_n,last_quiet);
+        char *output = (char*)malloc(sizeof(char)*(issue_buf_size+512));
+        fprintf(stderr,"-------------------------------------\n");
+        memcpy(output,issue_buf,issue_buf_size);
+        write(2,output,issue_buf_size);
+        fprintf(stderr,"\n-------------------------------------\n");
+        free(output);
+#endif
+        //buffer is ready to go!
+        bufferevent_write(bev, issue_buf, issue_buf_size);
+        
+        memset(issue_buf,0,issue_buf_size);
+        issue_buf_pos = issue_buf;
+        issue_buf_size = 0;
+        issue_buf_n = 0;
+    }
+
+    return ret;
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int Connection::issue_get_with_len(const char* key, int valuelen, double now, bool quiet) {
+  //Operation *op = new Operation;
+  Operation op; // = new Operation;
+
+#if HAVE_CLOCK_GETTIME
+  op.start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    op.start_time = tv_to_double(&now_tv);
+#else
+    op.start_time = get_time();
+#endif
+  } else {
+    op.start_time = now;
+  }
+#endif
+
+  //record before rx 
+  //r_vsize = stats.rx_bytes % 100000;
+  //pthread_mutex_lock(&opaque_lock);
+  op.opaque = opaque++;
+  if (opaque > OPAQUE_MAX) {
+      opaque = 0;
+  }
+  //pthread_mutex_unlock(&opaque_lock);
+
+  strncpy(op.key,key,255);
+  op.valuelen = valuelen;
+  op.type = Operation::GET;
+  //op.hv = hashstr(op.key);
+  //item_lock(op.hv,cid);
+  //pthread_mutex_t *lock = (pthread_mutex_t*)item_trylock(op.hv,cid);
+  //if (lock != NULL) {
+    op_queue[op.opaque] = op;
+    op_queue_size++;
+    //output_op(&op,0,0);
+
+    //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+    uint16_t keylen = strlen(key);
+
+    // each line is 4-bytes
+    binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                          0x00, 0x00, htons(0),
+                          htonl(keylen) };
+
+    if (quiet) {
+        //h.opcode = CMD_GETQ;
+        h.opcode = CMD_GET;
+    }
+    h.opaque = htonl(op.opaque);
+
+    memcpy(issue_buf_pos,&h,24);
+    issue_buf_pos += 24;
+    issue_buf_size += 24;
+    memcpy(issue_buf_pos,key,keylen);
+    issue_buf_pos += keylen;
+    issue_buf_size += keylen;
+    issue_buf_n++;
+    
+    if (read_state != LOADING) stats.tx_bytes += 24 + keylen;
+    
+    stats.log_access(op);
+    return 1;
+}
+
 /**
  * Issue a get request to the server.
  */
 void Connection::issue_get(const char* key, double now) {
   Operation op;
-  int l;
 
 #if HAVE_CLOCK_GETTIME
   op.start_time = get_time_accurate();
@@ -155,23 +742,78 @@ void Connection::issue_get(const char* key, double now) {
   }
 #endif
 
-  op.key = string(key);
+  //record before rx 
+  //r_vsize = stats.rx_bytes % 100000;
+  
+  op.opaque = opaque++;
+  if (opaque > OPAQUE_MAX) {
+      opaque = 0;
+  }
+  
+  strncpy(op.key,key,255);
   op.type = Operation::GET;
-  op_queue.push(op);
+  //op.hv = hashstr(op.key);
+  //item_lock(op.hv,cid);
+  op_queue[op.opaque] = op;
+  op_queue_size++;
 
   if (read_state == IDLE) read_state = WAITING_FOR_GET;
-  l = prot->get_request(key);
+  int l = prot->get_request(key,op.opaque);
   if (read_state != LOADING) stats.tx_bytes += l;
+  
+  stats.log_access(op);
 }
 
 /**
- * Issue a set request to the server.
+ * Issue a delete90 request to the server.
  */
-void Connection::issue_set(const char* key, const char* value, int length,
-                           double now) {
+void Connection::issue_delete90(double now) {
   Operation op;
   int l;
 
+#if HAVE_CLOCK_GETTIME
+  op.start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    op.start_time = tv_to_double(&now_tv);
+#else
+    op.start_time = get_time();
+#endif
+  } else {
+    op.start_time = now;
+  }
+#endif
+
+  op.type = Operation::DELETE;
+  op.opaque = 0;
+  op_queue[op.opaque] = op;
+  op_queue_size++;
+
+  if (read_state == IDLE) read_state = WAITING_FOR_DELETE;
+  l = prot->delete90_request();
+  if (read_state != LOADING) stats.tx_bytes += l;
+}
+
+/**
+ * Issue a set request as a result of a miss to the server.
+ * The difference here is that we will yield to any outstanding SETs to this
+ * key, i.e. while waiting for GET response a SET to the key was issued.
+ *
+ *
+ * or v2?
+ *  - works with the lock held, since we want to beat any incoming writes
+ *  - maintains program order, total set ordering
+ *  - currenlty using this design
+ */
+void Connection::issue_set_miss(const char* key, const char* value, int length) {
+  //Operation *op = new Operation;
+  Operation op; // = new Operation;
+  int l;
+  double now = 0;
+
 #if HAVE_CLOCK_GETTIME
   op.start_time = get_time_accurate();
 #else
@@ -179,41 +821,192 @@ void Connection::issue_set(const char* key, const char* value, int length,
   else op.start_time = now;
 #endif
 
+  //record value size
+  //r_vsize = length;
+  //r_appid = key[0] - '0';
+  //const char* kptr = key;
+  //kptr += 2;
+  //r_key = atoi(kptr);
+  //r_ksize = strlen(kptr);
+  op.opaque = opaque++;
+  if (opaque > OPAQUE_MAX) {
+      opaque = 0;
+  }
+  
+  strncpy(op.key,key,255);
+  op.valuelen = length;
   op.type = Operation::SET;
-  op_queue.push(op);
+  //op.hv = hashstr(op.key);
+  op_queue[op.opaque] = op;
+  op_queue_size++;
 
-  if (read_state == IDLE) read_state = WAITING_FOR_SET;
-  l = prot->set_request(key, value, length);
+  //output_op(&op,1,0);
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_SET;
+  l = prot->set_request(key, value, length, op.opaque);
   if (read_state != LOADING) stats.tx_bytes += l;
+
+  //if (is_access)
+  stats.log_access(op);
+}
+
+
+void Connection::issue_noop(double now) {
+    Operation op;
+    
+    if (now == 0.0) op.start_time = get_time();
+    else op.start_time = now;
+
+    //op.opaque = opaque++;
+    //if (opaque > OPAQUE_MAX) {
+    //    opaque = 0;
+    //}
+    
+    //op.valuelen = 0;
+    //op.type = Operation::NOOP;
+    //op.hv = hashstr(op.key);
+    //pthread_mutex_t *lock = (pthread_mutex_t*)item_trylock(op.hv,cid);
+    //if (lock != NULL) {
+    //item_lock(op.hv,cid);
+    //op_queue[op.opaque] = op;
+    //op_queue_size++;
+    binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                          0x00, 0x00, htons(0),
+                          0x00 };
+    //h.opaque = htonl(op.opaque);
+
+    memcpy(issue_buf_pos,&h,24);
+    issue_buf_pos += 24;
+    issue_buf_size += 24;
+    issue_buf_n++;
 }
 
 /**
- * Return the oldest live operation in progress.
+ * Issue a set request to the server.
  */
-void Connection::pop_op() {
-  assert(op_queue.size() > 0);
+int Connection::issue_set(const char* key, const char* value, int length,
+                           double now, bool is_access) {
+  //Operation *op = new Operation;
+  Operation op; // = new Operation;
 
-  op_queue.pop();
+#if HAVE_CLOCK_GETTIME
+  op.start_time = get_time_accurate();
+#else
+  if (now == 0.0) op.start_time = get_time();
+  else op.start_time = now;
+#endif
+
+  //record value size
+  //r_vsize = length;
+  //r_appid = key[0] - '0';
+  //const char* kptr = key;
+  //kptr += 2;
+  //r_key = atoi(kptr);
+  //r_ksize = strlen(kptr);
+  op.opaque = opaque++;
+  if (opaque > OPAQUE_MAX) {
+      opaque = 0;
+  }
+  
+  op.valuelen = length;
+  op.type = Operation::SET;
+  strncpy(op.key,key,255);
+  //op.hv = hashstr(op.key);
+  //pthread_mutex_t *lock = (pthread_mutex_t*)item_trylock(op.hv,cid);
+  //if (lock != NULL) {
+  //item_lock(op.hv,cid);
+    op_queue[op.opaque] = op;
+    op_queue_size++;
+
+    //output_op(&op,1,0);
+    uint16_t keylen = strlen(key);
+
+    // each line is 4-bytes
+    binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                          0x08, 0x00, htons(0),
+                          htonl(keylen + 8 + length) }; 
+    h.opaque = htonl(op.opaque);
+    
+    memcpy(issue_buf_pos,&h,24);
+    issue_buf_pos += 24;
+    issue_buf_size += 24;
+    if (options.miss_through && is_access) {
+        uint32_t flags = htonl(16384);
+        memcpy(issue_buf_pos,&flags,4);
+        issue_buf_pos += 4;
+        issue_buf_size += 4;
+        uint32_t exp = 0;
+        memcpy(issue_buf_pos,&exp,4);
+        issue_buf_pos += 4;
+        issue_buf_size += 4;
+
+    } else {
+        uint32_t flags = 0;
+        memcpy(issue_buf_pos,&flags,4);
+        issue_buf_pos += 4;
+        issue_buf_size += 4;
+        uint32_t exp = 0;
+        memcpy(issue_buf_pos,&exp,4);
+        issue_buf_pos += 4;
+        issue_buf_size += 4;
+    }
+    memcpy(issue_buf_pos,key,keylen);
+    issue_buf_pos += keylen;
+    issue_buf_size += keylen;
+    memcpy(issue_buf_pos,value,length);
+    issue_buf_pos += length;
+    issue_buf_size += length;
+    issue_buf_n++;
+
+
+    //if (read_state == IDLE) read_state = WAITING_FOR_SET;
+    //l = prot->set_request(key, value, length, op->opaque);
+
+    //if (is_access) {
+        if (read_state != LOADING) stats.tx_bytes += length + 32 + keylen;
+        stats.log_access(op);
+    //}
+    return 1;
+  //} else {
+  //  return 0;
+  //}
+}
+
+/**
+ * Return the oldest live operation in progress.
+ */
+void Connection::pop_op(Operation *op) {
+
+  //assert(op_queue.size() > 0);
+  uint32_t opopq = op->opaque;
+  //pthread_mutex_t *l = op->lock;
+  //delete op_queue[opopq];
+  op_queue.erase(opopq);
+  op_queue_size--;
+  
+  //item_trylock_unlock(l,cid);
+  //item_unlock(hv,cid);
 
   if (read_state == LOADING) return;
   read_state = IDLE;
 
   // Advance the read state machine.
-  if (op_queue.size() > 0) {
-    Operation& op = op_queue.front();
-    switch (op.type) {
-    case Operation::GET: read_state = WAITING_FOR_GET; break;
-    case Operation::SET: read_state = WAITING_FOR_SET; break;
-    default: DIE("Not implemented.");
-    }
-  }
+  //if (op_queue.size() > 0) {
+  //  Operation& op = op_queue.front();
+  //  switch (op.type) {
+  //  case Operation::GET: read_state = WAITING_FOR_GET; break;
+  //  case Operation::SET: read_state = WAITING_FOR_SET; break;
+  //  case Operation::DELETE: read_state = WAITING_FOR_DELETE; break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
 }
 
 /**
  * Finish up (record stats) an operation that just returned from the
  * server.
  */
-void Connection::finish_op(Operation *op) {
+void Connection::finish_op(Operation *op, int was_hit) {
   double now;
 #if USE_CACHED_TIME
   struct timeval now_tv;
@@ -228,25 +1021,84 @@ void Connection::finish_op(Operation *op) {
   op->end_time = now;
 #endif
 
-  switch (op->type) {
-  case Operation::GET: stats.log_get(*op); break;
-  case Operation::SET: stats.log_set(*op); break;
-  default: DIE("Not implemented.");
+  if (options.successful_queries && was_hit) { 
+    switch (op->type) {
+    case Operation::GET: stats.log_get(*op); break;
+    case Operation::SET: stats.log_set(*op); break;
+    case Operation::DELETE: break;
+    default: DIE("Not implemented.");
+    }
+  } else {
+    switch (op->type) {
+    case Operation::GET: stats.log_get(*op); break;
+    case Operation::SET: stats.log_set(*op); break;
+    case Operation::DELETE: break;
+    default: DIE("Not implemented.");
+    }
   }
 
   last_rx = now;
-  pop_op();
-  drive_write_machine();
+  uint32_t opopq = op->opaque;
+  op_queue.erase(opopq);
+  //op_queue.erase(op_queue.begin()+opopq);
+  //delete op_queue[opopq];
+  op_queue_size--;
+  read_state = IDLE;
+
+  //lets check if we should output stats for the window
+  //Do the binning for percentile outputs
+  //crude at start
+  if ((options.misswindow != 0) && ( ((stats.window_accesses) % options.misswindow) == 0))
+  {
+      if (stats.window_gets != 0)
+      {
+        //printf("%lu,%.4f\n",(stats.accesses),
+        //        ((double)stats.window_get_misses/(double)stats.window_accesses));
+        stats.window_gets = 0;
+        stats.window_get_misses = 0;
+        stats.window_sets = 0;
+        stats.window_accesses = 0;
+      }
+  }
+
 }
 
+
+
 /**
  * Check if our testing is done and we should exit.
  */
 bool Connection::check_exit_condition(double now) {
   if (read_state == INIT_READ) return false;
   if (now == 0.0) now = get_time();
-  if (now > start_time + options.time) return true;
-  if (options.loadonly && read_state == IDLE) return true;
+
+  if (options.read_file) {
+    if (eof) {
+        return true;
+    }
+    else if ((options.queries == 1) && 
+        (now > start_time + options.time))
+    {
+        return true;
+    }
+    else {
+        return false;
+    }
+
+  } else {
+    if (options.queries != 0 && 
+       (((long unsigned)options.queries) == (stats.accesses))) 
+    {
+        return true;
+    }
+    if ((options.queries == 0) && 
+        (now > start_time + options.time))
+    {
+        return true;
+    }
+    if (options.loadonly && read_state == IDLE) return true;
+  }
+
   return false;
 }
 
@@ -259,7 +1111,7 @@ void Connection::event_callback(short events) {
     int fd = bufferevent_getfd(bev);
     if (fd < 0) DIE("bufferevent_getfd");
 
-    if (!options.no_nodelay) {
+    if (!options.no_nodelay && !options.unix_socket) {
       int one = 1;
       if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
                      (void *) &one, sizeof(one)) < 0)
@@ -270,14 +1122,19 @@ void Connection::event_callback(short events) {
     if (prot->setup_connection_w()) {
       read_state = IDLE;
     }
+    drive_write_machine(); 
 
   } else if (events & BEV_EVENT_ERROR) {
     int err = bufferevent_socket_get_dns_error(bev);
-    if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"Got an error: %s\n",
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
     DIE("BEV_EVENT_ERROR: %s", strerror(errno));
 
   } else if (events & BEV_EVENT_EOF) {
-    DIE("Unexpected EOF from server.");
+    //DIE("Unexpected EOF from server.");
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
   }
 }
 
@@ -293,7 +1150,9 @@ void Connection::drive_write_machine(double now) {
   double delay;
   struct timeval tv;
 
-  if (check_exit_condition(now)) return;
+  if (check_exit_condition(now)) {
+      return;
+  }
 
   while (1) {
     switch (write_state) {
@@ -303,9 +1162,14 @@ void Connection::drive_write_machine(double now) {
       double_to_tv(delay, &tv);
       evtimer_add(timer, &tv);
       write_state = WAITING_FOR_TIME;
+      write_state = ISSUING;
       break;
 
     case ISSUING:
+      if (op_queue_size >= (size_t) options.depth) {
+        write_state = WAITING_FOR_OPQ;
+        return;
+      }
       if (op_queue.size() >= (size_t) options.depth) {
         write_state = WAITING_FOR_OPQ;
         return;
@@ -323,9 +1187,15 @@ void Connection::drive_write_machine(double now) {
         return;
       }
 
-      issue_something(now);
+      if (options.getsetorset) {
+        int ret = issue_getsetorset(now);
+        if (ret) return; //if at EOF
+      } else {
+        issue_something(now);
+      }
+      
       last_tx = now;
-      stats.log_op(op_queue.size());
+      stats.log_op(op_queue_size);
       next_time += iagen->generate();
 
       if (options.skip && options.lambda > 0.0 &&
@@ -352,7 +1222,7 @@ void Connection::drive_write_machine(double now) {
       break;
 
     case WAITING_FOR_OPQ:
-      if (op_queue.size() >= (size_t) options.depth) return;
+      if (op_queue_size >= (size_t) options.depth) return;
       write_state = ISSUING;
       break;
 
@@ -368,82 +1238,179 @@ void Connection::read_callback() {
   struct evbuffer *input = bufferevent_get_input(bev);
 
   Operation *op = NULL;
-  bool done, full_read;
-
-  if (op_queue.size() == 0) V("Spurious read callback.");
-
-  while (1) {
-    if (op_queue.size() > 0) op = &op_queue.front();
-
-    switch (read_state) {
-    case INIT_READ: DIE("event from uninitialized connection");
-    case IDLE: return;  // We munched all the data we expected?
-
-    case WAITING_FOR_GET:
-      assert(op_queue.size() > 0);
-      full_read = prot->handle_response(input, done);
-      if (!full_read) {
-        return;
-      } else if (done) {
-        finish_op(op); // sets read_state = IDLE
-      }
-      break;
-
-    case WAITING_FOR_SET:
-      assert(op_queue.size() > 0);
-      if (!prot->handle_response(input, done)) return;
-      finish_op(op);
-      break;
-
-    case LOADING:
-      assert(op_queue.size() > 0);
-      if (!prot->handle_response(input, done)) return;
-      loader_completed++;
-      pop_op();
-
-      if (loader_completed == options.records) {
-        D("Finished loading.");
-        read_state = IDLE;
-      } else {
-        while (loader_issued < loader_completed + LOADER_CHUNK) {
-          if (loader_issued >= options.records) break;
-
-          char key[256];
-          string keystr = keygen->generate(loader_issued);
-          strcpy(key, keystr.c_str());
-          int index = lrand48() % (1024 * 1024);
-          issue_set(key, &random_char[index], valuesize->generate());
-
-          loader_issued++;
-        }
-      }
-
-      break;
-
-    case CONN_SETUP:
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  //
+  found = true;
+  //bool full_read = true;
+  //fprintf(stderr,"read_cb start with current queue of ops: %lu and issue_buf_n: %d\n",op_queue.size(),issue_buf_n);
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+    if (read_state == CONN_SETUP) {
       assert(options.binary);
       if (!prot->setup_connection_r(input)) return;
       read_state = IDLE;
       break;
+    }
+      
+    int opcode;
+    uint32_t opaque;
+    full_read = prot->handle_response(input, done, found, opcode, opaque);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGC
+            char out[128];
+            sprintf(out,"conn: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            continue;
+        }
+        op = &op_queue[opaque];
+#ifdef DEBUGC
+        char out[128];
+        sprintf(out,"conn: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+            //char out2[128];
+            //sprintf(out2,"conn: %u, bad op: %s\n",cid,op->key.c_str());
+            //write(2,out2,strlen(out2));
+            continue;
+        }
+    } else {
+        break;
+    }
+    
+
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {//  &&
+                    //(options.twitter_trace != 1)) {
+                    char key[256];
+                    string keystr = op->key;
+                    strcpy(key, keystr.c_str());
+                    int valuelen = op->valuelen;
+                    int index = lrand48() % (1024 * 1024);
+                    finish_op(op,0); // sets read_state = IDLE
+                    if (last_quiet) {
+                        issue_noop();
+                    }
+                    //issue_set_miss(key, &random_char[index], valuelen);
+                    issue_set(key, &random_char[index], valuelen, false);
+                    last_quiet = false; 
+                    
+                } else {
+                    if (found) {
+                        finish_op(op,1);
+                    } else {
+                        finish_op(op,0);
+                    }
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
 
-    default: DIE("not implemented");
+  }
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+#ifdef DEBUGC
+  fprintf(stderr,"read_cb done with current queue of ops: %d and issue_buf_n: %d\n",op_queue_size,issue_buf_n);
+  for (auto x : op_queue) {
+      cerr << x.first << ": " << x.second.key << endl;
+  }
+#endif
+  //buffer is ready to go!
+  //if (issue_buf_n >= options.depth) {
+  if (issue_buf_n > 0) {
+    if (last_quiet) {
+        issue_noop();
+        last_quiet = false;
     }
+#ifdef DEBUGC
+    fprintf(stderr,"read_cb writing %d reqs, last quiet %d\n",issue_buf_n,last_quiet);
+    char *output = (char*)malloc(sizeof(char)*(issue_buf_size+512));
+    fprintf(stderr,"-------------------------------------\n");
+    memcpy(output,issue_buf,issue_buf_size);
+    write(2,output,issue_buf_size);
+    fprintf(stderr,"\n-------------------------------------\n");
+    free(output);
+#endif
+
+    bufferevent_write(bev, issue_buf, issue_buf_size);
+    memset(issue_buf,0,issue_buf_size);
+    issue_buf_pos = issue_buf;
+    issue_buf_size = 0;
+    issue_buf_n = 0;
   }
+
+  //if (op_queue_size > (uint32_t) options.depth) {
+  //  fprintf(stderr,"read_cb opqueue too big %d\n",op_queue_size);
+  //  return;
+  //} else {
+  //  fprintf(stderr,"read_cb issing  %d\n",op_queue_size);
+  //  issue_getsetorset(now);
+  //}
+  last_tx = now;
+  stats.log_op(op_queue_size);
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
 }
 
 /**
  * Callback called when write requests finish.
  */
-void Connection::write_callback() {}
+void Connection::write_callback() {
+
+    //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size());
+}
 
 /**
  * Callback for timer timeouts.
  */
-void Connection::timer_callback() { drive_write_machine(); }
+void Connection::timer_callback() { 
+  drive_write_machine();
+}
+//    //fprintf(stderr,"timer callback issuing requests!\n");
+//    if (op_queue_size >= (size_t) options.depth) {
+//      return;
+//    } else {
+//        double now = get_time();
+//        issue_getsetorset(now);
+//    }
+//}
 
 
 /* The follow are C trampolines for libevent callbacks. */
 void bev_event_cb(struct bufferevent *bev, short events, void *ptr) {
+
   Connection* conn = (Connection*) ptr;
   conn->event_callback(events);
 }
diff --git a/Connection.h b/Connection.h
index fea451e..b617b9e 100644
--- a/Connection.h
+++ b/Connection.h
@@ -4,12 +4,16 @@
 
 #include <queue>
 #include <string>
+#include <fstream>
+#include <map>
+#include <unordered_map>
 
 #include <event2/bufferevent.h>
 #include <event2/dns.h>
 #include <event2/event.h>
 #include <event2/util.h>
 
+#include "bipbuffer.h"
 #include "AdaptiveSampler.h"
 #include "cmdline.h"
 #include "ConnectionOptions.h"
@@ -17,15 +21,61 @@
 #include "Generator.h"
 #include "Operation.h"
 #include "util.h"
-
+#include "blockingconcurrentqueue.h"
 #include "Protocol.h"
 
+#define OPAQUE_MAX 64000
+#define hashsize(n) ((unsigned long int)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+
+#define MAX_BUFFER_SIZE 10*1024*1024
+#define MAX_LEVELS 2+1
+
 using namespace std;
+using namespace moodycamel;
+
+
+typedef struct _evicted_type {
+    bool evicted;
+    uint32_t evictedFlags;
+    uint32_t serverFlags;
+    uint32_t clsid;
+    uint32_t evictedKeyLen;
+    uint32_t evictedLen;
+    char *evictedKey;
+    char *evictedData;
+} evicted_t;
+
+typedef struct resp {
+    uint32_t opaque;
+    int opcode;
+    bool found;
+    evicted_t* evict;
+} resp_t;
+
 
 void bev_event_cb(struct bufferevent *bev, short events, void *ptr);
 void bev_read_cb(struct bufferevent *bev, void *ptr);
+void bev_event_cb1(struct bufferevent *bev, short events, void *ptr);
+void bev_event_cb1_approx(struct bufferevent *bev, short events, void *ptr);
+void bev_event_cb1_approx_batch(struct bufferevent *bev, short events, void *ptr);
+void bev_read_cb1(struct bufferevent *bev, void *ptr);
+void bev_read_cb1_approx(struct bufferevent *bev, void *ptr);
+void bev_read_cb1_approx_batch(struct bufferevent *bev, void *ptr);
+void bev_event_cb2(struct bufferevent *bev, short events, void *ptr);
+void bev_event_cb2_approx(struct bufferevent *bev, short events, void *ptr);
+void bev_event_cb2_approx_batch(struct bufferevent *bev, short events, void *ptr);
+void bev_read_cb2(struct bufferevent *bev, void *ptr);
+void bev_read_cb2_approx(struct bufferevent *bev, void *ptr);
+void bev_read_cb2_approx_batch(struct bufferevent *bev, void *ptr);
 void bev_write_cb(struct bufferevent *bev, void *ptr);
+void bev_write_cb_m(struct bufferevent *bev, void *ptr);
+void bev_write_cb_m_approx(struct bufferevent *bev, void *ptr);
+void bev_write_cb_m_approx_batch(struct bufferevent *bev, void *ptr);
 void timer_cb(evutil_socket_t fd, short what, void *ptr);
+void timer_cb_m(evutil_socket_t fd, short what, void *ptr);
+void timer_cb_m_approx(evutil_socket_t fd, short what, void *ptr);
+void timer_cb_m_approx_batch(evutil_socket_t fd, short what, void *ptr);
 
 class Protocol;
 
@@ -33,9 +83,13 @@ class Connection {
 public:
   Connection(struct event_base* _base, struct evdns_base* _evdns,
              string _hostname, string _port, options_t options,
+             //ConcurrentQueue<string> *a_trace_queue,
              bool sampling = true);
+
   ~Connection();
 
+  int do_connect();
+
   double start_time; // Time when this connection began operations.
   ConnectionStats stats;
   options_t options;
@@ -54,6 +108,11 @@ class Connection {
   void read_callback();
   void write_callback();
   void timer_callback();
+  
+  uint32_t get_cid();
+  //void set_queue(ConcurrentQueue<string> *a_trace_queue);
+  void set_queue(queue<string> *a_trace_queue);
+  void set_lock(pthread_mutex_t* a_lock);
 
 private:
   string hostname;
@@ -75,6 +134,7 @@ class Connection {
     IDLE,
     WAITING_FOR_GET,
     WAITING_FOR_SET,
+    WAITING_FOR_DELETE,
     MAX_READ_STATE,
   };
 
@@ -92,33 +152,870 @@ class Connection {
   // Parameters to track progress of the data loader.
   int loader_issued, loader_completed;
 
+  uint32_t opaque;
+  int issue_buf_size;
+  int issue_buf_n;
+  unsigned char *issue_buf_pos;
+  unsigned char *issue_buf;
+  bool last_quiet;
+  uint32_t total;
+  uint32_t cid;
+  int eof;
+
+
   Protocol *prot;
   Generator *valuesize;
   Generator *keysize;
   KeyGenerator *keygen;
   Generator *iagen;
-  std::queue<Operation> op_queue;
+  //std::vector<std::vector<Operation>> op_queue;
+  std::unordered_map<uint32_t,Operation> op_queue;
+
+  uint32_t op_queue_size;
+  pthread_mutex_t* lock;
+  //ConcurrentQueue<string> *trace_queue;
+  queue<string> *trace_queue;
 
   // state machine functions / event processing
-  void pop_op();
-  void finish_op(Operation *op);
+  void pop_op(Operation *op);
+  void output_op(Operation *op, int type, bool was_found);
+  //void finish_op(Operation *op);
+  void finish_op(Operation *op,int was_hit);
   void issue_something(double now = 0.0);
+  int issue_something_trace(double now = 0.0);
+  void issue_getset(double now = 0.0);
+  int issue_getsetorset(double now = 0.0);
   void drive_write_machine(double now = 0.0);
 
   // request functions
   void issue_sasl();
+  void issue_noop(double now = 0.0);
   void issue_get(const char* key, double now = 0.0);
-  void issue_set(const char* key, const char* value, int length,
-                 double now = 0.0);
+  int issue_get_with_len(const char* key, int valuelen, double now = 0.0, bool quiet = false);
+  int issue_set(const char* key, const char* value, int length,
+                 double now = 0.0, bool is_access = false);
+  void issue_set_miss(const char* key, const char* value, int length);
+  void issue_delete90(double now = 0.0);
+
+  // protocol fucntions
+  int set_request_ascii(const char* key, const char* value, int length);
+  int set_request_binary(const char* key, const char* value, int length);
+  int set_request_resp(const char* key, const char* value, int length);
+  
+  int get_request_ascii(const char* key);
+  int get_request_binary(const char* key);
+  int get_request_resp(const char* key);
+
+  bool consume_binary_response(evbuffer *input);
+  bool consume_ascii_line(evbuffer *input, bool &done);
+  bool consume_resp_line(evbuffer *input, bool &done);
+};
+
+class ConnectionMulti {
+public:
+  ConnectionMulti(struct event_base* _base, struct evdns_base* _evdns,
+             string _hostname1, string _hostname2, string _port, options_t options,
+             bool sampling = true, int fd1 = -1, int fd2 = -1);
+
+  ~ConnectionMulti();
+
+  int do_connect();
+
+  double start_time; // Time when this connection began operations.
+  ConnectionStats stats;
+  options_t options;
+
+  bool is_ready() { return read_state == IDLE; }
+  void set_priority(int pri);
+
+  // state commands
+  void start() { 
+      //fprintf(stderr,"connid: %d starting...\n",cid); 
+      drive_write_machine(); 
+  }
+  void start_loading();
+  void reset();
+  bool check_exit_condition(double now = 0.0);
+
+  void event_callback1(short events);
+  void event_callback2(short events);
+  void read_callback1();
+  void read_callback2();
+  // event callbacks
+  void write_callback();
+  void timer_callback();
+  
+  int eof;
+  uint32_t get_cid();
+  //void set_queue(ConcurrentQueue<string> *a_trace_queue);
+  int  add_to_wb_keys(string wb_key);
+  int  add_to_copy_keys(string key);
+  void  del_wb_keys(string wb_key);
+  void  del_copy_keys(string key);
+  void set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys);
+  void set_queue(queue<Operation*> *a_trace_queue);
+  void set_lock(pthread_mutex_t* a_lock);
+
+private:
+  string hostname1;
+  string hostname2;
+  string port;
+
+  double o_percent;
+  int trace_queue_n;
+  struct event_base *base;
+  struct evdns_base *evdns;
+  struct bufferevent *bev1;
+  struct bufferevent *bev2;
+
+  struct event *timer; // Used to control inter-transmission time.
+  double next_time;    // Inter-transmission time parameters.
+  double last_rx;      // Used to moderate transmission rate.
+  double last_tx;
+
+  vector<string> wb_keys;
+  enum read_state_enum {
+    INIT_READ,
+    CONN_SETUP,
+    LOADING,
+    IDLE,
+    WAITING_FOR_GET,
+    WAITING_FOR_SET,
+    WAITING_FOR_DELETE,
+    MAX_READ_STATE,
+  };
+
+  enum write_state_enum {
+    INIT_WRITE,
+    ISSUING,
+    WAITING_FOR_TIME,
+    WAITING_FOR_OPQ,
+    MAX_WRITE_STATE,
+  };
+
+  read_state_enum read_state;
+  write_state_enum write_state;
+
+  // Parameters to track progress of the data loader.
+  int loader_issued, loader_completed;
+
+  uint32_t *opaque;
+  int *issue_buf_size;
+  int *issue_buf_n;
+  unsigned char **issue_buf_pos;
+  unsigned char **issue_buf;
+  bool last_quiet1;
+  bool last_quiet2;
+  uint32_t total;
+  uint32_t cid;
+  
+  //std::vector<std::queue<Operation>> op_queue;
+  Operation ***op_queue;
+  uint32_t *op_queue_size;
+
+  map<string,int> key_hist;
+
+  Generator *valuesize;
+  Generator *keysize;
+  KeyGenerator *keygen;
+  Generator *iagen;
+  pthread_mutex_t* lock;
+  unordered_map<string,vector<Operation*>> *g_wb_keys;
+  queue<Operation*> *trace_queue;
+
+  // state machine functions / event processing
+  void pop_op(Operation *op);
+  void output_op(Operation *op, int type, bool was_found);
+  //void finish_op(Operation *op);
+  void finish_op(Operation *op,int was_hit);
+  int issue_getsetorset(double now = 0.0);
+  void drive_write_machine(double now = 0.0);
+
+  // request functions
+  void issue_sasl();
+  void issue_noop(double now = 0.0, int level = 1);
+  int issue_touch(const char* key, int valuelen, double now, int level);
+  int issue_delete(const char* key, double now, uint32_t flags);
+  int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_set(const char* key, const char* value, int length, double now, uint32_t flags);
+
+  // protocol fucntions
+  int set_request_ascii(const char* key, const char* value, int length);
+  int set_request_binary(const char* key, const char* value, int length);
+  int set_request_resp(const char* key, const char* value, int length);
+  
+  int get_request_ascii(const char* key);
+  int get_request_binary(const char* key);
+  int get_request_resp(const char* key);
+
+  bool consume_binary_response(evbuffer *input);
+  bool consume_ascii_line(evbuffer *input, bool &done);
+  bool consume_resp_line(evbuffer *input, bool &done);
+};
+
+class ConnectionMultiApprox {
+public:
+  ConnectionMultiApprox(struct event_base* _base, struct evdns_base* _evdns,
+             string _hostname1, string _hostname2, string _port, options_t options,
+             bool sampling = true, int fd1 = -1, int fd2 = -1);
+
+  ~ConnectionMultiApprox();
+
+  int do_connect();
+
+  double start_time; // Time when this connection began operations.
+  ConnectionStats stats;
+  options_t options;
+
+  bool is_ready() { return read_state == IDLE; }
+  void set_priority(int pri);
+
+  // state commands
+  void start() { 
+      //fprintf(stderr,"connid: %d starting...\n",cid); 
+      drive_write_machine(); 
+  }
+  void start_loading();
+  void reset();
+  bool check_exit_condition(double now = 0.0);
+
+  void event_callback1(short events);
+  void event_callback2(short events);
+  void read_callback1();
+  void read_callback2();
+  // event callbacks
+  void write_callback();
+  void timer_callback();
+  
+  int eof;
+  uint32_t get_cid();
+  //void set_queue(ConcurrentQueue<string> *a_trace_queue);
+  int  add_to_wb_keys(string wb_key);
+  int  add_to_copy_keys(string key);
+  int  add_to_touch_keys(string key);
+  void del_wb_keys(string wb_key);
+  void del_copy_keys(string key);
+  void del_touch_keys(string key);
+  void set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys);
+  void set_queue(queue<Operation*> *a_trace_queue);
+  void set_lock(pthread_mutex_t* a_lock);
+
+private:
+  string hostname1;
+  string hostname2;
+  string port;
+
+  double o_percent;
+  int trace_queue_n;
+  struct event_base *base;
+  struct evdns_base *evdns;
+  struct bufferevent *bev1;
+  struct bufferevent *bev2;
+
+  struct event *timer; // Used to control inter-transmission time.
+  double next_time;    // Inter-transmission time parameters.
+  double last_rx;      // Used to moderate transmission rate.
+  double last_tx;
+
+  enum read_state_enum {
+    INIT_READ,
+    CONN_SETUP,
+    LOADING,
+    IDLE,
+    WAITING_FOR_GET,
+    WAITING_FOR_SET,
+    WAITING_FOR_DELETE,
+    MAX_READ_STATE,
+  };
+
+  enum write_state_enum {
+    INIT_WRITE,
+    ISSUING,
+    WAITING_FOR_TIME,
+    WAITING_FOR_OPQ,
+    MAX_WRITE_STATE,
+  };
+
+  read_state_enum read_state;
+  write_state_enum write_state;
+
+  // Parameters to track progress of the data loader.
+  int loader_issued, loader_completed;
+
+  uint32_t *opaque;
+  int *issue_buf_size;
+  int *issue_buf_n;
+  unsigned char **issue_buf_pos;
+  unsigned char **issue_buf;
+  bool last_quiet1;
+  bool last_quiet2;
+  uint32_t total;
+  uint32_t cid;
+  uint32_t gets;
+  uint32_t gloc;
+  uint32_t ghits;
+  uint32_t sloc;
+  uint32_t esets;
+  uint32_t isets;
+  uint32_t iloc;
+  
+  //std::vector<std::queue<Operation>> op_queue;
+  Operation ***op_queue;
+  uint32_t *op_queue_size;
+
+
+  Generator *valuesize;
+  Generator *keysize;
+  KeyGenerator *keygen;
+  Generator *iagen;
+  pthread_mutex_t* lock;
+  unordered_map<string,vector<Operation*>> *g_wb_keys;
+  queue<Operation*> *trace_queue;
+
+  // state machine functions / event processing
+  void pop_op(Operation *op);
+  void output_op(Operation *op, int type, bool was_found);
+  //void finish_op(Operation *op);
+  void finish_op(Operation *op,int was_hit);
+  int issue_getsetorset(double now = 0.0);
+  void drive_write_machine(double now = 0.0);
+
+  // request functions
+  void issue_sasl();
+  int issue_op(Operation* op);
+  void issue_noop(double now = 0.0, int level = 1);
+  int issue_touch(const char* key, int valuelen, double now, int level);
+  int issue_delete(const char* key, double now, uint32_t flags);
+  int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_set(const char* key, const char* value, int length, double now, uint32_t flags);
+  int issue_set(Operation *pop, const char* value, double now, uint32_t flags);
+
+  // protocol fucntions
+  int set_request_ascii(const char* key, const char* value, int length);
+  int set_request_binary(const char* key, const char* value, int length);
+  int set_request_resp(const char* key, const char* value, int length);
+  
+  int get_request_ascii(const char* key);
+  int get_request_binary(const char* key);
+  int get_request_resp(const char* key);
+
+  bool consume_binary_response(evbuffer *input);
+  bool consume_ascii_line(evbuffer *input, bool &done);
+  bool consume_resp_line(evbuffer *input, bool &done);
+};
+
+class ConnectionMultiApproxBatch {
+public:
+  ConnectionMultiApproxBatch(struct event_base* _base, struct evdns_base* _evdns,
+             string _hostname1, string _hostname2, string _port, options_t options,
+             bool sampling = true, int fd1 = -1, int fd2 = -1);
+
+  ~ConnectionMultiApproxBatch();
+
+  int do_connect();
+
+  double start_time; // Time when this connection began operations.
+  ConnectionStats stats;
+  options_t options;
+
+  bool is_ready() { return read_state == IDLE; }
+  void set_priority(int pri);
+
+  // state commands
+  void start() { 
+      //fprintf(stderr,"connid: %d starting...\n",cid); 
+      drive_write_machine(); 
+  }
+  void start_loading();
+  void reset();
+  bool check_exit_condition(double now = 0.0);
+
+  void event_callback1(short events);
+  void event_callback2(short events);
+  void read_callback1();
+  void read_callback2();
+  void read_callback1_v1();
+  void read_callback2_v1();
+  // event callbacks
+  void write_callback();
+  void timer_callback();
+  
+  int eof;
+  uint32_t get_cid();
+  //void set_queue(ConcurrentQueue<string> *a_trace_queue);
+  int  add_to_wb_keys(string wb_key);
+  int  add_to_copy_keys(string key);
+  int  add_to_touch_keys(string key);
+  void del_wb_keys(string wb_key);
+  void del_copy_keys(string key);
+  void del_touch_keys(string key);
+  void set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys);
+  void set_queue(queue<Operation*> *a_trace_queue);
+  void set_lock(pthread_mutex_t* a_lock);
+  int send_write_buffer(int level);
+  int add_get_op_to_queue(Operation *pop, int level);
+  int add_set_to_queue(Operation *pop, int level, const char *value);
+  size_t handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, 
+                                    size_t read_bytes, size_t consumed_bytes,
+                                    int level, int extra);
+
+private:
+  string hostname1;
+  string hostname2;
+  string port;
+
+  double o_percent;
+  int trace_queue_n;
+  struct event_base *base;
+  struct evdns_base *evdns;
+  struct bufferevent *bev1;
+  struct bufferevent *bev2;
+
+  struct event *timer; // Used to control inter-transmission time.
+  double next_time;    // Inter-transmission time parameters.
+  double last_rx;      // Used to moderate transmission rate.
+  double last_tx;
+
+  enum read_state_enum {
+    INIT_READ,
+    CONN_SETUP,
+    LOADING,
+    IDLE,
+    WAITING_FOR_GET,
+    WAITING_FOR_SET,
+    WAITING_FOR_DELETE,
+    MAX_READ_STATE,
+  };
+
+  enum write_state_enum {
+    INIT_WRITE,
+    ISSUING,
+    WAITING_FOR_TIME,
+    WAITING_FOR_OPQ,
+    MAX_WRITE_STATE,
+  };
+
+  read_state_enum read_state;
+  write_state_enum write_state;
+
+  // Parameters to track progress of the data loader.
+  int loader_issued, loader_completed;
+
+  uint32_t *opaque;
+  int *issue_buf_size;
+  int *issue_buf_n;
+  unsigned char **issue_buf_pos;
+  unsigned char **issue_buf;
+  bool last_quiet1;
+  bool last_quiet2;
+  uint32_t total;
+  uint32_t cid;
+  uint32_t gets;
+  uint32_t gloc;
+  uint32_t ghits;
+  uint32_t sloc;
+  uint32_t esets;
+  uint32_t isets;
+  uint32_t iloc;
+
+  uint32_t clsid_;
+  uint32_t incl_;
+  uint32_t buffer_size_;
+  unsigned char* buffer_write[MAX_LEVELS];
+  unsigned char* buffer_read[MAX_LEVELS];
+  unsigned char* buffer_write_pos[MAX_LEVELS];
+  unsigned char* buffer_read_pos[MAX_LEVELS];
+  unsigned char* buffer_lasthdr[MAX_LEVELS];
+  unsigned char* buffer_leftover[MAX_LEVELS];
+  uint32_t buffer_read_n[MAX_LEVELS];
+  uint32_t buffer_write_n[MAX_LEVELS];
+  uint32_t buffer_read_nbytes[MAX_LEVELS];
+  uint32_t buffer_write_nbytes[MAX_LEVELS];
+
+
+  //std::vector<std::queue<Operation>> op_queue;
+  Operation ***op_queue;
+  uint32_t *op_queue_size;
+
+
+  Generator *valuesize;
+  Generator *keysize;
+  KeyGenerator *keygen;
+  Generator *iagen;
+  pthread_mutex_t* lock;
+  unordered_map<string,vector<Operation*>> *g_wb_keys;
+  queue<Operation*> *trace_queue;
+
+  // state machine functions / event processing
+  void pop_op(Operation *op);
+  void output_op(Operation *op, int type, bool was_found);
+  //void finish_op(Operation *op);
+  void finish_op(Operation *op,int was_hit);
+  int issue_getsetorset(double now = 0.0);
+  void drive_write_machine(double now = 0.0);
+
+  // request functions
+  void issue_sasl();
+  int issue_op(Operation* op);
+  int issue_noop(int level = 1);
+  size_t fill_read_buffer(int level, int *extra);
+  int issue_touch(const char* key, int valuelen, double now, int level);
+  int issue_delete(const char* key, double now, uint32_t flags);
+  int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_set(const char* key, const char* value, int length, double now, uint32_t flags);
+  int issue_set(Operation *pop, const char* value, double now, uint32_t flags);
+
+  // protocol fucntions
+  int set_request_ascii(const char* key, const char* value, int length);
+  int set_request_binary(const char* key, const char* value, int length);
+  int set_request_resp(const char* key, const char* value, int length);
+  
+  int get_request_ascii(const char* key);
+  int get_request_binary(const char* key);
+  int get_request_resp(const char* key);
+
+  bool consume_binary_response(evbuffer *input);
+  bool consume_ascii_line(evbuffer *input, bool &done);
+  bool consume_resp_line(evbuffer *input, bool &done);
+};
+
+class ConnectionMultiApproxShm {
+public:
+  ConnectionMultiApproxShm(options_t options, bool sampling = true);
+
+  ~ConnectionMultiApproxShm();
+
+  int do_connect();
+
+  double start_time; // Time when this connection began operations.
+  ConnectionStats stats;
+  options_t options;
+
+  bool is_ready() { return read_state == IDLE; }
+  void set_priority(int pri);
+
+  void start_loading();
+  void reset();
+  bool check_exit_condition(double now = 0.0);
+
+  void event_callback1(short events);
+  void event_callback2(short events);
+  void read_callback1();
+  void read_callback2();
+  void read_callback1_v1();
+  void read_callback2_v1();
+  // event callbacks
+  void write_callback();
+  void timer_callback();
+  
+  int eof;
+  uint32_t get_cid();
+  //void set_queue(ConcurrentQueue<string> *a_trace_queue);
+  int  add_to_wb_keys(string wb_key);
+  int  add_to_copy_keys(string key);
+  int  add_to_touch_keys(string key);
+  void del_wb_keys(string wb_key);
+  void del_copy_keys(string key);
+  void del_touch_keys(string key);
+  void set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys);
+  void set_queue(queue<Operation*> *a_trace_queue);
+  void set_lock(pthread_mutex_t* a_lock);
+  int send_write_buffer(int level);
+  int add_get_op_to_queue(Operation *pop, int level);
+  int add_set_to_queue(Operation *pop, int level, const char *value);
+  size_t handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, 
+                                    size_t read_bytes, size_t consumed_bytes,
+                                    int level, int extra);
+  void drive_write_machine_shm(double now = 0.0);
+  bipbuf_t* bipbuf_in[3];
+  bipbuf_t* bipbuf_out[3];
+  pthread_mutex_t* lock_in[3];
+  pthread_mutex_t* lock_out[3];
+  pthread_cond_t* cond_in_not_empty[3];
+  pthread_cond_t* cond_in_not_full[3];
+  pthread_cond_t* cond_out_not_empty[3];
+  pthread_cond_t* cond_out_not_full[3];
+
+private:
+  string hostname1;
+  string hostname2;
+  string port;
+
+  double o_percent;
+  int trace_queue_n;
+
+  struct event *timer; // Used to control inter-transmission time.
+  double next_time;    // Inter-transmission time parameters.
+  double last_rx;      // Used to moderate transmission rate.
+  double last_tx;
+
+  enum read_state_enum {
+    INIT_READ,
+    CONN_SETUP,
+    LOADING,
+    IDLE,
+    WAITING_FOR_GET,
+    WAITING_FOR_SET,
+    WAITING_FOR_DELETE,
+    MAX_READ_STATE,
+  };
+
+  enum write_state_enum {
+    INIT_WRITE,
+    ISSUING,
+    WAITING_FOR_TIME,
+    WAITING_FOR_OPQ,
+    MAX_WRITE_STATE,
+  };
+
+  read_state_enum read_state;
+  write_state_enum write_state;
+
+  // Parameters to track progress of the data loader.
+  int loader_issued, loader_completed;
+
+  uint32_t *opaque;
+  int *issue_buf_size;
+  int *issue_buf_n;
+  unsigned char **issue_buf_pos;
+  unsigned char **issue_buf;
+  bool last_quiet1;
+  bool last_quiet2;
+  uint32_t total;
+  uint32_t cid;
+  uint32_t gets;
+  uint32_t gloc;
+  uint32_t ghits;
+  uint32_t sloc;
+  uint32_t esets;
+  uint32_t isets;
+  uint32_t iloc;
+
+
+  //std::vector<std::queue<Operation>> op_queue;
+  Operation ***op_queue;
+  uint32_t *op_queue_size;
+
+
+  Generator *valuesize;
+  Generator *keysize;
+  KeyGenerator *keygen;
+  Generator *iagen;
+  pthread_mutex_t* lock;
+  unordered_map<string,vector<Operation*>> *g_wb_keys;
+  queue<Operation*> *trace_queue;
+  queue<Operation*> extra_queue;
+
+  // state machine functions / event processing
+  void pop_op(Operation *op);
+  void output_op(Operation *op, int type, bool was_found);
+  //void finish_op(Operation *op);
+  void finish_op(Operation *op,int was_hit);
+  int issue_getsetorset(double now = 0.0);
+
+  // request functions
+  void issue_sasl();
+  int issue_op(Operation* op);
+  void issue_noop(int level = 1);
+  size_t fill_read_buffer(int level, int *extra);
+  int issue_touch(const char* key, int valuelen, double now, int level);
+  int issue_delete(const char* key, double now, uint32_t flags);
+  int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_set(const char* key, const char* value, int length, double now, uint32_t flags);
+  int issue_set(Operation *pop, const char* value, double now, uint32_t flags);
+  int offer_set(Operation *pop, int extra = 0);
+  int offer_get(Operation *pop, int extra = 0);
+
+  int read_response_l1(); 
+  void read_response_l2();
+  // protocol fucntions
+  int set_request_ascii(const char* key, const char* value, int length);
+  int set_request_binary(const char* key, const char* value, int length);
+  int set_request_resp(const char* key, const char* value, int length);
+  
+  int get_request_ascii(const char* key);
+  int get_request_binary(const char* key);
+  int get_request_resp(const char* key);
+
+  bool consume_binary_response(evbuffer *input);
+  bool consume_ascii_line(evbuffer *input, bool &done);
+  bool consume_resp_line(evbuffer *input, bool &done);
+};
+
+class ConnectionMultiApproxBatchShm {
+public:
+  ConnectionMultiApproxBatchShm(options_t options, bool sampling = true);
+
+  ~ConnectionMultiApproxBatchShm();
+
+  int do_connect();
+
+  double start_time; // Time when this connection began operations.
+  ConnectionStats stats;
+  options_t options;
+
+  bool is_ready() { return read_state == IDLE; }
+  void set_priority(int pri);
+
+  void start_loading();
+  void reset();
+  bool check_exit_condition(double now = 0.0);
+
+  void read_callback1();
+  void read_callback2();
+  
+  int eof;
+  uint32_t get_cid();
+  //void set_queue(ConcurrentQueue<string> *a_trace_queue);
+  int  add_to_wb_keys(string wb_key);
+  int  add_to_copy_keys(string key);
+  int  add_to_touch_keys(string key);
+  void del_wb_keys(string wb_key);
+  void del_copy_keys(string key);
+  void del_touch_keys(string key);
+  void set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys);
+  void set_queue(queue<Operation*> *a_trace_queue);
+  void set_lock(pthread_mutex_t* a_lock);
+  size_t handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, 
+                                    size_t read_bytes, size_t consumed_bytes,
+                                    int level, int extra);
+  void drive_write_machine_shm(double now = 0.0);
+  bipbuf_t* bipbuf_in[3];
+  bipbuf_t* bipbuf_out[3];
+  pthread_mutex_t* lock_in[3];
+  pthread_mutex_t* lock_out[3];
+ 
+  int *bipbuf_out_bytes[3];
+  int *bipbuf_in_bytes[3];
+  pthread_cond_t* cond_in_not_empty[3];
+  pthread_cond_t* cond_in_not_full[3];
+  pthread_cond_t* cond_out_not_empty[3];
+  pthread_cond_t* cond_out_not_full[3];
+
+private:
+  string hostname1;
+  string hostname2;
+  string port;
+
+  double o_percent;
+  int trace_queue_n;
+
+  struct event *timer; // Used to control inter-transmission time.
+  double next_time;    // Inter-transmission time parameters.
+  double last_rx;      // Used to moderate transmission rate.
+  double last_tx;
+
+  enum read_state_enum {
+    INIT_READ,
+    CONN_SETUP,
+    LOADING,
+    IDLE,
+    WAITING_FOR_GET,
+    WAITING_FOR_SET,
+    WAITING_FOR_DELETE,
+    MAX_READ_STATE,
+  };
+
+  enum write_state_enum {
+    INIT_WRITE,
+    ISSUING,
+    WAITING_FOR_TIME,
+    WAITING_FOR_OPQ,
+    MAX_WRITE_STATE,
+  };
+
+  read_state_enum read_state;
+  write_state_enum write_state;
+
+  // Parameters to track progress of the data loader.
+  int loader_issued, loader_completed;
+
+  uint32_t *opaque;
+  int *issue_buf_size;
+  int *issue_buf_n;
+  unsigned char **issue_buf_pos;
+  unsigned char **issue_buf;
+  bool last_quiet1;
+  bool last_quiet2;
+  uint32_t total;
+  uint32_t cid;
+  uint32_t gets;
+  uint32_t gloc;
+  uint32_t ghits;
+  uint32_t sloc;
+  uint32_t esets;
+  uint32_t isets;
+  uint32_t iloc;
+  
+  uint32_t buffer_size_;
+  unsigned char* buffer_write[MAX_LEVELS];
+  unsigned char* buffer_read[MAX_LEVELS];
+  unsigned char* buffer_write_pos[MAX_LEVELS];
+  unsigned char* buffer_read_pos[MAX_LEVELS];
+  unsigned char* buffer_lasthdr[MAX_LEVELS];
+  unsigned char* buffer_leftover[MAX_LEVELS];
+  uint32_t buffer_read_n[MAX_LEVELS];
+  uint32_t buffer_write_n[MAX_LEVELS];
+  uint32_t buffer_read_nbytes[MAX_LEVELS];
+  uint32_t buffer_write_nbytes[MAX_LEVELS];
+
+
+  //std::vector<std::queue<Operation>> op_queue;
+  Operation ***op_queue;
+  uint32_t *op_queue_size;
+  uint32_t *issued_queue;
+
+
+  Generator *valuesize;
+  Generator *keysize;
+  KeyGenerator *keygen;
+  Generator *iagen;
+  pthread_mutex_t* lock;
+  unordered_map<string,vector<Operation*>> *g_wb_keys;
+  queue<Operation*> *trace_queue;
+  queue<Operation*> extra_queue;
+
+  // state machine functions / event processing
+  void pop_op(Operation *op);
+  void output_op(Operation *op, int type, bool was_found);
+  //void finish_op(Operation *op);
+  void finish_op(Operation *op,int was_hit);
+  int issue_getsetorset(double now = 0.0);
+
+  // request functions
+  void issue_sasl();
+  int issue_op(Operation* op);
+  int issue_noop(int level = 1);
+  int issue_touch(const char* key, int valuelen, double now, int level);
+  int issue_delete(const char* key, double now, uint32_t flags);
+  int issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1 = NULL);
+  int issue_set(const char* key, const char* value, int length, double now, uint32_t flags);
+  int issue_set(Operation *pop, const char* value, double now, uint32_t flags);
+  int offer_set(Operation *pop, int extra = 0);
+  int offer_get(Operation *pop, int extra = 0);
+  int send_write_buffer(int level);
+  size_t fill_read_buffer(int level, int *extra);
+  int add_get_op_to_queue(Operation *pop, int level, int cb = 0);
+  int add_set_to_queue(Operation *pop, int level, const char *value, int cb = 0);
 
+  int read_response_l1(); 
+  void read_response_l2();
   // protocol fucntions
   int set_request_ascii(const char* key, const char* value, int length);
   int set_request_binary(const char* key, const char* value, int length);
+  int set_request_resp(const char* key, const char* value, int length);
+  
   int get_request_ascii(const char* key);
   int get_request_binary(const char* key);
+  int get_request_resp(const char* key);
 
   bool consume_binary_response(evbuffer *input);
   bool consume_ascii_line(evbuffer *input, bool &done);
+  bool consume_resp_line(evbuffer *input, bool &done);
 };
 
 #endif
diff --git a/ConnectionMulti.backup b/ConnectionMulti.backup
new file mode 100644
index 0000000..688ad3c
--- /dev/null
+++ b/ConnectionMulti.backup
@@ -0,0 +1,1723 @@
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <time.h>
+#include <pthread.h>
+
+#include <event2/buffer.h>
+#include <event2/bufferevent.h>
+#include <event2/dns.h>
+#include <event2/event.h>
+#include <event2/thread.h>
+#include <event2/util.h>
+
+#include "config.h"
+
+#include "Connection.h"
+#include "distributions.h"
+#include "Generator.h"
+#include "mutilate.h"
+#include "binary_protocol.h"
+#include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+
+#define ITEM_L1 1
+#define ITEM_L2 2
+#define LOG_OP 4
+#define SRC_L1_M 8
+#define SRC_L1_H 16
+#define SRC_L2_M 32
+#define SRC_L2_H 64
+#define SRC_DIRECT_SET 128
+#define SRC_L1_COPY 256
+#define SRC_WB 512
+
+#define ITEM_INCL  4096
+#define ITEM_EXCL  8192
+#define ITEM_DIRTY 16384
+#define ITEM_SIZE_CHANGE 131072
+#define ITEM_WAS_HIT 262144
+
+#define LEVELS 2
+#define SET_INCL(incl,flags)     \
+    switch (incl) {              \
+        case 1:                  \
+            flags |= ITEM_INCL;  \
+            break;               \
+        case 2:                  \
+            flags |= ITEM_EXCL;  \
+            break;               \
+                                 \
+    }                            \
+
+#define GET_INCL(incl,flags) \
+    if (flags & ITEM_INCL) incl = 1; \
+    else if (flags & ITEM_EXCL) incl = 2; \
+
+//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 )
+#define OP_level(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define FLAGS_level(flags) ( flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) 
+
+#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) )
+
+#define OP_log(op) ((op)->flags & LOG_OP)
+#define OP_incl(op) ((op)->flags & ITEM_INCL)
+#define OP_excl(op) ((op)->flags & ITEM_EXCL)
+#define OP_set_flag(op,flag) ((op))->flags |= flag;
+
+//#define DEBUGMC
+//#define DEBUGS
+
+using namespace moodycamel;
+
+pthread_mutex_t cid_lock_m = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t connids_m = 1;
+
+#define NCLASSES 40
+#define CHUNK_ALIGN_BYTES 8
+static int classes = 0;
+static int sizes[NCLASSES+1];
+static int inclusives[NCLASSES+1];
+
+typedef struct _evicted_type {
+    bool evicted;
+    uint32_t evictedFlags;
+    uint32_t serverFlags;
+    uint32_t clsid;
+    uint32_t evictedKeyLen;
+    uint32_t evictedLen;
+    char *evictedKey;
+    char *evictedData;
+} evicted_t;
+
+static vector<double> cid_rate;
+
+extern int max_n[3];
+
+static void init_inclusives(char *inclusive_str) {
+    int j = 1;
+    for (int i = 0; i < (int)strlen(inclusive_str); i++) {
+        if (inclusive_str[i] == '-') {
+            continue;
+        } else {
+            inclusives[j] = inclusive_str[i] - '0';
+            j++;
+        }
+    }
+}
+
+static void init_classes() {
+
+    double factor = 1.25;
+    unsigned int chunk_size = 48;
+    unsigned int item_size = 24;
+    unsigned int size = 96; //warning if you change this you die
+    unsigned int i = 0;
+    unsigned int chunk_size_max = 1048576/2;
+    while (++i < NCLASSES-1) {
+        if (size >= chunk_size_max / factor) {
+            break;
+        }
+        if (size % CHUNK_ALIGN_BYTES)
+            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
+        sizes[i] = size;
+        size *= factor;
+    }
+    sizes[i] = chunk_size_max;
+    classes = i;
+
+}
+
+static int get_class(int vl, uint32_t kl) {
+    //warning if you change this you die
+    int vsize = vl+kl+48+1+2;
+    int res = 1;
+    while (vsize > sizes[res])
+        if (res++ == classes) { 
+            //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]);
+            return -1;
+        }
+    return res;
+}
+
+static int get_incl(int vl, int kl) {
+    int clsid = get_class(vl,kl);
+    if (clsid) {
+        return inclusives[clsid];
+    } else {
+        return -1;
+    }
+}
+
+void ConnectionMulti::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char k[256];
+    char a[256];
+    char s[256];
+    memset(k,0,256);
+    memset(a,0,256);
+    memset(s,0,256);
+    strcpy(k,op->key.c_str());
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type);
+    }
+    write(2,output,strlen(output));
+}
+
+/**
+ * Create a new connection to a server endpoint.
+ */
+ConnectionMulti::ConnectionMulti(struct event_base* _base, struct evdns_base* _evdns,
+                       string _hostname1, string _hostname2, string _port, options_t _options,
+                       bool sampling, int fd1, int fd2 ) :
+  start_time(0), stats(sampling), options(_options),
+  hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns)
+{
+  pthread_mutex_lock(&cid_lock_m);
+  cid = connids_m++;
+  if (cid == 1) {
+    cid_rate.push_back(100);
+    cid_rate.push_back(0);
+    init_classes();
+    init_inclusives(options.inclusives);
+  } else {
+    cid_rate.push_back(0);
+  }
+  
+  pthread_mutex_unlock(&cid_lock_m);
+  
+  valuesize = createGenerator(options.valuesize);
+  keysize = createGenerator(options.keysize);
+  srand(time(NULL));
+  keygen = new KeyGenerator(keysize, options.records);
+  
+  total = 0;
+  eof = 0;
+  o_percent = 0;
+
+  if (options.lambda <= 0) {
+    iagen = createGenerator("0");
+  } else {
+    D("iagen = createGenerator(%s)", options.ia);
+    iagen = createGenerator(options.ia);
+    iagen->set_lambda(options.lambda);
+  }
+
+  read_state  = IDLE;
+  write_state = INIT_WRITE;
+  last_quiet1 = false;
+  last_quiet2 = false;
+  
+  last_tx = last_rx = 0.0;
+
+  
+  op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1));
+
+  for (int i = 0; i <= LEVELS; i++) {
+      op_queue_size[i] = 0;
+      opaque[i] = 1;
+      //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX);
+      op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX*2));
+
+  }
+  
+  bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev1, bev_read_cb1, bev_write_cb_m, bev_event_cb1, this);
+  bufferevent_enable(bev1, EV_READ | EV_WRITE);
+  
+  bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev2, bev_read_cb2, bev_write_cb_m, bev_event_cb2, this);
+  bufferevent_enable(bev2, EV_READ | EV_WRITE);
+  
+  timer = evtimer_new(base, timer_cb_m, this);
+
+  read_state  = IDLE;
+}
+
+
+void ConnectionMulti::set_queue(queue<Operation>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    trace_queue_n = a_trace_queue->size();
+}
+
+void ConnectionMulti::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+void ConnectionMulti::set_g_wbkeys(unordered_map<string,int> *a_wb_keys) {
+    g_wb_keys = a_wb_keys;
+}
+
+uint32_t ConnectionMulti::get_cid() {
+    return cid;
+}
+
+int ConnectionMulti::add_to_wb_keys(string key) {
+    int ret = -1;
+    pthread_mutex_lock(lock);
+    auto pos = g_wb_keys->find(key);
+    if (pos == g_wb_keys->end()) {
+        g_wb_keys->insert( {key,cid });
+        ret = 1;
+        //fprintf(stderr,"----set: %s----\n",Op.key.c_str());
+        //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){
+        //    fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second);
+        //}
+        //fprintf(stderr,"----%d----\n",cid);
+    } else {
+        ret = 2;
+    }
+
+    pthread_mutex_unlock(lock);
+    return ret;
+}
+
+void ConnectionMulti::del_wb_keys(string key) {
+
+    pthread_mutex_lock(lock);
+    auto position = g_wb_keys->find(key);
+    if (position != g_wb_keys->end()) {
+        g_wb_keys->erase(position);
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+    pthread_mutex_unlock(lock);
+}
+
+
+int ConnectionMulti::do_connect() {
+
+  int connected = 0;
+  if (options.unix_socket) {
+  
+
+    struct sockaddr_un sin1;
+    memset(&sin1, 0, sizeof(sin1));
+    sin1.sun_family = AF_LOCAL;
+    strcpy(sin1.sun_path, hostname1.c_str());
+
+    int addrlen;
+    addrlen = sizeof(sin1);
+
+    int err = bufferevent_socket_connect(bev1,  (struct sockaddr*)&sin1, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+        connected = 0;
+        err = errno;
+        fprintf(stderr,"l1 error %s\n",strerror(err));
+    }
+    
+    struct sockaddr_un sin2;
+    memset(&sin2, 0, sizeof(sin2));
+    sin2.sun_family = AF_LOCAL;
+    strcpy(sin2.sun_path, hostname2.c_str());
+
+    addrlen = sizeof(sin2);
+    err = bufferevent_socket_connect(bev2,  (struct sockaddr*)&sin2, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+	connected = 0;
+        err = errno;
+	fprintf(stderr,"l2 error %s\n",strerror(err));
+    }
+  } 
+  read_state  = IDLE;
+  return connected;
+}
+
+/**
+ * Destroy a connection, performing cleanup.
+ */
+ConnectionMulti::~ConnectionMulti() {
+ 
+
+  for (int i = 0; i <= LEVELS; i++) {
+      free(op_queue[i]);
+
+  }
+  
+  free(op_queue_size);
+  free(opaque);
+  free(op_queue);
+  //event_free(timer);
+  //timer = NULL;
+  // FIXME:  W("Drain op_q?");
+  //bufferevent_free(bev1);
+  //bufferevent_free(bev2);
+
+  delete iagen;
+  delete keygen;
+  delete keysize;
+  delete valuesize;
+}
+
+/**
+ * Reset the connection back to an initial, fresh state.
+ */
+void ConnectionMulti::reset() {
+  // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
+  read_state = IDLE;
+  write_state = INIT_WRITE;
+  stats = ConnectionStats(stats.sampling);
+}
+
+/**
+ * Set our event processing priority.
+ */
+void ConnectionMulti::set_priority(int pri) {
+  if (bufferevent_priority_set(bev1, pri)) {
+    DIE("bufferevent_set_priority(bev, %d) failed", pri);
+  }
+}
+
+
+
+/**
+ * Get/Set or Set Style
+ * If a GET command: Issue a get first, if not found then set
+ * If trace file (or prob. write) says to set, then set it
+ */
+int ConnectionMulti::issue_getsetorset(double now) {
+ 
+
+    
+    int ret = 0;
+    int nissued = 0;
+    //while (nissued < options.depth) {
+    
+    //pthread_mutex_lock(lock);
+    if (!trace_queue->empty()) {
+        Operation Op = trace_queue->front();
+        if (Op.type == Operation::SASL) {
+            eof = 1;
+            cid_rate[cid] = 100;
+            fprintf(stderr,"cid %d done\n",cid);
+            string op_queue1;
+            string op_queue2;
+            for (int j = 0; j < 2; j++) {
+                for (int i = 0; i < OPAQUE_MAX; i++) {
+                    if (op_queue[j+1][i] != NULL) {
+                        if (j == 0) {
+                            op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+                        } else {
+                            op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+                        }
+                    }
+                }
+            }
+            fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+            return 1;
+        } 
+        
+
+        /* check if in global wb queue */
+        pthread_mutex_lock(lock);
+        double percent = (double)total/((double)trace_queue_n) * 100;
+        if (percent > o_percent+1) {
+            //update the percentage table and see if we should execute
+            std::vector<double>::iterator mp = std::min_element(cid_rate.begin(), cid_rate.end());
+            double min_percent = *mp;
+
+            if (percent > min_percent+2) {
+                pthread_mutex_unlock(lock);
+                struct timeval tv;
+                tv.tv_sec = 1;
+                tv.tv_usec = 0;
+                int good = 0;
+                if (!event_pending(timer, EV_TIMEOUT, NULL)) {
+                    good = evtimer_add(timer, &tv);
+                }
+                if (good != 0) {
+                    fprintf(stderr,"eventimer is messed up!\n");
+                    return 2;
+                }
+                return 1;
+            }
+            cid_rate[cid] = percent;
+            fprintf(stderr,"%f,%d,%.4f\n",now,cid,percent);
+            o_percent = percent;
+        }
+        auto check = g_wb_keys->find(Op.key);
+        if (check != g_wb_keys->end()) {
+            pthread_mutex_unlock(lock);
+            struct timeval tv;
+            tv.tv_sec = 1;
+            tv.tv_usec = 0;
+            int good = 0;
+            if (!event_pending(timer, EV_TIMEOUT, NULL)) {
+                good = evtimer_add(timer, &tv);
+            }
+            if (good != 0) {
+                fprintf(stderr,"eventimer is messed up in checking for key: %s\n",Op.key.c_str());
+                return 2;
+            }
+            return 1;
+        } else {
+            g_wb_keys->insert( {Op.key, cid} );
+            //g_wb_keys->insert( {Op.key+"l2", cid} );
+        }
+        pthread_mutex_unlock(lock);
+
+        
+        
+        char key[256];
+        memset(key,0,256);
+        strncpy(key, Op.key.c_str(),255);
+        int vl = Op.valuelen;
+
+        trace_queue->pop();
+
+        int issued = 0;
+        int incl = get_incl(vl,strlen(key));
+        int cid = get_class(vl,strlen(key));
+        int flags = 0;
+        int touch = (rand() % 100);
+        int index = lrand48() % (1024 * 1024);
+        //int touch = 1;
+        SET_INCL(incl,flags);
+        
+        switch(Op.type)
+        {
+          case Operation::GET:
+              //if (nissued < options.depth-1) {
+              //  issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1);
+              //  last_quiet1 = false;
+              //} else {
+              //}
+              if (options.threshold > 0) {
+                if (Op.future) {
+                    key_hist[key] = 1;
+                }
+              }
+              issued = issue_get_with_len(key, vl, now, false, flags | LOG_OP | ITEM_L1);
+              if (touch == 1 && incl == 1) {
+                issue_touch(key,vl,now, ITEM_L2 | SRC_L1_H);
+              }
+              last_quiet1 = false;
+              this->stats.gets++;
+              this->stats.gets_cid[cid]++;
+
+              break;
+        case Operation::SET:
+              if (last_quiet1) {
+                  issue_noop(now,1);
+              }
+              if (incl == 1) {
+                issue_touch(key,vl,now, ITEM_L2 | SRC_DIRECT_SET);
+              } else if (incl == 2) {
+                issue_delete(key,now, ITEM_L2 | SRC_DIRECT_SET );
+              }
+              issued = issue_set(key, &random_char[index], vl, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+              last_quiet1 = false;
+              this->stats.sets++;
+              this->stats.sets_cid[cid]++;
+              break;
+        case Operation::DELETE:
+        case Operation::TOUCH:
+        case Operation::NOOP:
+        case Operation::SASL:
+              fprintf(stderr,"invalid line: %s, vl: %d\n",key,vl);
+              break;
+        
+        }
+        if (issued) {
+            nissued++;
+            total++;
+        } else {
+            fprintf(stderr,"failed to issue line: %s, vl: %d @T: XX \n",key,vl);
+        }
+    } else {
+        return 1;
+    }
+    //}
+    if (last_quiet1) {
+        issue_noop(now,1);
+        last_quiet1 = false;
+    }
+
+    return ret;
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMulti::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) {
+
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op;
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->key = string(key);
+  pop->valuelen = valuelen;
+  pop->type = Operation::GET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(valuelen,strlen(key));
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  if (quiet) {
+      h.opcode = CMD_GETQ;
+  }
+  h.opaque = htonl(pop->opaque);
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  return 1;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMulti::issue_touch(const char* key, int valuelen, double now, int flags) {
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->key = string(key);
+  pop->valuelen = valuelen;
+  pop->type = Operation::TOUCH;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+#ifdef DEBUGS
+  fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque);
+#endif
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen),
+                        0x04, 0x00, htons(0),
+                        htonl(keylen + 4) };
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t exp = 0;
+  if (flags & ITEM_DIRTY) {
+      exp = htonl(flags); 
+  }
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, key, keylen);
+
+  
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+/**
+ * Issue a delete request to the server.
+ */
+int ConnectionMulti::issue_delete(const char* key, double now, uint32_t flags) {
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op;
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->key = string(key);
+  pop->type = Operation::DELETE;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque);
+#endif
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_DELETE, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  h.opaque = htonl(pop->opaque);
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+void ConnectionMulti::issue_noop(double now, int level) {
+   struct evbuffer *output = NULL;
+   switch (level) {
+       case 1:
+           output = bufferevent_get_output(bev1);
+           break;
+       case 2:
+           output = bufferevent_get_output(bev2);
+           break;
+   }
+   Operation op;
+   
+   if (now == 0.0) op.start_time = get_time();
+   else op.start_time = now;
+
+   binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                         0x00, 0x00, htons(0),
+                         0x00 };
+   
+   evbuffer_add(output, &h, 24);
+
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMulti::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) {
+  
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op; 
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  
+  pop->key = string(key);
+  pop->valuelen = length;
+  pop->type = Operation::SET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(length,strlen(key));
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(flags);
+  uint32_t exp = 0;
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &f, 4);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, key, keylen);
+  evbuffer_add(output, value, length);
+
+  stats.tx_bytes += length + 32 + keylen;
+  return 1;
+}
+
+/**
+ * Return the oldest live operation in progress.
+ */
+void ConnectionMulti::pop_op(Operation *op) {
+
+  uint8_t level = OP_level(op);
+  //op_queue[level].erase(op);
+  op_queue_size[level]--;
+  
+
+  if (read_state == LOADING) return;
+  read_state = IDLE;
+
+  // Advance the read state machine.
+  //if (op_queue.size() > 0) {
+  //  Operation& op = op_queue.front();
+  //  switch (op.type) {
+  //  case Operation::GET: read_state = WAITING_FOR_GET; break;
+  //  case Operation::SET: read_state = WAITING_FOR_SET; break;
+  //  case Operation::DELETE: read_state = WAITING_FOR_DELETE; break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
+}
+
+/**
+ * Finish up (record stats) an operation that just returned from the
+ * server.
+ */
+void ConnectionMulti::finish_op(Operation *op, int was_hit) {
+  double now;
+#if USE_CACHED_TIME
+  struct timeval now_tv;
+  event_base_gettimeofday_cached(base, &now_tv);
+  now = tv_to_double(&now_tv);
+#else
+  now = get_time();
+#endif
+#if HAVE_CLOCK_GETTIME
+  op->end_time = get_time_accurate();
+#else
+  op->end_time = now;
+#endif
+
+  if (options.successful_queries && was_hit) { 
+    switch (op->type) {
+    case Operation::GET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_get_l1(*op);
+                break;
+            case 2:
+                stats.log_get_l2(*op);
+                break;
+        }
+        break;
+    case Operation::SET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_set_l1(*op);
+                break;
+            case 2:
+                stats.log_set_l2(*op);
+                break;
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  } else {
+    switch (op->type) {
+    case Operation::GET: 
+        if (OP_log(op)) {
+            switch (OP_level(op)) {
+                case 1:
+                    stats.log_get_l1(*op);
+                    break;
+                case 2:
+                    stats.log_get_l2(*op);
+                    if (op->l1 != NULL) {
+                        op->l1->end_time = now;
+                        stats.log_get(*(op->l1));
+                    }
+                    break;
+            }
+        }
+        break;
+    case Operation::SET:
+        if (OP_log(op)) {
+            switch (OP_level(op)) {
+                case 1:
+                    stats.log_set_l1(*op);
+                    break;
+                case 2:
+                    stats.log_set_l2(*op);
+                    break;
+            }
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  }
+
+  last_rx = now;
+  uint8_t level = OP_level(op);
+  if (op->l1 != NULL) {
+      delete op_queue[1][op->l1->opaque];
+      op_queue[1][op->l1->opaque] = 0;
+      op_queue_size[1]--;
+  }
+  //op_queue[level].erase(op_queue[level].begin()+opopq);
+  if (op == op_queue[level][op->opaque] && 
+          op->opaque == op_queue[level][op->opaque]->opaque) {
+    delete op_queue[level][op->opaque];
+    op_queue[level][op->opaque] = 0;
+  } else {
+      fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque);
+  }
+  op_queue_size[level]--;
+  read_state = IDLE;
+
+
+}
+
+
+
+/**
+ * Check if our testing is done and we should exit.
+ */
+bool ConnectionMulti::check_exit_condition(double now) {
+  if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) {
+      return true;
+  }
+  if (read_state == INIT_READ) return false;
+
+  return false;
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMulti::event_callback1(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname1.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev1);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev1);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMulti::event_callback2(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname2.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev2);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev2);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Request generation loop. Determines whether or not to issue a new command,
+ * based on timer events.
+ *
+ * Note that this function loops. Be wary of break vs. return.
+ */
+void ConnectionMulti::drive_write_machine(double now) {
+  if (now == 0.0) now = get_time();
+
+  double delay;
+  struct timeval tv;
+
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  while (1) {
+    switch (write_state) {
+    case INIT_WRITE:
+      delay = iagen->generate();
+      next_time = now + delay;
+      double_to_tv(delay, &tv);
+      evtimer_add(timer, &tv);
+      write_state = ISSUING;
+      break;
+
+    case ISSUING:
+      if ( (op_queue_size[1] >= (size_t) options.depth) || 
+          (op_queue_size[2] >= (size_t) options.depth) ) {
+        write_state = WAITING_FOR_OPQ;
+        break;
+      }
+
+      if (options.getsetorset) {
+        int ret = issue_getsetorset(now);
+        if (ret == 1) return; //if at EOF
+      }
+      
+      last_tx = now;
+      for (int i = 1; i <= 2; i++) {
+        stats.log_op(op_queue_size[i]);
+      }
+      break;
+
+    case WAITING_FOR_TIME:
+      write_state = ISSUING;
+      break;
+
+    case WAITING_FOR_OPQ:
+      if ( (op_queue_size[1] >= (size_t) options.depth) || 
+          (op_queue_size[2] >= (size_t) options.depth) ) {
+          //double delay = 0.01;
+          //struct timeval tv;
+          //double_to_tv(delay, &tv);
+          //evtimer_add(timer, &tv);
+          return;
+      } else {
+        write_state = ISSUING;
+        break;
+      }
+
+    default: DIE("Not implemented");
+    }
+  }
+}
+
+
+
+/**
+ * Tries to consume a binary response (in its entirety) from an evbuffer.
+ *
+ * @param input evBuffer to read response from
+ * @return  true if consumed, false if not enough data in buffer.
+ */
+static bool handle_response(ConnectionMulti *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) {
+  // Read the first 24 bytes as a header
+  int length = evbuffer_get_length(input);
+  if (length < 24) return false;
+  binary_header_t* h =
+          reinterpret_cast<binary_header_t*>(evbuffer_pullup(input, 24));
+  //assert(h);
+
+  uint32_t bl = ntohl(h->body_len);
+  uint16_t kl = ntohs(h->key_len);
+  uint8_t el = h->extra_len;
+  // Not whole response
+  int targetLen = 24 + bl;
+  if (length < targetLen) {
+      return false;
+  }
+
+  opcode = h->opcode;
+  opaque = ntohl(h->opaque);
+  uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level,
+            h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+            ntohl(h->body_len),ntohs(h->status));
+#endif
+
+
+  // If something other than success, count it as a miss
+  if (opcode == CMD_GET && status == RESP_NOT_FOUND) {
+      switch(level) {
+          case 1:
+              conn->stats.get_misses_l1++;
+              break;
+          case 2:
+              conn->stats.get_misses_l2++;
+              conn->stats.get_misses++;
+              conn->stats.window_get_misses++;
+              break;
+
+      }
+      found = false;
+      evbuffer_drain(input, targetLen);
+
+  } else if (opcode == CMD_SET && kl > 0) {
+    //first data is extras: clsid, flags, eflags
+    if (evict) {
+        evbuffer_drain(input,24);
+        unsigned char *buf = evbuffer_pullup(input,bl);
+        
+
+        evict->clsid = *((uint32_t*)buf);
+        evict->clsid = ntohl(evict->clsid);
+        buf += 4;
+        
+        evict->serverFlags = *((uint32_t*)buf);
+        evict->serverFlags = ntohl(evict->serverFlags);
+        buf += 4;
+        
+        evict->evictedFlags = *((uint32_t*)buf);
+        evict->evictedFlags = ntohl(evict->evictedFlags);
+        buf += 4;
+
+        
+        evict->evictedKeyLen = kl;
+        evict->evictedKey = (char*)malloc(kl+1);
+        memset(evict->evictedKey,0,kl+1);
+        memcpy(evict->evictedKey,buf,kl);
+        buf += kl;
+
+
+        evict->evictedLen = bl - kl - el;
+        evict->evictedData = (char*)malloc(evict->evictedLen);
+        memcpy(evict->evictedData,buf,evict->evictedLen);
+        evict->evicted = true;
+        //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags);
+        evbuffer_drain(input,bl);
+    } else {
+        evbuffer_drain(input, targetLen);
+    }
+  } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else {
+    evbuffer_drain(input, targetLen);
+  }
+
+  conn->stats.rx_bytes += targetLen;
+  done = true;
+  return true;
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMulti::read_callback1() {
+  struct evbuffer *input = bufferevent_get_input(bev1);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+    memset(evict,0,sizeof(evicted_t));
+
+    full_read = handle_response(this,input, done, found, opcode, opaque, evict,1);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l1: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+        op = op_queue[1][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (op->key.length() < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key.c_str());
+            write(2,out2,strlen(out2));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+    } else {
+        if (evict) {
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+        }
+        break;
+    }
+    
+
+    double now = get_time();
+    int wb = 0;
+    if (options.rand_admit) {
+        wb = (rand() % options.rand_admit);
+    }
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {
+                    /* issue a get a l2 */
+                    char key[256];
+                    memset(key,0,256);
+                    strncpy(key, op->key.c_str(),255);
+                    int vl = op->valuelen;
+                    int flags = OP_clu(op);
+                    issue_get_with_len(key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                    op->end_time = now;
+                    this->stats.log_get_l1(*op);
+                    //finish_op(op,0);
+
+                } else {
+                    del_wb_keys(op->key);
+                    finish_op(op,found);
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l1: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_L1_COPY || 
+            //    OP_src(op) == SRC_DIRECT_SET ||  
+            //    OP_src(op) == SRC_L2_M ) {
+            //}
+            if (evict->evicted) {
+                string wb_key(evict->evictedKey);
+                if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                    //wb_keys.push_back(wb_key);
+                    int ret = add_to_wb_keys(wb_key);
+                    if (ret == 1) {
+                        issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB);
+                    }
+                    //fprintf(stderr,"incl writeback %s\n",evict->evictedKey);
+                    this->stats.incl_wbs++;
+                } else if (evict->evictedFlags & ITEM_EXCL) {
+                    //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                    //strncpy(wb_key,evict->evictedKey,255);
+                    if ( (options.rand_admit && wb == 0) ||
+                         (options.threshold && (key_hist[wb_key] == 1)) ||
+                         (options.wb_all) ) {
+                        int ret = add_to_wb_keys(wb_key);
+                        if (ret == 1) {
+                            issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                        }
+                        this->stats.excl_wbs++;
+                    }
+                }
+                /*
+                if (evict->serverFlags & ITEM_SIZE_CHANGE && OP_src(op) == SRC_DIRECT_SET) {
+                    char key[256];
+                    memset(key,0,256);
+                    strncpy(key, op->key.c_str(),255);
+                    if (evict->serverFlags & ITEM_INCL) {
+                        int index = lrand48() % (1024 * 1024);
+                        int valuelen = op->valuelen;
+                        //the item's size was changed, issue a SET to L2 as a new command
+                        issue_set(key, &random_char[index], valuelen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_L2_M);
+                    }
+                }
+                */
+                if (OP_src(op) == SRC_DIRECT_SET) {
+                    if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                        this->stats.set_misses_l1++;
+                    } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_excl_hits_l1++;
+                    } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_incl_hits_l1++;
+                    }
+                }
+            }
+            del_wb_keys(op->key);
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key.c_str(),op->opaque);
+            DIE("not implemented");
+    }
+
+    if (evict) {
+        if (evict->evictedKey) free(evict->evictedKey);
+        if (evict->evictedData) free(evict->evictedData);
+        free(evict);
+    }
+
+  }
+  
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  last_tx = now;
+  stats.log_op(op_queue_size[1]);
+  stats.log_op(op_queue_size[2]);
+  //for (int i = 1; i <= 2; i++) {
+  //    fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]);
+  //}
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMulti::read_callback2() {
+  struct evbuffer *input = bufferevent_get_input(bev2);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l2: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            continue;
+        }
+        op = op_queue[2][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (op->key.length() < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key.c_str());
+            write(2,out2,strlen(out2));
+#endif
+            continue;
+        }
+    } else {
+        break;
+    }
+    
+
+    double now = get_time();
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {//  &&
+                    //(options.twitter_trace != 1)) {
+                    char key[256];
+                    memset(key,0,256);
+                    strncpy(key, op->key.c_str(),255);
+                    int valuelen = op->valuelen;
+                    int index = lrand48() % (1024 * 1024);
+                    int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                    issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                    //wb_keys.push_back(op->key);
+                    last_quiet1 = false; 
+                    if (OP_incl(op)) {
+                        //wb_keys.push_back(op->key);
+                        issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                        last_quiet2 = false; 
+                    }
+                    //pthread_mutex_lock(lock);
+                    //fprintf(stderr,"----miss: %s----\n",key);
+                    //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){
+                    //    fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second);
+                    //}
+                    //fprintf(stderr,"----%d----\n",cid);
+                    //pthread_mutex_unlock(lock);
+                    finish_op(op,0); // sets read_state = IDLE
+                    
+                } else {
+                    if (found) {
+                        char key[256];
+                        memset(key,0,256);
+                        strncpy(key, op->key.c_str(),255);
+                        int valuelen = op->valuelen;
+                        int index = lrand48() % (1024 * 1024);
+                        int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                        //found in l2, set in l1
+                        //wb_keys.push_back(op->key);
+                        issue_set(key, &random_char[index],valuelen, now, flags);
+                        this->stats.copies_to_l1++;
+                        //if (OP_excl(op)) {
+                        //    issue_delete(key,now, ITEM_L2 | SRC_L1_COPY );
+                        //}
+                        finish_op(op,1);
+
+                    } else {
+                        finish_op(op,0);
+                    }
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l2: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            if (OP_src(op) == SRC_WB) {
+                del_wb_keys(op->key);
+            }
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                char key[256];
+                memset(key,0,256);
+                strncpy(key, op->key.c_str(),255);
+                int valuelen = op->valuelen;
+                if (!found) {
+                    int index = lrand48() % (1024 * 1024);
+                    //int ret = add_to_wb_keys(op->key+"l2");
+                    //if (ret == 1) {
+                    issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                    //}
+                    this->stats.set_misses_l2++;
+                } else {
+                    issue_touch(key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                }
+            }
+            //if (!found) {
+            //    //int incl = op->incl;
+            //    //int flags = 0;
+            //    //SET_INCL(incl,flags);
+            //    //// not found in l2, set in l2
+            //    char key[256];
+            //    memset(key,0,256);
+            //    strncpy(key, op->key.c_str(),255);
+            //    int valuelen = op->valuelen;
+            //    int index = lrand48() % (1024 * 1024);
+            //    if (OP_src(op) == SRC_DIRECT_SET) {
+            //        issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP);
+            //        this->stats.set_misses_l2++;
+            //    }
+            //    //if (OP_src(op) == SRC_L1_H) {
+            //    //    fprintf(stderr,"expected op in l2: %s\n",key);
+            //    //}
+            //    finish_op(op,0);
+            //} else {
+            //    finish_op(op,1);
+            //}
+            finish_op(op,0);
+            break;
+        case Operation::DELETE:
+            //check to see if it was a hit
+            //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                if (found) {
+                    this->stats.delete_hits_l2++;
+                } else {
+                    this->stats.delete_misses_l2++;
+                }
+            }
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key.c_str(),op->opaque);
+            DIE("not implemented");
+    }
+
+  }
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  last_tx = now;
+  stats.log_op(op_queue_size[2]);
+  stats.log_op(op_queue_size[1]);
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Callback called when write requests finish.
+ */
+void ConnectionMulti::write_callback() {
+
+    //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size());
+}
+
+/**
+ * Callback for timer timeouts.
+ */
+void ConnectionMulti::timer_callback() {
+  //fprintf(stderr,"timer up: %d\n",cid);
+  drive_write_machine();
+}
+
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb1(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->event_callback1(events);
+}
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb2(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->event_callback2(events);
+}
+
+void bev_read_cb1(struct bufferevent *bev, void *ptr) {
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->read_callback1();
+}
+
+
+void bev_read_cb2(struct bufferevent *bev, void *ptr) {
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->read_callback2();
+}
+
+void bev_write_cb_m(struct bufferevent *bev, void *ptr) {
+}
+
+void timer_cb_m(evutil_socket_t fd, short what, void *ptr) {
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->timer_callback();
+}
+
diff --git a/ConnectionMulti.cc b/ConnectionMulti.cc
new file mode 100644
index 0000000..81a6cda
--- /dev/null
+++ b/ConnectionMulti.cc
@@ -0,0 +1,1713 @@
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <time.h>
+#include <pthread.h>
+
+#include <event2/buffer.h>
+#include <event2/bufferevent.h>
+#include <event2/dns.h>
+#include <event2/event.h>
+#include <event2/thread.h>
+#include <event2/util.h>
+
+#include "config.h"
+
+#include "Connection.h"
+#include "distributions.h"
+#include "Generator.h"
+#include "mutilate.h"
+#include "binary_protocol.h"
+#include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+
+//#include <folly/concurrency/ConcurrentHashMap.h>
+
+#define ITEM_L1 1
+#define ITEM_L2 2
+#define LOG_OP 4
+#define SRC_L1_M 8
+#define SRC_L1_H 16
+#define SRC_L2_M 32
+#define SRC_L2_H 64
+#define SRC_DIRECT_SET 128
+#define SRC_L1_COPY 256
+#define SRC_WB 512
+
+#define ITEM_INCL  4096
+#define ITEM_EXCL  8192
+#define ITEM_DIRTY 16384
+#define ITEM_SIZE_CHANGE 131072
+#define ITEM_WAS_HIT 262144
+
+#define LEVELS 2
+#define SET_INCL(incl,flags)     \
+    switch (incl) {              \
+        case 1:                  \
+            flags |= ITEM_INCL;  \
+            break;               \
+        case 2:                  \
+            flags |= ITEM_EXCL;  \
+            break;               \
+                                 \
+    }                            \
+
+#define GET_INCL(incl,flags) \
+    if (flags & ITEM_INCL) incl = 1; \
+    else if (flags & ITEM_EXCL) incl = 2; \
+
+//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 )
+#define OP_level(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define FLAGS_level(flags) ( flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) 
+
+#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) )
+
+#define OP_log(op) ((op)->flags & LOG_OP)
+#define OP_incl(op) ((op)->flags & ITEM_INCL)
+#define OP_excl(op) ((op)->flags & ITEM_EXCL)
+#define OP_set_flag(op,flag) ((op))->flags |= flag;
+
+//#define DEBUGMC
+//#define DEBUGS
+//using namespace folly;
+using namespace moodycamel;
+
+pthread_mutex_t cid_lock_m = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t connids_m = 1;
+
+#define NCLASSES 40
+#define CHUNK_ALIGN_BYTES 8
+static int classes = 0;
+static int sizes[NCLASSES+1];
+static int inclusives[NCLASSES+1];
+
+
+static vector<double> cid_rate;
+
+extern int max_n[3];
+
+static void init_inclusives(char *inclusive_str) {
+    int j = 1;
+    for (int i = 0; i < (int)strlen(inclusive_str); i++) {
+        if (inclusive_str[i] == '-') {
+            continue;
+        } else {
+            inclusives[j] = inclusive_str[i] - '0';
+            j++;
+        }
+    }
+}
+
+static void init_classes() {
+
+    double factor = 1.25;
+    unsigned int size = 96; //warning if you change this you die
+    unsigned int i = 0;
+    unsigned int chunk_size_max = 1048576/2;
+    while (++i < NCLASSES-1) {
+        if (size >= chunk_size_max / factor) {
+            break;
+        }
+        if (size % CHUNK_ALIGN_BYTES)
+            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
+        sizes[i] = size;
+        size *= factor;
+    }
+    sizes[i] = chunk_size_max;
+    classes = i;
+
+}
+
+static int get_class(int vl, uint32_t kl) {
+    //warning if you change this you die
+    int vsize = vl+kl+48+1+2;
+    int res = 1;
+    while (vsize > sizes[res])
+        if (res++ == classes) { 
+            //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]);
+            return -1;
+        }
+    return res;
+}
+
+static int get_incl(int vl, int kl) {
+    int clsid = get_class(vl,kl);
+    if (clsid) {
+        return inclusives[clsid];
+    } else {
+        return -1;
+    }
+}
+
+void ConnectionMulti::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char k[256];
+    char a[256];
+    char s[256];
+    memset(k,0,256);
+    memset(a,0,256);
+    memset(s,0,256);
+    strncpy(k,op->key,255);
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type);
+    }
+    write(2,output,strlen(output));
+}
+
+/**
+ * Create a new connection to a server endpoint.
+ */
+ConnectionMulti::ConnectionMulti(struct event_base* _base, struct evdns_base* _evdns,
+                       string _hostname1, string _hostname2, string _port, options_t _options,
+                       bool sampling, int fd1, int fd2 ) :
+  start_time(0), stats(sampling), options(_options),
+  hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns)
+{
+  pthread_mutex_lock(&cid_lock_m);
+  cid = connids_m++;
+  if (cid == 1) {
+    cid_rate.push_back(100);
+    cid_rate.push_back(0);
+    init_classes();
+    init_inclusives(options.inclusives);
+  } else {
+    cid_rate.push_back(0);
+  }
+  
+  pthread_mutex_unlock(&cid_lock_m);
+  
+  valuesize = createGenerator(options.valuesize);
+  keysize = createGenerator(options.keysize);
+  srand(time(NULL));
+  keygen = new KeyGenerator(keysize, options.records);
+  
+  total = 0;
+  eof = 0;
+  o_percent = 0;
+
+  if (options.lambda <= 0) {
+    iagen = createGenerator("0");
+  } else {
+    D("iagen = createGenerator(%s)", options.ia);
+    iagen = createGenerator(options.ia);
+    iagen->set_lambda(options.lambda);
+  }
+
+  read_state  = IDLE;
+  write_state = INIT_WRITE;
+  last_quiet1 = false;
+  last_quiet2 = false;
+  
+  last_tx = last_rx = 0.0;
+
+  
+  op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1));
+
+  for (int i = 0; i <= LEVELS; i++) {
+      op_queue_size[i] = 0;
+      opaque[i] = 1;
+      //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX);
+      op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX*2));
+
+  }
+  
+  bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev1, bev_read_cb1, bev_write_cb_m, bev_event_cb1, this);
+  bufferevent_enable(bev1, EV_READ | EV_WRITE);
+  
+  bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev2, bev_read_cb2, bev_write_cb_m, bev_event_cb2, this);
+  bufferevent_enable(bev2, EV_READ | EV_WRITE);
+  
+  timer = evtimer_new(base, timer_cb_m, this);
+
+  read_state  = IDLE;
+}
+
+
+void ConnectionMulti::set_queue(queue<Operation*>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    trace_queue_n = a_trace_queue->size();
+}
+
+void ConnectionMulti::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+void ConnectionMulti::set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys) {
+    g_wb_keys = a_wb_keys;
+}
+
+uint32_t ConnectionMulti::get_cid() {
+    return cid;
+}
+
+int ConnectionMulti::add_to_wb_keys(string key) {
+    int ret = -1;
+    pthread_mutex_lock(lock);
+    auto pos = g_wb_keys->find(key);
+    if (pos == g_wb_keys->end()) {
+        g_wb_keys->insert( {key, vector<Operation*>() });
+        ret = 1;
+        //fprintf(stderr,"----set: %s----\n",Op.key.c_str());
+        //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){
+        //    fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second);
+        //}
+        //fprintf(stderr,"----%d----\n",cid);
+    } else {
+        ret = 2;
+    }
+
+    pthread_mutex_unlock(lock);
+    return ret;
+}
+
+void ConnectionMulti::del_wb_keys(string key) {
+
+    pthread_mutex_lock(lock);
+    auto position = g_wb_keys->find(key);
+    if (position != g_wb_keys->end()) {
+        g_wb_keys->erase(position);
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+    pthread_mutex_unlock(lock);
+}
+
+
+int ConnectionMulti::do_connect() {
+
+  int connected = 0;
+  if (options.unix_socket) {
+  
+
+    struct sockaddr_un sin1;
+    memset(&sin1, 0, sizeof(sin1));
+    sin1.sun_family = AF_LOCAL;
+    strcpy(sin1.sun_path, hostname1.c_str());
+
+    int addrlen;
+    addrlen = sizeof(sin1);
+
+    int err = bufferevent_socket_connect(bev1,  (struct sockaddr*)&sin1, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+        connected = 0;
+        err = errno;
+        fprintf(stderr,"l1 error %s\n",strerror(err));
+    }
+    
+    struct sockaddr_un sin2;
+    memset(&sin2, 0, sizeof(sin2));
+    sin2.sun_family = AF_LOCAL;
+    strcpy(sin2.sun_path, hostname2.c_str());
+
+    addrlen = sizeof(sin2);
+    err = bufferevent_socket_connect(bev2,  (struct sockaddr*)&sin2, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+	connected = 0;
+        err = errno;
+	fprintf(stderr,"l2 error %s\n",strerror(err));
+    }
+  } 
+  read_state  = IDLE;
+  return connected;
+}
+
+/**
+ * Destroy a connection, performing cleanup.
+ */
+ConnectionMulti::~ConnectionMulti() {
+ 
+
+  for (int i = 0; i <= LEVELS; i++) {
+      free(op_queue[i]);
+
+  }
+  
+  free(op_queue_size);
+  free(opaque);
+  free(op_queue);
+  //event_free(timer);
+  //timer = NULL;
+  // FIXME:  W("Drain op_q?");
+  //bufferevent_free(bev1);
+  //bufferevent_free(bev2);
+
+  delete iagen;
+  delete keygen;
+  delete keysize;
+  delete valuesize;
+}
+
+/**
+ * Reset the connection back to an initial, fresh state.
+ */
+void ConnectionMulti::reset() {
+  // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
+  read_state = IDLE;
+  write_state = INIT_WRITE;
+  stats = ConnectionStats(stats.sampling);
+}
+
+/**
+ * Set our event processing priority.
+ */
+void ConnectionMulti::set_priority(int pri) {
+  if (bufferevent_priority_set(bev1, pri)) {
+    DIE("bufferevent_set_priority(bev, %d) failed", pri);
+  }
+}
+
+
+
+/**
+ * Get/Set or Set Style
+ * If a GET command: Issue a get first, if not found then set
+ * If trace file (or prob. write) says to set, then set it
+ */
+int ConnectionMulti::issue_getsetorset(double now) {
+ 
+
+    
+    int ret = 0;
+    int nissued = 0;
+    //while (nissued < options.depth) {
+    
+    //pthread_mutex_lock(lock);
+    if (!trace_queue->empty()) {
+        Operation Op = *(trace_queue->front());
+        if (Op.type == Operation::SASL) {
+            eof = 1;
+            cid_rate[cid] = 100;
+            fprintf(stderr,"cid %d done\n",cid);
+            string op_queue1;
+            string op_queue2;
+            for (int j = 0; j < 2; j++) {
+                for (int i = 0; i < OPAQUE_MAX; i++) {
+                    if (op_queue[j+1][i] != NULL) {
+                        if (j == 0) {
+                            op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+                        } else {
+                            op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+                        }
+                    }
+                }
+            }
+            fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+            return 1;
+        } 
+        
+
+        /* check if in global wb queue */
+        pthread_mutex_lock(lock);
+        double percent = (double)total/((double)trace_queue_n) * 100;
+        if (percent > o_percent+1) {
+            //update the percentage table and see if we should execute
+            std::vector<double>::iterator mp = std::min_element(cid_rate.begin(), cid_rate.end());
+            double min_percent = *mp;
+
+            if (percent > min_percent+2) {
+                pthread_mutex_unlock(lock);
+                struct timeval tv;
+                tv.tv_sec = 1;
+                tv.tv_usec = 0;
+                int good = 0;
+                if (!event_pending(timer, EV_TIMEOUT, NULL)) {
+                    good = evtimer_add(timer, &tv);
+                }
+                if (good != 0) {
+                    fprintf(stderr,"eventimer is messed up!\n");
+                    return 2;
+                }
+                return 1;
+            }
+            cid_rate[cid] = percent;
+            fprintf(stderr,"%f,%d,%.4f\n",now,cid,percent);
+            o_percent = percent;
+        }
+        auto check = g_wb_keys->find(Op.key);
+        if (check != g_wb_keys->end()) {
+            pthread_mutex_unlock(lock);
+            struct timeval tv;
+            tv.tv_sec = 1;
+            tv.tv_usec = 0;
+            int good = 0;
+            if (!event_pending(timer, EV_TIMEOUT, NULL)) {
+                good = evtimer_add(timer, &tv);
+            }
+            if (good != 0) {
+                fprintf(stderr,"eventimer is messed up in checking for key: %s\n",Op.key);
+                return 2;
+            }
+            return 1;
+        } else {
+            //g_wb_keys->insert( {Op.key, cid} );
+            //g_wb_keys->insert( {Op.key+"l2", cid} );
+        }
+        pthread_mutex_unlock(lock);
+
+        
+        
+        char key[256];
+        memset(key,0,256);
+        strncpy(key, Op.key,255);
+        int vl = Op.valuelen;
+
+        trace_queue->pop();
+
+        int issued = 0;
+        int incl = get_incl(vl,strlen(key));
+        int cid = get_class(vl,strlen(key));
+        int flags = 0;
+        int touch = (rand() % 100);
+        int index = lrand48() % (1024 * 1024);
+        //int touch = 1;
+        SET_INCL(incl,flags);
+        
+        switch(Op.type)
+        {
+          case Operation::GET:
+              //if (nissued < options.depth-1) {
+              //  issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1);
+              //  last_quiet1 = false;
+              //} else {
+              //}
+              if (options.threshold > 0) {
+                if (Op.future) {
+                    key_hist[key] = 1;
+                }
+              }
+              issued = issue_get_with_len(key, vl, now, false, flags | LOG_OP | ITEM_L1);
+              if (touch == 1 && incl == 1) {
+                issue_touch(key,vl,now, ITEM_L2 | SRC_L1_H);
+              }
+              last_quiet1 = false;
+              this->stats.gets++;
+              this->stats.gets_cid[cid]++;
+
+              break;
+        case Operation::SET:
+              if (last_quiet1) {
+                  issue_noop(now,1);
+              }
+              if (incl == 1) {
+                issue_touch(key,vl,now, ITEM_L2 | SRC_DIRECT_SET);
+              } else if (incl == 2) {
+                issue_delete(key,now, ITEM_L2 | SRC_DIRECT_SET );
+              }
+              issued = issue_set(key, &random_char[index], vl, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+              last_quiet1 = false;
+              this->stats.sets++;
+              this->stats.sets_cid[cid]++;
+              break;
+        case Operation::DELETE:
+        case Operation::TOUCH:
+        case Operation::NOOP:
+        case Operation::SASL:
+              fprintf(stderr,"invalid line: %s, vl: %d\n",key,vl);
+              break;
+        
+        }
+        if (issued) {
+            nissued++;
+            total++;
+        } else {
+            fprintf(stderr,"failed to issue line: %s, vl: %d @T: XX \n",key,vl);
+        }
+    } else {
+        return 1;
+    }
+    //}
+    if (last_quiet1) {
+        issue_noop(now,1);
+        last_quiet1 = false;
+    }
+
+    return ret;
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMulti::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) {
+
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op;
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::GET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(valuelen,strlen(key));
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  if (quiet) {
+      h.opcode = CMD_GETQ;
+  }
+  h.opaque = htonl(pop->opaque);
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  return 1;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMulti::issue_touch(const char* key, int valuelen, double now, int flags) {
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::TOUCH;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+#ifdef DEBUGS
+  fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque);
+#endif
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen),
+                        0x04, 0x00, htons(0),
+                        htonl(keylen + 4) };
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t exp = 0;
+  if (flags & ITEM_DIRTY) {
+      exp = htonl(flags); 
+  }
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, key, keylen);
+
+  
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+/**
+ * Issue a delete request to the server.
+ */
+int ConnectionMulti::issue_delete(const char* key, double now, uint32_t flags) {
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op;
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->type = Operation::DELETE;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque);
+#endif
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_DELETE, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  h.opaque = htonl(pop->opaque);
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+void ConnectionMulti::issue_noop(double now, int level) {
+   struct evbuffer *output = NULL;
+   switch (level) {
+       case 1:
+           output = bufferevent_get_output(bev1);
+           break;
+       case 2:
+           output = bufferevent_get_output(bev2);
+           break;
+   }
+   Operation op;
+   
+   if (now == 0.0) op.start_time = get_time();
+   else op.start_time = now;
+
+   binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                         0x00, 0x00, htons(0),
+                         0x00 };
+   
+   evbuffer_add(output, &h, 24);
+
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMulti::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) {
+  
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op; 
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  
+  strncpy(pop->key,key,255);
+  pop->valuelen = length;
+  pop->type = Operation::SET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(length,strlen(key));
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(flags);
+  uint32_t exp = 0;
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &f, 4);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, key, keylen);
+  evbuffer_add(output, value, length);
+
+  stats.tx_bytes += length + 32 + keylen;
+  return 1;
+}
+
+/**
+ * Return the oldest live operation in progress.
+ */
+void ConnectionMulti::pop_op(Operation *op) {
+
+  uint8_t level = OP_level(op);
+  //op_queue[level].erase(op);
+  op_queue_size[level]--;
+  
+
+  if (read_state == LOADING) return;
+  read_state = IDLE;
+
+  // Advance the read state machine.
+  //if (op_queue.size() > 0) {
+  //  Operation& op = op_queue.front();
+  //  switch (op.type) {
+  //  case Operation::GET: read_state = WAITING_FOR_GET; break;
+  //  case Operation::SET: read_state = WAITING_FOR_SET; break;
+  //  case Operation::DELETE: read_state = WAITING_FOR_DELETE; break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
+}
+
+/**
+ * Finish up (record stats) an operation that just returned from the
+ * server.
+ */
+void ConnectionMulti::finish_op(Operation *op, int was_hit) {
+  double now;
+#if USE_CACHED_TIME
+  struct timeval now_tv;
+  event_base_gettimeofday_cached(base, &now_tv);
+  now = tv_to_double(&now_tv);
+#else
+  now = get_time();
+#endif
+#if HAVE_CLOCK_GETTIME
+  op->end_time = get_time_accurate();
+#else
+  op->end_time = now;
+#endif
+
+  if (options.successful_queries && was_hit) { 
+    switch (op->type) {
+    case Operation::GET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_get_l1(*op);
+                break;
+            case 2:
+                stats.log_get_l2(*op);
+                break;
+        }
+        break;
+    case Operation::SET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_set_l1(*op);
+                break;
+            case 2:
+                stats.log_set_l2(*op);
+                break;
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  } else {
+    switch (op->type) {
+    case Operation::GET: 
+        if (OP_log(op)) {
+            switch (OP_level(op)) {
+                case 1:
+                    stats.log_get_l1(*op);
+                    break;
+                case 2:
+                    stats.log_get_l2(*op);
+                    if (op->l1 != NULL) {
+                        op->l1->end_time = now;
+                        stats.log_get(*(op->l1));
+                    }
+                    break;
+            }
+        }
+        break;
+    case Operation::SET:
+        if (OP_log(op)) {
+            switch (OP_level(op)) {
+                case 1:
+                    stats.log_set_l1(*op);
+                    break;
+                case 2:
+                    stats.log_set_l2(*op);
+                    break;
+            }
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  }
+
+  last_rx = now;
+  uint8_t level = OP_level(op);
+  if (op->l1 != NULL) {
+      delete op_queue[1][op->l1->opaque];
+      op_queue[1][op->l1->opaque] = 0;
+      op_queue_size[1]--;
+  }
+  //op_queue[level].erase(op_queue[level].begin()+opopq);
+  if (op == op_queue[level][op->opaque] && 
+          op->opaque == op_queue[level][op->opaque]->opaque) {
+    delete op_queue[level][op->opaque];
+    op_queue[level][op->opaque] = 0;
+  } else {
+      fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque);
+  }
+  op_queue_size[level]--;
+  read_state = IDLE;
+
+
+}
+
+
+
+/**
+ * Check if our testing is done and we should exit.
+ */
+bool ConnectionMulti::check_exit_condition(double now) {
+  if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) {
+      return true;
+  }
+  if (read_state == INIT_READ) return false;
+
+  return false;
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMulti::event_callback1(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname1.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev1);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev1);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMulti::event_callback2(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname2.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev2);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev2);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Request generation loop. Determines whether or not to issue a new command,
+ * based on timer events.
+ *
+ * Note that this function loops. Be wary of break vs. return.
+ */
+void ConnectionMulti::drive_write_machine(double now) {
+  if (now == 0.0) now = get_time();
+
+  double delay;
+  struct timeval tv;
+
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  while (1) {
+    switch (write_state) {
+    case INIT_WRITE:
+      delay = iagen->generate();
+      next_time = now + delay;
+      double_to_tv(delay, &tv);
+      evtimer_add(timer, &tv);
+      write_state = ISSUING;
+      break;
+
+    case ISSUING:
+      if ( (op_queue_size[1] >= (size_t) options.depth) || 
+          (op_queue_size[2] >= (size_t) options.depth) ) {
+        write_state = WAITING_FOR_OPQ;
+        break;
+      }
+
+      if (options.getsetorset) {
+        int ret = issue_getsetorset(now);
+        if (ret == 1) return; //if at EOF
+      }
+      
+      last_tx = now;
+      for (int i = 1; i <= 2; i++) {
+        stats.log_op(op_queue_size[i]);
+      }
+      break;
+
+    case WAITING_FOR_TIME:
+      write_state = ISSUING;
+      break;
+
+    case WAITING_FOR_OPQ:
+      if ( (op_queue_size[1] >= (size_t) options.depth) || 
+          (op_queue_size[2] >= (size_t) options.depth) ) {
+          //double delay = 0.01;
+          //struct timeval tv;
+          //double_to_tv(delay, &tv);
+          //evtimer_add(timer, &tv);
+          return;
+      } else {
+        write_state = ISSUING;
+        break;
+      }
+
+    default: DIE("Not implemented");
+    }
+  }
+}
+
+
+
+/**
+ * Tries to consume a binary response (in its entirety) from an evbuffer.
+ *
+ * @param input evBuffer to read response from
+ * @return  true if consumed, false if not enough data in buffer.
+ */
+static bool handle_response(ConnectionMulti *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) {
+  // Read the first 24 bytes as a header
+  int length = evbuffer_get_length(input);
+  if (length < 24) return false;
+  binary_header_t* h =
+          reinterpret_cast<binary_header_t*>(evbuffer_pullup(input, 24));
+  //assert(h);
+
+  uint32_t bl = ntohl(h->body_len);
+  uint16_t kl = ntohs(h->key_len);
+  uint8_t el = h->extra_len;
+  // Not whole response
+  int targetLen = 24 + bl;
+  if (length < targetLen) {
+      return false;
+  }
+
+  opcode = h->opcode;
+  opaque = ntohl(h->opaque);
+  uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level,
+            h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+            ntohl(h->body_len),ntohs(h->status));
+#endif
+
+
+  // If something other than success, count it as a miss
+  if (opcode == CMD_GET && status == RESP_NOT_FOUND) {
+      switch(level) {
+          case 1:
+              conn->stats.get_misses_l1++;
+              break;
+          case 2:
+              conn->stats.get_misses_l2++;
+              conn->stats.get_misses++;
+              conn->stats.window_get_misses++;
+              break;
+
+      }
+      found = false;
+      evbuffer_drain(input, targetLen);
+
+  } else if (opcode == CMD_SET && kl > 0) {
+    //first data is extras: clsid, flags, eflags
+    if (evict) {
+        evbuffer_drain(input,24);
+        unsigned char *buf = evbuffer_pullup(input,bl);
+        
+
+        evict->clsid = *((uint32_t*)buf);
+        evict->clsid = ntohl(evict->clsid);
+        buf += 4;
+        
+        evict->serverFlags = *((uint32_t*)buf);
+        evict->serverFlags = ntohl(evict->serverFlags);
+        buf += 4;
+        
+        evict->evictedFlags = *((uint32_t*)buf);
+        evict->evictedFlags = ntohl(evict->evictedFlags);
+        buf += 4;
+
+        
+        evict->evictedKeyLen = kl;
+        evict->evictedKey = (char*)malloc(kl+1);
+        memset(evict->evictedKey,0,kl+1);
+        memcpy(evict->evictedKey,buf,kl);
+        buf += kl;
+
+
+        evict->evictedLen = bl - kl - el;
+        evict->evictedData = (char*)malloc(evict->evictedLen);
+        memcpy(evict->evictedData,buf,evict->evictedLen);
+        evict->evicted = true;
+        //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags);
+        evbuffer_drain(input,bl);
+    } else {
+        evbuffer_drain(input, targetLen);
+    }
+  } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else {
+    evbuffer_drain(input, targetLen);
+  }
+
+  conn->stats.rx_bytes += targetLen;
+  done = true;
+  return true;
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMulti::read_callback1() {
+  struct evbuffer *input = bufferevent_get_input(bev1);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+    memset(evict,0,sizeof(evicted_t));
+
+    full_read = handle_response(this,input, done, found, opcode, opaque, evict,1);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l1: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+        op = op_queue[1][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+    } else {
+        if (evict) {
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+        }
+        break;
+    }
+    
+
+    double now = get_time();
+    int wb = 0;
+    if (options.rand_admit) {
+        wb = (rand() % options.rand_admit);
+    }
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {
+                    /* issue a get a l2 */
+                    char key[256];
+                    memset(key,0,256);
+                    strncpy(key, op->key,255);
+                    int vl = op->valuelen;
+                    int flags = OP_clu(op);
+                    issue_get_with_len(key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                    op->end_time = now;
+                    this->stats.log_get_l1(*op);
+                    //finish_op(op,0);
+
+                } else {
+                    del_wb_keys(op->key);
+                    finish_op(op,found);
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l1: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_L1_COPY || 
+            //    OP_src(op) == SRC_DIRECT_SET ||  
+            //    OP_src(op) == SRC_L2_M ) {
+            //}
+            if (evict->evicted) {
+                string wb_key(evict->evictedKey);
+                if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                    //wb_keys.push_back(wb_key);
+                    int ret = add_to_wb_keys(wb_key);
+                    if (ret == 1) {
+                        issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB);
+                    }
+                    //fprintf(stderr,"incl writeback %s\n",evict->evictedKey);
+                    this->stats.incl_wbs++;
+                } else if (evict->evictedFlags & ITEM_EXCL) {
+                    //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                    //strncpy(wb_key,evict->evictedKey,255);
+                    if ( (options.rand_admit && wb == 0) ||
+                         (options.threshold && (key_hist[wb_key] == 1)) ||
+                         (options.wb_all) ) {
+                        int ret = add_to_wb_keys(wb_key);
+                        if (ret == 1) {
+                            issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                        }
+                        this->stats.excl_wbs++;
+                    }
+                }
+                /*
+                if (evict->serverFlags & ITEM_SIZE_CHANGE && OP_src(op) == SRC_DIRECT_SET) {
+                    char key[256];
+                    memset(key,0,256);
+                    strncpy(key, op->key.c_str(),255);
+                    if (evict->serverFlags & ITEM_INCL) {
+                        int index = lrand48() % (1024 * 1024);
+                        int valuelen = op->valuelen;
+                        //the item's size was changed, issue a SET to L2 as a new command
+                        issue_set(key, &random_char[index], valuelen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_L2_M);
+                    }
+                }
+                */
+                if (OP_src(op) == SRC_DIRECT_SET) {
+                    if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                        this->stats.set_misses_l1++;
+                    } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_excl_hits_l1++;
+                    } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_incl_hits_l1++;
+                    }
+                }
+            }
+            del_wb_keys(op->key);
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+    if (evict) {
+        if (evict->evictedKey) free(evict->evictedKey);
+        if (evict->evictedData) free(evict->evictedData);
+        free(evict);
+    }
+
+  }
+  
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  last_tx = now;
+  stats.log_op(op_queue_size[1]);
+  stats.log_op(op_queue_size[2]);
+  //for (int i = 1; i <= 2; i++) {
+  //    fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]);
+  //}
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMulti::read_callback2() {
+  struct evbuffer *input = bufferevent_get_input(bev2);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l2: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            continue;
+        }
+        op = op_queue[2][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            continue;
+        }
+    } else {
+        break;
+    }
+    
+
+    double now = get_time();
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {//  &&
+                    //(options.twitter_trace != 1)) {
+                    char key[256];
+                    memset(key,0,256);
+                    strncpy(key, op->key,255);
+                    int valuelen = op->valuelen;
+                    int index = lrand48() % (1024 * 1024);
+                    int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                    issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                    //wb_keys.push_back(op->key);
+                    last_quiet1 = false; 
+                    if (OP_incl(op)) {
+                        //wb_keys.push_back(op->key);
+                        issue_set(key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                        last_quiet2 = false; 
+                    }
+                    //pthread_mutex_lock(lock);
+                    //fprintf(stderr,"----miss: %s----\n",key);
+                    //for (auto iter = g_wb_keys->begin(); iter != g_wb_keys->end(); ++iter){
+                    //    fprintf(stderr,"%s,%d\n",iter->first.c_str(),iter->second);
+                    //}
+                    //fprintf(stderr,"----%d----\n",cid);
+                    //pthread_mutex_unlock(lock);
+                    finish_op(op,0); // sets read_state = IDLE
+                    
+                } else {
+                    if (found) {
+                        char key[256];
+                        memset(key,0,256);
+                        strncpy(key, op->key,255);
+                        int valuelen = op->valuelen;
+                        int index = lrand48() % (1024 * 1024);
+                        int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                        //found in l2, set in l1
+                        //wb_keys.push_back(op->key);
+                        issue_set(key, &random_char[index],valuelen, now, flags);
+                        this->stats.copies_to_l1++;
+                        //if (OP_excl(op)) {
+                        //    issue_delete(key,now, ITEM_L2 | SRC_L1_COPY );
+                        //}
+                        finish_op(op,1);
+
+                    } else {
+                        finish_op(op,0);
+                    }
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l2: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            if (OP_src(op) == SRC_WB) {
+                del_wb_keys(op->key);
+            }
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                char key[256];
+                memset(key,0,256);
+                strncpy(key, op->key,255);
+                int valuelen = op->valuelen;
+                if (!found) {
+                    int index = lrand48() % (1024 * 1024);
+                    //int ret = add_to_wb_keys(op->key+"l2");
+                    //if (ret == 1) {
+                    issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                    //}
+                    this->stats.set_misses_l2++;
+                } else {
+                    issue_touch(key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                }
+            }
+            //if (!found) {
+            //    //int incl = op->incl;
+            //    //int flags = 0;
+            //    //SET_INCL(incl,flags);
+            //    //// not found in l2, set in l2
+            //    char key[256];
+            //    memset(key,0,256);
+            //    strncpy(key, op->key.c_str(),255);
+            //    int valuelen = op->valuelen;
+            //    int index = lrand48() % (1024 * 1024);
+            //    if (OP_src(op) == SRC_DIRECT_SET) {
+            //        issue_set(key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP);
+            //        this->stats.set_misses_l2++;
+            //    }
+            //    //if (OP_src(op) == SRC_L1_H) {
+            //    //    fprintf(stderr,"expected op in l2: %s\n",key);
+            //    //}
+            //    finish_op(op,0);
+            //} else {
+            //    finish_op(op,1);
+            //}
+            finish_op(op,0);
+            break;
+        case Operation::DELETE:
+            //check to see if it was a hit
+            //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                if (found) {
+                    this->stats.delete_hits_l2++;
+                } else {
+                    this->stats.delete_misses_l2++;
+                }
+            }
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+  }
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  last_tx = now;
+  stats.log_op(op_queue_size[2]);
+  stats.log_op(op_queue_size[1]);
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Callback called when write requests finish.
+ */
+void ConnectionMulti::write_callback() {
+
+    //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size());
+}
+
+/**
+ * Callback for timer timeouts.
+ */
+void ConnectionMulti::timer_callback() {
+  //fprintf(stderr,"timer up: %d\n",cid);
+  drive_write_machine();
+}
+
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb1(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->event_callback1(events);
+}
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb2(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->event_callback2(events);
+}
+
+void bev_read_cb1(struct bufferevent *bev, void *ptr) {
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->read_callback1();
+}
+
+
+void bev_read_cb2(struct bufferevent *bev, void *ptr) {
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->read_callback2();
+}
+
+void bev_write_cb_m(struct bufferevent *bev, void *ptr) {
+}
+
+void timer_cb_m(evutil_socket_t fd, short what, void *ptr) {
+  ConnectionMulti* conn = (ConnectionMulti*) ptr;
+  conn->timer_callback();
+}
+
diff --git a/ConnectionMultiApprox.cc b/ConnectionMultiApprox.cc
new file mode 100644
index 0000000..7ee052a
--- /dev/null
+++ b/ConnectionMultiApprox.cc
@@ -0,0 +1,1943 @@
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <time.h>
+#include <pthread.h>
+
+#include <event2/buffer.h>
+#include <event2/bufferevent.h>
+#include <event2/dns.h>
+#include <event2/event.h>
+#include <event2/thread.h>
+#include <event2/util.h>
+
+
+#include "config.h"
+
+#include "Connection.h"
+#include "distributions.h"
+#include "Generator.h"
+#include "mutilate.h"
+#include "binary_protocol.h"
+#include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+
+//#include <folly/concurrency/UnboundedQueue.h>
+//#include <folly/concurrency/ConcurrentHashMap.h>
+
+#define ITEM_L1 1
+#define ITEM_L2 2
+#define LOG_OP 4
+#define SRC_L1_M 8
+#define SRC_L1_H 16
+#define SRC_L2_M 32
+#define SRC_L2_H 64
+#define SRC_DIRECT_SET 128
+#define SRC_L1_COPY 256
+#define SRC_WB 512
+
+#define ITEM_INCL  4096
+#define ITEM_EXCL  8192
+#define ITEM_DIRTY 16384
+#define ITEM_SIZE_CHANGE 131072
+#define ITEM_WAS_HIT 262144
+
+#define LEVELS 2
+#define SET_INCL(incl,flags)     \
+    switch (incl) {              \
+        case 1:                  \
+            flags |= ITEM_INCL;  \
+            break;               \
+        case 2:                  \
+            flags |= ITEM_EXCL;  \
+            break;               \
+                                 \
+    }                            \
+
+#define GET_INCL(incl,flags) \
+    if (flags & ITEM_INCL) incl = 1; \
+    else if (flags & ITEM_EXCL) incl = 2; \
+
+//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 )
+#define OP_level(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define FLAGS_level(flags) ( flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) 
+
+#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) )
+
+#define OP_log(op) ((op)->flags & LOG_OP)
+#define OP_incl(op) ((op)->flags & ITEM_INCL)
+#define OP_excl(op) ((op)->flags & ITEM_EXCL)
+#define OP_set_flag(op,flag) ((op))->flags |= flag;
+
+//#define DEBUGMC
+//#define DEBUGS
+//using namespace folly;
+using namespace moodycamel;
+//using namespace fmt;
+
+//struct node {
+//    long long addr,label;
+//    node *nxt;
+//    node(long long _addr = 0, long long _label = 0, node *_nxt = NULL)
+//         : addr(_addr),label(_label),nxt(_nxt) {}
+//};
+//
+//struct tnode {
+//    long long tm,offset; int size;
+//};//trace file data structure
+//
+//long long find(long long addr) {
+//    int t = addr%MAXH;
+//    node *tmp = hash[t],*pre = NULL;
+//    while (tmp) {
+//        if (tmp->addr == addr) {
+//            long long tlabel = tmp->label;
+//            if (pre == NULL) hash[t] = tmp->nxt;
+//            else pre->nxt = tmp->nxt;
+//            delete tmp;
+//            return tlabel;
+//        }
+//        pre = tmp;
+//        tmp = tmp->nxt;
+//    }
+//    return 0;
+//}
+//
+//void insert(long long addr )  {
+//    int t = addr%MAXH;
+//    node *tmp = new node(addr,n,hash[t]);
+//    hash[t] = tmp;
+//}
+
+
+
+pthread_mutex_t cid_lock_m_approx = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t connids_m = 1;
+
+#define NCLASSES 40
+#define CHUNK_ALIGN_BYTES 8
+static int classes = 0;
+static int sizes[NCLASSES+1];
+static int inclusives[NCLASSES+1];
+
+
+
+static void init_inclusives(char *inclusive_str) {
+    int j = 1;
+    for (int i = 0; i < (int)strlen(inclusive_str); i++) {
+        if (inclusive_str[i] == '-') {
+            continue;
+        } else {
+            inclusives[j] = inclusive_str[i] - '0';
+            j++;
+        }
+    }
+}
+
+static void init_classes() {
+
+    double factor = 1.25;
+    //unsigned int chunk_size = 48;
+    //unsigned int item_size = 24;
+    unsigned int size = 96; //warning if you change this you die
+    unsigned int i = 0;
+    unsigned int chunk_size_max = 1048576/2;
+    while (++i < NCLASSES-1) {
+        if (size >= chunk_size_max / factor) {
+            break;
+        }
+        if (size % CHUNK_ALIGN_BYTES)
+            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
+        sizes[i] = size;
+        size *= factor;
+    }
+    sizes[i] = chunk_size_max;
+    classes = i;
+
+}
+
+static int get_class(int vl, uint32_t kl) {
+    //warning if you change this you die
+    int vsize = vl+kl+48+1+2;
+    int res = 1;
+    while (vsize > sizes[res])
+        if (res++ == classes) { 
+            //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]);
+            return -1;
+        }
+    return res;
+}
+
+static int get_incl(int vl, int kl) {
+    int clsid = get_class(vl,kl);
+    if (clsid) {
+        return inclusives[clsid];
+    } else {
+        return -1;
+    }
+}
+
+void ConnectionMultiApprox::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char k[256];
+    char a[256];
+    char s[256];
+    memset(k,0,256);
+    memset(a,0,256);
+    memset(s,0,256);
+    strncpy(k,op->key,255);
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type);
+    }
+    write(2,output,strlen(output));
+}
+
+//extern USPMCQueue<Operation*,true,8,7> g_trace_queue;
+//static vector<double> cid_rate;
+//extern ConcurrentHashMap<int, double> cid_rate;
+extern unordered_map<int, double> cid_rate;
+//extern ConcurrentHashMap<string, vector<Operation*>> copy_keys;
+extern unordered_map<string, vector<Operation*>> copy_keys;
+extern unordered_map<string, int> touch_keys;
+extern unordered_map<string, vector<Operation*>> wb_keys;
+//extern ConcurrentHashMap<string, vector<Operation*>> wb_keys;
+
+extern map<string,int> g_key_hist;
+extern int max_n[3];
+
+/**
+ * Create a new connection to a server endpoint.
+ */
+ConnectionMultiApprox::ConnectionMultiApprox(struct event_base* _base, struct evdns_base* _evdns,
+                       string _hostname1, string _hostname2, string _port, options_t _options,
+                       bool sampling, int fd1, int fd2 ) :
+  start_time(0), stats(sampling), options(_options),
+  hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns)
+{
+  pthread_mutex_lock(&cid_lock_m_approx);
+  cid = connids_m++;
+  if (cid == 1) {
+    init_classes();
+    init_inclusives(options.inclusives);
+  }
+  cid_rate.insert( { cid, 0 } );
+  
+  pthread_mutex_unlock(&cid_lock_m_approx);
+  
+  valuesize = createGenerator(options.valuesize);
+  keysize = createGenerator(options.keysize);
+  srand(time(NULL));
+  keygen = new KeyGenerator(keysize, options.records);
+  
+  total = 0;
+  eof = 0;
+  o_percent = 0;
+
+  if (options.lambda <= 0) {
+    iagen = createGenerator("0");
+  } else {
+    D("iagen = createGenerator(%s)", options.ia);
+    iagen = createGenerator(options.ia);
+    iagen->set_lambda(options.lambda);
+  }
+
+  read_state  = IDLE;
+  write_state = INIT_WRITE;
+  last_quiet1 = false;
+  last_quiet2 = false;
+  
+  last_tx = last_rx = 0.0;
+  gets = 0;
+  ghits = 0;
+  esets = 0;
+  isets = 0;
+  gloc = rand() % (10*2-1)+1;
+  sloc = rand() % (10*2-1)+1;
+  iloc = rand() % (10*2-1)+1;
+
+  op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1));
+
+  for (int i = 0; i <= LEVELS; i++) {
+      op_queue_size[i] = 0;
+      opaque[i] = 1;
+      //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX);
+      op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1));
+      for (int j = 0; j <= OPAQUE_MAX; j++) {
+          op_queue[i][j] = NULL;
+      }
+
+  }
+
+  
+  bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev1, bev_read_cb1_approx, bev_write_cb_m_approx, bev_event_cb1_approx, this);
+  bufferevent_enable(bev1, EV_READ | EV_WRITE);
+  
+  bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev2, bev_read_cb2_approx, bev_write_cb_m_approx, bev_event_cb2_approx, this);
+  bufferevent_enable(bev2, EV_READ | EV_WRITE);
+  
+  timer = evtimer_new(base, timer_cb_m_approx, this);
+
+  read_state  = IDLE;
+}
+
+
+void ConnectionMultiApprox::set_queue(queue<Operation*>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    trace_queue_n = a_trace_queue->size();
+}
+
+void ConnectionMultiApprox::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+void ConnectionMultiApprox::set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys) {
+    g_wb_keys = a_wb_keys;
+}
+
+uint32_t ConnectionMultiApprox::get_cid() {
+    return cid;
+}
+
+int ConnectionMultiApprox::add_to_wb_keys(string key) {
+    auto pos = wb_keys.find(key);
+    if (pos == wb_keys.end()) {
+        wb_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+int ConnectionMultiApprox::add_to_copy_keys(string key) {
+    auto pos = copy_keys.find(key);
+    if (pos == copy_keys.end()) {
+        copy_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApprox::del_copy_keys(string key) {
+
+    auto position = copy_keys.find(key);
+    if (position != copy_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        copy_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApprox::add_to_touch_keys(string key) {
+    //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2;
+    auto pos = touch_keys.find(key);
+    if (pos == touch_keys.end()) {
+        touch_keys.insert( {key, cid });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApprox::del_touch_keys(string key) {
+    //touch_keys.erase(key);
+    auto position = touch_keys.find(key);
+    if (position != touch_keys.end()) {
+        touch_keys.erase(position);
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApprox::issue_op(Operation *Op) {
+    double now = get_time();
+    int issued = 0;
+    int incl = get_incl(Op->valuelen,strlen(Op->key));
+    int cid = get_class(Op->valuelen,strlen(Op->key));
+    Op->clsid = cid;
+    int flags = 0;
+    int index = lrand48() % (1024 * 1024);
+    //int touch = 1;
+    SET_INCL(incl,flags);
+    
+    switch(Op->type)
+    {
+      case Operation::GET:
+          //if (nissued < options.depth-1) {
+          //  issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1);
+          //  last_quiet1 = false;
+          //} else {
+          //}
+          issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1);
+          last_quiet1 = false;
+          this->stats.gets++;
+          gets++;
+          this->stats.gets_cid[cid]++;
+    
+          break;
+    case Operation::SET:
+          if (last_quiet1) {
+              issue_noop(now,1);
+          }
+          if (incl == 1) {
+            if (isets >= iloc) {
+            //if (1) {
+                const char *data = &random_char[index];
+                issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+                //int ret = add_to_touch_keys(string(Op->key));
+                //if (ret == 1) {
+                    issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET);
+                //}
+                iloc += rand()%(10*2-1)+1;
+            } else {
+                issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY);
+            }
+            isets++;
+          } else if (incl == 2) {
+            issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+            if (esets >= sloc) {
+                issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET);
+                sloc += rand()%(10*2-1)+1;
+            }
+            esets++;
+          }
+          last_quiet1 = false;
+          this->stats.sets++;
+          this->stats.sets_cid[cid]++;
+          break;
+    case Operation::DELETE:
+    case Operation::TOUCH:
+    case Operation::NOOP:
+    case Operation::SASL:
+          fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen);
+          break;
+    
+    }
+    return issued;
+}
+
+void ConnectionMultiApprox::del_wb_keys(string key) {
+
+    auto position = wb_keys.find(key);
+    if (position != wb_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        wb_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+}
+
+
+int ConnectionMultiApprox::do_connect() {
+
+  int connected = 0;
+  if (options.unix_socket) {
+  
+
+    struct sockaddr_un sin1;
+    memset(&sin1, 0, sizeof(sin1));
+    sin1.sun_family = AF_LOCAL;
+    strcpy(sin1.sun_path, hostname1.c_str());
+
+    int addrlen;
+    addrlen = sizeof(sin1);
+
+    int err = bufferevent_socket_connect(bev1,  (struct sockaddr*)&sin1, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+        connected = 0;
+        err = errno;
+        fprintf(stderr,"l1 error %s\n",strerror(err));
+    }
+    
+    struct sockaddr_un sin2;
+    memset(&sin2, 0, sizeof(sin2));
+    sin2.sun_family = AF_LOCAL;
+    strcpy(sin2.sun_path, hostname2.c_str());
+
+    addrlen = sizeof(sin2);
+    err = bufferevent_socket_connect(bev2,  (struct sockaddr*)&sin2, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+	connected = 0;
+        err = errno;
+	fprintf(stderr,"l2 error %s\n",strerror(err));
+    }
+  } 
+  read_state  = IDLE;
+  return connected;
+}
+
+/**
+ * Destroy a connection, performing cleanup.
+ */
+ConnectionMultiApprox::~ConnectionMultiApprox() {
+ 
+
+  for (int i = 0; i <= LEVELS; i++) {
+      free(op_queue[i]);
+
+  }
+  
+  free(op_queue_size);
+  free(opaque);
+  free(op_queue);
+  //event_free(timer);
+  //timer = NULL;
+  // FIXME:  W("Drain op_q?");
+  //bufferevent_free(bev1);
+  //bufferevent_free(bev2);
+
+  delete iagen;
+  delete keygen;
+  delete keysize;
+  delete valuesize;
+}
+
+/**
+ * Reset the connection back to an initial, fresh state.
+ */
+void ConnectionMultiApprox::reset() {
+  // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
+  read_state = IDLE;
+  write_state = INIT_WRITE;
+  stats = ConnectionStats(stats.sampling);
+}
+
+/**
+ * Set our event processing priority.
+ */
+void ConnectionMultiApprox::set_priority(int pri) {
+  if (bufferevent_priority_set(bev1, pri)) {
+    DIE("bufferevent_set_priority(bev, %d) failed", pri);
+  }
+}
+
+
+
+/**
+ * Get/Set or Set Style
+ * If a GET command: Issue a get first, if not found then set
+ * If trace file (or prob. write) says to set, then set it
+ */
+int ConnectionMultiApprox::issue_getsetorset(double now) {
+ 
+
+    
+    int ret = 0;
+    int nissued = 0;
+    
+    //while (nissued < 1) {
+    
+    //pthread_mutex_lock(lock);
+        //if (!trace_queue->empty()) {
+            
+            /* check if in global wb queue */
+            //double percent = (double)total/((double)trace_queue_n) * 100;
+            //if (percent > o_percent+2) {
+            //    //update the percentage table and see if we should execute
+            //    if (options.ratelimit) {
+            //        double min_percent = 1000;
+            //        auto it = cid_rate.begin();
+            //        while (it != cid_rate.end()) {
+            //           if (it->second < min_percent) {
+            //               min_percent = it->second;
+            //           }
+            //           ++it;
+            //        }
+
+            //        if (percent > min_percent+2) {
+            //            struct timeval tv;
+            //            tv.tv_sec = 0;
+            //            tv.tv_usec = 100;
+            //            int good = 0;
+            //            if (!event_pending(timer, EV_TIMEOUT, NULL)) {
+            //                good = evtimer_add(timer, &tv);
+            //            }
+            //            if (good != 0) {
+            //                fprintf(stderr,"eventimer is messed up!\n");
+            //                return 2;
+            //            }
+            //            return 1;
+            //        }
+            //    }
+            //    cid_rate.insert( {cid, percent});
+            //    fprintf(stderr,"%f,%d,%.4f\n",now,cid,percent);
+            //    o_percent = percent;
+            //}
+            //
+            
+            Operation *Op = trace_queue->front(); 
+            //Operation *Op = g_trace_queue.dequeue(); 
+            
+            if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) {
+                eof = 1;
+                cid_rate.insert( {cid, 100 } );
+                fprintf(stderr,"cid %d done\n",cid);
+                string op_queue1;
+                string op_queue2;
+                for (int j = 0; j < 2; j++) {
+                    for (int i = 0; i < OPAQUE_MAX; i++) {
+                        if (op_queue[j+1][i] != NULL) {
+                            if (j == 0) {
+                                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+                            } else {
+                                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+                            }
+                        }
+                    }
+                }
+                fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+                return 1;
+            } 
+            
+            trace_queue->pop();
+
+            
+            //trace_queue->pop();
+
+            //pthread_mutex_lock(lock);
+            //auto check = wb_keys.find(string(Op->key));
+            //if (check != wb_keys.end()) {
+            //    check->second.push_back(Op);
+            //    return 0;
+            //}
+                //pthread_mutex_unlock(lock);
+                //pthread_mutex_unlock(lock);
+                //struct timeval tv;
+                //double delay; 
+                //delay = last_rx + 0.00025 - now;
+                //double_to_tv(delay,&tv);
+                //int good = 0;
+                ////if (!event_pending(timer, EV_TIMEOUT, NULL)) {
+                //good = evtimer_add(timer, &tv);
+                ////}
+                //if (good != 0) {
+                //    fprintf(stderr,"eventimer is messed up in checking for key: %s\n",Op->key);
+                //    return 2;
+                //}
+                //return 1;
+            //} else {
+                //pthread_mutex_unlock(lock);
+            int issued = issue_op(Op);
+            if (issued) {
+                nissued++;
+                total++;
+            } else {
+                fprintf(stderr,"failed to issue line: %s, vl: %d\n",Op->key,Op->valuelen);
+            }
+            //}
+
+        //} else {
+        //    return 1;
+        //}
+    //}
+    //if (last_quiet1) {
+    //    issue_noop(now,1);
+    //    last_quiet1 = false;
+    //}
+
+    return ret;
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApprox::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) {
+  
+  //check if op is in copy_keys (currently going to L1)
+  //auto check = copy_keys.find(string(pop->key));
+  //if (check != copy_keys.end()) {
+  //    check->second.push_back(pop);
+  //    return 1;
+  //} 
+
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(pop->key);
+
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  if (quiet) {
+      h.opcode = CMD_GETQ;
+  }
+  h.opaque = htonl(pop->opaque);
+ 
+
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, pop->key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  return 1;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApprox::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) {
+
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op;
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::GET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(valuelen,strlen(key));
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  if (quiet) {
+      h.opcode = CMD_GETQ;
+  }
+  h.opaque = htonl(pop->opaque);
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  return 1;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApprox::issue_touch(const char* key, int valuelen, double now, int flags) {
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::TOUCH;
+  pop->opaque = opaque[level]++;
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+  
+  pop->flags = flags;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+#ifdef DEBUGS
+  fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque);
+#endif
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen),
+                        0x04, 0x00, htons(0),
+                        htonl(keylen + 4) };
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t exp = 0;
+  if (flags & ITEM_DIRTY) {
+      exp = htonl(flags); 
+  }
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, key, keylen);
+
+  
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+/**
+ * Issue a delete request to the server.
+ */
+int ConnectionMultiApprox::issue_delete(const char* key, double now, uint32_t flags) {
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op;
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->type = Operation::DELETE;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque);
+#endif
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_DELETE, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  h.opaque = htonl(pop->opaque);
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+void ConnectionMultiApprox::issue_noop(double now, int level) {
+   struct evbuffer *output = NULL;
+   switch (level) {
+       case 1:
+           output = bufferevent_get_output(bev1);
+           break;
+       case 2:
+           output = bufferevent_get_output(bev2);
+           break;
+   }
+   Operation op;
+   
+   if (now == 0.0) op.start_time = get_time();
+   else op.start_time = now;
+
+   binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                         0x00, 0x00, htons(0),
+                         0x00 };
+   
+   evbuffer_add(output, &h, 24);
+
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApprox::issue_set(Operation *pop, const char* value, double now, uint32_t flags) {
+  
+  //check if op is in copy_keys (currently going to L1)
+  //auto check = copy_keys.find(string(pop->key));
+  //if (check != copy_keys.end()) {
+  //    check->second.push_back(pop);
+  //    return 1;
+  //} 
+  
+  struct evbuffer *output = NULL;
+  int level = 0;
+  int length = pop->valuelen;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,length,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(pop->key);
+  
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(flags);
+  uint32_t exp = 0;
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &f, 4);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, pop->key, keylen);
+  evbuffer_add(output, value, length);
+
+  stats.tx_bytes += length + 32 + keylen;
+  return 1;
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApprox::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) {
+  
+  struct evbuffer *output = NULL;
+  int level = 0;
+  switch (FLAGS_level(flags)) {
+      case 1:
+          level = 1;
+          output = bufferevent_get_output(bev1);
+          break;
+      case 2:
+          level = 2;
+          output = bufferevent_get_output(bev2);
+          break;
+  }
+  //Operation op; 
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  strncpy(pop->key,key,255); 
+  pop->valuelen = length;
+  pop->type = Operation::SET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(length,strlen(key));
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(flags);
+  uint32_t exp = 0;
+  
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, &f, 4);
+  evbuffer_add(output, &exp, 4);
+  evbuffer_add(output, key, keylen);
+  evbuffer_add(output, value, length);
+
+  stats.tx_bytes += length + 32 + keylen;
+  return 1;
+}
+
+/**
+ * Return the oldest live operation in progress.
+ */
+void ConnectionMultiApprox::pop_op(Operation *op) {
+
+  uint8_t level = OP_level(op);
+  //op_queue[level].erase(op);
+  op_queue_size[level]--;
+  
+
+  if (read_state == LOADING) return;
+  read_state = IDLE;
+
+  // Advance the read state machine.
+  //if (op_queue.size() > 0) {
+  //  Operation& op = op_queue.front();
+  //  switch (op.type) {
+  //  case Operation::GET: read_state = WAITING_FOR_GET; break;
+  //  case Operation::SET: read_state = WAITING_FOR_SET; break;
+  //  case Operation::DELETE: read_state = WAITING_FOR_DELETE; break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
+}
+
+/**
+ * Finish up (record stats) an operation that just returned from the
+ * server.
+ */
+void ConnectionMultiApprox::finish_op(Operation *op, int was_hit) {
+  double now;
+#if USE_CACHED_TIME
+  struct timeval now_tv;
+  event_base_gettimeofday_cached(base, &now_tv);
+  now = tv_to_double(&now_tv);
+#else
+  now = get_time();
+#endif
+#if HAVE_CLOCK_GETTIME
+  op->end_time = get_time_accurate();
+#else
+  op->end_time = now;
+#endif
+
+  if (was_hit) { 
+    switch (op->type) {
+    case Operation::GET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_get_l1(*op);
+                break;
+            case 2:
+                stats.log_get_l2(*op);
+                if (op->l1 != NULL) {
+                    op->l1->end_time = now;
+                    stats.log_get(*(op->l1));
+                }
+                break;
+        }
+        break;
+    case Operation::SET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_set_l1(*op);
+                break;
+            case 2:
+                stats.log_set_l2(*op);
+                break;
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  }
+  //} else {
+  //  switch (op->type) {
+  //  case Operation::GET: 
+  //      if (OP_log(op)) {
+  //          switch (OP_level(op)) {
+  //              case 1:
+  //                  stats.log_get_l1(*op);
+  //                  break;
+  //              case 2:
+  //                  stats.log_get_l2(*op);
+  //                  if (op->l1 != NULL) {
+  //                      op->l1->end_time = now;
+  //                      stats.log_get(*(op->l1));
+  //                  }
+  //                  break;
+  //          }
+  //      }
+  //      break;
+  //  case Operation::SET:
+  //      if (OP_log(op)) {
+  //          switch (OP_level(op)) {
+  //              case 1:
+  //                  stats.log_set_l1(*op);
+  //                  break;
+  //              case 2:
+  //                  stats.log_set_l2(*op);
+  //                  break;
+  //          }
+  //      }
+  //      break;
+  //  case Operation::DELETE: break;
+  //  case Operation::TOUCH: break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
+
+  last_rx = now;
+  uint8_t level = OP_level(op);
+  if (op->l1 != NULL) {
+      //delete op_queue[1][op->l1->opaque];
+      op_queue[1][op->l1->opaque] = 0;
+      op_queue_size[1]--;
+      delete op->l1;
+  }
+  //op_queue[level].erase(op_queue[level].begin()+opopq);
+  if (op == op_queue[level][op->opaque] && 
+          op->opaque == op_queue[level][op->opaque]->opaque) {
+    //delete op_queue[level][op->opaque];
+    op_queue[level][op->opaque] = 0;
+    delete op;
+  } else {
+      fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque);
+  }
+  op_queue_size[level]--;
+  read_state = IDLE;
+
+
+}
+
+
+
+/**
+ * Check if our testing is done and we should exit.
+ */
+bool ConnectionMultiApprox::check_exit_condition(double now) {
+  if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) {
+      return true;
+  }
+  if (read_state == INIT_READ) return false;
+
+  return false;
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMultiApprox::event_callback1(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname1.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev1);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev1);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMultiApprox::event_callback2(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname2.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev2);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev2);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Request generation loop. Determines whether or not to issue a new command,
+ * based on timer events.
+ *
+ * Note that this function loops. Be wary of break vs. return.
+ */
+void ConnectionMultiApprox::drive_write_machine(double now) {
+
+  if (now == 0.0) now = get_time();
+  double delay;
+  struct timeval tv;
+
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  while (1) {
+    switch (write_state) {
+    case INIT_WRITE:
+      delay = iagen->generate();
+      next_time = now + delay;
+      double_to_tv(delay, &tv);
+      evtimer_add(timer, &tv);
+      write_state = ISSUING;
+      break;
+
+    case ISSUING:
+      if ( (op_queue_size[1] >= (size_t) options.depth) || 
+          (op_queue_size[2] >= (size_t) options.depth) ) {
+        write_state = WAITING_FOR_OPQ;
+        break;
+      }
+
+      if (options.getsetorset) {
+        int ret = issue_getsetorset(now);
+        if (ret == 1) return; //if at EOF
+      }
+      
+      last_tx = now;
+      for (int i = 1; i <= 2; i++) {
+        stats.log_op(op_queue_size[i]);
+      }
+      break;
+
+    case WAITING_FOR_TIME:
+      write_state = ISSUING;
+      break;
+
+    case WAITING_FOR_OPQ:
+      if ( (op_queue_size[1] >= (size_t) options.depth) || 
+          (op_queue_size[2] >= (size_t) options.depth) ) {
+          //double delay = 0.01;
+          //struct timeval tv;
+          //double_to_tv(delay, &tv);
+          //evtimer_add(timer, &tv);
+          return;
+      } else {
+        write_state = ISSUING;
+        break;
+      }
+
+    default: DIE("Not implemented");
+    }
+  }
+}
+
+
+
+/**
+ * Tries to consume a binary response (in its entirety) from an evbuffer.
+ *
+ * @param input evBuffer to read response from
+ * @return  true if consumed, false if not enough data in buffer.
+ */
+static bool handle_response(ConnectionMultiApprox *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) {
+  // Read the first 24 bytes as a header
+  int length = evbuffer_get_length(input);
+  if (length < 24) return false;
+  binary_header_t* h =
+          reinterpret_cast<binary_header_t*>(evbuffer_pullup(input, 24));
+  //assert(h);
+
+  uint32_t bl = ntohl(h->body_len);
+  uint16_t kl = ntohs(h->key_len);
+  uint8_t el = h->extra_len;
+  // Not whole response
+  int targetLen = 24 + bl;
+  if (length < targetLen) {
+      return false;
+  }
+
+  opcode = h->opcode;
+  opaque = ntohl(h->opaque);
+  uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level,
+            h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+            ntohl(h->body_len),ntohs(h->status));
+#endif
+
+
+  // If something other than success, count it as a miss
+  if (opcode == CMD_GET && status == RESP_NOT_FOUND) {
+      switch(level) {
+          case 1:
+              conn->stats.get_misses_l1++;
+              break;
+          case 2:
+              conn->stats.get_misses_l2++;
+              conn->stats.get_misses++;
+              conn->stats.window_get_misses++;
+              break;
+
+      }
+      found = false;
+      evbuffer_drain(input, targetLen);
+
+  } else if (opcode == CMD_SET && kl > 0) {
+    //first data is extras: clsid, flags, eflags
+    if (evict) {
+        evbuffer_drain(input,24);
+        unsigned char *buf = evbuffer_pullup(input,bl);
+        
+
+        evict->clsid = *((uint32_t*)buf);
+        evict->clsid = ntohl(evict->clsid);
+        buf += 4;
+        
+        evict->serverFlags = *((uint32_t*)buf);
+        evict->serverFlags = ntohl(evict->serverFlags);
+        buf += 4;
+        
+        evict->evictedFlags = *((uint32_t*)buf);
+        evict->evictedFlags = ntohl(evict->evictedFlags);
+        buf += 4;
+
+        
+        evict->evictedKeyLen = kl;
+        evict->evictedKey = (char*)malloc(kl+1);
+        memset(evict->evictedKey,0,kl+1);
+        memcpy(evict->evictedKey,buf,kl);
+        buf += kl;
+
+
+        evict->evictedLen = bl - kl - el;
+        evict->evictedData = (char*)malloc(evict->evictedLen);
+        memcpy(evict->evictedData,buf,evict->evictedLen);
+        evict->evicted = true;
+        //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags);
+        evbuffer_drain(input,bl);
+    } else {
+        evbuffer_drain(input, targetLen);
+    }
+  } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else {
+    evbuffer_drain(input, targetLen);
+  }
+
+  conn->stats.rx_bytes += targetLen;
+  done = true;
+  return true;
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApprox::read_callback1() {
+  struct evbuffer *input = bufferevent_get_input(bev1);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+    memset(evict,0,sizeof(evicted_t));
+
+    full_read = handle_response(this,input, done, found, opcode, opaque, evict,1);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l1: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+        op = op_queue[1][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+    } else {
+        if (evict) {
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+        }
+        break;
+    }
+    
+
+    double now = get_time();
+    int wb = 0;
+    if (options.rand_admit) {
+        wb = (rand() % options.rand_admit);
+    }
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+
+                int vl = op->valuelen;
+                if ( !found && (options.getset || options.getsetorset) ) {
+                    /* issue a get a l2 */
+                    int flags = OP_clu(op);
+                    issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                    op->end_time = now;
+                    this->stats.log_get_l1(*op);
+                    //finish_op(op,0);
+
+                } else {
+                    if (OP_incl(op) && ghits >= gloc) {
+                        //int ret = add_to_touch_keys(string(op->key));
+                        //if (ret == 1) {
+                            issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H);
+                        //}
+                        gloc += rand()%(10*2-1)+1;
+                    }
+                    ghits++;
+                    finish_op(op,1);
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l1: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_L1_COPY ||
+            //    OP_src(op) == SRC_L2_M) {
+            //    del_copy_keys(string(op->key));
+            //}
+            if (evict->evicted) {
+                string wb_key(evict->evictedKey);
+                if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                    //int ret = add_to_wb_keys(wb_key);
+                    //if (ret == 1) {
+                        issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY);
+                    //}
+                    this->stats.incl_wbs++;
+                } else if (evict->evictedFlags & ITEM_EXCL) {
+                    //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                    //strncpy(wb_key,evict->evictedKey,255);
+                    if ( (options.rand_admit && wb == 0) ||
+                         (options.threshold && (g_key_hist[wb_key] == 1)) ||
+                         (options.wb_all) ) {
+                        //int ret = add_to_wb_keys(wb_key);
+                        //if (ret == 1) {
+                            issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                        //}
+                        this->stats.excl_wbs++;
+                    }
+                }
+                if (OP_src(op) == SRC_DIRECT_SET) {
+                    if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                        this->stats.set_misses_l1++;
+                    } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_excl_hits_l1++;
+                    } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_incl_hits_l1++;
+                    }
+                }
+            }
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+    if (evict) {
+        if (evict->evictedKey) free(evict->evictedKey);
+        if (evict->evictedData) free(evict->evictedData);
+        free(evict);
+    }
+
+  }
+  
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  last_tx = now;
+  stats.log_op(op_queue_size[1]);
+  stats.log_op(op_queue_size[2]);
+  //for (int i = 1; i <= 2; i++) {
+  //    fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]);
+  //}
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApprox::read_callback2() {
+  struct evbuffer *input = bufferevent_get_input(bev2);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l2: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            continue;
+        }
+        op = op_queue[2][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            continue;
+        }
+    } else {
+        break;
+    }
+    
+
+    double now = get_time();
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {//  &&
+                    //(options.twitter_trace != 1)) {
+                    int valuelen = op->valuelen;
+                    int index = lrand48() % (1024 * 1024);
+                    int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                    //int ret = add_to_copy_keys(string(op->key));
+                    //if (ret == 1) {
+                        issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                        if (OP_incl(op)) {
+                            issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                            last_quiet2 = false; 
+                        }
+                    //}
+                    last_quiet1 = false; 
+                    finish_op(op,0); // sets read_state = IDLE
+                    
+                } else {
+                    if (found) {
+                        int valuelen = op->valuelen;
+                        int index = lrand48() % (1024 * 1024);
+                        int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                        string key = string(op->key);
+                        const char *data = &random_char[index];
+                        //int ret = add_to_copy_keys(string(op->key));
+                        //if (ret == 1) {
+                            issue_set(op->key,data,valuelen, now, flags);
+                        //}
+                        this->stats.copies_to_l1++;
+                        //djb: this is automatically done in the L2 server
+                        //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death?
+                        //    issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY );
+                        //}
+                        finish_op(op,1);
+
+                    } else {
+                        finish_op(op,0);
+                    }
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l2: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_WB) {
+            //    del_wb_keys(string(op->key));
+            //}
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) {
+                int valuelen = op->valuelen;
+                if (!found) {
+                    int index = lrand48() % (1024 * 1024);
+                    issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                    this->stats.set_misses_l2++;
+                } else {
+                    if (OP_src(op) == SRC_DIRECT_SET) {
+                        issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                    }
+                }
+                //del_touch_keys(string(op->key));
+            }
+            finish_op(op,0);
+            break;
+        case Operation::DELETE:
+            //check to see if it was a hit
+            //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                if (found) {
+                    this->stats.delete_hits_l2++;
+                } else {
+                    this->stats.delete_misses_l2++;
+                }
+            }
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+  }
+
+  double now = get_time();
+  if (check_exit_condition(now)) {
+      return;
+  }
+
+  last_tx = now;
+  stats.log_op(op_queue_size[2]);
+  stats.log_op(op_queue_size[1]);
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Callback called when write requests finish.
+ */
+void ConnectionMultiApprox::write_callback() {
+
+    //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size());
+}
+
+/**
+ * Callback for timer timeouts.
+ */
+void ConnectionMultiApprox::timer_callback() {
+  //fprintf(stderr,"timer up: %d\n",cid);
+  drive_write_machine();
+}
+
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb1_approx(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr;
+  conn->event_callback1(events);
+}
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb2_approx(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr;
+  conn->event_callback2(events);
+}
+
+void bev_read_cb1_approx(struct bufferevent *bev, void *ptr) {
+  ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr;
+  conn->read_callback1();
+}
+
+
+void bev_read_cb2_approx(struct bufferevent *bev, void *ptr) {
+  ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr;
+  conn->read_callback2();
+}
+
+void bev_write_cb_m_approx(struct bufferevent *bev, void *ptr) {
+}
+
+void timer_cb_m_approx(evutil_socket_t fd, short what, void *ptr) {
+  ConnectionMultiApprox* conn = (ConnectionMultiApprox*) ptr;
+  conn->timer_callback();
+}
+
diff --git a/ConnectionMultiApproxBatch.cc b/ConnectionMultiApproxBatch.cc
new file mode 100644
index 0000000..16de236
--- /dev/null
+++ b/ConnectionMultiApproxBatch.cc
@@ -0,0 +1,2187 @@
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <time.h>
+#include <pthread.h>
+
+#include <event2/buffer.h>
+#include <event2/bufferevent.h>
+#include <event2/dns.h>
+#include <event2/event.h>
+#include <event2/thread.h>
+#include <event2/util.h>
+
+
+#include "config.h"
+
+#include "Connection.h"
+#include "distributions.h"
+#include "Generator.h"
+#include "mutilate.h"
+#include "binary_protocol.h"
+#include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+
+//#include <folly/concurrency/UnboundedQueue.h>
+//#include <folly/concurrency/ConcurrentHashMap.h>
+
+#define ITEM_L1 1
+#define ITEM_L2 2
+#define LOG_OP 4
+#define SRC_L1_M 8
+#define SRC_L1_H 16
+#define SRC_L2_M 32
+#define SRC_L2_H 64
+#define SRC_DIRECT_SET 128
+#define SRC_L1_COPY 256
+#define SRC_WB 512
+
+#define ITEM_INCL  4096
+#define ITEM_EXCL  8192
+#define ITEM_DIRTY 16384
+#define ITEM_SIZE_CHANGE 131072
+#define ITEM_WAS_HIT 262144
+
+#define LEVELS 2
+#define SET_INCL(incl,flags)     \
+    switch (incl) {              \
+        case 1:                  \
+            flags |= ITEM_INCL;  \
+            break;               \
+        case 2:                  \
+            flags |= ITEM_EXCL;  \
+            break;               \
+                                 \
+    }                            \
+
+#define GET_INCL(incl,flags) \
+    if (flags & ITEM_INCL) incl = 1; \
+    else if (flags & ITEM_EXCL) incl = 2; \
+
+//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 )
+#define OP_level(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define FLAGS_level(flags) ( flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) 
+
+#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) )
+
+#define OP_log(op) ((op)->flags & LOG_OP)
+#define OP_incl(op) ((op)->flags & ITEM_INCL)
+#define OP_excl(op) ((op)->flags & ITEM_EXCL)
+#define OP_set_flag(op,flag) ((op))->flags |= flag;
+
+//#define DEBUGMC
+//#define DEBUGS
+
+
+
+pthread_mutex_t cid_lock_m_approx_batch = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t connids_m = 1;
+
+#define NCLASSES 40
+#define CHUNK_ALIGN_BYTES 8
+static int classes = 0;
+static int sizes[NCLASSES+1];
+static int inclusives[NCLASSES+1];
+
+
+
+static void init_inclusives(char *inclusive_str) {
+    int j = 1;
+    for (int i = 0; i < (int)strlen(inclusive_str); i++) {
+        if (inclusive_str[i] == '-') {
+            continue;
+        } else {
+            inclusives[j] = inclusive_str[i] - '0';
+            j++;
+        }
+    }
+}
+
+static void init_classes() {
+
+    double factor = 1.25;
+    //unsigned int chunk_size = 48;
+    //unsigned int item_size = 24;
+    unsigned int size = 96; //warning if you change this you die
+    unsigned int i = 0;
+    unsigned int chunk_size_max = 1048576/2;
+    while (++i < NCLASSES-1) {
+        if (size >= chunk_size_max / factor) {
+            break;
+        }
+        if (size % CHUNK_ALIGN_BYTES)
+            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
+        sizes[i] = size;
+        size *= factor;
+    }
+    sizes[i] = chunk_size_max;
+    classes = i;
+
+}
+
+static int get_class(int vl, uint32_t kl) {
+    //warning if you change this you die
+    int vsize = vl+kl+48+1+2;
+    int res = 1;
+    while (vsize > sizes[res])
+        if (res++ == classes) { 
+            //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]);
+            return -1;
+        }
+    return res;
+}
+
+static int get_incl(int vl, int kl) {
+    int clsid = get_class(vl,kl);
+    if (clsid) {
+        return inclusives[clsid];
+    } else {
+        return -1;
+    }
+}
+
+
+void ConnectionMultiApproxBatch::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char k[256];
+    char a[256];
+    char s[256];
+    memset(k,0,256);
+    memset(a,0,256);
+    memset(s,0,256);
+    strncpy(k,op->key,255);
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type);
+    }
+    size_t res = write(2,output,strlen(output));
+    if (res != strlen(output)) {
+        fprintf(stderr,"error outputingiii\n");
+    }
+}
+
+extern unordered_map<int, double> cid_rate;
+extern unordered_map<string, vector<Operation*>> copy_keys;
+extern unordered_map<string, int> touch_keys;
+extern unordered_map<string, vector<Operation*>> wb_keys;
+
+extern map<string,int> g_key_hist;
+extern int max_n[3];
+
+/**
+ * Create a new connection to a server endpoint.
+ */
+ConnectionMultiApproxBatch::ConnectionMultiApproxBatch(struct event_base* _base, struct evdns_base* _evdns,
+                       string _hostname1, string _hostname2, string _port, options_t _options,
+                       bool sampling, int fd1, int fd2 ) :
+  start_time(0), stats(sampling), options(_options),
+  hostname1(_hostname1), hostname2(_hostname2), port(_port), base(_base), evdns(_evdns)
+{
+  pthread_mutex_lock(&cid_lock_m_approx_batch);
+  cid = connids_m++;
+  if (cid == 1) {
+    init_classes();
+    init_inclusives(options.inclusives);
+  }
+  //cid_rate.insert( { cid, 0 } );
+  
+  pthread_mutex_unlock(&cid_lock_m_approx_batch);
+  
+  valuesize = createGenerator(options.valuesize);
+  keysize = createGenerator(options.keysize);
+  srand(time(NULL));
+  keygen = new KeyGenerator(keysize, options.records);
+  
+  total = 0;
+  eof = 0;
+  o_percent = 0;
+
+  if (options.lambda <= 0) {
+    iagen = createGenerator("0");
+  } else {
+    D("iagen = createGenerator(%s)", options.ia);
+    iagen = createGenerator(options.ia);
+    iagen->set_lambda(options.lambda);
+  }
+
+  read_state  = IDLE;
+  write_state = INIT_WRITE;
+  last_quiet1 = false;
+  last_quiet2 = false;
+  
+  last_tx = last_rx = 0.0;
+  gets = 0;
+  ghits = 0;
+  esets = 0;
+  isets = 0;
+  gloc = rand() % (10*2-1)+1;
+  sloc = rand() % (10*2-1)+1;
+  iloc = rand() % (10*2-1)+1;
+
+  op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1));
+
+  for (int i = 0; i <= LEVELS; i++) {
+      op_queue_size[i] = 0;
+      opaque[i] = 1;
+      //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX);
+      op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1));
+      for (int j = 0; j <= OPAQUE_MAX; j++) {
+          op_queue[i][j] = NULL;
+      }
+
+  }
+
+  
+  bev1 = bufferevent_socket_new(base, fd1, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev1, bev_read_cb1_approx_batch, bev_write_cb_m_approx_batch, bev_event_cb1_approx_batch, this);
+  bufferevent_enable(bev1, EV_READ | EV_WRITE);
+  //bufferevent_setwatermark(bev1, EV_READ, 512*1024, 0);
+  
+  bev2 = bufferevent_socket_new(base, fd2, BEV_OPT_CLOSE_ON_FREE);
+  bufferevent_setcb(bev2, bev_read_cb2_approx_batch, bev_write_cb_m_approx_batch, bev_event_cb2_approx_batch, this);
+  bufferevent_enable(bev2, EV_READ | EV_WRITE);
+  //bufferevent_setwatermark(bev2, EV_READ, 512*1024, 0);
+  
+  timer = evtimer_new(base, timer_cb_m_approx_batch, this);
+
+  read_state  = IDLE;
+}
+
+
+void ConnectionMultiApproxBatch::set_queue(queue<Operation*>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    trace_queue_n = a_trace_queue->size();
+    Operation *Op = trace_queue->front(); 
+    incl_ = get_incl(Op->valuelen,strlen(Op->key));
+    clsid_ = get_class(Op->valuelen,strlen(Op->key));
+
+    buffer_size_ = 1024*1024*10;
+    //setup the buffers
+    //max is (valuelen + 256 + 24 + 4 + 4 )  * depth
+    for (int i = 1; i <= LEVELS; i++) {
+        buffer_write[i] = (unsigned char*)malloc(options.depth*512*1024);
+        buffer_read[i] = (unsigned char*)malloc(buffer_size_);
+        buffer_leftover[i] = (unsigned char*)malloc(buffer_size_);
+        memset(buffer_read[i],0,buffer_size_);
+        memset(buffer_leftover[i],0,buffer_size_);
+        buffer_write_n[i] = 0;
+        buffer_read_n[i] = 0;
+        buffer_write_nbytes[i] = 0;
+        buffer_read_nbytes[i] = 0;
+        buffer_write_pos[i] = buffer_write[i];
+        buffer_read_pos[i] = buffer_read[i];
+        buffer_lasthdr[i] = 0; // buffer_read[i];
+    }
+
+}
+
+void ConnectionMultiApproxBatch::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+void ConnectionMultiApproxBatch::set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys) {
+    g_wb_keys = a_wb_keys;
+}
+
+uint32_t ConnectionMultiApproxBatch::get_cid() {
+    return cid;
+}
+
+int ConnectionMultiApproxBatch::add_to_wb_keys(string key) {
+    auto pos = wb_keys.find(key);
+    if (pos == wb_keys.end()) {
+        wb_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+void ConnectionMultiApproxBatch::del_wb_keys(string key) {
+
+    auto position = wb_keys.find(key);
+    if (position != wb_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        wb_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected wb %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxBatch::add_to_copy_keys(string key) {
+    auto pos = copy_keys.find(key);
+    if (pos == copy_keys.end()) {
+        copy_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApproxBatch::del_copy_keys(string key) {
+
+    auto position = copy_keys.find(key);
+    if (position != copy_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        copy_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected copy %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxBatch::add_to_touch_keys(string key) {
+    //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2;
+    auto pos = touch_keys.find(key);
+    if (pos == touch_keys.end()) {
+        touch_keys.insert( {key, cid });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApproxBatch::del_touch_keys(string key) {
+    //touch_keys.erase(key);
+    auto position = touch_keys.find(key);
+    if (position != touch_keys.end()) {
+        touch_keys.erase(position);
+    } else {
+        fprintf(stderr,"expected touch %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxBatch::issue_op(Operation *Op) {
+    double now = get_time();
+    int issued = 0;
+    Op->clsid = get_class(Op->valuelen,strlen(Op->key));
+    int flags = 0;
+    int index = lrand48() % (1024 * 1024);
+    int incl = inclusives[Op->clsid];
+    SET_INCL(incl,flags);
+    
+    switch(Op->type) {
+
+    case Operation::GET:
+          issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1);
+          this->stats.gets++;
+          gets++;
+          //this->stats.gets_cid[cid]++;
+          break;
+    case Operation::SET:
+          if (incl == 1) {
+            if (isets >= iloc) {
+                const char *data = &random_char[index];
+                issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+                issued = issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET);
+                iloc += rand()%(10*2-1)+1;
+            } else {
+                issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY);
+            }
+            isets++;
+          } else if (incl == 2) {
+            issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+            if (esets >= sloc) {
+                issued = issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET);
+                sloc += rand()%(10*2-1)+1;
+            }
+            esets++;
+          }
+          this->stats.sets++;
+          //this->stats.sets_cid[cid]++;
+          break;
+    case Operation::DELETE:
+    case Operation::TOUCH:
+    case Operation::NOOP:
+    case Operation::SASL:
+          fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen);
+          break;
+    
+    }
+    return issued;
+}
+
+
+int ConnectionMultiApproxBatch::do_connect() {
+
+  int connected = 0;
+  if (options.unix_socket) {
+  
+
+    struct sockaddr_un sin1;
+    memset(&sin1, 0, sizeof(sin1));
+    sin1.sun_family = AF_LOCAL;
+    strcpy(sin1.sun_path, hostname1.c_str());
+
+    int addrlen;
+    addrlen = sizeof(sin1);
+
+    int err = bufferevent_socket_connect(bev1,  (struct sockaddr*)&sin1, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+        connected = 0;
+        err = errno;
+        fprintf(stderr,"l1 error %s\n",strerror(err));
+    }
+    
+    struct sockaddr_un sin2;
+    memset(&sin2, 0, sizeof(sin2));
+    sin2.sun_family = AF_LOCAL;
+    strcpy(sin2.sun_path, hostname2.c_str());
+
+    addrlen = sizeof(sin2);
+    err = bufferevent_socket_connect(bev2,  (struct sockaddr*)&sin2, addrlen);
+    if (err == 0) {
+        connected = 1;
+    } else {
+	connected = 0;
+        err = errno;
+	fprintf(stderr,"l2 error %s\n",strerror(err));
+    }
+  } 
+  read_state  = IDLE;
+  return connected;
+}
+
+/**
+ * Destroy a connection, performing cleanup.
+ */
+ConnectionMultiApproxBatch::~ConnectionMultiApproxBatch() {
+ 
+
+  for (int i = 0; i <= LEVELS; i++) {
+      free(op_queue[i]);
+      if (i > 0) {
+        free(buffer_write[i]);
+        free(buffer_read[i]);
+      }
+
+  }
+  
+  free(op_queue_size);
+  free(opaque);
+  free(op_queue);
+  event_free(timer);
+  //timer = NULL;
+  // FIXME:  W("Drain op_q?");
+
+  bufferevent_free(bev1);
+  bufferevent_free(bev2);
+
+  delete iagen;
+  delete keygen;
+  delete keysize;
+  delete valuesize;
+}
+
+/**
+ * Reset the connection back to an initial, fresh state.
+ */
+void ConnectionMultiApproxBatch::reset() {
+  // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
+  read_state = IDLE;
+  write_state = INIT_WRITE;
+  stats = ConnectionStats(stats.sampling);
+}
+
+/**
+ * Set our event processing priority.
+ */
+void ConnectionMultiApproxBatch::set_priority(int pri) {
+  if (bufferevent_priority_set(bev1, pri)) {
+    DIE("bufferevent_set_priority(bev, %d) failed", pri);
+  }
+}
+
+
+
+/**
+ * Get/Set or Set Style
+ * If a GET command: Issue a get first, if not found then set
+ * If trace file (or prob. write) says to set, then set it
+ */
+int ConnectionMultiApproxBatch::issue_getsetorset(double now) {
+    
+    Operation *Op = trace_queue->front(); 
+    if (Op->type == Operation::SASL) {
+        //cid_rate.insert( {cid, 100 } );
+        //fprintf(stderr,"cid %d done before loop\n",cid);
+        //string op_queue1;
+        //string op_queue2;
+        //for (int j = 0; j < 2; j++) {
+        //    for (int i = 0; i < OPAQUE_MAX; i++) {
+        //        if (op_queue[j+1][i] != NULL) {
+        //            if (j == 0) {
+        //                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+        //            } else {
+        //                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+        //            }
+        //        }
+        //    }
+        //}
+        for (int i = 1; i <= LEVELS; i++) {
+            if (buffer_write_n[i] > 0) {
+                send_write_buffer(i);
+            }
+        }
+        eof = 1;
+        //fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+        return 1;
+    } 
+   
+    int issued = issue_op(Op);
+    trace_queue->pop();
+    while (issued != 2) {
+        Op = trace_queue->front();
+
+        if (Op->type == Operation::SASL) {
+            for (int i = 1; i <= LEVELS; i++) {
+                if (buffer_write_n[i] > 0) {
+                    send_write_buffer(i);
+                }
+            }
+            //string op_queue1;
+            //string op_queue2;
+            //for (int j = 0; j < 2; j++) {
+            //    for (int i = 0; i < OPAQUE_MAX; i++) {
+            //        if (op_queue[j+1][i] != NULL) {
+            //            if (j == 0) {
+            //                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+            //            } else {
+            //                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+            //            }
+            //        }
+            //    }
+            //}
+            //fprintf(stderr,"done in loop cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+            eof = 1;
+            return 1;
+        }
+        issued = issue_op(Op);
+        trace_queue->pop();
+    }
+    
+    return 0;
+}
+
+int ConnectionMultiApproxBatch::send_write_buffer(int level) {
+    struct bufferevent *bev = NULL;
+    switch (level) {
+        case 1:
+            bev = bev1;
+            break;
+        case 2:
+            bev = bev2;
+            break;
+        default:
+            bev = bev1;
+            break;
+    }
+    int ret = bufferevent_write(bev,buffer_write[level],buffer_write_nbytes[level]);
+    if (ret != 0) {
+        fprintf(stderr,"error writing buffer! level %d, size %d\n",level,buffer_write_nbytes[level]);
+    }
+    //fprintf(stderr,"l%d write: %u\n",level,buffer_write_nbytes[level]);
+    buffer_write_n[level] = 0;
+    buffer_write_pos[level] = buffer_write[level];
+    memset(buffer_write_pos[level],0,buffer_write_nbytes[level]);
+    stats.tx_bytes += buffer_write_nbytes[level];
+    buffer_write_nbytes[level] = 0;
+    return 2;
+}
+
+int ConnectionMultiApproxBatch::add_get_op_to_queue(Operation *pop, int level) {
+
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,pop->flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(pop->key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  //if (quiet) {
+  //    h.opcode = CMD_GETQ;
+  //}
+  h.opaque = htonl(pop->opaque);
+
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += 24 + keylen;
+
+  int res = 1;
+  if (buffer_write_n[level] == (uint32_t)options.depth) {
+      res = send_write_buffer(level);
+  }
+  return res;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxBatch::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) {
+  
+  int level = FLAGS_level(flags);
+
+  //initialize op for sending
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+
+  //put op into queue
+  return add_get_op_to_queue(pop,level);
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxBatch::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::GET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(valuelen,strlen(key));
+
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+
+  return add_get_op_to_queue(pop,level);
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxBatch::issue_touch(const char* key, int valuelen, double now, int flags) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::TOUCH;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+#ifdef DEBUGS
+  fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque);
+#endif
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen),
+                        0x04, 0x00, htons(0),
+                        htonl(keylen + 4) };
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t exp = 0;
+  if (flags & ITEM_DIRTY) {
+      exp = htonl(flags); 
+  }
+  
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], &exp, 4);
+  buffer_write_pos[level] += 4;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  buffer_write_nbytes[level] += 24 + keylen + 4;
+  buffer_write_n[level]++;
+
+  int ret = 1;
+  if (buffer_write_n[level] == (uint32_t)options.depth) {
+      ret = send_write_buffer(level);
+  }
+  
+  return ret;
+}
+
+/**
+ * Issue a delete request to the server.
+ */
+int ConnectionMultiApproxBatch::issue_delete(const char* key, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->type = Operation::DELETE;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque);
+#endif
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_DELETE, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  h.opaque = htonl(pop->opaque);
+  
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += 24 + keylen;
+  
+  int ret = 1;
+  if (buffer_write_n[level] == (uint32_t)options.depth) {
+      ret = send_write_buffer(level);
+  }
+  
+  return ret;
+}
+
+int ConnectionMultiApproxBatch::issue_noop(int level) {
+
+  binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                        0x00, 0x00, htons(0),
+                        0x00 };
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += 24;
+  
+  int ret = 1;
+  if (buffer_write_n[level] == (uint32_t)options.depth) {
+      ret = send_write_buffer(level);
+  }
+
+  return ret;
+}
+
+int ConnectionMultiApproxBatch::add_set_to_queue(Operation *pop, int level, const char* value) {
+  int length = pop->valuelen;
+  
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,length,level,pop->flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(pop->key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(pop->flags);
+  uint32_t exp = 0;
+ 
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], &f, 4);
+  buffer_write_pos[level] += 4;
+  memcpy(buffer_write_pos[level], &exp, 4);
+  buffer_write_pos[level] += 4;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  memcpy(buffer_write_pos[level], value, length);
+  buffer_write_pos[level] += length;
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += length + 32 + keylen;
+  
+  int ret = 1;
+  if (buffer_write_n[level] == (uint32_t)options.depth) {
+      ret = send_write_buffer(level);
+  }
+  return ret;
+  
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxBatch::issue_set(Operation *pop, const char* value, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  return add_set_to_queue(pop,level,value);
+
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxBatch::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  strncpy(pop->key,key,255); 
+  pop->valuelen = length;
+  pop->type = Operation::SET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(length,strlen(key));
+
+  return add_set_to_queue(pop,level,value);
+  
+}
+
+
+/**
+ * Finish up (record stats) an operation that just returned from the
+ * server.
+ */
+void ConnectionMultiApproxBatch::finish_op(Operation *op, int was_hit) {
+  double now;
+#if USE_CACHED_TIME
+  struct timeval now_tv;
+  event_base_gettimeofday_cached(base, &now_tv);
+  now = tv_to_double(&now_tv);
+#else
+  now = get_time();
+#endif
+#if HAVE_CLOCK_GETTIME
+  op->end_time = get_time_accurate();
+#else
+  op->end_time = now;
+#endif
+
+  if (was_hit) { 
+    switch (op->type) {
+    case Operation::GET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_get_l1(*op);
+                break;
+            case 2:
+                stats.log_get_l2(*op);
+                if (op->l1 != NULL) {
+                    op->l1->end_time = now;
+                    stats.log_get(*(op->l1));
+                }
+                break;
+        }
+        break;
+    case Operation::SET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_set_l1(*op);
+                break;
+            case 2:
+                stats.log_set_l2(*op);
+                break;
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  }
+
+  last_rx = now;
+  uint8_t level = OP_level(op);
+  if (op->l1 != NULL) {
+      //delete op_queue[1][op->l1->opaque];
+      if (op->l1 == op_queue[1][op->l1->opaque]) {
+        op_queue[1][op->l1->opaque] = 0;
+        if (op_queue_size[1] > 0) {
+            op_queue_size[1]--;
+        } else {
+            fprintf(stderr,"chained op_Queue_size[%d] out of sync!!\n",1);
+        }
+        delete op->l1;
+      } else {
+        fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[1][op->opaque],op->opaque,op_queue[1][op->opaque]->opaque);
+      }
+  }
+  //op_queue[level].erase(op_queue[level].begin()+opopq);
+  if (op == op_queue[level][op->opaque] && 
+          op->opaque == op_queue[level][op->opaque]->opaque) {
+    //delete op_queue[level][op->opaque];
+    op_queue[level][op->opaque] = 0;
+    delete op;
+    if (op_queue_size[level] > 0) {
+        op_queue_size[level]--;
+    } else {
+        fprintf(stderr,"op_Queue_size[%d] out of sync!!\n",level);
+    }
+  } else {
+      fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque);
+  }
+  read_state = IDLE;
+
+}
+
+
+
+/**
+ * Check if our testing is done and we should exit.
+ */
+bool ConnectionMultiApproxBatch::check_exit_condition(double now) {
+  if (eof == 1) {
+      int done = 1;
+      for (int i = 1; i <= LEVELS; i++) {
+          if (buffer_write_n[i] != 0) {
+              //fprintf(stderr,"%d sending %d\n",i,buffer_write_n[i]);
+              send_write_buffer(i);
+              done = 0;
+          }
+      }
+      if (done) {
+        //fprintf(stderr,"%d done - check exit\n",cid);
+        return true;
+      }
+  } 
+  return false;
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMultiApproxBatch::event_callback1(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname1.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev1);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname1.c_str(),bufferevent_getfd(bev1));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev1);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Handle new connection and error events.
+ */
+void ConnectionMultiApproxBatch::event_callback2(short events) {
+  if (events & BEV_EVENT_CONNECTED) {
+    D("Connected to %s:%s.", hostname2.c_str(), port.c_str());
+    int fd = bufferevent_getfd(bev2);
+    if (fd < 0) DIE("bufferevent_getfd");
+
+    if (!options.no_nodelay && !options.unix_socket) {
+      int one = 1;
+      if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                     (void *) &one, sizeof(one)) < 0)
+        DIE("setsockopt()");
+    }
+#ifdef DEBUGMC
+    fprintf(stderr,"libevent connected %s, fd: %u\n",hostname2.c_str(),bufferevent_getfd(bev2));
+#endif
+
+
+  } else if (events & BEV_EVENT_ERROR) {
+    int err = bufferevent_socket_get_dns_error(bev2);
+    //if (err) DIE("DNS error: %s", evutil_gai_strerror(err));
+    if (err) fprintf(stderr,"DNS error: %s", evutil_gai_strerror(err));
+    fprintf(stderr,"CID: %d - Got an error: %s\n",this->cid,
+        evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR()));
+
+    //DIE("BEV_EVENT_ERROR: %s", strerror(errno));
+
+
+  } else if (events & BEV_EVENT_EOF) {
+    fprintf(stderr,"Unexpected EOF from server.");
+    return;
+  }
+}
+
+/**
+ * Request generation loop. Determines whether or not to issue a new command,
+ * based on timer events.
+ *
+ * Note that this function loops. Be wary of break vs. return.
+ */
+void ConnectionMultiApproxBatch::drive_write_machine(double now) {
+
+  if (now == 0.0) now = get_time();
+  double delay;
+  struct timeval tv;
+
+  int max_depth = (int)options.depth*2;
+
+  while (1) {
+    switch (write_state) {
+    case INIT_WRITE:
+      delay = iagen->generate();
+      next_time = now + delay;
+      double_to_tv(delay, &tv);
+      evtimer_add(timer, &tv);
+      write_state = ISSUING;
+      break;
+
+    case ISSUING:
+      if ( (op_queue_size[1] >= (size_t) max_depth) || 
+          (op_queue_size[2] >= (size_t) max_depth) ) {
+        write_state = WAITING_FOR_OPQ;
+        break;
+      }
+
+      if (options.getsetorset) {
+        int ret = issue_getsetorset(now);
+        //if (ret) return; //if at EOF
+        return;
+      }
+      
+      last_tx = now;
+      for (int i = 1; i <= 2; i++) {
+        stats.log_op(op_queue_size[i]);
+      }
+      break;
+
+    case WAITING_FOR_TIME:
+      write_state = ISSUING;
+      break;
+
+    case WAITING_FOR_OPQ:
+      if ( (op_queue_size[1] >= (size_t) max_depth) || 
+          (op_queue_size[2] >= (size_t) max_depth) ) {
+           for (int i = 1; i <= LEVELS; i++) {
+               if (max_depth > 16) {
+                   if (buffer_write_n[i] > max_depth*0.8) {
+                       send_write_buffer(i);
+                   }
+               }
+           }
+            next_time = now + 0.01;
+            double_to_tv(delay, &tv);
+            evtimer_add(timer, &tv);
+
+          return;
+      } else {
+        write_state = ISSUING;
+        break;
+      }
+
+    default: DIE("Not implemented");
+    }
+  }
+}
+
+size_t ConnectionMultiApproxBatch::handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, 
+                                    size_t read_bytes, size_t consumed_bytes,
+                                    int level, int extra) {
+    if (rbuf_pos[0] != 129) {
+        //fprintf(stderr,"cid %d we don't have a valid header %u\n",cid,rbuf_pos[0]);
+        //buffer_read_pos[level] = rbuf_pos;
+        //buffer_read_n[level] = 1;
+        return 0;
+    }
+    if ((read_bytes+extra - consumed_bytes) < 24) {
+        size_t have = (read_bytes+extra) - (consumed_bytes);
+        size_t need = 24 - (have);
+        buffer_read_n[level] = need;
+        buffer_read_nbytes[level] = have;
+        memcpy(buffer_leftover[level],rbuf_pos,have);
+        //buffer_lasthdr[level] = rbuf_pos;
+        //buffer_read_n[level] = need;
+        //buffer_read_nbytes[level] = have;
+        //fprintf(stderr,"cid %d - we don't have enough header data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,24,read_bytes,extra,level);
+        return 0;
+        
+    }
+
+    binary_header_t* h = reinterpret_cast<binary_header_t*>(rbuf_pos);
+    uint32_t bl = ntohl(h->body_len);
+    uint16_t kl = ntohs(h->key_len);
+    uint8_t el = h->extra_len;
+    int targetLen = 24 + bl;
+    if (consumed_bytes + targetLen > (read_bytes+extra)) {
+        size_t have = (read_bytes+extra) - (consumed_bytes);
+        size_t need = targetLen - (have);
+        buffer_read_n[level] = need;
+        buffer_read_nbytes[level] = have;
+        memcpy(buffer_leftover[level],rbuf_pos,have);
+        //fprintf(stderr,"cid %d - we don't have enough data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,targetLen,read_bytes,extra,level);
+        return 0;
+    }
+    
+    resp->opcode = h->opcode;
+    resp->opaque = ntohl(h->opaque);
+    uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",cid,level,
+    h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+    ntohl(h->body_len),ntohs(h->status));
+#endif
+    // If something other than success, count it as a miss
+    if (resp->opcode == CMD_GET && status == RESP_NOT_FOUND) {
+        switch(level) {
+            case 1:
+                stats.get_misses_l1++;
+                break;
+            case 2:
+                stats.get_misses_l2++;
+                stats.get_misses++;
+                stats.window_get_misses++;
+                break;
+        }
+        resp->found = false;
+    } else if (resp->opcode == CMD_SET && kl > 0) {
+      //first data is extras: clsid, flags, eflags
+      if (resp->evict) {
+          unsigned char *buf = rbuf_pos + 24;
+          resp->evict->clsid = *((uint32_t*)buf);
+          resp->evict->clsid = ntohl(resp->evict->clsid);
+          buf += 4;
+          
+          resp->evict->serverFlags = *((uint32_t*)buf);
+          resp->evict->serverFlags = ntohl(resp->evict->serverFlags);
+          buf += 4;
+          
+          resp->evict->evictedFlags = *((uint32_t*)buf);
+          resp->evict->evictedFlags = ntohl(resp->evict->evictedFlags);
+          buf += 4;
+    
+          resp->evict->evictedKeyLen = kl;
+          resp->evict->evictedKey = (char*)malloc(kl+1);
+          memset(resp->evict->evictedKey,0,kl+1);
+          memcpy(resp->evict->evictedKey,buf,kl);
+          buf += kl;
+    
+          resp->evict->evictedLen = bl - kl - el;
+          resp->evict->evictedData = (char*)malloc(resp->evict->evictedLen);
+          memcpy(resp->evict->evictedData,buf,resp->evict->evictedLen);
+          resp->evict->evicted = true;
+      } 
+    } else if ( (resp->opcode == CMD_DELETE || resp->opcode == CMD_TOUCH) &&
+                 status == RESP_NOT_FOUND) {
+      resp->found = false;
+    }
+    this->stats.rx_bytes += targetLen; 
+    return targetLen;
+}
+
+
+size_t ConnectionMultiApproxBatch::fill_read_buffer(int level, int *extra) {
+
+  size_t read_bytes = 0;
+  struct bufferevent *bev = NULL;
+  switch (level) {
+      case 1:
+          bev = bev1;
+          break;
+      case 2:
+          bev = bev2;
+          break;
+      default:
+          bev = bev1;
+          break;
+  }
+  if (buffer_read_n[level] != 0) {
+      uint32_t have = buffer_read_nbytes[level];
+      struct evbuffer *input = bufferevent_get_input(bev);
+      size_t len = evbuffer_get_length(input);
+      if (len < buffer_read_n[level]) {
+          return 0;
+      }
+      memset(buffer_read[level],0,512*1024);
+      memcpy(buffer_read[level],buffer_leftover[level],have);
+      buffer_read_pos[level] = buffer_read[level];
+      read_bytes = bufferevent_read(bev,buffer_read_pos[level]+have,len);
+      if (read_bytes != len) { 
+          fprintf(stderr,"cid %d expected %lu got %lu\n",cid,len,read_bytes);
+      }
+      *extra = have;
+      buffer_read_n[level] = 0;
+      buffer_read_nbytes[level] = 0;
+
+  } else {
+      memset(buffer_read[level],0,512*1024);
+      buffer_read_pos[level] = buffer_read[level];
+      read_bytes = bufferevent_read(bev, buffer_read_pos[level], buffer_size_ / 4);
+      *extra = 0;
+  }
+  if (read_bytes == 0) {
+      fprintf(stderr,"cid %d read 0 bytes\n",cid);
+  }
+  return read_bytes;
+}
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxBatch::read_callback1() {
+
+  int level = 1;
+  int extra = 0;
+  size_t read_bytes = 0;
+
+  read_bytes = fill_read_buffer(level,&extra);
+  if (read_bytes == 0) {
+    return;
+  }
+
+  //fprintf(stderr,"cid %d l1 read: %lu\n",cid,read_bytes);
+  size_t consumed_bytes = 0;
+  size_t batch = options.depth;
+      //we have at least some data to read
+  size_t nread_ops = 0;
+  while (1) {
+      evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+      memset(evict,0,sizeof(evicted_t));
+      resp_t mc_resp;
+      mc_resp.found = true;
+      mc_resp.evict = evict;
+      size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra);
+      if (cbytes == 0) {
+          if (evict) {
+              if (evict->evictedKey) free(evict->evictedKey);
+              if (evict->evictedData) free(evict->evictedData);
+              free(evict);
+          }
+          break;
+      }
+      buffer_read_pos[level] = buffer_read_pos[level] + cbytes;
+      consumed_bytes += cbytes;
+      uint32_t opaque = mc_resp.opaque;
+      bool found = mc_resp.found;
+      
+      Operation *op = op_queue[level][opaque];
+#ifdef DEBUGMC
+      char out[128];
+      sprintf(out,"l1 cid %u, reading opaque: %u\n",cid,opaque);
+      write(2,out,strlen(out));
+      output_op(op,2,found);
+#endif
+
+      double now = get_time();
+      int wb = 0;
+      if (options.rand_admit) {
+          wb = (rand() % options.rand_admit);
+      }
+      int vl = op->valuelen;
+      int flags = OP_clu(op);
+      switch (op->type) {
+          case Operation::GET:
+              if ( !found && (options.getset || options.getsetorset) ) {
+                  /* issue a get a l2 */
+                  issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                  op->end_time = now;
+                  this->stats.log_get_l1(*op);
+
+              } else {
+                  if (OP_incl(op) && ghits >= gloc) {
+                      issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H);
+                      gloc += rand()%(10*2-1)+1;
+                  }
+                  ghits++;
+                  finish_op(op,1);
+              }
+              break;
+          case Operation::SET:
+              //if (OP_src(op) == SRC_L1_COPY ||
+              //    OP_src(op) == SRC_L2_M) {
+              //    del_copy_keys(string(op->key));
+              //}
+              if (evict->evicted) {
+                  string wb_key(evict->evictedKey);
+                  if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                      //int ret = add_to_wb_keys(wb_key);
+                      //if (ret == 1) {
+                          issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY);
+                      //}
+                      this->stats.incl_wbs++;
+                  } else if (evict->evictedFlags & ITEM_EXCL) {
+                      //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                      //strncpy(wb_key,evict->evictedKey,255);
+                      if ( (options.rand_admit && wb == 0) ||
+                           (options.threshold && (g_key_hist[wb_key] == 1)) ||
+                           (options.wb_all) ) {
+                          //int ret = add_to_wb_keys(wb_key);
+                          //if (ret == 1) {
+                              issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                          //}
+                          this->stats.excl_wbs++;
+                      }
+                  }
+                  if (OP_src(op) == SRC_DIRECT_SET) {
+                      if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                          this->stats.set_misses_l1++;
+                      } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                          this->stats.set_excl_hits_l1++;
+                      } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                          this->stats.set_incl_hits_l1++;
+                      }
+                  }
+              }
+              finish_op(op,1);
+              break;
+          case Operation::TOUCH:
+              finish_op(op,1);
+              break;
+          default: 
+              fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+              DIE("not implemented");
+      }
+
+      if (evict) {
+          if (evict->evictedKey) free(evict->evictedKey);
+          if (evict->evictedData) free(evict->evictedData);
+          free(evict);
+      }
+      nread_ops++;
+      if (buffer_read_pos[level][0] == 0) {
+          break;
+      }
+      if (buffer_read_pos[level][0] != 129) {
+          fprintf(stderr,"cid %d we don't have a valid header post %u\n",cid,buffer_read_pos[level][0]);
+          break;
+      }
+  }
+  //if (buffer_read_n[level] == 0) {
+  //    memset(buffer_read[level],0,read_bytes);
+  //}
+  //if (nread_ops == 0) {
+  //    fprintf(stderr,"ugh only got: %lu ops expected %lu, read %lu, cid %u\n",nread_ops,batch,read_bytes,cid);
+  //    int *a = 0;
+  //    *a = 0;
+  //}
+  
+
+  double now = get_time();
+  last_tx = now;
+  stats.log_op(op_queue_size[1]);
+  stats.log_op(op_queue_size[2]);
+
+  drive_write_machine();
+    
+  
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxBatch::read_callback2() {
+  
+  int level = 2;
+  int extra = 0;
+  
+  size_t read_bytes = 0;
+
+  read_bytes = fill_read_buffer(level,&extra);
+  if (read_bytes == 0) {
+    return;
+  }
+
+  //fprintf(stderr,"l2 read: %lu\n",read_bytes);
+  size_t consumed_bytes = 0;
+  size_t batch = options.depth;
+  size_t nread_ops = 0;
+  while (1) {
+      evicted_t *evict = NULL;
+      resp_t mc_resp;
+      mc_resp.found = true;
+      mc_resp.evict = evict;
+      size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra);
+      if (cbytes == 0) {
+          break;
+      }
+      buffer_read_pos[level] = buffer_read_pos[level] + cbytes;
+      consumed_bytes += cbytes;
+      uint32_t opaque = mc_resp.opaque;
+      bool found = mc_resp.found;
+      
+      Operation *op = op_queue[level][opaque];
+#ifdef DEBUGMC
+      char out[128];
+      sprintf(out,"l2 cid %u, reading opaque: %u\n",cid,opaque);
+      write(2,out,strlen(out));
+      output_op(op,2,found);
+#endif
+      double now = get_time();
+      switch (op->type) {
+          case Operation::GET:
+              if ( !found && (options.getset || options.getsetorset) ) {
+                  int valuelen = op->valuelen;
+                  int index = lrand48() % (1024 * 1024);
+                  int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                  //int ret = add_to_copy_keys(string(op->key));
+                  //if (ret == 1) {
+                  issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                  if (OP_incl(op)) {
+                      issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                  }
+                  //}
+                  finish_op(op,0); // sets read_state = IDLE
+              } else {
+                  if (found) {
+                      int valuelen = op->valuelen;
+                      int index = lrand48() % (1024 * 1024);
+                      int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                      string key = string(op->key);
+                      const char *data = &random_char[index];
+                      //int ret = add_to_copy_keys(string(op->key));
+                      //if (ret == 1) {
+                          issue_set(op->key,data,valuelen, now, flags);
+                      //}
+                      this->stats.copies_to_l1++;
+                      //djb: this is automatically done in the L2 server
+                      //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death?
+                      //    issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY );
+                      //}
+                      finish_op(op,1);
+
+                  } else {
+                      finish_op(op,0);
+                  }
+              }
+              break;
+          case Operation::SET:
+              //if (OP_src(op) == SRC_WB) {
+              //    del_wb_keys(string(op->key));
+              //}
+              finish_op(op,1);
+              break;
+          case Operation::TOUCH:
+              if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) {
+                  int valuelen = op->valuelen;
+                  if (!found) {
+                      int index = lrand48() % (1024 * 1024);
+                      issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                      this->stats.set_misses_l2++;
+                  } else {
+                      if (OP_src(op) == SRC_DIRECT_SET) {
+                          issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                      }
+                  }
+                  //del_touch_keys(string(op->key));
+              }
+              finish_op(op,0);
+              break;
+          case Operation::DELETE:
+              //check to see if it was a hit
+              //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+              if (OP_src(op) == SRC_DIRECT_SET) {
+                  if (found) {
+                      this->stats.delete_hits_l2++;
+                  } else {
+                      this->stats.delete_misses_l2++;
+                  }
+              }
+              finish_op(op,1);
+              break;
+          default: 
+              fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+              DIE("not implemented");
+      }
+      nread_ops++;
+      if (buffer_read_pos[level][0] == 0) {
+          break;
+      }
+      if (buffer_read_pos[level][0] != 129) {
+          fprintf(stderr,"l2 cid %d we don't have a valid header post %u\n",cid,buffer_read_pos[level][0]);
+          break;
+      }
+  }
+  //if (buffer_read_n[level] == 0) {
+  //    memset(buffer_read[level],0,read_bytes);
+  //}
+  //if (nread_ops == 0) {
+  //    fprintf(stderr,"ugh l2 only got: %lu ops expected %lu\n",nread_ops,batch);
+  //}
+
+
+  double now = get_time();
+  last_tx = now;
+  stats.log_op(op_queue_size[2]);
+  stats.log_op(op_queue_size[1]);
+  
+  drive_write_machine();
+}
+
+/**
+ * Callback called when write requests finish.
+ */
+void ConnectionMultiApproxBatch::write_callback() {
+
+    //fprintf(stderr,"loaded evbuffer with ops: %u\n",op_queue.size());
+}
+
+/**
+ * Callback for timer timeouts.
+ */
+void ConnectionMultiApproxBatch::timer_callback() {
+  //fprintf(stderr,"timer up: %d\n",cid);
+  drive_write_machine();
+}
+
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb1_approx_batch(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr;
+  conn->event_callback1(events);
+}
+
+/* The follow are C trampolines for libevent callbacks. */
+void bev_event_cb2_approx_batch(struct bufferevent *bev, short events, void *ptr) {
+
+  ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr;
+  conn->event_callback2(events);
+}
+
+void bev_read_cb1_approx_batch(struct bufferevent *bev, void *ptr) {
+  ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr;
+  if (conn->options.v1callback) {
+    conn->read_callback1_v1();
+  } else {
+    conn->read_callback1();
+  }
+}
+
+
+void bev_read_cb2_approx_batch(struct bufferevent *bev, void *ptr) {
+  ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr;
+  if (conn->options.v1callback) {
+    conn->read_callback2_v1();
+  } else {
+    conn->read_callback2();
+  }
+}
+
+void bev_write_cb_m_approx_batch(struct bufferevent *bev, void *ptr) {
+}
+
+void timer_cb_m_approx_batch(evutil_socket_t fd, short what, void *ptr) {
+  ConnectionMultiApproxBatch* conn = (ConnectionMultiApproxBatch*) ptr;
+  conn->timer_callback();
+}
+//previous implmentation of read
+//
+
+/**
+ * Tries to consume a binary response (in its entirety) from an evbuffer.
+ *
+ * @param input evBuffer to read response from
+ * @return  true if consumed, false if not enough data in buffer.
+ */
+static bool handle_response(ConnectionMultiApproxBatch *conn, evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) {
+  // Read the first 24 bytes as a header
+  int length = evbuffer_get_length(input);
+  if (length < 24) return false;
+  binary_header_t* h =
+          reinterpret_cast<binary_header_t*>(evbuffer_pullup(input, 24));
+  //assert(h);
+
+  uint32_t bl = ntohl(h->body_len);
+  uint16_t kl = ntohs(h->key_len);
+  uint8_t el = h->extra_len;
+  // Not whole response
+  int targetLen = 24 + bl;
+  if (length < targetLen) {
+      return false;
+  }
+
+  opcode = h->opcode;
+  opaque = ntohl(h->opaque);
+  uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level,
+            h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+            ntohl(h->body_len),ntohs(h->status));
+#endif
+
+
+  // If something other than success, count it as a miss
+  if (opcode == CMD_GET && status == RESP_NOT_FOUND) {
+      switch(level) {
+          case 1:
+              conn->stats.get_misses_l1++;
+              break;
+          case 2:
+              conn->stats.get_misses_l2++;
+              conn->stats.get_misses++;
+              conn->stats.window_get_misses++;
+              break;
+
+      }
+      found = false;
+      evbuffer_drain(input, targetLen);
+
+  } else if (opcode == CMD_SET && kl > 0) {
+    //first data is extras: clsid, flags, eflags
+    if (evict) {
+        evbuffer_drain(input,24);
+        unsigned char *buf = evbuffer_pullup(input,bl);
+        
+
+        evict->clsid = *((uint32_t*)buf);
+        evict->clsid = ntohl(evict->clsid);
+        buf += 4;
+        
+        evict->serverFlags = *((uint32_t*)buf);
+        evict->serverFlags = ntohl(evict->serverFlags);
+        buf += 4;
+        
+        evict->evictedFlags = *((uint32_t*)buf);
+        evict->evictedFlags = ntohl(evict->evictedFlags);
+        buf += 4;
+
+        
+        evict->evictedKeyLen = kl;
+        evict->evictedKey = (char*)malloc(kl+1);
+        memset(evict->evictedKey,0,kl+1);
+        memcpy(evict->evictedKey,buf,kl);
+        buf += kl;
+
+
+        evict->evictedLen = bl - kl - el;
+        evict->evictedData = (char*)malloc(evict->evictedLen);
+        memcpy(evict->evictedData,buf,evict->evictedLen);
+        evict->evicted = true;
+        //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags);
+        evbuffer_drain(input,bl);
+    } else {
+        evbuffer_drain(input, targetLen);
+    }
+  } else if (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else if (opcode == CMD_DELETE && status == RESP_NOT_FOUND) {
+    found = false;
+    evbuffer_drain(input, targetLen);
+  } else {
+    evbuffer_drain(input, targetLen);
+  }
+
+  conn->stats.rx_bytes += targetLen;
+  done = true;
+  return true;
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxBatch::read_callback1_v1() {
+  struct evbuffer *input = bufferevent_get_input(bev1);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+    memset(evict,0,sizeof(evicted_t));
+
+    full_read = handle_response(this,input, done, found, opcode, opaque, evict,1);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l1: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+        op = op_queue[1][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+    } else {
+        if (evict) {
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+        }
+        break;
+    }
+    
+
+    double now = get_time();
+    int wb = 0;
+    if (options.rand_admit) {
+        wb = (rand() % options.rand_admit);
+    }
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+
+                int vl = op->valuelen;
+                if ( !found && (options.getset || options.getsetorset) ) {
+                    /* issue a get a l2 */
+                    int flags = OP_clu(op);
+                    issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                    op->end_time = now;
+                    this->stats.log_get_l1(*op);
+                    //finish_op(op,0);
+
+                } else {
+                    if (OP_incl(op) && ghits >= gloc) {
+                        //int ret = add_to_touch_keys(string(op->key));
+                        //if (ret == 1) {
+                            issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H);
+                        //}
+                        gloc += rand()%(10*2-1)+1;
+                    }
+                    ghits++;
+                    finish_op(op,1);
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l1: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_L1_COPY ||
+            //    OP_src(op) == SRC_L2_M) {
+            //    del_copy_keys(string(op->key));
+            //}
+            if (evict->evicted) {
+                string wb_key(evict->evictedKey);
+                if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                    //int ret = add_to_wb_keys(wb_key);
+                    //if (ret == 1) {
+                        issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY);
+                    //}
+                    this->stats.incl_wbs++;
+                } else if (evict->evictedFlags & ITEM_EXCL) {
+                    //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                    //strncpy(wb_key,evict->evictedKey,255);
+                    if ( (options.rand_admit && wb == 0) ||
+                         (options.threshold && (g_key_hist[wb_key] == 1)) ||
+                         (options.wb_all) ) {
+                        //int ret = add_to_wb_keys(wb_key);
+                        //if (ret == 1) {
+                            issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                        //}
+                        this->stats.excl_wbs++;
+                    }
+                }
+                if (OP_src(op) == SRC_DIRECT_SET) {
+                    if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                        this->stats.set_misses_l1++;
+                    } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_excl_hits_l1++;
+                    } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_incl_hits_l1++;
+                    }
+                }
+            }
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+    if (evict) {
+        if (evict->evictedKey) free(evict->evictedKey);
+        if (evict->evictedData) free(evict->evictedData);
+        free(evict);
+    }
+
+  }
+  
+
+  double now = get_time();
+
+  last_tx = now;
+  stats.log_op(op_queue_size[1]);
+  stats.log_op(op_queue_size[2]);
+  //for (int i = 1; i <= 2; i++) {
+  //    fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]);
+  //}
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxBatch::read_callback2_v1() {
+  struct evbuffer *input = bufferevent_get_input(bev2);
+
+  Operation *op = NULL;
+  bool done, found;
+
+  //initially assume found (for sets that may come through here)
+  //is this correct? do we want to assume true in case that 
+  //GET was found, but wrong value size (i.e. update value)
+  found = true;
+
+
+  //if (op_queue.size() == 0) V("Spurious read callback.");
+  bool full_read = true;
+  while (full_read) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    full_read = handle_response(this,input, done, found, opcode, opaque, NULL,2);
+    if (full_read) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l2: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            continue;
+        }
+        op = op_queue[2][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            continue;
+        }
+    } else {
+        break;
+    }
+    
+
+    double now = get_time();
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+                if ( !found && (options.getset || options.getsetorset) ) {//  &&
+                    //(options.twitter_trace != 1)) {
+                    int valuelen = op->valuelen;
+                    int index = lrand48() % (1024 * 1024);
+                    int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                    //int ret = add_to_copy_keys(string(op->key));
+                    //if (ret == 1) {
+                        issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                        if (OP_incl(op)) {
+                            issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                            last_quiet2 = false; 
+                        }
+                    //}
+                    last_quiet1 = false; 
+                    finish_op(op,0); // sets read_state = IDLE
+                    
+                } else {
+                    if (found) {
+                        int valuelen = op->valuelen;
+                        int index = lrand48() % (1024 * 1024);
+                        int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                        string key = string(op->key);
+                        const char *data = &random_char[index];
+                        //int ret = add_to_copy_keys(string(op->key));
+                        //if (ret == 1) {
+                            issue_set(op->key,data,valuelen, now, flags);
+                        //}
+                        this->stats.copies_to_l1++;
+                        //djb: this is automatically done in the L2 server
+                        //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death?
+                        //    issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY );
+                        //}
+                        finish_op(op,1);
+
+                    } else {
+                        finish_op(op,0);
+                    }
+                }
+            } else {
+                char out[128];
+                sprintf(out,"conn l2: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_WB) {
+            //    del_wb_keys(string(op->key));
+            //}
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) {
+                int valuelen = op->valuelen;
+                if (!found) {
+                    int index = lrand48() % (1024 * 1024);
+                    issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                    this->stats.set_misses_l2++;
+                } else {
+                    if (OP_src(op) == SRC_DIRECT_SET) {
+                        issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                    }
+                }
+                //del_touch_keys(string(op->key));
+            }
+            finish_op(op,0);
+            break;
+        case Operation::DELETE:
+            //check to see if it was a hit
+            //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                if (found) {
+                    this->stats.delete_hits_l2++;
+                } else {
+                    this->stats.delete_misses_l2++;
+                }
+            }
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+  }
+
+  double now = get_time();
+
+  last_tx = now;
+  stats.log_op(op_queue_size[2]);
+  stats.log_op(op_queue_size[1]);
+  drive_write_machine();
+  
+  // update events
+  //if (bev != NULL) {
+  //    // no pending response (nothing to read) and output buffer empty (nothing to write)
+  //    if ((op_queue.size() == 0) && (evbuffer_get_length(bufferevent_get_output(bev)) == 0)) {
+  //        bufferevent_disable(bev, EV_WRITE|EV_READ);
+  //    }
+  //}
+}
diff --git a/ConnectionMultiApproxBatchShm.cc b/ConnectionMultiApproxBatchShm.cc
new file mode 100644
index 0000000..21e7593
--- /dev/null
+++ b/ConnectionMultiApproxBatchShm.cc
@@ -0,0 +1,1645 @@
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <time.h>
+#include <pthread.h>
+
+#include <event2/buffer.h>
+#include <event2/bufferevent.h>
+#include <event2/dns.h>
+#include <event2/event.h>
+#include <event2/thread.h>
+#include <event2/util.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+
+
+#include "config.h"
+
+#include "Connection.h"
+#include "distributions.h"
+#include "Generator.h"
+#include "mutilate.h"
+#include "binary_protocol.h"
+#include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+
+//#include <folly/concurrency/UnboundedQueue.h>
+//#include <folly/concurrency/ConcurrentHashMap.h>
+
+#define ITEM_L1 1
+#define ITEM_L2 2
+#define LOG_OP 4
+#define SRC_L1_M 8
+#define SRC_L1_H 16
+#define SRC_L2_M 32
+#define SRC_L2_H 64
+#define SRC_DIRECT_SET 128
+#define SRC_L1_COPY 256
+#define SRC_WB 512
+
+#define ITEM_INCL  4096
+#define ITEM_EXCL  8192
+#define ITEM_DIRTY 16384
+#define ITEM_SIZE_CHANGE 131072
+#define ITEM_WAS_HIT 262144
+
+#define LEVELS 2
+#define SET_INCL(incl,flags)     \
+    switch (incl) {              \
+        case 1:                  \
+            flags |= ITEM_INCL;  \
+            break;               \
+        case 2:                  \
+            flags |= ITEM_EXCL;  \
+            break;               \
+                                 \
+    }                            \
+
+#define GET_INCL(incl,flags) \
+    if (flags & ITEM_INCL) incl = 1; \
+    else if (flags & ITEM_EXCL) incl = 2; \
+
+//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 )
+#define OP_level(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define FLAGS_level(flags) ( flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) 
+
+#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) )
+
+#define OP_log(op) ((op)->flags & LOG_OP)
+#define OP_incl(op) ((op)->flags & ITEM_INCL)
+#define OP_excl(op) ((op)->flags & ITEM_EXCL)
+#define OP_set_flag(op,flag) ((op))->flags |= flag;
+
+//#define DEBUGMC
+//#define DEBUGS
+
+
+
+pthread_mutex_t cid_lock_m_approx_batch_shm = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t connids_m = 1;
+
+#define NCLASSES 40
+#define CHUNK_ALIGN_BYTES 8
+static int classes = 0;
+static int sizes[NCLASSES+1];
+static int inclusives[NCLASSES+1];
+
+
+
+static void init_inclusives(char *inclusive_str) {
+    int j = 1;
+    for (int i = 0; i < (int)strlen(inclusive_str); i++) {
+        if (inclusive_str[i] == '-') {
+            continue;
+        } else {
+            inclusives[j] = inclusive_str[i] - '0';
+            j++;
+        }
+    }
+}
+
+static void init_classes() {
+
+    double factor = 1.25;
+    //unsigned int chunk_size = 48;
+    //unsigned int item_size = 24;
+    unsigned int size = 96; //warning if you change this you die
+    unsigned int i = 0;
+    unsigned int chunk_size_max = 1048576/2;
+    while (++i < NCLASSES-1) {
+        if (size >= chunk_size_max / factor) {
+            break;
+        }
+        if (size % CHUNK_ALIGN_BYTES)
+            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
+        sizes[i] = size;
+        size *= factor;
+    }
+    sizes[i] = chunk_size_max;
+    classes = i;
+
+}
+
+static int get_class(int vl, uint32_t kl) {
+    //warning if you change this you die
+    int vsize = vl+kl+48+1+2;
+    int res = 1;
+    while (vsize > sizes[res])
+        if (res++ == classes) { 
+            //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]);
+            return -1;
+        }
+    return res;
+}
+
+//static int get_incl(int vl, int kl) {
+//    int clsid = get_class(vl,kl);
+//    if (clsid) {
+//        return inclusives[clsid];
+//    } else {
+//        return -1;
+//    }
+//}
+
+
+void ConnectionMultiApproxBatchShm::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char k[256];
+    char a[256];
+    char s[256];
+    memset(k,0,256);
+    memset(a,0,256);
+    memset(s,0,256);
+    strncpy(k,op->key,255);
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type);
+    }
+    size_t res = write(2,output,strlen(output));
+    if (res != strlen(output)) {
+        fprintf(stderr,"error outputingiii\n");
+    }
+}
+
+extern unordered_map<int, double> cid_rate;
+extern unordered_map<string, vector<Operation*>> copy_keys;
+extern unordered_map<string, int> touch_keys;
+extern unordered_map<string, vector<Operation*>> wb_keys;
+
+extern map<string,int> g_key_hist;
+extern int max_n[3];
+
+/**
+ * Create a new connection to a server endpoint.
+ */
+ConnectionMultiApproxBatchShm::ConnectionMultiApproxBatchShm(options_t _options, bool sampling) :
+    start_time(0), stats(sampling), options(_options) 
+{
+  pthread_mutex_lock(&cid_lock_m_approx_batch_shm);
+  cid = connids_m++;
+  if (cid == 1) {
+    init_classes();
+    init_inclusives(options.inclusives);
+  }
+  //cid_rate.insert( { cid, 0 } );
+  
+  pthread_mutex_unlock(&cid_lock_m_approx_batch_shm);
+  
+  valuesize = createGenerator(options.valuesize);
+  keysize = createGenerator(options.keysize);
+  srand(time(NULL));
+  keygen = new KeyGenerator(keysize, options.records);
+  
+  total = 0;
+  eof = 0;
+  o_percent = 0;
+
+  if (options.lambda <= 0) {
+    iagen = createGenerator("0");
+  } else {
+    D("iagen = createGenerator(%s)", options.ia);
+    iagen = createGenerator(options.ia);
+    iagen->set_lambda(options.lambda);
+  }
+
+  read_state  = IDLE;
+  write_state = INIT_WRITE;
+  last_quiet1 = false;
+  last_quiet2 = false;
+  
+  last_tx = last_rx = 0.0;
+  gets = 0;
+  ghits = 0;
+  esets = 0;
+  isets = 0;
+  gloc = rand() % (10*2-1)+1;
+  sloc = rand() % (10*2-1)+1;
+  iloc = rand() % (10*2-1)+1;
+
+  op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  issued_queue = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1));
+
+  for (int i = 0; i <= LEVELS; i++) {
+      op_queue_size[i] = 0;
+      issued_queue[i] = 0;
+      opaque[i] = 1;
+      //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX);
+      op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1));
+      for (int j = 0; j <= OPAQUE_MAX; j++) {
+          op_queue[i][j] = NULL;
+      }
+
+  }
+  
+
+  read_state  = IDLE;
+}
+
+
+void ConnectionMultiApproxBatchShm::set_queue(queue<Operation*>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    trace_queue_n = a_trace_queue->size();
+    //Operation *Op = trace_queue->front(); 
+    //incl_ = get_incl(Op->valuelen,strlen(Op->key));
+    //clsid_ = get_class(Op->valuelen,strlen(Op->key));
+
+    buffer_size_ = 1024*1024*4;
+    //setup the buffers
+    //max is (valuelen + 256 + 24 + 4 + 4 )  * depth
+    for (int i = 1; i <= LEVELS; i++) {
+        buffer_write[i] = (unsigned char*)malloc(buffer_size_);
+        buffer_read[i] = (unsigned char*)malloc(buffer_size_);
+        buffer_leftover[i] = (unsigned char*)malloc(buffer_size_);
+        memset(buffer_read[i],0,buffer_size_);
+        memset(buffer_leftover[i],0,buffer_size_);
+        buffer_write_n[i] = 0;
+        buffer_read_n[i] = 0;
+        buffer_write_nbytes[i] = 0;
+        buffer_read_nbytes[i] = 0;
+        buffer_write_pos[i] = buffer_write[i];
+        buffer_read_pos[i] = buffer_read[i];
+        buffer_lasthdr[i] = 0; // buffer_read[i];
+    }
+
+}
+
+void ConnectionMultiApproxBatchShm::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+void ConnectionMultiApproxBatchShm::set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys) {
+    g_wb_keys = a_wb_keys;
+}
+
+uint32_t ConnectionMultiApproxBatchShm::get_cid() {
+    return cid;
+}
+
+int ConnectionMultiApproxBatchShm::add_to_wb_keys(string key) {
+    auto pos = wb_keys.find(key);
+    if (pos == wb_keys.end()) {
+        wb_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+void ConnectionMultiApproxBatchShm::del_wb_keys(string key) {
+
+    auto position = wb_keys.find(key);
+    if (position != wb_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        wb_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected wb %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxBatchShm::add_to_copy_keys(string key) {
+    auto pos = copy_keys.find(key);
+    if (pos == copy_keys.end()) {
+        copy_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApproxBatchShm::del_copy_keys(string key) {
+
+    auto position = copy_keys.find(key);
+    if (position != copy_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        copy_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected copy %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxBatchShm::add_to_touch_keys(string key) {
+    //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2;
+    auto pos = touch_keys.find(key);
+    if (pos == touch_keys.end()) {
+        touch_keys.insert( {key, cid });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApproxBatchShm::del_touch_keys(string key) {
+    //touch_keys.erase(key);
+    auto position = touch_keys.find(key);
+    if (position != touch_keys.end()) {
+        touch_keys.erase(position);
+    } else {
+        fprintf(stderr,"expected touch %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxBatchShm::issue_op(Operation *Op) {
+    double now = get_time();
+    int issued = 0;
+    Op->clsid = get_class(Op->valuelen,strlen(Op->key));
+    int flags = 0;
+    int index = lrand48() % (1024 * 1024);
+    int incl = inclusives[Op->clsid];
+    SET_INCL(incl,flags);
+    
+    switch(Op->type) {
+
+    case Operation::GET:
+          issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1);
+          this->stats.gets++;
+          gets++;
+          //this->stats.gets_cid[cid]++;
+          break;
+    case Operation::SET:
+          if (incl == 1) {
+            if (isets >= iloc) {
+                const char *data = &random_char[index];
+                issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+                issued = issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET);
+                iloc += rand()%(10*2-1)+1;
+            } else {
+                issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY);
+            }
+            isets++;
+          } else if (incl == 2) {
+            issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+            if (esets >= sloc) {
+                issued = issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET);
+                sloc += rand()%(10*2-1)+1;
+            }
+            esets++;
+          }
+          this->stats.sets++;
+          //this->stats.sets_cid[cid]++;
+          break;
+    case Operation::DELETE:
+    case Operation::TOUCH:
+    case Operation::NOOP:
+    case Operation::SASL:
+          fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen);
+          break;
+    
+    }
+    return issued;
+}
+
+
+int ConnectionMultiApproxBatchShm::do_connect() {
+
+  int connected = 0;
+  //the client should see for this cid, where the shared memory is
+  typedef struct shared_ {
+      bipbuf_t bipbuf_in;
+      bipbuf_t bipbuf_out;
+      pthread_mutex_t lock_in;
+      pthread_mutex_t lock_out;
+      pthread_cond_t cond_in_not_empty;
+      pthread_cond_t cond_in_not_full;
+      pthread_cond_t cond_out_not_empty;
+      pthread_cond_t cond_out_not_full;
+      int bipbuf_in_bytes;
+      int bipbuf_out_bytes;
+      int shared_id;
+  } shared_t;
+  
+  //this cid gets shared memory
+  // ftok to generate unique key
+  //char shmkey[64];
+  //sprintf(shmkey,"shmfilel1%d",cid);
+  int id = cid+100;
+  //key_t key = ftok(shmkey,id);
+  
+  // shmget returns an identifier in shmid
+  int shmid = shmget(id,sizeof(shared_t),0666);
+  
+  // shmat to attach to shared memory
+  shared_t* share_l1 = (shared_t*) shmat(shmid,(void*)0,0);
+
+  fprintf(stderr,"cid %d gets shared memory buf l1 %d\n",cid,share_l1->shared_id);
+  
+  // ftok to generate unique key
+  //char shmkey2[64];
+  //sprintf(shmkey2,"shmfilel2%d",cid);
+  int id2 = cid+200;
+  //key_t key2 = ftok(shmkey2,id2);
+  
+  // shmget returns an identifier in shmid
+  int shmid2 = shmget(id2,sizeof(shared_t),0666);
+  
+  // shmat to attach to shared memory
+  shared_t* share_l2 = (shared_t*) shmat(shmid2,(void*)0,0);
+  
+  fprintf(stderr,"cid %d gets shared memory buf l2 %d\n",cid,share_l2->shared_id);
+  connected = 1;
+
+  //the leads are reveresed (from perspective of server)
+  bipbuf_in[1] = &share_l1->bipbuf_out;
+  bipbuf_in[2] = &share_l2->bipbuf_out;
+  bipbuf_out[1] = &share_l1->bipbuf_in;
+  bipbuf_out[2] = &share_l2->bipbuf_in;
+  
+  bipbuf_in_bytes[1] = &share_l1->bipbuf_out_bytes;
+  bipbuf_in_bytes[2] = &share_l2->bipbuf_out_bytes;
+  bipbuf_out_bytes[1] = &share_l1->bipbuf_in_bytes;
+  bipbuf_out_bytes[2] = &share_l2->bipbuf_in_bytes;
+  
+  lock_in[1] = &share_l1->lock_out;
+  lock_in[2] = &share_l2->lock_out;
+  lock_out[1] = &share_l1->lock_in;
+  lock_out[2] = &share_l2->lock_in;
+  
+  cond_in_not_empty[1] = &share_l1->cond_out_not_empty;
+  cond_in_not_empty[2] = &share_l2->cond_out_not_empty;
+  cond_in_not_full[1] = &share_l1->cond_out_not_full;
+  cond_in_not_full[2] = &share_l2->cond_out_not_full;
+  cond_out_not_empty[1] = &share_l1->cond_in_not_empty;
+  cond_out_not_empty[2] = &share_l2->cond_in_not_empty;
+  cond_out_not_full[1] = &share_l1->cond_in_not_full;
+  cond_out_not_full[2] = &share_l2->cond_in_not_full;
+  read_state  = IDLE;
+  return connected;
+}
+
+/**
+ * Destroy a connection, performing cleanup.
+ */
+ConnectionMultiApproxBatchShm::~ConnectionMultiApproxBatchShm() {
+ 
+
+  for (int i = 0; i <= LEVELS; i++) {
+      free(op_queue[i]);
+      if (i > 0) {
+        free(buffer_write[i]);
+        free(buffer_read[i]);
+      }
+
+  }
+  
+  free(op_queue_size);
+  free(opaque);
+  free(op_queue);
+
+  delete iagen;
+  delete keygen;
+  delete keysize;
+  delete valuesize;
+}
+
+/**
+ * Reset the connection back to an initial, fresh state.
+ */
+void ConnectionMultiApproxBatchShm::reset() {
+  // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
+  read_state = IDLE;
+  write_state = INIT_WRITE;
+  stats = ConnectionStats(stats.sampling);
+}
+
+
+
+
+/**
+ * Get/Set or Set Style
+ * If a GET command: Issue a get first, if not found then set
+ * If trace file (or prob. write) says to set, then set it
+ */
+int ConnectionMultiApproxBatchShm::issue_getsetorset(double now) {
+    
+    Operation *Op = trace_queue->front(); 
+    if (Op->type == Operation::SASL) {
+        //cid_rate.insert( {cid, 100 } );
+        //fprintf(stderr,"cid %d done before loop\n",cid);
+        //string op_queue1;
+        //string op_queue2;
+        //for (int j = 0; j < 2; j++) {
+        //    for (int i = 0; i < OPAQUE_MAX; i++) {
+        //        if (op_queue[j+1][i] != NULL) {
+        //            if (j == 0) {
+        //                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+        //            } else {
+        //                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+        //            }
+        //        }
+        //    }
+        //}
+        for (int i = 1; i <= LEVELS; i++) {
+            if (buffer_write_n[i] > 0) {
+                send_write_buffer(i);
+            }
+        }
+        eof = 1;
+        //fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+        return 1;
+    } 
+   
+    int issued = issue_op(Op);
+    trace_queue->pop();
+    while (issued != 2) {
+        Op = trace_queue->front();
+
+        if (Op->type == Operation::SASL) {
+            for (int i = 1; i <= LEVELS; i++) {
+                if (buffer_write_n[i] > 0) {
+                    send_write_buffer(i);
+                }
+            }
+            //string op_queue1;
+            //string op_queue2;
+            //for (int j = 0; j < 2; j++) {
+            //    for (int i = 0; i < OPAQUE_MAX; i++) {
+            //        if (op_queue[j+1][i] != NULL) {
+            //            if (j == 0) {
+            //                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+            //            } else {
+            //                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+            //            }
+            //        }
+            //    }
+            //}
+            //fprintf(stderr,"done in loop cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+            eof = 1;
+            return 1;
+        }
+        issued = issue_op(Op);
+        trace_queue->pop();
+    }
+    
+    return 0;
+}
+
+int ConnectionMultiApproxBatchShm::send_write_buffer(int level) {
+    int rc = 1;
+    pthread_mutex_lock(lock_out[level]);
+    int to_write = buffer_write_nbytes[level];
+    int gtg = bipbuf_unused(bipbuf_out[level]) >= to_write ? 1 : 0;
+    while (gtg == 0) {
+        pthread_cond_wait(cond_out_not_full[level],lock_out[level]);
+        gtg = bipbuf_unused(bipbuf_out[level]) >= to_write ? 1 : 0;
+    }
+    int ret = bipbuf_offer(bipbuf_out[level],buffer_write[level],to_write);
+    if (ret != to_write) {
+        fprintf(stderr,"error writing buffer! level %d, size %d\n",level,to_write);
+    } 
+    *bipbuf_out_bytes[level] += to_write;
+    //fprintf(stderr,"writing %d to %d, total %d\n",to_write,level,*bipbuf_out_bytes[level]);
+    issued_queue[level] = buffer_write_n[level];
+    buffer_write_n[level] = 0;
+    buffer_write_pos[level] = buffer_write[level];
+    memset(buffer_write_pos[level],0,buffer_write_nbytes[level]);
+    stats.tx_bytes += buffer_write_nbytes[level];
+    buffer_write_nbytes[level] = 0;
+    rc = 2;
+    pthread_cond_signal(cond_out_not_empty[level]);
+    pthread_mutex_unlock(lock_out[level]);
+    return rc;
+}
+
+int ConnectionMultiApproxBatchShm::add_get_op_to_queue(Operation *pop, int level, int cb) {
+
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,pop->flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(pop->key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  //if (quiet) {
+  //    h.opcode = CMD_GETQ;
+  //}
+  h.opaque = htonl(pop->opaque);
+  
+
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += 24 + keylen;
+
+  int res = 1;
+  if (buffer_write_n[level] >= (uint32_t)options.depth) { // && cb == 0) {
+      res = send_write_buffer(level);
+  }
+  return res;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxBatchShm::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) {
+  
+  int level = FLAGS_level(flags);
+
+  //initialize op for sending
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+
+  //put op into queue
+  return add_get_op_to_queue(pop,level,0);
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxBatchShm::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::GET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(valuelen,strlen(key));
+
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+
+  return add_get_op_to_queue(pop,level,1);
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxBatchShm::issue_touch(const char* key, int valuelen, double now, int flags) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::TOUCH;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+#ifdef DEBUGS
+  fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque);
+#endif
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen),
+                        0x04, 0x00, htons(0),
+                        htonl(keylen + 4) };
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t exp = 0;
+  if (flags & ITEM_DIRTY) {
+      exp = htonl(flags); 
+  }
+  
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], &exp, 4);
+  buffer_write_pos[level] += 4;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  buffer_write_nbytes[level] += 24 + keylen + 4;
+  buffer_write_n[level]++;
+
+  int ret = 1;
+  //if (buffer_write_n[level] == (uint32_t)options.depth) {
+  //    ret = send_write_buffer(level);
+  //}
+  
+  return ret;
+}
+
+/**
+ * Issue a delete request to the server.
+ */
+int ConnectionMultiApproxBatchShm::issue_delete(const char* key, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->type = Operation::DELETE;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque);
+#endif
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_DELETE, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  h.opaque = htonl(pop->opaque);
+  
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += 24 + keylen;
+  
+  int ret = 1;
+  //if (buffer_write_n[level] >= (uint32_t)options.depth) {
+  //    ret = send_write_buffer(level);
+  //}
+  
+  return ret;
+}
+
+int ConnectionMultiApproxBatchShm::issue_noop(int level) {
+
+  binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                        0x00, 0x00, htons(0),
+                        0x00 };
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += 24;
+  
+  int ret = 1;
+  //if (buffer_write_n[level] >= (uint32_t)options.depth) {
+  //    ret = send_write_buffer(level);
+  //}
+
+  return ret;
+}
+
+int ConnectionMultiApproxBatchShm::add_set_to_queue(Operation *pop, int level, const char* value, int cb) {
+  int length = pop->valuelen;
+  
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,length,level,pop->flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  uint16_t keylen = strlen(pop->key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(pop->flags);
+  uint32_t exp = 0;
+  //int to_write = buffer_write_nbytes[level] + 32 + keylen + length;
+  //int gtg = bipbuf_unused(bipbuf_out[level]) >= to_write ? 1 : 0;
+  //if (gtg == 0) {
+  //    switch (level) {
+  //      case 1:
+  //          read_callback1();
+  //          break;
+  //      case 2:
+  //          read_callback2();
+  //          break;
+  //    }
+  //}
+  //fprintf(stderr,"write_n[%d] %d bytes: %d\n",level,buffer_write_n[level],buffer_write_nbytes[level]);
+  memcpy(buffer_write_pos[level], &h, 24);
+  buffer_write_pos[level] += 24;
+  memcpy(buffer_write_pos[level], &f, 4);
+  buffer_write_pos[level] += 4;
+  memcpy(buffer_write_pos[level], &exp, 4);
+  buffer_write_pos[level] += 4;
+  memcpy(buffer_write_pos[level], pop->key, keylen);
+  buffer_write_pos[level] += keylen;
+  memcpy(buffer_write_pos[level], value, length);
+  buffer_write_pos[level] += length;
+  buffer_write_n[level]++;
+  buffer_write_nbytes[level] += length + 32 + keylen;
+  
+  int ret = 1;
+  if (buffer_write_n[level] >= (uint32_t)options.depth) { // && cb == 0) {
+      ret = send_write_buffer(level);
+  }
+  return ret;
+  
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxBatchShm::issue_set(Operation *pop, const char* value, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  return add_set_to_queue(pop,level,value,0);
+
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxBatchShm::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  strncpy(pop->key,key,255); 
+  pop->valuelen = length;
+  pop->type = Operation::SET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(length,strlen(key));
+
+  return add_set_to_queue(pop,level,value,1);
+  
+}
+
+
+/**
+ * Finish up (record stats) an operation that just returned from the
+ * server.
+ */
+void ConnectionMultiApproxBatchShm::finish_op(Operation *op, int was_hit) {
+  double now;
+#if USE_CACHED_TIME
+  struct timeval now_tv;
+  event_base_gettimeofday_cached(base, &now_tv);
+  now = tv_to_double(&now_tv);
+#else
+  now = get_time();
+#endif
+#if HAVE_CLOCK_GETTIME
+  op->end_time = get_time_accurate();
+#else
+  op->end_time = now;
+#endif
+
+  if (was_hit) { 
+    switch (op->type) {
+    case Operation::GET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_get_l1(*op);
+                break;
+            case 2:
+                stats.log_get_l2(*op);
+                if (op->l1 != NULL) {
+                    op->l1->end_time = now;
+                    stats.log_get(*(op->l1));
+                }
+                break;
+        }
+        break;
+    case Operation::SET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_set_l1(*op);
+                break;
+            case 2:
+                stats.log_set_l2(*op);
+                break;
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  }
+
+  last_rx = now;
+  uint8_t level = OP_level(op);
+  if (op->l1 != NULL) {
+      //delete op_queue[1][op->l1->opaque];
+      if (op->l1 == op_queue[1][op->l1->opaque]) {
+        op_queue[1][op->l1->opaque] = 0;
+        if (op_queue_size[1] > 0) {
+            op_queue_size[1]--;
+        } else {
+            fprintf(stderr,"chained op_Queue_size[%d] out of sync!!\n",1);
+        }
+        delete op->l1;
+      } else {
+        fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[1][op->opaque],op->opaque,op_queue[1][op->opaque]->opaque);
+      }
+  }
+  //op_queue[level].erase(op_queue[level].begin()+opopq);
+  if (op == op_queue[level][op->opaque] && 
+          op->opaque == op_queue[level][op->opaque]->opaque) {
+    //delete op_queue[level][op->opaque];
+    op_queue[level][op->opaque] = 0;
+    delete op;
+    if (op_queue_size[level] > 0) {
+        op_queue_size[level]--;
+    } else {
+        fprintf(stderr,"op_Queue_size[%d] out of sync!!\n",level);
+    }
+  } else {
+      fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque);
+  }
+  read_state = IDLE;
+
+}
+
+
+
+/**
+ * Request generation loop. Determines whether or not to issue a new command,
+ * based on timer events.
+ *
+ * Note that this function loops. Be wary of break vs. return.
+ */
+void ConnectionMultiApproxBatchShm::drive_write_machine_shm(double now) {
+    
+  while (trace_queue->size() > 0) {
+      Operation *Op = trace_queue->front(); 
+      if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) {
+          eof = 1;
+          for (int i = 1; i <= LEVELS; i++) {
+              if (buffer_write_n[i] > 0) {
+                  send_write_buffer(i);
+              }
+          }
+          
+          cid_rate.insert( {cid, 100 } );
+          fprintf(stderr,"cid %d done\n",cid);
+          string op_queue1;
+          string op_queue2;
+          for (int j = 0; j < 2; j++) {
+              for (int i = 0; i < OPAQUE_MAX; i++) {
+                  if (op_queue[j+1][i] != NULL) {
+                      if (j == 0) {
+                          op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+                      } else {
+                          op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+                      }
+                  }
+              }
+          }
+          fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+          return;
+      } 
+      int issued = 0;
+      while (issued != 2) {
+          Op = trace_queue->front();
+
+          if (Op->type == Operation::SASL) {
+              for (int i = 1; i <= LEVELS; i++) {
+                  if (buffer_write_n[i] > 0) {
+                      send_write_buffer(i);
+                  }
+              }
+              //string op_queue1;
+              //string op_queue2;
+              //for (int j = 0; j < 2; j++) {
+              //    for (int i = 0; i < OPAQUE_MAX; i++) {
+              //        if (op_queue[j+1][i] != NULL) {
+              //            if (j == 0) {
+              //                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+              //            } else {
+              //                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+              //            }
+              //        }
+              //    }
+              //}
+              //fprintf(stderr,"done in loop cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+              eof = 1;
+              return;
+          }
+          issued = issue_op(Op); //this will return 2 if the write buffer was sent (i.e. buffer has depth commands)
+          trace_queue->pop();
+      }
+      if ( (int)(issued_queue[1]) > 0)  { 
+          read_callback1();
+          issued_queue[1] = 0;
+      }
+      if ( (int)(issued_queue[2]) > 0)  { 
+          read_callback2();
+          issued_queue[2] = 0;
+      }
+  }
+}
+
+size_t ConnectionMultiApproxBatchShm::handle_response_batch(unsigned char *rbuf_pos, resp_t *resp, 
+                                    size_t read_bytes, size_t consumed_bytes,
+                                    int level, int extra) {
+    if (rbuf_pos[0] != 129) {
+        fprintf(stderr,"cid %d we don't have a valid header %u\n",cid,rbuf_pos[0]);
+        //buffer_read_pos[level] = rbuf_pos;
+        //buffer_read_n[level] = 1;
+        return 0;
+    }
+    if ((read_bytes+extra - consumed_bytes) < 24) {
+        size_t have = (read_bytes+extra) - (consumed_bytes);
+        size_t need = 24 - (have);
+        buffer_read_n[level] = need;
+        buffer_read_nbytes[level] = have;
+        memcpy(buffer_leftover[level],rbuf_pos,have);
+        //buffer_lasthdr[level] = rbuf_pos;
+        //buffer_read_n[level] = need;
+        //buffer_read_nbytes[level] = have;
+        fprintf(stderr,"cid %d - we don't have enough header data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,24,read_bytes,extra,level);
+        return 0;
+        
+    }
+
+    binary_header_t* h = reinterpret_cast<binary_header_t*>(rbuf_pos);
+    uint32_t bl = ntohl(h->body_len);
+    uint16_t kl = ntohs(h->key_len);
+    uint8_t el = h->extra_len;
+    int targetLen = 24 + bl;
+    if (consumed_bytes + targetLen > (read_bytes+extra)) {
+        size_t have = (read_bytes+extra) - (consumed_bytes);
+        size_t need = targetLen - (have);
+        buffer_read_n[level] = need;
+        buffer_read_nbytes[level] = have;
+        memcpy(buffer_leftover[level],rbuf_pos,have);
+        fprintf(stderr,"cid %d - we don't have enough data, need %lu more bytes, have %lu (targetLen: %d) (read_bytes %ld) (extra %d) %d)\n",cid,need,have,targetLen,read_bytes,extra,level);
+        return 0;
+    }
+    
+    resp->opcode = h->opcode;
+    resp->opaque = ntohl(h->opaque);
+    uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",cid,level,
+    h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+    ntohl(h->body_len),ntohs(h->status));
+#endif
+    // If something other than success, count it as a miss
+    if (resp->opcode == CMD_GET && status == RESP_NOT_FOUND) {
+        switch(level) {
+            case 1:
+                stats.get_misses_l1++;
+                break;
+            case 2:
+                stats.get_misses_l2++;
+                stats.get_misses++;
+                stats.window_get_misses++;
+                break;
+        }
+        resp->found = false;
+    } else if (resp->opcode == CMD_SET && kl > 0) {
+      //first data is extras: clsid, flags, eflags
+      if (resp->evict) {
+          unsigned char *buf = rbuf_pos + 24;
+          resp->evict->clsid = *((uint32_t*)buf);
+          resp->evict->clsid = ntohl(resp->evict->clsid);
+          buf += 4;
+          
+          resp->evict->serverFlags = *((uint32_t*)buf);
+          resp->evict->serverFlags = ntohl(resp->evict->serverFlags);
+          buf += 4;
+          
+          resp->evict->evictedFlags = *((uint32_t*)buf);
+          resp->evict->evictedFlags = ntohl(resp->evict->evictedFlags);
+          buf += 4;
+    
+          resp->evict->evictedKeyLen = kl;
+          resp->evict->evictedKey = (char*)malloc(kl+1);
+          memset(resp->evict->evictedKey,0,kl+1);
+          memcpy(resp->evict->evictedKey,buf,kl);
+          buf += kl;
+    
+          resp->evict->evictedLen = bl - kl - el;
+          resp->evict->evictedData = (char*)malloc(resp->evict->evictedLen);
+          memcpy(resp->evict->evictedData,buf,resp->evict->evictedLen);
+          resp->evict->evicted = true;
+      } 
+    } else if ( (resp->opcode == CMD_DELETE || resp->opcode == CMD_TOUCH) &&
+                 status == RESP_NOT_FOUND) {
+      resp->found = false;
+    }
+    this->stats.rx_bytes += targetLen; 
+    return targetLen;
+}
+
+
+size_t ConnectionMultiApproxBatchShm::fill_read_buffer(int level, int *extra) {
+
+  size_t read_bytes = 0;
+
+  pthread_mutex_lock(lock_in[level]);
+  //int len = *bipbuf_in_bytes[level];
+  int len = bipbuf_used(bipbuf_in[level]);
+  while (len == 0) {
+    pthread_cond_wait(cond_in_not_empty[level],lock_in[level]);
+    //len = *bipbuf_in_bytes[level];
+    len = bipbuf_used(bipbuf_in[level]);
+  }
+  unsigned int all = 0;
+
+
+  if (buffer_read_n[level] != 0) {
+      uint32_t have = buffer_read_nbytes[level];
+      fprintf(stderr,"already have %u\n",have);
+      //if ((size_t)len < buffer_read_n[level]) {
+      //    pthread_mutex_unlock(lock_in[level]);
+      //    return 0;
+      //}
+      unsigned char* input = bipbuf_peek_all(bipbuf_in[level],&all);
+      if (!input || all == 0) {
+          if (!input && all > 0)
+            fprintf(stderr,"cid %d expected %d on level %d (already have %u)\n",cid,all,level,have);
+          pthread_mutex_unlock(lock_in[level]);
+          return 0;
+      }
+      memcpy(buffer_read[level],buffer_leftover[level],have);
+      buffer_read_pos[level] = buffer_read[level];
+      memcpy(buffer_read_pos[level]+have,input,all);
+      read_bytes = all; 
+      *extra = have;
+      buffer_read_n[level] = 0;
+      buffer_read_nbytes[level] = 0;
+
+  } else {
+      unsigned char *input = bipbuf_peek_all(bipbuf_in[level],&all);
+      if (!input || all == 0) {
+          if (!input && all > 0)
+            fprintf(stderr,"cid %d expected %d on level %d\n",cid,all,level);
+          pthread_mutex_unlock(lock_in[level]);
+          return 0;
+      }
+      read_bytes = all;
+      buffer_read_pos[level] = input;
+#ifdef DEBUGMC
+      fprintf(stderr,"read %d of %d (avail: %d) on l%d\n",all,*bipbuf_in_bytes[level],len,level);
+#endif
+      //memcpy(buffer_read_pos[level],input,len);
+      
+      *extra = 0;
+  }
+  if (read_bytes == 0) {
+      fprintf(stderr,"cid %d read 0 bytes\n",cid);
+  }
+  pthread_mutex_unlock(lock_in[level]);
+  return read_bytes;
+}
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxBatchShm::read_callback1() {
+
+  int level = 1;
+  int extra = 0;
+  size_t read_bytes = 0;
+
+  read_bytes = fill_read_buffer(level,&extra);
+  if (read_bytes == 0) {
+    pthread_mutex_lock(lock_in[level]);
+    pthread_cond_signal(cond_in_not_full[level]);
+    pthread_mutex_unlock(lock_in[level]);
+    return;
+  }
+
+  //fprintf(stderr,"cid %d l1 read: %lu\n",cid,read_bytes);
+  size_t consumed_bytes = 0;
+  size_t batch = options.depth;
+      //we have at least some data to read
+  size_t nread_ops = 0;
+  while (1) {
+      evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+      memset(evict,0,sizeof(evicted_t));
+      resp_t mc_resp;
+      mc_resp.found = true;
+      mc_resp.evict = evict;
+      size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra);
+      if (cbytes == 0) {
+          if (evict) {
+              if (evict->evictedKey) free(evict->evictedKey);
+              if (evict->evictedData) free(evict->evictedData);
+              free(evict);
+          }
+          break;
+      }
+      buffer_read_pos[level] = buffer_read_pos[level] + cbytes;
+      consumed_bytes += cbytes;
+      uint32_t opaque = mc_resp.opaque;
+      bool found = mc_resp.found;
+      
+      Operation *op = op_queue[level][opaque];
+#ifdef DEBUGMC
+      char out[128];
+      sprintf(out,"l1 cid %u, reading opaque: %u\n",cid,opaque);
+      write(2,out,strlen(out));
+      output_op(op,2,found);
+#endif
+
+      double now = get_time();
+      int wb = 0;
+      if (options.rand_admit) {
+          wb = (rand() % options.rand_admit);
+      }
+      int vl = op->valuelen;
+      int flags = OP_clu(op);
+      switch (op->type) {
+          case Operation::GET:
+              if ( !found && (options.getset || options.getsetorset) ) {
+                  /* issue a get a l2 */
+                  issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                  op->end_time = now;
+                  this->stats.log_get_l1(*op);
+
+              } else {
+                  if (OP_incl(op) && ghits >= gloc) {
+                      issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H);
+                      gloc += rand()%(10*2-1)+1;
+                  }
+                  ghits++;
+                  finish_op(op,1);
+              }
+              break;
+          case Operation::SET:
+              //if (OP_src(op) == SRC_L1_COPY ||
+              //    OP_src(op) == SRC_L2_M) {
+              //    del_copy_keys(string(op->key));
+              //}
+              if (evict->evicted) {
+                  string wb_key(evict->evictedKey);
+                  if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                      //int ret = add_to_wb_keys(wb_key);
+                      //if (ret == 1) {
+                          issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY);
+                      //}
+                      this->stats.incl_wbs++;
+                  } else if (evict->evictedFlags & ITEM_EXCL) {
+                      //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                      //strncpy(wb_key,evict->evictedKey,255);
+                      if ( (options.rand_admit && wb == 0) ||
+                           (options.threshold && (g_key_hist[wb_key] == 1)) ||
+                           (options.wb_all) ) {
+                          //int ret = add_to_wb_keys(wb_key);
+                          //if (ret == 1) {
+                              issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                          //}
+                          this->stats.excl_wbs++;
+                      }
+                  }
+                  if (OP_src(op) == SRC_DIRECT_SET) {
+                      if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                          this->stats.set_misses_l1++;
+                      } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                          this->stats.set_excl_hits_l1++;
+                      } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                          this->stats.set_incl_hits_l1++;
+                      }
+                  }
+              }
+              finish_op(op,1);
+              break;
+          case Operation::TOUCH:
+              finish_op(op,1);
+              break;
+          default: 
+              fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+              DIE("not implemented");
+      }
+
+      if (evict) {
+          if (evict->evictedKey) free(evict->evictedKey);
+          if (evict->evictedData) free(evict->evictedData);
+          free(evict);
+      }
+      nread_ops++;
+      if (buffer_read_pos[level][0] != 129 || (read_bytes - consumed_bytes == 0)) {
+          break;
+      }
+      //if (buffer_read_pos[level][0] != 129) {
+      //    //fprintf(stderr,"cid %d we don't have a valid header post %u\n",cid,buffer_read_pos[level][0]);
+      //    break;
+      //}
+  }
+  pthread_mutex_lock(lock_in[level]);
+  bipbuf_poll(bipbuf_in[level],read_bytes);
+  *bipbuf_in_bytes[level] = *bipbuf_in_bytes[level] - read_bytes;
+  pthread_cond_signal(cond_in_not_full[level]);
+  pthread_mutex_unlock(lock_in[level]);
+  double now = get_time();
+  last_tx = now;
+  stats.log_op(op_queue_size[1]);
+  stats.log_op(op_queue_size[2]);
+
+  
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxBatchShm::read_callback2() {
+  
+  int level = 2;
+  int extra = 0;
+  
+  size_t read_bytes = 0;
+
+  read_bytes = fill_read_buffer(level,&extra);
+  if (read_bytes == 0) {
+    pthread_mutex_lock(lock_in[level]);
+    pthread_cond_signal(cond_in_not_full[level]);
+    pthread_mutex_unlock(lock_in[level]);
+    return;
+  }
+
+  //fprintf(stderr,"l2 read: %lu\n",read_bytes);
+  size_t consumed_bytes = 0;
+  size_t batch = options.depth;
+  size_t nread_ops = 0;
+  while (1) {
+      evicted_t *evict = NULL;
+      resp_t mc_resp;
+      mc_resp.found = true;
+      mc_resp.evict = evict;
+      size_t cbytes = handle_response_batch(buffer_read_pos[level],&mc_resp,read_bytes,consumed_bytes,level,extra);
+      if (cbytes == 0) {
+          break;
+      }
+      buffer_read_pos[level] = buffer_read_pos[level] + cbytes;
+      consumed_bytes += cbytes;
+      uint32_t opaque = mc_resp.opaque;
+      bool found = mc_resp.found;
+      
+      Operation *op = op_queue[level][opaque];
+#ifdef DEBUGMC
+      char out[128];
+      sprintf(out,"l2 cid %u, reading opaque: %u\n",cid,opaque);
+      write(2,out,strlen(out));
+      output_op(op,2,found);
+#endif
+      double now = get_time();
+      switch (op->type) {
+          case Operation::GET:
+              if ( !found && (options.getset || options.getsetorset) ) {
+                  int valuelen = op->valuelen;
+                  int index = lrand48() % (1024 * 1024);
+                  int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                  //int ret = add_to_copy_keys(string(op->key));
+                  //if (ret == 1) {
+                  issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                  if (OP_incl(op)) {
+                      issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                  }
+                  //}
+                  finish_op(op,0); // sets read_state = IDLE
+              } else {
+                  if (found) {
+                      int valuelen = op->valuelen;
+                      int index = lrand48() % (1024 * 1024);
+                      int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                      string key = string(op->key);
+                      const char *data = &random_char[index];
+                      //int ret = add_to_copy_keys(string(op->key));
+                      //if (ret == 1) {
+                          issue_set(op->key,data,valuelen, now, flags);
+                      //}
+                      this->stats.copies_to_l1++;
+                      finish_op(op,1);
+
+                  } else {
+                      finish_op(op,0);
+                  }
+              }
+              break;
+          case Operation::SET:
+              //if (OP_src(op) == SRC_WB) {
+              //    del_wb_keys(string(op->key));
+              //}
+              finish_op(op,1);
+              break;
+          case Operation::TOUCH:
+              if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) {
+                  int valuelen = op->valuelen;
+                  if (!found) {
+                      int index = lrand48() % (1024 * 1024);
+                      issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                      this->stats.set_misses_l2++;
+                  } else {
+                      if (OP_src(op) == SRC_DIRECT_SET) {
+                          issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                      }
+                  }
+                  //del_touch_keys(string(op->key));
+              }
+              finish_op(op,0);
+              break;
+          case Operation::DELETE:
+              //check to see if it was a hit
+              //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+              if (OP_src(op) == SRC_DIRECT_SET) {
+                  if (found) {
+                      this->stats.delete_hits_l2++;
+                  } else {
+                      this->stats.delete_misses_l2++;
+                  }
+              }
+              finish_op(op,1);
+              break;
+          default: 
+              fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+              DIE("not implemented");
+      }
+      nread_ops++;
+      if (buffer_read_pos[level][0] != 129 || (read_bytes - consumed_bytes == 0)) {
+          break;
+      }
+  }
+  //if (buffer_read_n[level] == 0) {
+  //    memset(buffer_read[level],0,read_bytes);
+  //}
+  //if (nread_ops == 0) {
+  //    fprintf(stderr,"ugh l2 only got: %lu ops expected %lu\n",nread_ops,batch);
+  //}
+  
+  pthread_mutex_lock(lock_in[level]);
+  bipbuf_poll(bipbuf_in[level],read_bytes);
+  *bipbuf_in_bytes[level] = *bipbuf_in_bytes[level] - read_bytes;
+  pthread_cond_signal(cond_in_not_full[level]);
+  pthread_mutex_unlock(lock_in[level]);
+
+
+  double now = get_time();
+  last_tx = now;
+  stats.log_op(op_queue_size[2]);
+  stats.log_op(op_queue_size[1]);
+  
+}
+
diff --git a/ConnectionMultiApproxShm.cc b/ConnectionMultiApproxShm.cc
new file mode 100644
index 0000000..e3c006d
--- /dev/null
+++ b/ConnectionMultiApproxShm.cc
@@ -0,0 +1,1772 @@
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <time.h>
+#include <pthread.h>
+
+#include <event2/buffer.h>
+#include <event2/bufferevent.h>
+#include <event2/dns.h>
+#include <event2/event.h>
+#include <event2/thread.h>
+#include <event2/util.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+
+#include "config.h"
+
+#include "Connection.h"
+#include "distributions.h"
+#include "Generator.h"
+#include "mutilate.h"
+#include "binary_protocol.h"
+#include "util.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unistd.h>
+#include <string.h>
+#include "blockingconcurrentqueue.h"
+#include "bipbuffer.h"
+//#include <folly/concurrency/UnboundedQueue.h>
+//#include <folly/concurrency/ConcurrentHashMap.h>
+
+
+
+#define ITEM_L1 1
+#define ITEM_L2 2
+#define LOG_OP 4
+#define SRC_L1_M 8
+#define SRC_L1_H 16
+#define SRC_L2_M 32
+#define SRC_L2_H 64
+#define SRC_DIRECT_SET 128
+#define SRC_L1_COPY 256
+#define SRC_WB 512
+
+#define ITEM_INCL  4096
+#define ITEM_EXCL  8192
+#define ITEM_DIRTY 16384
+#define ITEM_SIZE_CHANGE 131072
+#define ITEM_WAS_HIT 262144
+
+#define LEVELS 2
+#define SET_INCL(incl,flags)     \
+    switch (incl) {              \
+        case 1:                  \
+            flags |= ITEM_INCL;  \
+            break;               \
+        case 2:                  \
+            flags |= ITEM_EXCL;  \
+            break;               \
+                                 \
+    }                            \
+
+#define GET_INCL(incl,flags) \
+    if (flags & ITEM_INCL) incl = 1; \
+    else if (flags & ITEM_EXCL) incl = 2; \
+
+//#define OP_level(op) ( ((op)->flags & ITEM_L1) ? ITEM_L1 : ITEM_L2 )
+#define OP_level(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define FLAGS_level(flags) ( flags & ~(LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) ) 
+
+#define OP_clu(op) ( (op)->flags & ~(LOG_OP | \
+                                     ITEM_L1 | ITEM_L2 | ITEM_DIRTY | \
+                                     SRC_L1_M | SRC_L1_H | SRC_L2_M | SRC_L2_H | \
+                                     SRC_DIRECT_SET | SRC_L1_COPY | SRC_WB ) )
+
+#define OP_src(op) ( (op)->flags & ~(ITEM_L1 | ITEM_L2 | LOG_OP | \
+                                     ITEM_INCL | ITEM_EXCL | ITEM_DIRTY ) )
+
+#define OP_log(op) ((op)->flags & LOG_OP)
+#define OP_incl(op) ((op)->flags & ITEM_INCL)
+#define OP_excl(op) ((op)->flags & ITEM_EXCL)
+#define OP_set_flag(op,flag) ((op))->flags |= flag;
+
+//#define DEBUGMC
+//#define DEBUGS
+//using namespace folly;
+using namespace moodycamel;
+//using namespace fmt;
+
+//struct node {
+//    long long addr,label;
+//    node *nxt;
+//    node(long long _addr = 0, long long _label = 0, node *_nxt = NULL)
+//         : addr(_addr),label(_label),nxt(_nxt) {}
+//};
+//
+//struct tnode {
+//    long long tm,offset; int size;
+//};//trace file data structure
+//
+//long long find(long long addr) {
+//    int t = addr%MAXH;
+//    node *tmp = hash[t],*pre = NULL;
+//    while (tmp) {
+//        if (tmp->addr == addr) {
+//            long long tlabel = tmp->label;
+//            if (pre == NULL) hash[t] = tmp->nxt;
+//            else pre->nxt = tmp->nxt;
+//            delete tmp;
+//            return tlabel;
+//        }
+//        pre = tmp;
+//        tmp = tmp->nxt;
+//    }
+//    return 0;
+//}
+//
+//void insert(long long addr )  {
+//    int t = addr%MAXH;
+//    node *tmp = new node(addr,n,hash[t]);
+//    hash[t] = tmp;
+//}
+
+
+
+pthread_mutex_t cid_lock_m_approx_shm = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t connids_m = 1;
+
+#define NCLASSES 40
+#define CHUNK_ALIGN_BYTES 8
+static int classes = 0;
+static int sizes[NCLASSES+1];
+static int inclusives[NCLASSES+1];
+
+
+
+static void init_inclusives(char *inclusive_str) {
+    int j = 1;
+    for (int i = 0; i < (int)strlen(inclusive_str); i++) {
+        if (inclusive_str[i] == '-') {
+            continue;
+        } else {
+            inclusives[j] = inclusive_str[i] - '0';
+            j++;
+        }
+    }
+}
+
+static void init_classes() {
+
+    double factor = 1.25;
+    //unsigned int chunk_size = 48;
+    //unsigned int item_size = 24;
+    unsigned int size = 96; //warning if you change this you die
+    unsigned int i = 0;
+    unsigned int chunk_size_max = 1048576/2;
+    while (++i < NCLASSES-1) {
+        if (size >= chunk_size_max / factor) {
+            break;
+        }
+        if (size % CHUNK_ALIGN_BYTES)
+            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
+        sizes[i] = size;
+        size *= factor;
+    }
+    sizes[i] = chunk_size_max;
+    classes = i;
+
+}
+
+static int get_class(int vl, uint32_t kl) {
+    //warning if you change this you die
+    int vsize = vl+kl+48+1+2;
+    int res = 1;
+    while (vsize > sizes[res])
+        if (res++ == classes) { 
+            //fprintf(stderr,"item larger than max class size. vsize: %d, class size: %d\n",vsize,sizes[res]);
+            return -1;
+        }
+    return res;
+}
+
+static int get_incl(int vl, int kl) {
+    int clsid = get_class(vl,kl);
+    if (clsid) {
+        return inclusives[clsid];
+    } else {
+        return -1;
+    }
+}
+
+void ConnectionMultiApproxShm::output_op(Operation *op, int type, bool found) {
+    char output[1024];
+    char k[256];
+    char a[256];
+    char s[256];
+    memset(k,0,256);
+    memset(a,0,256);
+    memset(s,0,256);
+    strncpy(k,op->key,255);
+    switch (type) {
+        case 0: //get
+            sprintf(a,"issue_get");
+            break;
+        case 1: //set
+            sprintf(a,"issue_set");
+            break;
+        case 2: //resp
+            sprintf(a,"resp");
+            break;
+    }
+    switch(read_state) {
+        case INIT_READ:
+            sprintf(s,"init");
+            break;
+        case CONN_SETUP:
+            sprintf(s,"setup");
+            break;
+        case LOADING:
+            sprintf(s,"load");
+            break;
+        case IDLE:
+            sprintf(s,"idle");
+            break;
+        case WAITING_FOR_GET:
+            sprintf(s,"waiting for get");
+            break;
+        case WAITING_FOR_SET:
+            sprintf(s,"waiting for set");
+            break;
+        case WAITING_FOR_DELETE:
+            sprintf(s,"waiting for del");
+            break;
+        case MAX_READ_STATE:
+            sprintf(s,"max");
+            break;
+    }
+    if (type == 2) {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, found: %d, type: %d\n",cid,a,k,op->opaque,found,op->type);
+    } else {
+        sprintf(output,"conn: %u, action: %s op: %s, opaque: %u, type: %d\n",cid,a,k,op->opaque,op->type);
+    }
+    write(2,output,strlen(output));
+}
+
+//extern USPMCQueue<Operation*,true,8,7> g_trace_queue;
+//static vector<double> cid_rate;
+//extern ConcurrentHashMap<int, double> cid_rate;
+extern unordered_map<int, double> cid_rate;
+//extern ConcurrentHashMap<string, vector<Operation*>> copy_keys;
+extern unordered_map<string, vector<Operation*>> copy_keys;
+extern unordered_map<string, int> touch_keys;
+extern unordered_map<string, vector<Operation*>> wb_keys;
+//extern ConcurrentHashMap<string, vector<Operation*>> wb_keys;
+
+extern map<string,int> g_key_hist;
+extern int max_n[3];
+
+/**
+ * Create a new connection to a server endpoint.
+ */
+ConnectionMultiApproxShm::ConnectionMultiApproxShm(options_t _options,
+                       bool sampling) :
+  start_time(0), stats(sampling), options(_options) 
+{
+  pthread_mutex_lock(&cid_lock_m_approx_shm);
+  cid = connids_m++;
+  if (cid == 1) {
+    init_classes();
+    init_inclusives(options.inclusives);
+  }
+  cid_rate.insert( { cid, 0 } );
+  
+  pthread_mutex_unlock(&cid_lock_m_approx_shm);
+  
+  valuesize = createGenerator(options.valuesize);
+  keysize = createGenerator(options.keysize);
+  srand(time(NULL));
+  keygen = new KeyGenerator(keysize, options.records);
+  
+  total = 0;
+  eof = 0;
+  o_percent = 0;
+
+  if (options.lambda <= 0) {
+    iagen = createGenerator("0");
+  } else {
+    D("iagen = createGenerator(%s)", options.ia);
+    iagen = createGenerator(options.ia);
+    iagen->set_lambda(options.lambda);
+  }
+
+  read_state  = IDLE;
+  write_state = INIT_WRITE;
+  last_quiet1 = false;
+  last_quiet2 = false;
+  
+  last_tx = last_rx = 0.0;
+  gets = 0;
+  ghits = 0;
+  esets = 0;
+  isets = 0;
+  gloc = rand() % (10*2-1)+1;
+  sloc = rand() % (10*2-1)+1;
+  iloc = rand() % (10*2-1)+1;
+
+  op_queue_size = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  opaque = (uint32_t*)malloc(sizeof(uint32_t)*(LEVELS+1));
+  op_queue = (Operation***)malloc(sizeof(Operation**)*(LEVELS+1));
+
+  for (int i = 0; i <= LEVELS; i++) {
+      op_queue_size[i] = 0;
+      opaque[i] = 1;
+      //op_queue[i] = (Operation*)malloc(sizeof(int)*OPAQUE_MAX);
+      op_queue[i] = (Operation**)malloc(sizeof(Operation*)*(OPAQUE_MAX+1));
+      for (int j = 0; j <= OPAQUE_MAX; j++) {
+          op_queue[i][j] = NULL;
+      }
+
+  }
+
+  read_state  = IDLE;
+}
+
+
+void ConnectionMultiApproxShm::set_queue(queue<Operation*>* a_trace_queue) {
+    trace_queue = a_trace_queue;
+    trace_queue_n = a_trace_queue->size();
+}
+
+void ConnectionMultiApproxShm::set_lock(pthread_mutex_t* a_lock) {
+    lock = a_lock;
+}
+
+void ConnectionMultiApproxShm::set_g_wbkeys(unordered_map<string,vector<Operation*>> *a_wb_keys) {
+    g_wb_keys = a_wb_keys;
+}
+
+uint32_t ConnectionMultiApproxShm::get_cid() {
+    return cid;
+}
+
+int ConnectionMultiApproxShm::add_to_wb_keys(string key) {
+    auto pos = wb_keys.find(key);
+    if (pos == wb_keys.end()) {
+        wb_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+int ConnectionMultiApproxShm::add_to_copy_keys(string key) {
+    auto pos = copy_keys.find(key);
+    if (pos == copy_keys.end()) {
+        copy_keys.insert( {key, vector<Operation*>() });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApproxShm::del_copy_keys(string key) {
+
+    auto position = copy_keys.find(key);
+    if (position != copy_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        copy_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxShm::add_to_touch_keys(string key) {
+    //return touch_keys.assign_if_equal( key, NULL, cid ) != NULL ? 1 : 2;
+    auto pos = touch_keys.find(key);
+    if (pos == touch_keys.end()) {
+        touch_keys.insert( {key, cid });
+        return 1;
+    }
+    return 2;
+}
+
+
+void ConnectionMultiApproxShm::del_touch_keys(string key) {
+    //touch_keys.erase(key);
+    auto position = touch_keys.find(key);
+    if (position != touch_keys.end()) {
+        touch_keys.erase(position);
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+}
+
+int ConnectionMultiApproxShm::issue_op(Operation *Op) {
+    double now = get_time();
+    int issued = 0;
+    int incl = get_incl(Op->valuelen,strlen(Op->key));
+    int cid = get_class(Op->valuelen,strlen(Op->key));
+    Op->clsid = cid;
+    int flags = 0;
+    int index = lrand48() % (1024 * 1024);
+    //int touch = 1;
+    SET_INCL(incl,flags);
+    
+    switch(Op->type)
+    {
+      case Operation::GET:
+          //if (nissued < options.depth-1) {
+          //  issued = issue_get_with_len(key, vl, now, false, 1, flags, 0, 1);
+          //  last_quiet1 = false;
+          //} else {
+          //}
+          issued = issue_get_with_len(Op, now, false, flags | LOG_OP | ITEM_L1);
+          last_quiet1 = false;
+          this->stats.gets++;
+          gets++;
+          this->stats.gets_cid[cid]++;
+    
+          break;
+    case Operation::SET:
+          if (last_quiet1) {
+              //issue_noop(1);
+          }
+          if (incl == 1) {
+            if (isets >= iloc) {
+            //if (1) {
+                const char *data = &random_char[index];
+                issued = issue_set(Op, data, now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+                //int ret = add_to_touch_keys(string(Op->key));
+                //if (ret == 1) {
+                    issue_touch(Op->key,Op->valuelen,now, ITEM_L2 | SRC_DIRECT_SET);
+                //}
+                iloc += rand()%(10*2-1)+1;
+            } else {
+                issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET | ITEM_DIRTY);
+            }
+            isets++;
+          } else if (incl == 2) {
+            issued = issue_set(Op, &random_char[index], now, flags | LOG_OP | ITEM_L1 | SRC_DIRECT_SET);
+            if (esets >= sloc) {
+                issue_delete(Op->key,now,ITEM_L2 | SRC_DIRECT_SET);
+                sloc += rand()%(10*2-1)+1;
+            }
+            esets++;
+          }
+          last_quiet1 = false;
+          this->stats.sets++;
+          this->stats.sets_cid[cid]++;
+          break;
+    case Operation::DELETE:
+    case Operation::TOUCH:
+    case Operation::NOOP:
+    case Operation::SASL:
+          fprintf(stderr,"invalid line: %s, vl: %d\n",Op->key,Op->valuelen);
+          break;
+    
+    }
+    return issued;
+}
+
+void ConnectionMultiApproxShm::del_wb_keys(string key) {
+
+    auto position = wb_keys.find(key);
+    if (position != wb_keys.end()) {
+        vector<Operation*> op_list = vector<Operation*>(position->second);
+        wb_keys.erase(position);
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            issue_op(*it);
+        }
+    } else {
+        fprintf(stderr,"expected %s, got nuthin\n",key.c_str());
+    }
+}
+
+
+int ConnectionMultiApproxShm::do_connect() {
+
+
+  //the client should see for this cid, where the shared memory is
+  typedef struct shared_ {
+      bipbuf_t bipbuf_in;
+      bipbuf_t bipbuf_out;
+      pthread_mutex_t lock_in;
+      pthread_mutex_t lock_out;
+      int shared_id;
+  } shared_t;
+  
+  //this cid gets shared memory
+  // ftok to generate unique key
+  //char shmkey[64];
+  //sprintf(shmkey,"shmfilel1%d",cid);
+  int id = cid+100;
+  //key_t key = ftok(shmkey,id);
+  
+  // shmget returns an identifier in shmid
+  int shmid = shmget(id,sizeof(shared_t),0666);
+  
+  // shmat to attach to shared memory
+  shared_t* share_l1 = (shared_t*) shmat(shmid,(void*)0,0);
+
+  fprintf(stderr,"cid %d gets shared memory buf l1 %d\n",cid,share_l1->shared_id);
+  
+  // ftok to generate unique key
+  //char shmkey2[64];
+  //sprintf(shmkey2,"shmfilel2%d",cid);
+  int id2 = cid+200;
+  //key_t key2 = ftok(shmkey2,id2);
+  
+  // shmget returns an identifier in shmid
+  int shmid2 = shmget(id2,sizeof(shared_t),0666);
+  
+  // shmat to attach to shared memory
+  shared_t* share_l2 = (shared_t*) shmat(shmid2,(void*)0,0);
+  
+  fprintf(stderr,"cid %d gets shared memory buf l2 %d\n",cid,share_l2->shared_id);
+
+  //the leads are reveresed (from perspective of server)
+  bipbuf_in[1] = &share_l1->bipbuf_out;
+  bipbuf_in[2] = &share_l2->bipbuf_out;
+  bipbuf_out[1] = &share_l1->bipbuf_in;
+  bipbuf_out[2] = &share_l2->bipbuf_in;
+  
+  lock_in[1] = &share_l1->lock_out;
+  lock_in[2] = &share_l2->lock_out;
+  lock_out[1] = &share_l1->lock_in;
+  lock_out[2] = &share_l2->lock_in;
+  read_state  = IDLE;
+  return 1;
+}
+
+/**
+ * Destroy a connection, performing cleanup.
+ */
+ConnectionMultiApproxShm::~ConnectionMultiApproxShm() {
+ 
+
+  for (int i = 0; i <= LEVELS; i++) {
+      free(op_queue[i]);
+
+  }
+  
+  free(op_queue_size);
+  free(opaque);
+  free(op_queue);
+  //event_free(timer);
+  //timer = NULL;
+  // FIXME:  W("Drain op_q?");
+  //bufferevent_free(bev1);
+  //bufferevent_free(bev2);
+
+  delete iagen;
+  delete keygen;
+  delete keysize;
+  delete valuesize;
+}
+
+/**
+ * Reset the connection back to an initial, fresh state.
+ */
+void ConnectionMultiApproxShm::reset() {
+  // FIXME: Actually check the connection, drain all bufferevents, drain op_q.
+  //assert(op_queue.size() == 0);
+  //evtimer_del(timer);
+  read_state = IDLE;
+  write_state = INIT_WRITE;
+  stats = ConnectionStats(stats.sampling);
+}
+
+
+
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxShm::offer_get(Operation *pop, int extra) {
+  
+  uint16_t keylen = strlen(pop->key);
+  int level = FLAGS_level(pop->flags);
+
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_GET, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  //if (quiet) {
+  //    h.opcode = CMD_GETQ;
+  //}
+  h.opaque = htonl(pop->opaque);
+ 
+  int res = 0;
+  pthread_mutex_lock(lock_out[level]);
+  int gtg = bipbuf_unused(bipbuf_out[level]) > (int)(24+keylen) ? 1 : 0;
+  if (gtg) {
+     res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24);
+     if (res != 24) {
+       fprintf(stderr,"failed offer 24 get level %d\n",level);
+       pthread_mutex_unlock(lock_out[level]);
+       return 0;
+     }
+     res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)pop->key,keylen);
+     if (res != keylen) {
+       fprintf(stderr,"failed offer %d get level %d\n",keylen,level);
+       pthread_mutex_unlock(lock_out[level]);
+       return 0;
+     }
+     if (extra == 1) {
+         extra_queue.pop();
+     }
+  } else {
+      if (extra == 0) {
+        extra_queue.push(pop);
+      }
+  }
+  pthread_mutex_unlock(lock_out[level]);
+  return 1;
+
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxShm::issue_get_with_len(Operation *pop, double now, bool quiet, uint32_t flags, Operation *l1) {
+  
+  int level = FLAGS_level(flags);
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+
+  offer_get(pop,0);
+  stats.tx_bytes += 24 + strlen(pop->key);
+  return 1;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxShm::issue_get_with_len(const char* key, int valuelen, double now, bool quiet, uint32_t flags, Operation *l1) {
+
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::GET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(valuelen,strlen(key));
+  if (l1 != NULL) {
+      pop->l1 = l1;
+  }
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing get: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  
+  offer_get(pop,0);
+  stats.tx_bytes += 24 + strlen(pop->key);;
+  return 1;
+}
+
+/**
+ * Issue a get request to the server.
+ */
+int ConnectionMultiApproxShm::issue_touch(const char* key, int valuelen, double now, int flags) {
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->valuelen = valuelen;
+  pop->type = Operation::TOUCH;
+  pop->opaque = opaque[level]++;
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+  
+  pop->flags = flags;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+#ifdef DEBUGS
+  fprintf(stderr,"issing touch: %s, size: %u, level %d, flags: %d, opaque: %d\n",key,valuelen,level,flags,pop->opaque);
+#endif
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_TOUCH, htons(keylen),
+                        0x04, 0x00, htons(0),
+                        htonl(keylen + 4) };
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t exp = 0;
+  if (flags & ITEM_DIRTY) {
+      exp = htonl(flags); 
+  }
+
+  int res = 0;
+  pthread_mutex_lock(lock_out[level]);
+  res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24);
+  if (res != 24) {
+    fprintf(stderr,"failed offer 24 touch level %d\n",level);
+    pthread_mutex_unlock(lock_out[level]);
+    return 0;
+  }
+  if (res != keylen) {
+    bipbuf_offer(bipbuf_out[level],(const unsigned char*)&exp,4);
+    fprintf(stderr,"failed offer 4 touch level %d\n",level);
+    pthread_mutex_unlock(lock_out[level]);
+    return 0;
+  }
+  res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)pop->key,keylen);
+  if (res != keylen) {
+    fprintf(stderr,"failed offer %d touch level %d\n",keylen,level);
+    pthread_mutex_unlock(lock_out[level]);
+    return 0;
+  }
+  pthread_mutex_unlock(lock_out[level]);
+  
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+/**
+ * Issue a delete request to the server.
+ */
+int ConnectionMultiApproxShm::issue_delete(const char* key, double now, uint32_t flags) {
+  int level = FLAGS_level(flags);
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) {
+#if USE_CACHED_TIME
+    struct timeval now_tv;
+    event_base_gettimeofday_cached(base, &now_tv);
+    pop->start_time = tv_to_double(&now_tv);
+#else
+    pop->start_time = get_time();
+#endif
+  } else {
+    pop->start_time = now;
+  }
+#endif
+
+  strncpy(pop->key,key,255);
+  pop->type = Operation::DELETE;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing delete: %s, level %d, flags: %d, opaque: %d\n",cid,key,level,flags,pop->opaque);
+#endif
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  uint16_t keylen = strlen(key);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_DELETE, htons(keylen),
+                        0x00, 0x00, htons(0),
+                        htonl(keylen) };
+  h.opaque = htonl(pop->opaque);
+  
+  pthread_mutex_lock(lock_out[level]);
+  bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24);
+  bipbuf_offer(bipbuf_out[level],(const unsigned char*)key,keylen);
+  pthread_mutex_unlock(lock_out[level]);
+  
+
+  stats.tx_bytes += 24 + keylen;
+  
+  //stats.log_access(op);
+  return 1;
+}
+
+void ConnectionMultiApproxShm::issue_noop(int level) {
+   Operation op;
+   
+
+   binary_header_t h = { 0x80, CMD_NOOP, 0x0000,
+                         0x00, 0x00, htons(0),
+                         0x00 };
+   
+
+  //bipbuf_offer(bipbuf[level],&h,24);
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxShm::issue_set(Operation *pop, const char* value, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  op_queue[level][pop->opaque] = pop;
+  //op_queue[level].push(op);
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,pop->key,pop->valuelen,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+  
+  offer_set(pop);
+
+
+  stats.tx_bytes += pop->valuelen + 32 + strlen(pop->key);
+  return 1;
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxShm::offer_set(Operation *pop, int extra) {
+
+  uint16_t keylen = strlen(pop->key);
+  uint32_t length = pop->valuelen;
+  int level = FLAGS_level(pop->flags);
+
+  // each line is 4-bytes
+  binary_header_t h = { 0x80, CMD_SET, htons(keylen),
+                        0x08, 0x00, htons(0),
+                        htonl(keylen + 8 + length) }; 
+  h.opaque = htonl(pop->opaque);
+  
+  uint32_t f = htonl(pop->flags);
+  uint32_t exp = 0;
+  int ret = 0;
+  int res = 0;
+  pthread_mutex_lock(lock_out[level]);
+  int gtg = bipbuf_unused(bipbuf_out[level]) > (int)(32+pop->valuelen) ? 1 : 0;
+  if (gtg) {
+     res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&h,24);
+     if (res != 24) {
+       fprintf(stderr,"failed offer 24 set level %d\n",level);
+       pthread_mutex_unlock(lock_out[level]);
+       return 0;
+     }
+     res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&f,4);
+     if (res != 4) {
+       fprintf(stderr,"failed offer 4 set level %d\n",level);
+       pthread_mutex_unlock(lock_out[level]);
+       return 0;
+     }
+     res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)&exp,4);
+     if (res != 4) {
+       fprintf(stderr,"failed offer 4 set level %d\n",level);
+       pthread_mutex_unlock(lock_out[level]);
+       return 0;
+     }
+     res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)pop->key,keylen);
+     if (res != keylen) {
+       fprintf(stderr,"failed offer %d set level %d\n",keylen,level);
+       pthread_mutex_unlock(lock_out[level]);
+       return 0;
+     }
+     int i = 0;
+     int index = lrand48() % (1024 * 1024);
+     const char *value = &random_char[index];
+     while ((res = bipbuf_offer(bipbuf_out[level],(const unsigned char*)value,length)) != (int)length) {
+       pthread_mutex_unlock(lock_out[level]);
+       i++;
+       if (i > 1000) {
+           fprintf(stderr,"failed offer %d set level %d\n",length,level);
+           break;
+       }
+       pthread_mutex_lock(lock_out[level]);
+     }
+     if (extra == 1) {
+         extra_queue.pop();
+     }
+     ret = 1;
+  } else {
+      if (extra == 0) {
+        extra_queue.push(pop);
+      }
+      ret = 0;
+  }
+  pthread_mutex_unlock(lock_out[level]);
+  return ret;
+}
+
+/**
+ * Issue a set request to the server.
+ */
+int ConnectionMultiApproxShm::issue_set(const char* key, const char* value, int length, double now, uint32_t flags) {
+  
+  int level = FLAGS_level(flags);
+  //Operation op; 
+  Operation *pop = new Operation();
+
+#if HAVE_CLOCK_GETTIME
+  pop->start_time = get_time_accurate();
+#else
+  if (now == 0.0) pop->start_time = get_time();
+  else pop->start_time = now;
+#endif
+
+  strncpy(pop->key,key,255); 
+  pop->valuelen = length;
+  pop->type = Operation::SET;
+  pop->opaque = opaque[level]++;
+  pop->flags = flags;
+  pop->clsid = get_class(length,strlen(key));
+  op_queue[level][pop->opaque] = pop;
+  op_queue_size[level]++;
+#ifdef DEBUGS
+  fprintf(stderr,"cid: %d issing set: %s, size: %u, level %d, flags: %d, opaque: %d\n",cid,key,length,level,flags,pop->opaque);
+#endif
+  
+  if (opaque[level] > OPAQUE_MAX) {
+      opaque[level] = 1;
+  }
+
+  offer_set(pop);
+  stats.tx_bytes += length + 32 + strlen(key);
+  return 1;
+}
+
+/**
+ * Return the oldest live operation in progress.
+ */
+void ConnectionMultiApproxShm::pop_op(Operation *op) {
+
+  uint8_t level = OP_level(op);
+  //op_queue[level].erase(op);
+  op_queue_size[level]--;
+  
+
+  if (read_state == LOADING) return;
+  read_state = IDLE;
+
+  // Advance the read state machine.
+  //if (op_queue.size() > 0) {
+  //  Operation& op = op_queue.front();
+  //  switch (op.type) {
+  //  case Operation::GET: read_state = WAITING_FOR_GET; break;
+  //  case Operation::SET: read_state = WAITING_FOR_SET; break;
+  //  case Operation::DELETE: read_state = WAITING_FOR_DELETE; break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
+}
+
+/**
+ * Finish up (record stats) an operation that just returned from the
+ * server.
+ */
+void ConnectionMultiApproxShm::finish_op(Operation *op, int was_hit) {
+  double now;
+#if USE_CACHED_TIME
+  struct timeval now_tv;
+  event_base_gettimeofday_cached(base, &now_tv);
+  now = tv_to_double(&now_tv);
+#else
+  now = get_time();
+#endif
+#if HAVE_CLOCK_GETTIME
+  op->end_time = get_time_accurate();
+#else
+  op->end_time = now;
+#endif
+
+  if (was_hit) { 
+    switch (op->type) {
+    case Operation::GET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_get_l1(*op);
+                break;
+            case 2:
+                stats.log_get_l2(*op);
+                if (op->l1 != NULL) {
+                    op->l1->end_time = now;
+                    stats.log_get(*(op->l1));
+                }
+                break;
+        }
+        break;
+    case Operation::SET: 
+        switch (OP_level(op)) {
+            case 1:
+                stats.log_set_l1(*op);
+                break;
+            case 2:
+                stats.log_set_l2(*op);
+                break;
+        }
+        break;
+    case Operation::DELETE: break;
+    case Operation::TOUCH: break;
+    default: DIE("Not implemented.");
+    }
+  }
+  //} else {
+  //  switch (op->type) {
+  //  case Operation::GET: 
+  //      if (OP_log(op)) {
+  //          switch (OP_level(op)) {
+  //              case 1:
+  //                  stats.log_get_l1(*op);
+  //                  break;
+  //              case 2:
+  //                  stats.log_get_l2(*op);
+  //                  if (op->l1 != NULL) {
+  //                      op->l1->end_time = now;
+  //                      stats.log_get(*(op->l1));
+  //                  }
+  //                  break;
+  //          }
+  //      }
+  //      break;
+  //  case Operation::SET:
+  //      if (OP_log(op)) {
+  //          switch (OP_level(op)) {
+  //              case 1:
+  //                  stats.log_set_l1(*op);
+  //                  break;
+  //              case 2:
+  //                  stats.log_set_l2(*op);
+  //                  break;
+  //          }
+  //      }
+  //      break;
+  //  case Operation::DELETE: break;
+  //  case Operation::TOUCH: break;
+  //  default: DIE("Not implemented.");
+  //  }
+  //}
+
+  last_rx = now;
+  uint8_t level = OP_level(op);
+  if (op->l1 != NULL) {
+      //delete op_queue[1][op->l1->opaque];
+      op_queue[1][op->l1->opaque] = 0;
+      op_queue_size[1]--;
+      delete op->l1;
+  }
+  //op_queue[level].erase(op_queue[level].begin()+opopq);
+  if (op == op_queue[level][op->opaque] && 
+          op->opaque == op_queue[level][op->opaque]->opaque) {
+    //delete op_queue[level][op->opaque];
+    op_queue[level][op->opaque] = 0;
+    delete op;
+  } else {
+      fprintf(stderr,"op_queue out of sync! Expected %p, got %p, opa1: %d opaq2: %d\n",
+              op,op_queue[level][op->opaque],op->opaque,op_queue[level][op->opaque]->opaque);
+  }
+  op_queue_size[level]--;
+  read_state = IDLE;
+
+
+}
+
+
+
+/**
+ * Check if our testing is done and we should exit.
+ */
+bool ConnectionMultiApproxShm::check_exit_condition(double now) {
+  if (eof && op_queue_size[1] == 0 && op_queue_size[2] == 0) {
+      return true;
+  }
+  if (read_state == INIT_READ) return false;
+
+  return false;
+}
+
+
+
+/**
+ * Request generation loop
+ */
+void ConnectionMultiApproxShm::drive_write_machine_shm(double now) {
+
+    while (trace_queue->size() > 0) {
+        int extra_tries = extra_queue.size();
+        for (int i = 0; i < extra_tries; i++) {
+            Operation *Op = extra_queue.front(); 
+            switch(Op->type)
+            {
+                case Operation::GET:
+                   offer_get(Op,1);
+                   break;
+                case Operation::SET:
+                   offer_set(Op,1);
+                   break;
+            }
+        }
+
+        int nissued = 0;
+        int nissuedl2 = 0;
+        while (nissued < options.depth && extra_queue.size() == 0) {
+            Operation *Op = trace_queue->front(); 
+            
+            if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) {
+                eof = 1;
+                cid_rate.insert( {cid, 100 } );
+                fprintf(stderr,"cid %d done\n",cid);
+                string op_queue1;
+                string op_queue2;
+                for (int j = 0; j < 2; j++) {
+                    for (int i = 0; i < OPAQUE_MAX; i++) {
+                        if (op_queue[j+1][i] != NULL) {
+                            if (j == 0) {
+                                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+                            } else {
+                                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+                            }
+                        }
+                    }
+                }
+                fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+                return;
+            } 
+            int gtg = 0;
+            pthread_mutex_lock(lock_out[1]);
+            switch(Op->type)
+            {
+                case Operation::GET:
+                   gtg = bipbuf_unused(bipbuf_out[1]) > (int)(24+strlen(Op->key)) ? 1 : 0;
+                   break;
+                case Operation::SET:
+                   gtg = bipbuf_unused(bipbuf_out[1]) > (int)(32+Op->valuelen) ? 1 : 0;
+                   break;
+            }
+            pthread_mutex_unlock(lock_out[1]);
+
+
+            if (gtg) {
+                trace_queue->pop();
+                int l2issued = issue_op(Op);
+                nissuedl2 += l2issued;
+                nissued++;
+            } else {
+                break;
+            }
+        }
+
+        //wait for response (at least nissued)
+        int l2issued = read_response_l1();
+        nissuedl2 += l2issued;
+        if (nissuedl2 > 0) {
+            read_response_l2();
+        }
+        
+    }
+
+}
+
+/**
+ * Request generation loop
+ */
+//void ConnectionMultiApproxShm::drive_write_machine_shm_2(double now) {
+//
+//    while (trace_queue->size() > 0) {
+//        int extra_tries = extra_queue.size();
+//        for (int i = 0; i < extra_tries; i++) {
+//            Operation *Op = extra_queue.front(); 
+//            switch(Op->type)
+//            {
+//                case Operation::GET:
+//                   offer_get(Op,1);
+//                   break;
+//                case Operation::SET:
+//                   offer_set(Op,1);
+//                   break;
+//            }
+//        }
+//
+//        int nissued = 0;
+//        int nissuedl2 = 0;
+//        while (nissued < options.depth && extra_queue.size() == 0) {
+//            Operation *Op = trace_queue->front(); 
+//            
+//            if (Op == NULL || trace_queue->size() <= 0 || Op->type == Operation::SASL) {
+//                eof = 1;
+//                cid_rate.insert( {cid, 100 } );
+//                fprintf(stderr,"cid %d done\n",cid);
+//                string op_queue1;
+//                string op_queue2;
+//                for (int j = 0; j < 2; j++) {
+//                    for (int i = 0; i < OPAQUE_MAX; i++) {
+//                        if (op_queue[j+1][i] != NULL) {
+//                            if (j == 0) {
+//                                op_queue1 = op_queue1 + "," + op_queue[j+1][i]->key;
+//                            } else {
+//                                op_queue2 = op_queue2 + "," + op_queue[j+1][i]->key;
+//                            }
+//                        }
+//                    }
+//                }
+//                fprintf(stderr,"cid %d op_queue1: %s op_queue2: %s, op_queue_size1: %d, op_queue_size2: %d\n",cid,op_queue1.c_str(),op_queue2.c_str(),op_queue_size[1],op_queue_size[2]);
+//                return;
+//            } 
+//            int gtg = 0;
+//            pthread_mutex_lock(lock_out[1]);
+//            switch(Op->type)
+//            {
+//                case Operation::GET:
+//                   gtg = bipbuf_unused(bipbuf_out[1]) > (int)(24+strlen(Op->key)) ? 1 : 0;
+//                   break;
+//                case Operation::SET:
+//                   gtg = bipbuf_unused(bipbuf_out[1]) > (int)(32+Op->valuelen) ? 1 : 0;
+//                   break;
+//            }
+//            pthread_mutex_unlock(lock_out[1]);
+//
+//
+//            if (gtg) {
+//                trace_queue->pop();
+//                int l2issued = issue_op(Op);
+//                nissuedl2 += l2issued;
+//                nissued++;
+//            } else {
+//                break;
+//            }
+//        }
+//
+//        //wait for response (at least nissued)
+//        int l2issued = read_response_l1();
+//        nissuedl2 += l2issued;
+//        if (nissuedl2 > 0) {
+//            read_response_l2();
+//        }
+//        
+//    }
+//
+//}
+
+/**
+ * Tries to consume a binary response (in its entirety) from shared memory.
+ *
+ * @param input evBuffer to read response from
+ * @return  true if consumed, false if not enough data in buffer.
+ */
+static int handle_response(ConnectionMultiApproxShm *conn, unsigned char *input, bool &done, bool &found, int &opcode, uint32_t &opaque, evicted_t *evict, int level) {
+  // Read the first 24 bytes as a header
+  //int length = evbuffer_get_length(input);
+  //if (length < 24) return false;
+  //binary_header_t* h =
+  //        reinterpret_cast<binary_header_t*>(evbuffer_pullup(input, 24));
+  //assert(h);
+  binary_header_t* h =
+          reinterpret_cast<binary_header_t*>(input);
+
+  uint32_t bl = ntohl(h->body_len);
+  uint16_t kl = ntohs(h->key_len);
+  uint8_t el = h->extra_len;
+  // Not whole response
+  int targetLen = 24 + bl;
+
+  opcode = h->opcode;
+  opaque = ntohl(h->opaque);
+  uint16_t status = ntohs(h->status);
+#ifdef DEBUGMC
+    fprintf(stderr,"cid: %d handle resp from l%d - opcode: %u opaque: %u keylen: %u extralen: %u datalen: %u status: %u\n",conn->get_cid(),level,
+            h->opcode,ntohl(h->opaque),ntohs(h->key_len),h->extra_len,
+            ntohl(h->body_len),ntohs(h->status));
+#endif
+
+  pthread_mutex_lock(conn->lock_in[level]);
+  unsigned char *abuf;
+  int tries = 0;
+  while ((abuf = bipbuf_poll(conn->bipbuf_in[level],targetLen)) == NULL) {
+      pthread_mutex_unlock(conn->lock_in[level]);
+      tries++;
+      if (tries > 10) {
+          //fprintf(stderr,"more than 10000 tries for cid: %d for length %d\n",conn->get_cid(),targetLen);
+          return 0;
+
+      }
+      pthread_mutex_lock(conn->lock_in[level]);
+  }
+  unsigned char bbuf[1024*1024];
+  unsigned char *buf = (unsigned char*) &bbuf;
+  if (abuf != NULL) {
+    memcpy(bbuf,abuf,targetLen);
+  }
+  buf += 24;
+  pthread_mutex_unlock(conn->lock_in[level]);
+
+
+  // If something other than success, count it as a miss
+  if (opcode == CMD_GET && status == RESP_NOT_FOUND) {
+      switch(level) {
+          case 1:
+              conn->stats.get_misses_l1++;
+              break;
+          case 2:
+              conn->stats.get_misses_l2++;
+              conn->stats.get_misses++;
+              conn->stats.window_get_misses++;
+              break;
+
+      }
+      found = false;
+      //evbuffer_drain(input, targetLen);
+
+  } else if (opcode == CMD_SET && kl > 0 && evict != NULL) {
+    //evbuffer_drain(input,24);
+    //unsigned char *buf = evbuffer_pullup(input,bl);
+    
+
+    evict->clsid = *((uint32_t*)buf);
+    evict->clsid = ntohl(evict->clsid);
+    buf += 4;
+    
+    evict->serverFlags = *((uint32_t*)buf);
+    evict->serverFlags = ntohl(evict->serverFlags);
+    buf += 4;
+    
+    evict->evictedFlags = *((uint32_t*)buf);
+    evict->evictedFlags = ntohl(evict->evictedFlags);
+    buf += 4;
+
+    
+    evict->evictedKeyLen = kl;
+    evict->evictedKey = (char*)malloc(kl+1);
+    memset(evict->evictedKey,0,kl+1);
+    memcpy(evict->evictedKey,buf,kl);
+    buf += kl;
+
+
+    evict->evictedLen = bl - kl - el;
+    evict->evictedData = (char*)malloc(evict->evictedLen);
+    memcpy(evict->evictedData,buf,evict->evictedLen);
+    evict->evicted = true;
+    //fprintf(stderr,"class: %u, serverFlags: %u, evictedFlags: %u\n",evict->clsid,evict->serverFlags,evict->evictedFlags);
+  } else if ( (opcode == CMD_TOUCH && status == RESP_NOT_FOUND) || 
+              (opcode == CMD_DELETE && status == RESP_NOT_FOUND) ) {
+    found = false;
+  }
+
+  conn->stats.rx_bytes += targetLen;
+  done = true;
+  return targetLen;
+}
+
+int ConnectionMultiApproxShm::read_response_l1() {
+ 
+    //maybe need mutex etc.
+  unsigned char input[64];
+  pthread_mutex_lock(lock_in[1]);
+  unsigned char *in = bipbuf_peek(bipbuf_in[1],24);
+  if (in) {
+      memcpy(input,in,24);
+  }
+  pthread_mutex_unlock(lock_in[1]);
+  if (in == NULL) {
+      return 0;
+  }
+
+  uint32_t responses_expected = op_queue_size[1];
+  Operation *op = NULL;
+  bool done, found;
+  found = true;
+  int bytes_read = 1;
+  int l2reqs = 0;
+  uint32_t responses = 0;
+  while (bytes_read > 0 && responses < responses_expected && input) {
+    
+      
+    int opcode;
+    uint32_t opaque;
+    evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+    memset(evict,0,sizeof(evicted_t));
+    bytes_read = handle_response(this,input, done, found, opcode, opaque, evict,1);
+    
+    if (bytes_read > 0) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l1: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+        op = op_queue[1][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l1: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l1: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+            continue;
+        }
+        responses++;
+    } else {
+        if (evict) {
+            if (evict->evictedKey) free(evict->evictedKey);
+            if (evict->evictedData) free(evict->evictedData);
+            free(evict);
+        }
+        return 0;
+    }
+    
+
+    double now = get_time();
+    int wb = 0;
+    if (options.rand_admit) {
+        wb = (rand() % options.rand_admit);
+    }
+    switch (op->type) {
+        case Operation::GET:
+            if (done) {
+
+                int vl = op->valuelen;
+                if ( !found && (options.getset || options.getsetorset) ) {
+                    /* issue a get a l2 */
+                    int flags = OP_clu(op);
+                    issue_get_with_len(op->key,vl,now,false, flags | SRC_L1_M | ITEM_L2 | LOG_OP, op);
+                    op->end_time = now;
+                    this->stats.log_get_l1(*op);
+                    //finish_op(op,0);
+
+                } else {
+                    if (OP_incl(op) && ghits >= gloc) {
+                        //int ret = add_to_touch_keys(string(op->key));
+                        //if (ret == 1) {
+                            issue_touch(op->key,vl,now, ITEM_L2 | SRC_L1_H);
+                        //}
+                        gloc += rand()%(10*2-1)+1;
+                    }
+                    ghits++;
+                    finish_op(op,1);
+                }
+                l2reqs++;
+            } else {
+                char out[128];
+                sprintf(out,"conn l1: %u, not done reading, should do something",cid);
+                write(2,out,strlen(out));
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_L1_COPY ||
+            //    OP_src(op) == SRC_L2_M) {
+            //    del_copy_keys(string(op->key));
+            //}
+            if (evict->evicted) {
+                string wb_key(evict->evictedKey);
+                if ((evict->evictedFlags & ITEM_INCL) && (evict->evictedFlags & ITEM_DIRTY)) {
+                    //int ret = add_to_wb_keys(wb_key);
+                    //if (ret == 1) {
+                        issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_INCL | LOG_OP | SRC_WB | ITEM_DIRTY);
+                    //}
+                    this->stats.incl_wbs++;
+                    l2reqs++;
+                } else if (evict->evictedFlags & ITEM_EXCL) {
+                    //fprintf(stderr,"excl writeback %s\n",evict->evictedKey);
+                    //strncpy(wb_key,evict->evictedKey,255);
+                    if ( (options.rand_admit && wb == 0) ||
+                         (options.threshold && (g_key_hist[wb_key] == 1)) ||
+                         (options.wb_all) ) {
+                        //int ret = add_to_wb_keys(wb_key);
+                        //if (ret == 1) {
+                            issue_set(evict->evictedKey, evict->evictedData, evict->evictedLen, now, ITEM_L2 | ITEM_EXCL | LOG_OP | SRC_WB);
+                        //}
+                        this->stats.excl_wbs++;
+                        l2reqs++;
+                    }
+                }
+                if (OP_src(op) == SRC_DIRECT_SET) {
+                    if ( (evict->serverFlags & ITEM_SIZE_CHANGE) || ((evict->serverFlags & ITEM_WAS_HIT) == 0)) {
+                        this->stats.set_misses_l1++;
+                    } else if (OP_excl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_excl_hits_l1++;
+                    } else if (OP_incl(op) && evict->serverFlags & ITEM_WAS_HIT) {
+                        this->stats.set_incl_hits_l1++;
+                    }
+                }
+            }
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+    if (evict) {
+        if (evict->evictedKey) free(evict->evictedKey);
+        if (evict->evictedData) free(evict->evictedData);
+        free(evict);
+    }
+    pthread_mutex_lock(lock_in[1]);
+    unsigned char *in = bipbuf_peek(bipbuf_in[1],24);
+    if (in) {
+        memcpy(input,in,24);
+        pthread_mutex_unlock(lock_in[1]);
+    } else {
+        pthread_mutex_unlock(lock_in[1]);
+        break; 
+    }
+
+  }
+  return l2reqs;
+}
+
+/**
+ * Handle incoming data (responses).
+ */
+void ConnectionMultiApproxShm::read_response_l2() {
+
+    //maybe need mutex etc.
+  unsigned char input[64];
+  pthread_mutex_lock(lock_in[2]);
+  unsigned char *in = bipbuf_peek(bipbuf_in[2],24);
+  if (in) {
+      memcpy(input,in,24);
+  }
+  pthread_mutex_unlock(lock_in[2]);
+  if (in == NULL) {
+      return;
+  }
+ 
+  uint32_t responses_expected = op_queue_size[2];
+  Operation *op = NULL;
+  bool done, found;
+  found = true;
+  int bytes_read = 1;
+  int l2reqs = 0;
+  uint32_t responses = 0;
+
+  while (bytes_read > 0 && responses < responses_expected && input) {
+    
+    int opcode;
+    uint32_t opaque;
+    evicted_t *evict = (evicted_t*)malloc(sizeof(evicted_t));
+    memset(evict,0,sizeof(evicted_t));
+    bytes_read = handle_response(this,input, done, found, opcode, opaque, evict,2);
+
+    if (bytes_read > 0) {
+        if (opcode == CMD_NOOP) {
+#ifdef DEBUGMC
+            char out[128];
+            sprintf(out,"conn l2: %u, reading noop\n",cid);
+            write(2,out,strlen(out));
+#endif
+            continue;
+        }
+        op = op_queue[2][opaque];
+#ifdef DEBUGMC
+        char out[128];
+        sprintf(out,"conn l2: %u, reading opaque: %u\n",cid,opaque);
+        write(2,out,strlen(out));
+        output_op(op,2,found);
+#endif
+        if (strlen(op->key) < 1) {
+#ifdef DEBUGMC
+            char out2[128];
+            sprintf(out2,"conn l2: %u, bad op: %s\n",cid,op->key);
+            write(2,out2,strlen(out2));
+#endif
+            continue;
+        }
+        responses++;
+    } else {
+        return;
+    }
+    
+
+    double now = get_time();
+    switch (op->type) {
+        case Operation::GET:
+            if ( !found && (options.getset || options.getsetorset) ) { //  &&
+                //(options.twitter_trace != 1)) {
+                int valuelen = op->valuelen;
+                int index = lrand48() % (1024 * 1024);
+                int flags = OP_clu(op) | SRC_L2_M | LOG_OP;
+                //int ret = add_to_copy_keys(string(op->key));
+                //if (ret == 1) {
+                    issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L1);
+                    if (OP_incl(op)) {
+                        issue_set(op->key, &random_char[index], valuelen, now, flags | ITEM_L2);
+                        last_quiet2 = false; 
+                    }
+                //}
+                last_quiet1 = false; 
+                finish_op(op,0); // sets read_state = IDLE
+                
+            } else {
+                if (found) {
+                    int valuelen = op->valuelen;
+                    int index = lrand48() % (1024 * 1024);
+                    int flags = OP_clu(op) | ITEM_L1 | SRC_L1_COPY;
+                    string key = string(op->key);
+                    const char *data = &random_char[index];
+                    //int ret = add_to_copy_keys(string(op->key));
+                    //if (ret == 1) {
+                        issue_set(op->key,data,valuelen, now, flags);
+                    //}
+                    this->stats.copies_to_l1++;
+                    //djb: this is automatically done in the L2 server
+                    //if (OP_excl(op)) { //djb: todo should we delete here for approx or just let it die a slow death?
+                    //    issue_delete(op->key,now, ITEM_L2 | SRC_L1_COPY );
+                    //}
+                    finish_op(op,1);
+
+                } else {
+                    finish_op(op,0);
+                }
+            }
+            break;
+        case Operation::SET:
+            //if (OP_src(op) == SRC_WB) {
+            //    del_wb_keys(string(op->key));
+            //}
+            finish_op(op,1);
+            break;
+        case Operation::TOUCH:
+            if (OP_src(op) == SRC_DIRECT_SET || SRC_L1_H) {
+                int valuelen = op->valuelen;
+                if (!found) {
+                    int index = lrand48() % (1024 * 1024);
+                    issue_set(op->key, &random_char[index],valuelen,now, ITEM_INCL | ITEM_L2 | LOG_OP | SRC_L2_M);
+                    this->stats.set_misses_l2++;
+                } else {
+                    if (OP_src(op) == SRC_DIRECT_SET) {
+                        issue_touch(op->key,valuelen,now, ITEM_L1 | SRC_L2_H | ITEM_DIRTY);
+                    }
+                }
+                //del_touch_keys(string(op->key));
+            }
+            finish_op(op,0);
+            break;
+        case Operation::DELETE:
+            //check to see if it was a hit
+            //fprintf(stderr," del %s -- %d from %d\n",op->key.c_str(),found,OP_src(op));
+            if (OP_src(op) == SRC_DIRECT_SET) {
+                if (found) {
+                    this->stats.delete_hits_l2++;
+                } else {
+                    this->stats.delete_misses_l2++;
+                }
+            }
+            finish_op(op,1);
+            break;
+        default: 
+            fprintf(stderr,"op: %p, key: %s opaque: %u\n",(void*)op,op->key,op->opaque);
+            DIE("not implemented");
+    }
+
+    pthread_mutex_lock(lock_in[2]);
+    unsigned char *in = bipbuf_peek(bipbuf_in[2],24);
+    if (in) {
+        memcpy(input,in,24);
+        pthread_mutex_unlock(lock_in[2]);
+    } else {
+        pthread_mutex_unlock(lock_in[2]);
+        break; 
+    }
+
+  }
+}
+
diff --git a/ConnectionOptions.h b/ConnectionOptions.h
index ba3d70c..96d70fc 100644
--- a/ConnectionOptions.h
+++ b/ConnectionOptions.h
@@ -4,17 +4,37 @@
 #include "distributions.h"
 
 typedef struct {
+  int apps;
+  int rand_admit;
+  bool ratelimit;
+  bool v1callback;
+  int threshold;
+  int wb_all;
+  bool miss_through;
   int connections;
   bool blocking;
   double lambda;
   int qps;
   int records;
-
+  int misswindow;
+  int queries;
+  int assoc;  
+  char file_name[256];
+  bool read_file;
   bool binary;
+  bool unix_socket;
+  bool successful_queries;
+  bool use_assoc;
+  bool redis;
+  bool getset;
+  bool getsetorset;
+  bool delete90;
   bool sasl;
   char username[32];
   char password[32];
 
+  char prefix[256];
+  char hashtype[256];
   char keysize[32];
   char valuesize[32];
   // int keysize;
@@ -23,7 +43,7 @@ typedef struct {
 
   // qps_per_connection
   // iadist
-
+  int twitter_trace;
   double update;
   int time;
   bool loadonly;
@@ -42,6 +62,8 @@ typedef struct {
   bool oob_thread;
 
   bool moderate;
+  char inclusives[256];
+
 } options_t;
 
 #endif // CONNECTIONOPTIONS_H
diff --git a/ConnectionStats.h b/ConnectionStats.h
index e957c19..1c79ea4 100644
--- a/ConnectionStats.h
+++ b/ConnectionStats.h
@@ -22,43 +22,101 @@ class ConnectionStats {
  public:
  ConnectionStats(bool _sampling = true) :
 #ifdef USE_ADAPTIVE_SAMPLER
-   get_sampler(100000), set_sampler(100000), op_sampler(100000),
+   get_sampler(100000), set_sampler(100000), 
+   get_l1_sampler(100000), set_l1_sampler(100000),
+   get_l2_sampler(100000), set_l2_sampler(100000),
+   access_sampler(100000), op_sampler(100000),
 #elif defined(USE_HISTOGRAM_SAMPLER)
-   get_sampler(10000,1), set_sampler(10000,1), op_sampler(1000,1),
+   get_sampler(10000,1), set_sampler(10000,1), 
+   get_l1_sampler(10000,1), set_l1_sampler(10000,1), 
+   get_l2_sampler(10000,1), set_l2_sampler(10000,1), 
+   access_sampler(10000,1), op_sampler(1000,1),
 #else
-   get_sampler(200), set_sampler(200), op_sampler(100),
+   get_sampler(200), set_sampler(200), 
+   get_l1_sampler(200), set_l1_sampler(200),
+   get_l2_sampler(200), set_l2_sampler(200),
+   access_sampler(200), op_sampler(100),
 #endif
-   rx_bytes(0), tx_bytes(0), gets(0), sets(0),
-   get_misses(0), skips(0), sampling(_sampling) {}
+   rx_bytes(0), tx_bytes(0), 
+   gets(0), sets(0), 
+   gets_l1(0), sets_l1(0), 
+   gets_l2(0), sets_l2(0), 
+   accesses(0),
+   get_misses(0), 
+   get_misses_l1(0), get_misses_l2(0), 
+   set_misses_l1(0), set_misses_l2(0), 
+   excl_wbs(0), incl_wbs(0), 
+   copies_to_l1(0),
+   delete_misses_l2(0),
+   delete_hits_l2(0),
+   gets_cid(40), sets_cid(40),
+   set_incl_hits_l1(0),set_excl_hits_l1(0),
+   window_gets(0), window_sets(0), window_accesses(0),
+   window_get_misses(0), skips(0), sampling(_sampling) {}
 
 #ifdef USE_ADAPTIVE_SAMPLER
   AdaptiveSampler<Operation> get_sampler;
   AdaptiveSampler<Operation> set_sampler;
+  AdaptiveSampler<Operation> get_l1_sampler;
+  AdaptiveSampler<Operation> set_l1_sampler;
+  AdaptiveSampler<Operation> get_l2_sampler;
+  AdaptiveSampler<Operation> set_l2_sampler;
+  AdaptiveSampler<Operation> access_sampler;
   AdaptiveSampler<double> op_sampler;
 #elif defined(USE_HISTOGRAM_SAMPLER)
   HistogramSampler get_sampler;
   HistogramSampler set_sampler;
+  HistogramSampler get_l1_sampler;
+  HistogramSampler get_l2_sampler;
+  HistogramSampler set_l1_sampler;
+  HistogramSampler set_l2_sampler;
+  HistogramSampler access_sampler;
   HistogramSampler op_sampler;
 #else
   LogHistogramSampler get_sampler;
   LogHistogramSampler set_sampler;
+  LogHistogramSampler get_l1_sampler;
+  LogHistogramSampler set_l1_sampler;
+  LogHistogramSampler get_l2_sampler;
+  LogHistogramSampler set_l2_sampler;
+  LogHistogramSampler access_sampler;
   LogHistogramSampler op_sampler;
 #endif
 
   uint64_t rx_bytes, tx_bytes;
-  uint64_t gets, sets, get_misses;
+  uint64_t gets, sets; 
+  uint64_t gets_l1, sets_l1, gets_l2, sets_l2;
+  uint64_t accesses, get_misses;
+  uint64_t get_misses_l1, get_misses_l2;
+  uint64_t set_misses_l1, set_misses_l2;
+  uint64_t excl_wbs, incl_wbs;
+  uint64_t copies_to_l1;
+  uint64_t delete_misses_l2;
+  uint64_t delete_hits_l2;
+  vector<uint64_t> gets_cid;
+  vector<uint64_t> sets_cid;
+  uint64_t set_incl_hits_l1, set_excl_hits_l1;
+  uint64_t window_gets, window_sets,  window_accesses, window_get_misses;
   uint64_t skips;
 
   double start, stop;
 
   bool sampling;
 
-  void log_get(Operation& op) { if (sampling) get_sampler.sample(op); gets++; }
-  void log_set(Operation& op) { if (sampling) set_sampler.sample(op); sets++; }
+  void log_get(Operation& op) { if (sampling) get_sampler.sample(op); } //window_gets++; gets++; }
+  void log_set(Operation& op) { if (sampling) set_sampler.sample(op); window_sets++; sets++; }
+  
+  void log_get_l1(Operation& op) { if (sampling) get_l1_sampler.sample(op); window_gets++; gets_l1++;  }
+  void log_set_l1(Operation& op) { if (sampling) set_l1_sampler.sample(op); window_sets++; sets_l1++;  }
+  
+  void log_get_l2(Operation& op) { if (sampling) get_l2_sampler.sample(op); window_gets++; gets_l2++;  }
+  void log_set_l2(Operation& op) { if (sampling) set_l2_sampler.sample(op); window_sets++; sets_l2++;  }
+  void log_access(Operation& op) { //if (sampling) access_sampler.sample(op); 
+      window_accesses++; } //accesses++; }
   void log_op (double op)     { if (sampling)  op_sampler.sample(op); }
 
   double get_qps() {
-    return (gets + sets) / (stop - start);
+    return (gets_l1 + gets_l2 + sets_l1 + sets_l2) / (stop - start);
   }
 
 #ifdef USE_ADAPTIVE_SAMPLER
@@ -69,8 +127,18 @@ class ConnectionStats {
 
     for (auto s: get_sampler.samples)
       samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
+    for (auto s: get_l1_sampler.samples)
+      samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
+    for (auto s: get_l2_sampler.samples)
+      samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
     for (auto s: set_sampler.samples)
       samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
+    for (auto s: set_l1_sampler.samples)
+      samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
+    for (auto s: set_l2_sampler.samples)
+      samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
+    for (auto s: access_sampler.samples)
+      samples.push_back(s.time()); // (s.end_time - s.start_time) * 1000000);
 
     sort(samples.begin(), samples.end());
 
@@ -91,19 +159,49 @@ class ConnectionStats {
   void accumulate(const ConnectionStats &cs) {
 #ifdef USE_ADAPTIVE_SAMPLER
     for (auto i: cs.get_sampler.samples) get_sampler.sample(i); //log_get(i);
+    for (auto i: cs.get_l1_sampler.samples) get_l1_sampler.sample(i); //log_get(i);
+    for (auto i: cs.get_l2_sampler.samples) get_l2_sampler.sample(i); //log_get(i);
     for (auto i: cs.set_sampler.samples) set_sampler.sample(i); //log_set(i);
+    for (auto i: cs.set_l1_sampler.samples) set_l1_sampler.sample(i); //log_set(i);
+    for (auto i: cs.set_l2_sampler.samples) set_l2_sampler.sample(i); //log_set(i);
+    for (auto i: cs.access_sampler.samples) access_sampler.sample(i); //log_access(i);
     for (auto i: cs.op_sampler.samples)  op_sampler.sample(i); //log_op(i);
 #else
     get_sampler.accumulate(cs.get_sampler);
+    get_l1_sampler.accumulate(cs.get_l1_sampler);
+    get_l2_sampler.accumulate(cs.get_l2_sampler);
     set_sampler.accumulate(cs.set_sampler);
+    set_l1_sampler.accumulate(cs.set_l1_sampler);
+    set_l2_sampler.accumulate(cs.set_l2_sampler);
+    access_sampler.accumulate(cs.access_sampler);
     op_sampler.accumulate(cs.op_sampler);
 #endif
 
+    for (int i = 0; i < 40; i++) {
+        gets_cid[i] += cs.gets_cid[i];
+        sets_cid[i] += cs.sets_cid[i];
+    }
     rx_bytes += cs.rx_bytes;
     tx_bytes += cs.tx_bytes;
     gets += cs.gets;
     sets += cs.sets;
+    gets_l1 += cs.gets_l1;
+    gets_l2 += cs.gets_l2;
+    sets_l1 += cs.sets_l1;
+    sets_l2 += cs.sets_l2;
+    accesses += cs.accesses;
     get_misses += cs.get_misses;
+    get_misses_l1 += cs.get_misses_l1;
+    get_misses_l2 += cs.get_misses_l2;
+    set_misses_l1 += cs.set_misses_l1;
+    set_misses_l2 += cs.set_misses_l2;
+    excl_wbs += cs.excl_wbs;
+    incl_wbs += cs.incl_wbs;
+    copies_to_l1 += cs.copies_to_l1;
+    delete_misses_l2 += cs.delete_misses_l2;
+    delete_hits_l2 += cs.delete_hits_l2;
+    set_excl_hits_l1 += cs.set_excl_hits_l1;
+    set_incl_hits_l1 += cs.set_incl_hits_l1;
     skips += cs.skips;
 
     start = cs.start;
@@ -115,7 +213,23 @@ class ConnectionStats {
     tx_bytes += as.tx_bytes;
     gets += as.gets;
     sets += as.sets;
+    gets_l1 += as.gets_l1;
+    gets_l2 += as.gets_l2;
+    sets_l1 += as.sets_l1;
+    sets_l2 += as.sets_l2;
+    accesses += as.accesses;
     get_misses += as.get_misses;
+    get_misses_l1 += as.get_misses_l1;
+    get_misses_l2 += as.get_misses_l2;
+    set_misses_l1 += as.set_misses_l1;
+    set_misses_l2 += as.set_misses_l2;
+    excl_wbs += as.excl_wbs;
+    incl_wbs += as.incl_wbs;
+    copies_to_l1 += as.copies_to_l1;
+    delete_misses_l2 += as.delete_misses_l2;
+    delete_hits_l2 += as.delete_hits_l2;
+    set_excl_hits_l1 += as.set_excl_hits_l1;
+    set_incl_hits_l1 += as.set_incl_hits_l1;
     skips += as.skips;
 
     start = as.start;
@@ -123,9 +237,9 @@ class ConnectionStats {
   }
 
   static void print_header() {
-    printf("%-7s %7s %7s %7s %7s %7s %7s %7s %7s\n",
+    printf("%-7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s\n",
            "#type", "avg", "std", "min", /*"1st",*/ "5th", "10th",
-           "90th", "95th", "99th");
+           "50th", "90th", "95th", "99th", "99.9th");
   }
 
 #ifdef USE_ADAPTIVE_SAMPLER
@@ -137,18 +251,18 @@ class ConnectionStats {
     size_t l = copy.size();
 
     if (l == 0) {
-      printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
-             tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+      printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
+             tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
       if (newline) printf("\n");
       return;
     }
 
     sort(copy.begin(), copy.end());
 
-    printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
+    printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
            tag, std::accumulate(copy.begin(), copy.end(), 0.0) / l,
-           copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100],
-           copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100]
+           copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100], copy[(l*50) / 100],
+           copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100], copy[(l*99.9) / 100]
            );
     if (newline) printf("\n");
   }
@@ -164,10 +278,10 @@ class ConnectionStats {
 
     sort(copy.begin(), copy.end());
 
-    printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
+    printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
            tag, std::accumulate(copy.begin(), copy.end(), 0.0) / l,
-           copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100],
-           copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100]
+           copy[0], copy[(l*1) / 100], copy[(l*5) / 100], copy[(l*10) / 100], copy[(l*50) / 100],
+           copy[(l*90) / 100], copy[(l*95) / 100], copy[(l*99) / 100], copy[(l*99.9) / 100]
            );
     if (newline) printf("\n");
   }
@@ -175,8 +289,8 @@ class ConnectionStats {
   void print_stats(const char *tag, HistogramSampler &sampler,
                    bool newline = true) {
     if (sampler.total() == 0) {
-      printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
-             tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+      printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
+             tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
       if (newline) printf("\n");
       return;
     }
@@ -184,8 +298,8 @@ class ConnectionStats {
     printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
            tag, sampler.average(),
            sampler.get_nth(0), sampler.get_nth(1), sampler.get_nth(5),
-           sampler.get_nth(10), sampler.get_nth(90),
-           sampler.get_nth(95), sampler.get_nth(99));
+           sampler.get_nth(10), sampler.get_nth(50), sampler.get_nth(90),
+           sampler.get_nth(95), sampler.get_nth(99), sampler.get_nth(99.9));
 
     if (newline) printf("\n");
   }
@@ -193,17 +307,18 @@ class ConnectionStats {
   void print_stats(const char *tag, LogHistogramSampler &sampler,
                    bool newline = true) {
     if (sampler.total() == 0) {
-      printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
-             tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+      printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
+             tag, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
       if (newline) printf("\n");
       return;
     }
 
-    printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
+    printf("%-7s %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f %7.1f",
            tag, sampler.average(), sampler.stddev(),
-           sampler.get_nth(0), /*sampler.get_nth(1),*/ sampler.get_nth(5),
-           sampler.get_nth(10), sampler.get_nth(90),
-           sampler.get_nth(95), sampler.get_nth(99));
+           sampler.get_nth(0), sampler.get_nth(5),
+           sampler.get_nth(10), sampler.get_nth(50),
+           sampler.get_nth(90), sampler.get_nth(95),
+           sampler.get_nth(99), sampler.get_nth(99.9) );
 
     if (newline) printf("\n");
   }
diff --git a/Generator.h b/Generator.h
index eb598b1..bf57e2a 100644
--- a/Generator.h
+++ b/Generator.h
@@ -119,6 +119,102 @@ class Exponential : public Generator {
   double lambda;
 };
 
+class Zipfian : public Generator {
+public:
+  Zipfian(double _alpha = 1.0, unsigned int _m = 100) :
+    alpha(_alpha), m(_m) {
+    int i;
+    // Compute normalization constant
+    for (i = 1; i <= m; i++)
+      c = c + (1.0 / pow((double) i, alpha));
+    c = 1.0 / c;
+    
+    sum_probs = (double*)malloc((m+1)*sizeof(double));
+    sum_probs[0] = 0;
+    for (i = 1; i <= m; i++) {
+      sum_probs[i] = sum_probs[i-1] + c / pow((double) i, alpha);
+    }
+    
+    D("Zipfian(alpha=%f, m=%u)", alpha, m);
+  }
+
+  virtual double generate() {
+      double z;                     // Uniform random number (0 < z < 1)
+      int zipf_value;               // Computed exponential value to be returned
+      int low, high, mid;           // Binary-search bounds
+    
+      // Pull a uniform random number (0 < z < 1)
+      do
+      {
+        z = rand_val(0);
+      }
+      while ((z == 0) || (z == 1));
+    
+      // Map z to the value
+      low = 1, high = m, mid = (m/2);
+      do {
+        mid = floor((low+high)/2);
+        if (sum_probs[mid] >= z && sum_probs[mid-1] < z) {
+          zipf_value = mid;
+          break;
+        } else if (sum_probs[mid] >= z) {
+          high = mid-1;
+        } else {
+          low = mid+1;
+        }
+      } while (low <= high);
+    
+      // Assert that zipf_value is between 1 and M
+      assert((zipf_value >=1) && (zipf_value <= m));
+    
+      return(zipf_value);
+    }
+
+  //=========================================================================
+  //= Multiplicative LCG for generating uniform(0.0, 1.0) random numbers    =
+  //=   - x_n = 7^5*x_(n-1)mod(2^31 - 1)                                    =
+  //=   - With x seeded to 1 the 10000th x value should be 1043618065       =
+  //=   - From R. Jain, "The Art of Computer Systems Performance Analysis," =
+  //=     John Wiley & Sons, 1991. (Page 443, Figure 26.2)                  =
+  //=========================================================================
+  static double rand_val(int seed) {
+    const long  a =      16807;  // Multiplier
+    const long  m = 2147483647;  // Modulus
+    const long  q =     127773;  // m div a
+    const long  r =       2836;  // m mod a
+    static long x;               // Random int value
+    long        x_div_q;         // x divided by q
+    long        x_mod_q;         // x modulo q
+    long        x_new;           // New x value
+  
+    // Set the seed if argument is non-zero and then return zero
+    if (seed > 0)
+    {
+      x = seed;
+      return(0.0);
+    }
+  
+    // RNG using integer arithmetic
+    x_div_q = x / q;
+    x_mod_q = x % q;
+    x_new = (a * x_mod_q) - (r * x_div_q);
+    if (x_new > 0)
+      x = x_new;
+    else
+      x = x_new + m;
+  
+    // Return a random value between 0.0 and 1.0
+    return((double) x / m);
+  }
+
+
+private:
+  double alpha;
+  double m;
+  double c;
+  double *sum_probs;     // Pre-calculated sum of probabilities
+};
+
 class GPareto : public Generator {
 public:
   GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0) :
@@ -197,8 +293,8 @@ class KeyGenerator {
     double U = (double) h / ULLONG_MAX;
     double G = g->generate(U);
     int keylen = MAX(round(G), floor(log10(max)) + 1);
-    char key[256];
-    snprintf(key, 256, "%0*" PRIu64, keylen, ind);
+    char key[250]; //memcached limit of 255 chars
+    snprintf(key, keylen, "%lu" , ind);
 
     //    D("%d = %s", ind, key);
     return std::string(key);
diff --git a/Operation.h b/Operation.h
index b594b17..ceb0531 100644
--- a/Operation.h
+++ b/Operation.h
@@ -8,16 +8,36 @@ using namespace std;
 
 class Operation {
 public:
+  Operation() {
+    valuelen = 0;
+    opaque = 0;
+    flags = 0;
+    clsid = 0;
+    future = 0;
+    curr = 0;
+    l1 = NULL;
+    type = NOOP;
+    appid = 0;
+    start_time = 0;
+    end_time = 0;
+    memset(key,0,256);
+  }
   double start_time, end_time;
 
   enum type_enum {
-    GET, SET, SASL
+    GET, SET, DELETE, SASL, NOOP, TOUCH
   };
 
   type_enum type;
-
-  string key;
-  // string value;
+  uint16_t appid;
+  uint32_t valuelen;
+  uint32_t opaque;
+  uint32_t flags;
+  uint16_t clsid;
+  uint8_t future;
+  uint8_t curr;
+  char key[256];
+  Operation *l1;
 
   double time() const { return (end_time - start_time) * 1000000; }
 };
diff --git a/Protocol.cc b/Protocol.cc
index 6d346b8..2a46f40 100644
--- a/Protocol.cc
+++ b/Protocol.cc
@@ -19,35 +19,387 @@
 
 #define unlikely(x) __builtin_expect((x),0)
 
+/**
+ *
+ * First we build a RESP Array:
+ *  1. * character as the first byte 
+ *  2.  the number of elements in the array as a decimal number
+ *  3.  CRLF
+ *  4. The actual RESP element we are putting into the array
+ *
+ * All Redis commands are sent as arrays of bulk strings. 
+ * For example, the command “SET mykey ‘my value’” would be written and sent as:
+ * *3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$8\r\nmy value\r\n
+ *
+ * Then package command as a RESP Bulk String to the server
+ *
+ * Bulk String is the defined by the following:
+ *     1."$" byte followed by the number of bytes composing the 
+ *        string (a prefixed length), terminated by CRLF.
+ *     2. The actual string data.
+ *     3. A final CRLF.
+ *
+ * DBG code
+ *   fprintf(stderr,"--\n");
+ *   fprintf(stderr,"*3\r\n$3\r\nSET\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n", 
+ *                           strlen(key),key,len,val);
+ *   fprintf(stderr,"--\n");
+ *
+ */
+int ProtocolRESP::set_request(const char* key, const char* value, int len, uint32_t opaque) {
+ 
+  //need to make the real value
+  char *val = (char*)malloc(len*sizeof(char)+1);
+  memset(val, 'a', len);
+  val[len] = '\0';
+
+  //check if we should use assoc
+  if (opts.use_assoc && strlen(key) > ((unsigned int)(opts.assoc+1)) )
+  {    
+      int l = hset_request(key,val,len);
+      free(val);
+      return l;
+  }
+
+  else
+  {
+    int l;
+    l = evbuffer_add_printf(bufferevent_get_output(bev),
+                            "*3\r\n$3\r\nSET\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n", 
+                            strlen(key),key,len,val);
+    l += len + 2;
+    if (read_state == IDLE) read_state = WAITING_FOR_GET;
+    free(val);
+    return l;
+  }
+
+}
+
+/**
+ * Send a RESP get request.
+ */
+int ProtocolRESP::get_request(const char* key, uint32_t opaque) {
+  
+  //check if we should use assoc
+  if (opts.use_assoc && strlen(key) > ((unsigned int)(opts.assoc+1)) )
+      return hget_request(key);
+  else
+  {
+    int l;
+    l = evbuffer_add_printf(bufferevent_get_output(bev),
+                            "*2\r\n$3\r\nGET\r\n$%lu\r\n%s\r\n",strlen(key),key);
+
+    if (read_state == IDLE) read_state = WAITING_FOR_GET;
+    return l;
+  }
+}
+
+/**
+ * RESP HSET
+ * HSET myhash field1 "Hello"
+ * We break the key by last assoc bytes for now...
+ * We are guarenteed a key of at least assoc+1 bytes...but
+ * the vast vast majority are going to be 20 bytes.
+ * 
+ * DBG code
+ * fprintf(stderr,"--\n");
+ * fprintf(stderr,"*4\r\n$4\r\nHSET\r\n$%lu\r\n%s\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n",
+ *           strlen(hash),hash,strlen(field),field,len,value);
+ * fprintf(stderr,"--\n");
+ */
+
+int ProtocolRESP::hset_request(const char* key, const char* value, int len) {
+  
+  int l = 0;
+  //hash is first n-assoc bytes
+  //field is last assoc bytes
+  //value is value
+  //int assoc = opts.assoc;
+  //char* hash = (char*)malloc(sizeof(char)*((strlen(key)-assoc)+1));
+  //char* field = (char*)malloc(sizeof(char)*(assoc+1)); 
+  //strncpy(hash, key, strlen(key)-assoc);
+  //strncpy(field,key+strlen(key)-assoc,assoc);
+  //hash[strlen(key)-assoc] = '\0';
+  //field[assoc] = '\0';
+  //l = evbuffer_add_printf(bufferevent_get_output(bev),
+  //                        "*4\r\n$4\r\nHSET\r\n$%lu\r\n%s\r\n$%lu\r\n%s\r\n$%d\r\n%s\r\n",
+  //                        strlen(hash),hash,strlen(field),field,len,value);
+  //l += len + 2;
+  //if (read_state == IDLE) read_state = WAITING_FOR_END;
+  //free(hash);
+  //free(field);
+  return l;
+
+}
+
+/**
+ * RESP HGET
+ * HGET myhash field1
+ * We break the key by last assoc bytes for now...
+ * We are guarenteed a key of at least assoc+1 bytes...but
+ * the vast vast majority are going to be 20 bytes.
+ */
+int ProtocolRESP::hget_request(const char* key) {
+  int l = 0;
+  //hash is first n-assoc bytes
+  //field is last assoc bytes
+  //int assoc = opts.assoc;
+  //char* hash = (char*)malloc(sizeof(char)*((strlen(key)-assoc)+1));
+  //char* field = (char*)malloc(sizeof(char)*(assoc+1));
+  //strncpy(hash, key, strlen(key)-assoc);
+  //strncpy(field,key+strlen(key)-assoc,assoc);
+  //hash[strlen(key)-assoc] = '\0';
+  //field[assoc] = '\0';
+  //l = evbuffer_add_printf(bufferevent_get_output(bev),
+  //                        "*3\r\n$4\r\nHGET\r\n$%lu\r\n%s\r\n$%lu\r\n%s\r\n",
+  //                        strlen(hash),hash,strlen(field),field);
+
+  //if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  //free(hash);
+  //free(field);
+  return l;
+}
+
+/**
+ * RESP DELETE 90 - delete 90 percent of keys in DB
+ */
+int ProtocolRESP::delete90_request() {
+  int l;
+  l = evbuffer_add_printf(bufferevent_get_output(bev),
+                            "*1\r\n$8\r\nFLUSHALL\r\n");
+
+  if (read_state == IDLE) read_state = WAITING_FOR_DELETE;
+  return l;
+}
+
+/**
+ * Handle a RESP response.
+ *
+ * In RESP, the type of data depends on the first byte:
+ * 
+ * Simple Strings the first byte of the reply is "+"
+ * Errors the first byte of the reply is "-"
+ * Integers the first byte of the reply is ":"
+ * Bulk Strings the first byte of the reply is "$"
+ * Arrays the first byte of the reply is "*"
+ *
+ * Right now we are only implementing GET response
+ * so the RESP type will be bulk string.
+ *
+ *
+ */
+bool ProtocolRESP::handle_response(evbuffer *input, bool &done, bool &found, int &obj_size, uint32_t &opaque) {
+  opaque = 0;
+
+  char *buf = NULL;
+  char *databuf = NULL;
+  char *obj_size_str = NULL;
+  int len;
+  size_t n_read_out;
+
+  switch (read_state) {
+
+  case WAITING_FOR_GET:
+    
+    buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF);
+    if (buf == NULL) return false;
+
+    obj_size_str = buf+1;
+    obj_size = atoi(obj_size_str);
+
+    conn->stats.rx_bytes += n_read_out;
+    
+    databuf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF);
+    //fprintf(stderr,"--------------------\n");
+    //fprintf(stderr,"resp size %lu\n",n_read_out);
+    //fprintf(stderr,"data size %d\n",obj_size);
+    //fprintf(stderr,"-------header---------\n");
+    //fprintf(stderr,"%s\n",buf);
+    //fprintf(stderr,"-------data-----------\n");
+    //fprintf(stderr,"%s\n",databuf);
+
+    conn->stats.rx_bytes += n_read_out;
+
+    if (!strncmp(buf,"$-1",3)) {
+      conn->stats.get_misses++;
+      conn->stats.window_get_misses++;
+      found = false; 
+      done = true;
+    } else if ((int)n_read_out != obj_size) {
+     
+
+      // FIXME: check key name to see if it corresponds to the op at
+      // the head of the op queue?  This will be necessary to
+      // support "gets" where there may be misses.
+
+      data_length = obj_size;
+      read_state = WAITING_FOR_GET_DATA;
+      done = false;
+    } else if (!strncmp(buf,"+OK",3) || !strncmp(buf,":1",2) || !strncmp(buf,":0",2) ) {
+      found = false;
+      done = true;
+    } else {
+      // got all the data..
+      found = true;
+      done = true;
+    }
+    if (databuf)
+        free(databuf);
+    free(buf);
+    return true;
+
+  case WAITING_FOR_GET_DATA:
+    
+    len = evbuffer_get_length(input);
+    
+    //finally got all data...
+    if (len >= data_length + 2) {
+      evbuffer_drain(input, data_length + 2);
+      conn->stats.rx_bytes += data_length + 2;
+      read_state = WAITING_FOR_GET;
+      obj_size = data_length;
+      found = true;
+      done = true;
+      return true;
+    }
+    return false;
+
+  default: printf("state: %d\n", read_state); DIE("Unimplemented!");
+  }
+
+  DIE("Shouldn't ever reach here...");
+}
+
+  //char *buf = NUL; //for initial readline
+  //char *dbuf = NULL; //for data readline
+  //size_t n_read_out;
+
+  //buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF);
+  //if (buf == NULL) 
+  //{
+  //    done = false;
+  //    return false;
+  //}
+  //conn->stats.rx_bytes += n_read_out;
+  //
+  //size_t len = evbuffer_get_length(input);
+
+  //fprintf(stderr,"--------------------\n");
+  //fprintf(stderr,"resp size %lu\n",n_read_out);
+  //fprintf(stderr,"ev len %lu\n",len);
+  //fprintf(stderr,"--------------------\n");
+  //fprintf(stderr,"%s\n",buf);
+  ////RESP null response => miss
+  //if (!strncmp(buf,"$-1",3)) 
+  //{
+  //  conn->stats.get_misses++;
+  //  conn->stats.window_get_misses++;
+  //  found = false;
+  //  
+  //}
+  ////HSET or SET response was good, just consume the input and move on
+  ////with our lives
+  //else if (!strncmp(buf,"+OK",3) || !strncmp(buf,":1",2) || !strncmp(buf,":0",2) )
+  //{
+  //    found = false;
+  //    done = true;
+  //}
+  ////else we got a hit
+  //else
+  //{
+  //  char* nlen = buf+1;
+  //  //fprintf(stderr,"%s\n",nlen);
+  //  obj_size = atoi(nlen);
+  //  // Consume the next "foobar"
+  //  //size_t len = evbuffer_get_length(input);
+  //  //dbuf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF);
+  //  //if (!dbuf)
+  //  //{
+  //  //    fprintf(stderr,"--------------------\n");
+  //  //    fprintf(stderr,"next foobar (null) %lu\n",n_read_out);
+  //  //    fprintf(stderr,"ev len %lu\n",len);
+  //  //    fprintf(stderr,"--------------------\n");
+  //  //    fprintf(stderr,"%s\n",dbuf);
+
+  //  //    //read_state = WAITING_FOR_GET_DATA;
+  //  //    //done = false;
+  //  //    //return false;
+  //  //}
+  //  //else
+  //  //{
+
+  //  //    fprintf(stderr,"--------------------\n");
+  //  //    fprintf(stderr,"next foobar (null) %lu\n",n_read_out);
+  //  //    fprintf(stderr,"ev len %lu\n",len);
+  //  //    fprintf(stderr,"--------------------\n");
+  //  //    fprintf(stderr,"%s\n",dbuf);
+  //  //}
+
+  //  //conn->stats.rx_bytes += n_read_out; 
+  //  found = true;
+  //}    
+  ////read_state = WAITING_FOR_GET;
+  ////fprintf(stderr,"--------------------\n");
+  ////fprintf(stderr,"read_state %u\n",read_state);
+  ////fprintf(stderr,"--------------------\n");
+  //done = true;
+  ////if (dbuf) 
+  ////    free(dbuf);
+  //free(buf);
+  //return true;
+
+  
+//}
+
 /**
  * Send an ascii get request.
  */
-int ProtocolAscii::get_request(const char* key) {
+int ProtocolAscii::get_request(const char* key, uint32_t opaque) {
   int l;
   l = evbuffer_add_printf(
     bufferevent_get_output(bev), "get %s\r\n", key);
-  if (read_state == IDLE) read_state = WAITING_FOR_GET;
+  if (read_state == IDLE) {
+      read_state = WAITING_FOR_GET;
+  } 
   return l;
 }
 
 /**
  * Send an ascii set request.
  */
-int ProtocolAscii::set_request(const char* key, const char* value, int len) {
+int ProtocolAscii::set_request(const char* key, const char* value, int len, uint32_t opaque) {
   int l;
   l = evbuffer_add_printf(bufferevent_get_output(bev),
                           "set %s 0 0 %d\r\n", key, len);
-  bufferevent_write(bev, value, len);
+  
+  char *val = (char*)malloc(len*sizeof(char)+1);
+  memset(val, 'a', len);
+  val[len] = '\0';
+  
+  bufferevent_write(bev, val, len);
   bufferevent_write(bev, "\r\n", 2);
   l += len + 2;
-  if (read_state == IDLE) read_state = WAITING_FOR_END;
+  if (read_state == IDLE) {
+      read_state = WAITING_FOR_END;
+  }
+  free(val);
+  return l;
+}
+
+/** WARNING UNIMPLEMENTED **/
+int ProtocolAscii::delete90_request() {
+  int l;
+  l = evbuffer_add_printf(bufferevent_get_output(bev),
+                            "*1\r\n$8\r\nFLUSHALL\r\n");
+
   return l;
 }
 
 /**
  * Handle an ascii response.
  */
-bool ProtocolAscii::handle_response(evbuffer *input, bool &done) {
+bool ProtocolAscii::handle_response(evbuffer *input, bool &done, bool &found, int &obj_size, uint32_t &opaque) {
+  opaque = 0;
   char *buf = NULL;
   int len;
   size_t n_read_out;
@@ -62,7 +414,14 @@ bool ProtocolAscii::handle_response(evbuffer *input, bool &done) {
     conn->stats.rx_bytes += n_read_out;
 
     if (!strncmp(buf, "END", 3)) {
-      if (read_state == WAITING_FOR_GET) conn->stats.get_misses++;
+      if (read_state == WAITING_FOR_GET) {
+          conn->stats.get_misses++;
+          conn->stats.window_get_misses++;
+          found = false;
+      }
+      read_state = WAITING_FOR_GET;
+      done = true;
+    } else if (!strncmp(buf, "STORED", 6)) {
       read_state = WAITING_FOR_GET;
       done = true;
     } else if (!strncmp(buf, "VALUE", 5)) {
@@ -93,6 +452,42 @@ bool ProtocolAscii::handle_response(evbuffer *input, bool &done) {
     }
     return false;
 
+  /*
+  case WAITING_FOR_GETSET:
+    buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF);
+    if (buf == NULL) return false;
+
+    conn->stats.rx_bytes += n_read_out;
+    if (!strncmp(buf, "END", 3)) {
+        conn->stats.get_misses++;
+        conn->stats.window_get_misses++;
+        found = false;
+        done = true;
+        read_state = WAITING_FOR_SET;
+        return true;
+    } else if (!strncmp(buf, "STORED", 6)) {
+        done = true;
+        read_state = WAITING_FOR_GET;
+        return true;
+    }
+    
+  
+  case WAITING_FOR_SET:
+    buf = evbuffer_readln(input, &n_read_out, EVBUFFER_EOL_CRLF);
+    if (buf == NULL) return false;
+
+    conn->stats.rx_bytes += n_read_out;
+
+    if (!strncmp(buf, "STORED", 6)) {
+        done = true;
+        read_state = IDLE;
+        return true;
+    } else {
+        done = false;
+        return true;
+    }
+    */
+
   default: printf("state: %d\n", read_state); DIE("Unimplemented!");
   }
 
@@ -108,7 +503,7 @@ bool ProtocolBinary::setup_connection_w() {
   string user = string(opts.username);
   string pass = string(opts.password);
 
-  binary_header_t header = {0x80, CMD_SASL, 0, 0, 0, {0}, 0, 0, 0};
+  binary_header_t header = {0x80, CMD_SASL, 0, 0, 0, 0, 0, 0, 0};
   header.key_len = htons(5);
   header.body_len = htonl(6 + user.length() + 1 + pass.length());
 
@@ -126,50 +521,74 @@ bool ProtocolBinary::setup_connection_w() {
 bool ProtocolBinary::setup_connection_r(evbuffer* input) {
   if (!opts.sasl) return true;
 
-  bool b;
-  return handle_response(input, b);
+  bool b,c;
+  int obj_size;
+  uint32_t opaque;
+  return handle_response(input, b, c, obj_size, opaque);
 }
 
 /**
  * Send a binary get request.
  */
-int ProtocolBinary::get_request(const char* key) {
+int ProtocolBinary::get_request(const char* key, uint32_t opaque) {
+
+  struct evbuffer *output = bufferevent_get_output(bev);
+  
   uint16_t keylen = strlen(key);
 
   // each line is 4-bytes
   binary_header_t h = { 0x80, CMD_GET, htons(keylen),
-                        0x00, 0x00, {htons(0)},
+                        0x00, 0x00, htons(0),
                         htonl(keylen) };
+  h.opaque = htonl(opaque);
 
-  bufferevent_write(bev, &h, 24); // size does not include extras
-  bufferevent_write(bev, key, keylen);
+  evbuffer_add(output, &h, 24);
+  evbuffer_add(output, key, keylen);
+  //bufferevent_write(bev, &h, 24); // size does not include extras
+  //bufferevent_write(bev, key, keylen);
   return 24 + keylen;
 }
 
+
+
 /**
  * Send a binary set request.
  */
-int ProtocolBinary::set_request(const char* key, const char* value, int len) {
+int ProtocolBinary::set_request(const char* key, const char* value, int len, uint32_t opaque) {
+  struct evbuffer *output = bufferevent_get_output(bev);
+  
   uint16_t keylen = strlen(key);
 
   // each line is 4-bytes
   binary_header_t h = { 0x80, CMD_SET, htons(keylen),
-                        0x08, 0x00, {htons(0)},
+                        0x08, 0x00, htons(0),
                         htonl(keylen + 8 + len) };
-
-  bufferevent_write(bev, &h, 32); // With extras
-  bufferevent_write(bev, key, keylen);
-  bufferevent_write(bev, value, len);
+  h.opaque = htonl(opaque);
+  //bufferevent_write(bev, &h, 32); // With extras
+  //bufferevent_write(bev, key, keylen);
+  //bufferevent_write(bev, value, len);
+  evbuffer_add(output, &h, 32);
+  evbuffer_add(output, key, keylen);
+  evbuffer_add(output, value, len);
   return 24 + ntohl(h.body_len);
 }
 
+/** WARNING UNIMPLEMENTED **/
+int ProtocolBinary::delete90_request() {
+  int l;
+  l = evbuffer_add_printf(bufferevent_get_output(bev),
+                            "*1\r\n$8\r\nFLUSHALL\r\n");
+
+  return l;
+}
+
 /**
  * Tries to consume a binary response (in its entirety) from an evbuffer.
  *
  * @param input evBuffer to read response from
  * @return  true if consumed, false if not enough data in buffer.
  */
-bool ProtocolBinary::handle_response(evbuffer *input, bool &done) {
+bool ProtocolBinary::handle_response(evbuffer *input, bool &done, bool &found, int &opcode, uint32_t &opaque) {
   // Read the first 24 bytes as a header
   int length = evbuffer_get_length(input);
   if (length < 24) return false;
@@ -177,24 +596,41 @@ bool ProtocolBinary::handle_response(evbuffer *input, bool &done) {
           reinterpret_cast<binary_header_t*>(evbuffer_pullup(input, 24));
   assert(h);
 
+  int bl = ntohl(h->body_len);
   // Not whole response
-  int targetLen = 24 + ntohl(h->body_len);
+  int targetLen = 24 + bl;
   if (length < targetLen) return false;
+    //fprintf(stderr,"handle resp - opcode: %u opaque: %u len: %u status: %u\n",
+    //        h->opcode,ntohl(h->opaque),
+    //        ntohl(h->body_len),ntohl(h->status));
 
+  opcode = h->opcode;
+  opaque = ntohl(h->opaque);
   // If something other than success, count it as a miss
-  if (h->opcode == CMD_GET && h->status) {
+  if (opcode == CMD_GET && h->status) {
       conn->stats.get_misses++;
+      conn->stats.window_get_misses++;
+      found = false;
   }
 
-  if (unlikely(h->opcode == CMD_SASL)) {
+  if (unlikely(opcode == CMD_SASL)) {
     if (h->status == RESP_OK) {
       V("SASL authentication succeeded");
     } else {
       DIE("SASL authentication failed");
     }
   }
+  
+  if (bl > 0 && opcode == 1) {
+    //fprintf(stderr,"set resp len: %u\n",bl);
+    //void *data = malloc(bl);
+    //data = evbuffer_pullup(input, bl);
+    //free(data);
+    evbuffer_drain(input, targetLen);
+  } else {
+    evbuffer_drain(input, targetLen);
+  }
 
-  evbuffer_drain(input, targetLen);
   conn->stats.rx_bytes += targetLen;
   done = true;
   return true;
diff --git a/Protocol.h b/Protocol.h
index da7b253..ccd2293 100644
--- a/Protocol.h
+++ b/Protocol.h
@@ -18,9 +18,10 @@ class Protocol {
 
   virtual bool setup_connection_w() = 0;
   virtual bool setup_connection_r(evbuffer* input) = 0;
-  virtual int  get_request(const char* key) = 0;
-  virtual int  set_request(const char* key, const char* value, int len) = 0;
-  virtual bool handle_response(evbuffer* input, bool &done) = 0;
+  virtual int  get_request(const char* key, uint32_t opaque) = 0;
+  virtual int  set_request(const char* key, const char* value, int len, uint32_t opaque) = 0;
+  virtual int  delete90_request() = 0;
+  virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque) = 0;
 
 protected:
   options_t    opts;
@@ -39,9 +40,10 @@ class ProtocolAscii : public Protocol {
 
   virtual bool setup_connection_w() { return true; }
   virtual bool setup_connection_r(evbuffer* input) { return true; }
-  virtual int  get_request(const char* key);
-  virtual int  set_request(const char* key, const char* value, int len);
-  virtual bool handle_response(evbuffer* input, bool &done);
+  virtual int  get_request(const char* key, uint32_t opaque);
+  virtual int  set_request(const char* key, const char* value, int len, uint32_t opaque);
+  virtual int  delete90_request();
+  virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque);
 
 private:
   enum read_fsm {
@@ -49,6 +51,8 @@ class ProtocolAscii : public Protocol {
     WAITING_FOR_GET,
     WAITING_FOR_GET_DATA,
     WAITING_FOR_END,
+    WAITING_FOR_SET,
+    WAITING_FOR_GETSET
   };
 
   read_fsm read_state;
@@ -58,14 +62,49 @@ class ProtocolAscii : public Protocol {
 class ProtocolBinary : public Protocol {
 public:
   ProtocolBinary(options_t opts, Connection* conn, bufferevent* bev):
-    Protocol(opts, conn, bev) {};
+    Protocol(opts, conn, bev) {
+        //int wbuf_written;
+        //int wbuf_towrite;
+        //unsigned char *wbuf_pos;
+        //unsigned char wbuf[65536];
+    };
   ~ProtocolBinary() {};
 
   virtual bool setup_connection_w();
   virtual bool setup_connection_r(evbuffer* input);
-  virtual int  get_request(const char* key);
-  virtual int  set_request(const char* key, const char* value, int len);
-  virtual bool handle_response(evbuffer* input, bool &done);
+  virtual int  get_request(const char* key, uint32_t opaque);
+  virtual int  set_request(const char* key, const char* value, int len, uint32_t opaque);
+  virtual int  delete90_request();
+  virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque);
+};
+
+class ProtocolRESP : public Protocol {
+public:
+  ProtocolRESP(options_t opts, Connection* conn, bufferevent* bev):
+    Protocol(opts, conn, bev) {
+    };
+  ~ProtocolRESP() {};
+
+  virtual bool setup_connection_w() { return true; } 
+  virtual bool setup_connection_r(evbuffer* input) { return true; }
+  virtual int  get_request(const char* key, uint32_t opaque);
+  virtual int  set_request(const char* key, const char* value, int len, uint32_t opaque);
+  virtual int  hget_request(const char* key);
+  virtual int  hset_request(const char* key, const char* value, int len);
+  virtual int  delete90_request();
+  virtual bool handle_response(evbuffer* input, bool &done, bool &found, int &obj_size, uint32_t &opaque);
+
+private:
+  enum read_fsm {
+    IDLE,
+    WAITING_FOR_GET,
+    WAITING_FOR_GET_DATA,
+    WAITING_FOR_DELETE,
+    WAITING_FOR_END
+  };
+
+  read_fsm read_state;
+  int data_length;
 };
 
 #endif
diff --git a/README.md b/README.md
index e599886..2b1904f 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,7 @@ client-side queuing delay adulterating the latency measurements.
 Command-line Options
 ====================
 
-    mutilate3 0.1
+    mutilate 0.1
     
     Usage: mutilate -s server[:port] [options]
     
@@ -129,20 +129,60 @@ Command-line Options
           --quiet                   Disable log messages.
     
     Basic options:
-      -s, --server=STRING           Memcached server hostname[:port].  Repeat to 
+      -s, --server=STRING           Memcached server hostname[:port].  Repeat to
                                       specify multiple servers.
+          --unix_socket             Use UNIX socket instead of TCP.
           --binary                  Use binary memcached protocol instead of ASCII.
-      -q, --qps=INT                 Target aggregate QPS. 0 = peak QPS.  
+          --redis                   Use Redis RESP protocol instead of memchached.
+          --getset                  Use getset mode, in getset mode we first issue
+                                      a GET and if the response is MISS, then issue
+                                      a SET for on that
+                                      key following distribution value.
+          --getsetorset             Use getset mode and allow for direct writes
+                                      (with optype == 2).
+          --successful              Only record latency and throughput stats for
+                                      successful queries
+          --prefix=STRING           Prefix all keys with a string (helps with
+                                      multi-tennant eval)
+          --delete90                Delete 90 percent of keys after halfway through
+                                      the workload, used to model Rumbel et. al.
+                                      USENIX                      FAST '14
+                                      workloads. MUST BE IN GETSET MODE and 
+                                                           have a set number of
+                                      queries
+          --assoc=INT               We create hash tables by taking the truncating
+                                      the                    key by b bytes. The
+                                      n-b bytes are the key for redis, in the
+                                      original                    (key,value). The
+                                      value is a hash table and we acess field
+                                      b to get the value. Essentially this makes
+                                      redis n-way                    associative
+                                      cache. Only works in redis mode. For small
+                                      key                    sizes we just use
+                                      normal method of (key,value) store. No hash
+                                      table.  (default=`4')
+      -q, --qps=INT                 Target aggregate QPS. 0 = peak QPS.
                                       (default=`0')
       -t, --time=INT                Maximum time to run (seconds).  (default=`5')
-      -K, --keysize=STRING          Length of memcached keys (distribution).  
+          --read_file=STRING        Read keys from file.  (default=`')
+          --twitter_trace=INT       use twitter memcached trace format from file.
+                                      (default=`0')
+      -K, --keysize=STRING          Length of memcached keys (distribution).
                                       (default=`30')
-      -V, --valuesize=STRING        Length of memcached values (distribution).  
+      -V, --valuesize=STRING        Length of memcached values (distribution).
                                       (default=`200')
-      -r, --records=INT             Number of memcached records to use.  If 
-                                      multiple memcached servers are given, this 
-                                      number is divided by the number of servers.  
+      -r, --records=INT             Number of memcached records to use.  If
+                                      multiple memcached servers are given, this
+                                      number is divided by the number of servers.
                                       (default=`10000')
+      -m, --misswindow=INT          Window for recording misses, used to find
+                                      steady state, no window by default, which
+                                      gives us summary stats in total
+                                      (default=`0')
+      -N, --queries=INT             Number of queries to make. 0 is unlimited
+                                      (default)  If multiple memcached servers are
+                                      given, this number is divided by the number
+                                      of servers.  (default=`0')
       -u, --update=FLOAT            Ratio of set:get commands.  (default=`0.0')
     
     Advanced options:
@@ -150,34 +190,34 @@ Command-line Options
       -P, --password=STRING         Password to use for SASL authentication.
       -T, --threads=INT             Number of threads to spawn.  (default=`1')
           --affinity                Set CPU affinity for threads, round-robin
-      -c, --connections=INT         Connections to establish per server.  
+      -c, --connections=INT         Connections to establish per server.
                                       (default=`1')
-      -d, --depth=INT               Maximum depth to pipeline requests.  
+      -d, --depth=INT               Maximum depth to pipeline requests.
                                       (default=`1')
-      -R, --roundrobin              Assign threads to servers in round-robin 
-                                      fashion.  By default, each thread connects to 
+      -R, --roundrobin              Assign threads to servers in round-robin
+                                      fashion.  By default, each thread connects to
                                       every server.
-      -i, --iadist=STRING           Inter-arrival distribution (distribution).  
-                                      Note: The distribution will automatically be 
-                                      adjusted to match the QPS given by --qps.  
+      -i, --iadist=STRING           Inter-arrival distribution (distribution).
+                                      Note: The distribution will automatically be
+                                      adjusted to match the QPS given by --qps.
                                       (default=`exponential')
-      -S, --skip                    Skip transmissions if previous requests are 
-                                      late.  This harms the long-term QPS average, 
-                                      but reduces spikes in QPS after long latency 
+      -S, --skip                    Skip transmissions if previous requests are
+                                      late.  This harms the long-term QPS average,
+                                      but reduces spikes in QPS after long latency
                                       requests.
-          --moderate                Enforce a minimum delay of ~1/lambda between 
+          --moderate                Enforce a minimum delay of ~1/lambda between
                                       requests.
           --noload                  Skip database loading.
           --loadonly                Load database and then exit.
       -B, --blocking                Use blocking epoll().  May increase latency.
           --no_nodelay              Don't use TCP_NODELAY.
       -w, --warmup=INT              Warmup time before starting measurement.
-      -W, --wait=INT                Time to wait after startup to start 
+      -W, --wait=INT                Time to wait after startup to start
                                       measurement.
           --save=STRING             Record latency samples to given file.
-          --search=N:X              Search for the QPS where N-order statistic < 
-                                      Xus.  (i.e. --search 95:1000 means find the 
-                                      QPS where 95% of requests are faster than 
+          --search=N:X              Search for the QPS where N-order statistic <
+                                      Xus.  (i.e. --search 95:1000 means find the
+                                      QPS where 95% of requests are faster than
                                       1000us).
           --scan=min:max:step       Scan latency across QPS rates from min to max.
     
@@ -185,11 +225,11 @@ Command-line Options
       -A, --agentmode               Run client in agent mode.
       -a, --agent=host              Enlist remote agent.
       -p, --agent_port=STRING       Agent port.  (default=`5556')
-      -l, --lambda_mul=INT          Lambda multiplier.  Increases share of QPS for 
+      -l, --lambda_mul=INT          Lambda multiplier.  Increases share of QPS for
                                       this client.  (default=`1')
-      -C, --measure_connections=INT Master client connections per server, overrides 
+      -C, --measure_connections=INT Master client connections per server, overrides
                                       --connections.
-      -Q, --measure_qps=INT         Explicitly set master client QPS, spread across 
+      -Q, --measure_qps=INT         Explicitly set master client QPS, spread across
                                       threads and connections.
       -D, --measure_depth=INT       Set master client connection depth.
     
diff --git a/SConstruct b/SConstruct
index 57d0054..f2a4e64 100644
--- a/SConstruct
+++ b/SConstruct
@@ -6,12 +6,18 @@ env = Environment(ENV = os.environ)
 
 env['HAVE_POSIX_BARRIER'] = True
 
-env.Append(CPPPATH = ['/usr/local/include', '/opt/local/include'])
-env.Append(LIBPATH = ['/opt/local/lib'])
-env.Append(CCFLAGS = '-std=c++11 -D_GNU_SOURCE')
-if sys.platform == 'darwin':
-    env['CC']  = 'clang'
-    env['CXX'] = 'clang++'
+#env['CC']  = 'clang'
+#env['CXX'] = 'clang++'
+
+#env.Append(CPPPATH = ['/u/dbyrne99/local/include', '/usr/include'])
+#env.Append(CPATH = ['/u/dbyrne99/local/include', '/usr/include'])
+#env.Append(LIBPATH = ['/u/dbyrne99/local/lib', '/lib64/'])
+
+#env.Append(CFLAGS = '-std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' )
+#env.Append(CCFLAGS = '-std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' )
+#if sys.platform == 'darwin':
+#env['CC']  = 'clang'
+#env['CXX'] = 'clang++'
 
 conf = env.Configure(config_h = "config.h")
 conf.Define("__STDC_FORMAT_MACROS")
@@ -23,13 +29,14 @@ if env.Execute("@which gengetopt &> /dev/null"):
     print "not found (required)"
     Exit(1)
 else: print "found"
-if not conf.CheckLibWithHeader("event", "event2/event.h", "C++"):
-    print "libevent required"
-    Exit(1)
-conf.CheckDeclaration("EVENT_BASE_FLAG_PRECISE_TIMER", '#include <event2/event.h>', "C++")
+#if not conf.CheckLibWithHeader("event", "event2/event.h", "C++"):
+#    print "libevent required"
+#    Exit(1)
+#conf.CheckDeclaration("EVENT_BASE_FLAG_PRECISE_TIMER", '#include <event2/event.h>', "C++")
 if not conf.CheckLibWithHeader("pthread", "pthread.h", "C++"):
     print "pthread required"
     Exit(1)
+
 conf.CheckLib("rt", "clock_gettime", language="C++")
 conf.CheckLibWithHeader("zmq", "zmq.hpp", "C++")
 if not conf.CheckFunc('pthread_barrier_init'):
@@ -37,17 +44,33 @@ if not conf.CheckFunc('pthread_barrier_init'):
 
 env = conf.Finish()
 
-env.Append(CFLAGS = ' -O3 -Wall -g')
-env.Append(CPPFLAGS = ' -O3 -Wall -g')
+#env.Append(CFLAGS = '-O0 -Wall -g --std=c++17 -lstdc++fs -fsanitize=address')
+#env.Append(CPPFLAGS = '-O0 -Wall -g --std=c++17 -lstdc++fs -fsanitize=address')
+env.Append(CFLAGS = ' -O2 -Wall -g --std=c++17 -lstdc++fs')
+env.Append(CPPFLAGS = ' -O2 -Wall -g --std=c++17 -lstdc++fs')
+#env.Append(CFLAGS = ' -O3 -Wall -g')
+#env.Append(CPPFLAGS = ' -O3 -Wall -g')
+#env.Append(LDFLAGS = '-fsantize=address')
+#env.Append(CFLAGS = ' -O3 -Wall -g -fsantize=address')
+#env.Append(CPPFLAGS = ' -O3 -Wall -g -fsanitize=address')
+#env.Append(CFLAGS = ' -O0 -Wall -g')
+#env.Append(CPPFLAGS = ' -O0 -Wall -g')
+
+#env.Append(CFLAGS = '-g -std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' )
+#env.Append(CCFLAGS = '-g -std=c++11 -D_GNU_SOURCE -static-libsan -fsanitize=address -I/u/dbyrne99/local/include' )
 
 env.Command(['cmdline.cc', 'cmdline.h'], 'cmdline.ggo', 'gengetopt < $SOURCE')
 
 src = Split("""mutilate.cc cmdline.cc log.cc distributions.cc util.cc
-               Connection.cc Protocol.cc Generator.cc""")
+               Connection.cc ConnectionMulti.cc ConnectionMultiApprox.cc ConnectionMultiApproxBatchShm.cc  ConnectionMultiApproxBatch.cc ConnectionMultiApproxShm.cc Protocol.cc Generator.cc bipbuffer.cc""")
+
+#src = Split("""mutilate.cc cmdline.cc log.cc distributions.cc util.cc
+#               ConnectionMultiApprox.cc ConnectionMultiApproxBatchShm.cc Generator.cc bipbuffer.cc""")
 
 if not env['HAVE_POSIX_BARRIER']: # USE_POSIX_BARRIER:
     src += ['barrier.cc']
 
+src += ['libzstd.a', '/u/dbyrne99/local/lib/libevent.a']
 env.Program(target='mutilate', source=src)
-env.Program(target='gtest', source=['TestGenerator.cc', 'log.cc', 'util.cc',
-                                    'Generator.cc'])
+#env.Program(target='gtest', source=['TestGenerator.cc', 'log.cc', 'util.cc',
+#                                    'Generator.cc'])
diff --git a/binary_protocol.h b/binary_protocol.h
index 2b5ef66..7c59ddf 100644
--- a/binary_protocol.h
+++ b/binary_protocol.h
@@ -1,31 +1,33 @@
 #ifndef BINARY_PROTOCOL_H
 #define	BINARY_PROTOCOL_H
 
+#include <stdint.h>
+
 #define CMD_GET  0x00
+#define CMD_GETQ 0x09
+#define CMD_TOUCH 0x1c
+#define CMD_TOUCH 0x1c
+#define CMD_DELETE 0x04
 #define CMD_SET  0x01
+#define CMD_NOOP 0x0a
+#define CMD_SETQ 0x11
 #define CMD_SASL 0x21
 
 #define RESP_OK 0x00
+#define RESP_NOT_FOUND 0x01
 #define RESP_SASL_ERR 0x20
 
-typedef struct __attribute__ ((__packed__)) {
+typedef struct {
   uint8_t magic;
   uint8_t opcode;
   uint16_t key_len;
-
   uint8_t extra_len;
   uint8_t data_type;
-  union {
-   uint16_t vbucket; // request use
-   uint16_t status;  // response use
-  };
-
+  uint16_t status;  // response use
   uint32_t body_len;
   uint32_t opaque;
-  uint64_t version;
+  uint64_t cas;
 
-  // Used for set only.
-  uint64_t extras;
 } binary_header_t;
 
 #endif /* BINARY_PROTOCOL_H */
diff --git a/bipbuffer.cc b/bipbuffer.cc
new file mode 100644
index 0000000..b712617
--- /dev/null
+++ b/bipbuffer.cc
@@ -0,0 +1,182 @@
+/**
+ * Copyright (c) 2011, Willem-Hendrik Thiart
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.bipbuffer file.
+ *
+ * @file
+ * @author  Willem Thiart himself@willemthiart.com
+ */
+
+//#include "stdio.h"
+#include <stdlib.h>
+
+/* for memcpy */
+#include <string.h>
+
+#include "bipbuffer.h"
+
+static size_t bipbuf_sizeof(const unsigned int size)
+{
+    return sizeof(bipbuf_t) + size;
+}
+
+int bipbuf_unused(const bipbuf_t* me)
+{
+    if (1 == me->b_inuse)
+        /* distance between region B and region A */
+        return me->a_start - me->b_end;
+    else
+        return me->size - me->a_end;
+}
+
+int bipbuf_size(const bipbuf_t* me)
+{
+    return me->size;
+}
+
+int bipbuf_used(const bipbuf_t* me)
+{
+    return (me->a_end - me->a_start) + me->b_end;
+}
+
+void bipbuf_init(bipbuf_t* me, const unsigned int size)
+{
+    me->a_start = me->a_end = me->b_end = 0;
+    me->size = size;
+    me->b_inuse = 0;
+}
+
+bipbuf_t *bipbuf_new(const unsigned int size)
+{
+    bipbuf_t *me = (bipbuf_t*)malloc(bipbuf_sizeof(size));
+    if (!me)
+        return NULL;
+    bipbuf_init(me, size);
+    return me;
+}
+
+void bipbuf_free(bipbuf_t* me)
+{
+    free(me);
+}
+
+int bipbuf_is_empty(const bipbuf_t* me)
+{
+    return me->a_start == me->a_end;
+}
+
+/* find out if we should turn on region B
+ * ie. is the distance from A to buffer's end less than B to A? */
+static void __check_for_switch_to_b(bipbuf_t* me)
+{
+    if (me->size - me->a_end < me->a_start - me->b_end) {
+        //fprintf(stderr,"%p switching to b, a_start: %d, a_end: %d, b_end %d\n",me,me->a_start,me->a_end,me->b_end);
+        me->b_inuse = 1;
+    }
+}
+
+/* TODO: DOCUMENT THESE TWO FUNCTIONS */
+unsigned char *bipbuf_request(bipbuf_t* me, const int size)
+{
+    if (bipbuf_unused(me) < size)
+        return 0;
+    if (1 == me->b_inuse)
+    {
+        return (unsigned char *)me->data + me->b_end;
+    }
+    else
+    {
+        return (unsigned char *)me->data + me->a_end;
+    }
+}
+
+int bipbuf_push(bipbuf_t* me, const int size)
+{
+    if (bipbuf_unused(me) < size)
+        return 0;
+
+    if (1 == me->b_inuse)
+    {
+        me->b_end += size;
+    }
+    else
+    {
+        me->a_end += size;
+    }
+
+    __check_for_switch_to_b(me);
+    return size;
+}
+
+int bipbuf_offer(bipbuf_t* me, const unsigned char *data, const int size)
+{
+    /* not enough space */
+    if (bipbuf_unused(me) < size)
+        return 0;
+
+    if (1 == me->b_inuse)
+    {
+        memcpy(me->data + me->b_end, data, size);
+        me->b_end += size;
+    }
+    else
+    {
+        memcpy(me->data + me->a_end, data, size);
+        me->a_end += size;
+    }
+
+    __check_for_switch_to_b(me);
+    return size;
+}
+
+unsigned char *bipbuf_peek(const bipbuf_t* me, const unsigned int size)
+{
+    /* make sure we can actually peek at this data */
+    if (me->size < me->a_start + size)
+        return NULL;
+
+    if (bipbuf_is_empty(me))
+        return NULL;
+
+    return (unsigned char *)me->data + me->a_start;
+}
+
+unsigned char *bipbuf_peek_all(const bipbuf_t* me, unsigned int *size)
+{
+    if (bipbuf_is_empty(me))
+        return NULL;
+
+    *size = me->a_end - me->a_start;
+    return (unsigned char*)me->data + me->a_start;
+}
+
+unsigned char *bipbuf_poll(bipbuf_t* me, const unsigned int size)
+{
+    if (bipbuf_is_empty(me))
+        return NULL;
+
+    /* make sure we can actually poll this data */
+    if (me->size < me->a_start + size)
+        return NULL;
+
+    void *end = me->data + me->a_start;
+    me->a_start += size;
+
+    /* we seem to be empty.. */
+    if (me->a_start == me->a_end)
+    {
+        /* replace a with region b */
+        if (1 == me->b_inuse)
+        {
+            me->a_start = 0;
+            me->a_end = me->b_end;
+            me->b_end = me->b_inuse = 0;
+        }
+        else
+            /* safely move cursor back to the start because we are empty */
+            me->a_start = me->a_end = 0;
+    }
+
+    __check_for_switch_to_b(me);
+    return (unsigned char*) end;
+}
diff --git a/bipbuffer.h b/bipbuffer.h
new file mode 100644
index 0000000..f99f148
--- /dev/null
+++ b/bipbuffer.h
@@ -0,0 +1,93 @@
+#ifndef BIPBUFFER_H
+#define BIPBUFFER_H
+
+#define BIPBUFSIZE 4*1024*1024
+#include "binary_protocol.h"
+#include <stdio.h>
+
+extern "C" {
+    typedef struct
+    {
+        unsigned long int size;
+    
+        /* region A */
+        unsigned int a_start, a_end;
+    
+        /* region B */
+        unsigned int b_end;
+    
+        /* is B inuse? */
+        int b_inuse;
+    
+        unsigned char data[BIPBUFSIZE];
+    } bipbuf_t;
+
+/**
+ * Create a new bip buffer.
+ *
+ * malloc()s space
+ *
+ * @param[in] size The size of the buffer */
+bipbuf_t *bipbuf_new(const unsigned int size);
+
+/**
+ * Initialise a bip buffer. Use memory provided by user.
+ *
+ * No malloc()s are performed.
+ *
+ * @param[in] size The size of the array */
+void bipbuf_init(bipbuf_t* me, const unsigned int size);
+
+/**
+ * Free the bip buffer */
+void bipbuf_free(bipbuf_t *me);
+
+/* TODO: DOCUMENTATION */
+unsigned char *bipbuf_request(bipbuf_t* me, const int size);
+int bipbuf_push(bipbuf_t* me, const int size);
+
+/**
+ * @param[in] data The data to be offered to the buffer
+ * @param[in] size The size of the data to be offered
+ * @return number of bytes offered */
+int bipbuf_offer(bipbuf_t *me, const unsigned char *data, const int size);
+
+/**
+ * Look at data. Don't move cursor
+ *
+ * @param[in] len The length of the data to be peeked
+ * @return data on success, NULL if we can't peek at this much data */
+unsigned char *bipbuf_peek(const bipbuf_t* me, const unsigned int len);
+
+/**
+ * Look at data. Don't move cursor
+ *
+ * @param[in] len The length of the data returned
+ * @return data on success, NULL if nothing available */
+unsigned char *bipbuf_peek_all(const bipbuf_t* me, unsigned int *len);
+
+/**
+ * Get pointer to data to read. Move the cursor on.
+ *
+ * @param[in] len The length of the data to be polled
+ * @return pointer to data, NULL if we can't poll this much data */
+unsigned char *bipbuf_poll(bipbuf_t* me, const unsigned int size);
+
+/**
+ * @return the size of the bipbuffer */
+int bipbuf_size(const bipbuf_t* me);
+
+/**
+ * @return 1 if buffer is empty; 0 otherwise */
+int bipbuf_is_empty(const bipbuf_t* me);
+
+/**
+ * @return how much space we have assigned */
+int bipbuf_used(const bipbuf_t* cb);
+
+/**
+ * @return bytes of unused space */
+int bipbuf_unused(const bipbuf_t* me);
+
+}
+#endif /* BIPBUFFER_H */
diff --git a/blockingconcurrentqueue.h b/blockingconcurrentqueue.h
new file mode 100644
index 0000000..66579b6
--- /dev/null
+++ b/blockingconcurrentqueue.h
@@ -0,0 +1,582 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, see lightweightsemaphore.h).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include "lightweightsemaphore.h"
+
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+namespace moodycamel
+{
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+	typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+	typedef ::moodycamel::LightweightSemaphore LightweightSemaphore;
+
+public:
+	typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+	typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+	
+	typedef typename ConcurrentQueue::index_t index_t;
+	typedef typename ConcurrentQueue::size_t size_t;
+	typedef typename std::make_signed<size_t>::type ssize_t;
+	
+	static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+	static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+	
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: inner(capacity), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	// Disable copying and copy assignment
+	BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+	
+	inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		inner.swap(other.inner);
+		sema.swap(other.sema);
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if ((details::likely)(inner.enqueue(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if ((details::likely)(inner.enqueue(std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		if ((details::likely)(inner.enqueue(token, item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (inner.try_enqueue(item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (inner.try_enqueue(std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		if (inner.try_enqueue(token, item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		if (inner.try_enqueue(token, std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(token, item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it using an explicit consumer token.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(consumer_token_t& token, U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+	}
+	
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+    
+    // Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	inline size_t size_approx() const
+	{
+		return (size_t)sema->availableApprox();
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return ConcurrentQueue::is_lock_free();
+	}
+	
+
+private:
+	template<typename U, typename A1, typename A2>
+	static inline U* create(A1&& a1, A2&& a2)
+	{
+		void* p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1), std::forward<A2>(a2)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+	
+private:
+	ConcurrentQueue inner;
+	std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}	// end namespace moodycamel
diff --git a/cmdline.ggo b/cmdline.ggo
index 331dd21..73e4b34 100644
--- a/cmdline.ggo
+++ b/cmdline.ggo
@@ -10,11 +10,45 @@ option "quiet" - "Disable log messages."
 
 text "\nBasic options:"
 
+option "use_shm" - "use shared memory"
+option "use_shm_batch" - "use shared memory BATCHED"
+option "ratelimit" - "limit conns from exceeding each other in requests"
+option "v1callback" - "use v1 readcallbacks"
 option "server" s "Memcached server hostname[:port].  \
 Repeat to specify multiple servers." string multiple
+option "unix_socket" - "Use UNIX socket instead of TCP."
+option "approx" - "approximate two level caching with inclusive/exclusive"
+option "approx_batch" - "approximate two level caching with inclusive/exclusive and batching of reqs"
+option "inclusives" - "give a list of 1 == inclusive, 2 == exclusives for each class" string default=""
 option "binary" - "Use binary memcached protocol instead of ASCII."
+option "redis"  - "Use Redis RESP protocol instead of memchached."
+option "getset" - "Use getset mode, in getset mode we first issue \
+a GET and if the response is MISS, then issue a SET for on that
+key following distribution value."
+option "getsetorset" - "Use getset mode and allow for direct writes (with optype == 2)."
+option "successful" - "Only record latency and throughput stats for successful queries"
+option "prefix" - "Prefix all keys with a string (helps with multi-tennant eval)" string
+option "delete90" - "Delete 90 percent of keys after halfway through \
+                     the workload, used to model Rumbel et. al. USENIX \
+                     FAST '14 workloads. MUST BE IN GETSET MODE and 
+                     have a set number of queries"
+
+option "assoc" - "We create hash tables by taking the truncating the \
+                   key by b bytes. The n-b bytes are the key for redis, in the original \
+                   (key,value). The value is a hash table and we acess field \
+                   b to get the value. Essentially this makes redis n-way \
+                   associative cache. Only works in redis mode. For small key \
+                   sizes we just use normal method of (key,value) store. No hash table." int default="4"
 option "qps" q "Target aggregate QPS. 0 = peak QPS." int default="0"
 option "time" t "Maximum time to run (seconds)." int default="5"
+option "apps" - "Number of apps, should eqaul total conns" int default="1"
+option "rand_admit" - "random admission to nvm" int default="0"
+option "wb_all" - "all admission to nvm" int default="1"
+option "threshold" - "admission to nvm if in top n" int default="0"
+option "miss_through" - "All sets are considered dirty, expect for miss driven sets"
+
+option "read_file"  - "Read keys from file." string default=""
+option "twitter_trace"  - "use twitter memcached trace format from file." int default="0"
 
 option "keysize" K "Length of memcached keys (distribution)."
        string default="30"
@@ -25,6 +59,14 @@ option "records" r "Number of memcached records to use.  \
 If multiple memcached servers are given, this number is divided \
 by the number of servers." int default="10000"
 
+option "misswindow" m  "Window for recording misses, used to find \
+                       steady state, no window by default, which \
+                       gives us summary stats in total" int default="0"
+
+option "queries" N "Number of queries to make. 0 is unlimited (default)  \
+If multiple memcached servers are given, this number is divided \
+by the number of servers." int default="0"
+
 option "update" u "Ratio of set:get commands." float default="0.0"
 
 text "\nAdvanced options:"
diff --git a/common.h b/common.h
new file mode 100644
index 0000000..8d21e69
--- /dev/null
+++ b/common.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*
+ * This header file has common utility functions used in examples.
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <stdlib.h>    // malloc, free, exit
+#include <stdio.h>     // fprintf, perror, fopen, etc.
+#include <string.h>    // strerror
+#include <errno.h>     // errno
+#include <sys/stat.h>  // stat
+#include "zstd.h"
+
+/*
+ * Define the returned error code from utility functions.
+ */
+typedef enum {
+    ERROR_fsize = 1,
+    ERROR_fopen = 2,
+    ERROR_fclose = 3,
+    ERROR_fread = 4,
+    ERROR_fwrite = 5,
+    ERROR_loadFile = 6,
+    ERROR_saveFile = 7,
+    ERROR_malloc = 8,
+    ERROR_largeFile = 9,
+} COMMON_ErrorCode;
+
+/*! CHECK
+ * Check that the condition holds. If it doesn't print a message and die.
+ */
+#define CHECK(cond, ...)                        \
+    do {                                        \
+        if (!(cond)) {                          \
+            fprintf(stderr,                     \
+                    "%s:%d CHECK(%s) failed: ", \
+                    __FILE__,                   \
+                    __LINE__,                   \
+                    #cond);                     \
+            fprintf(stderr, "" __VA_ARGS__);    \
+            fprintf(stderr, "\n");              \
+            exit(1);                            \
+        }                                       \
+    } while (0)
+
+/*! CHECK_ZSTD
+ * Check the zstd error code and die if an error occurred after printing a
+ * message.
+ */
+#define CHECK_ZSTD(fn, ...)                                      \
+    do {                                                         \
+        size_t const err = (fn);                                 \
+        CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \
+    } while (0)
+
+/*! fsize_orDie() :
+ * Get the size of a given file path.
+ *
+ * @return The size of a given file path.
+ *
+static size_t fsize_orDie(const char *filename)
+{
+    struct stat st;
+    if (stat(filename, &st) != 0) {
+        perror(filename);
+        exit(ERROR_fsize);
+    }
+
+    off_t const fileSize = st.st_size;
+    size_t const size = (size_t)fileSize;
+    * 1. fileSize should be non-negative,
+     * 2. if off_t -> size_t type conversion results in discrepancy,
+     *    the file size is too large for type size_t.
+     *
+    if ((fileSize < 0) || (fileSize != (off_t)size)) {
+        fprintf(stderr, "%s : filesize too large \n", filename);
+        exit(ERROR_largeFile);
+    }
+    return size;
+}
+*/
+
+/*! fopen_orDie() :
+ * Open a file using given file path and open option.
+ *
+ * @return If successful this function will return a FILE pointer to an
+ * opened file otherwise it sends an error to stderr and exits.
+ */
+static FILE* fopen_orDie(const char *filename, const char *instruction)
+{
+    FILE* const inFile = fopen(filename, instruction);
+    if (inFile) return inFile;
+    /* error */
+    perror(filename);
+    exit(ERROR_fopen);
+}
+
+/*! fclose_orDie() :
+ * Close an opened file using given FILE pointer.
+ */
+static void fclose_orDie(FILE* file)
+{
+    if (!fclose(file)) { return; };
+    /* error */
+    perror("fclose");
+    exit(ERROR_fclose);
+}
+
+/*! fread_orDie() :
+ *
+ * Read sizeToRead bytes from a given file, storing them at the
+ * location given by buffer.
+ *
+ * @return The number of bytes read.
+ */
+static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file)
+{
+    size_t const readSize = fread(buffer, 1, sizeToRead, file);
+    if (readSize == sizeToRead) return readSize;   /* good */
+    if (feof(file)) return readSize;   /* good, reached end of file */
+    /* error */
+    perror("fread");
+    exit(ERROR_fread);
+}
+
+/*! fwrite_orDie() :
+ *
+ * Write sizeToWrite bytes to a file pointed to by file, obtaining
+ * them from a location given by buffer.
+ *
+ * Note: This function will send an error to stderr and exit if it
+ * cannot write data to the given file pointer.
+ *
+ * @return The number of bytes written.
+ */
+//static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file)
+//{
+//    size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file);
+//    if (writtenSize == sizeToWrite) return sizeToWrite;   /* good */
+//    /* error */
+//    perror("fwrite");
+//    exit(ERROR_fwrite);
+//}
+
+/*! malloc_orDie() :
+ * Allocate memory.
+ *
+ * @return If successful this function returns a pointer to allo-
+ * cated memory.  If there is an error, this function will send that
+ * error to stderr and exit.
+ */
+static void* malloc_orDie(size_t size)
+{
+    void* const buff = malloc(size);
+    if (buff) return buff;
+    /* error */
+    perror("malloc");
+    exit(ERROR_malloc);
+}
+
+/*! loadFile_orDie() :
+ * load file into buffer (memory).
+ *
+ * Note: This function will send an error to stderr and exit if it
+ * cannot read data from the given file path.
+ *
+ * @return If successful this function will load file into buffer and
+ * return file size, otherwise it will printout an error to stderr and exit.
+ *
+static size_t loadFile_orDie(const char* fileName, void* buffer, size_t bufferSize)
+{
+    size_t const fileSize = fsize_orDie(fileName);
+    CHECK(fileSize <= bufferSize, "File too large!");
+
+    FILE* const inFile = fopen_orDie(fileName, "rb");
+    size_t const readSize = fread(buffer, 1, fileSize, inFile);
+    if (readSize != (size_t)fileSize) {
+        fprintf(stderr, "fread: %s : %s \n", fileName, strerror(errno));
+        exit(ERROR_fread);
+    }
+    fclose(inFile); 
+    return fileSize;
+}
+*/
+
+/*! mallocAndLoadFile_orDie() :
+ * allocate memory buffer and then load file into it.
+ *
+ * Note: This function will send an error to stderr and exit if memory allocation
+ * fails or it cannot read data from the given file path.
+ *
+ * @return If successful this function will return buffer and bufferSize(=fileSize),
+ * otherwise it will printout an error to stderr and exit.
+ *
+static void* mallocAndLoadFile_orDie(const char* fileName, size_t* bufferSize) {
+    size_t const fileSize = fsize_orDie(fileName);
+    *bufferSize = fileSize;
+    void* const buffer = malloc_orDie(*bufferSize);
+    loadFile_orDie(fileName, buffer, *bufferSize);
+    return buffer;
+}
+*/
+
+/*! saveFile_orDie() :
+ *
+ * Save buffSize bytes to a given file path, obtaining them from a location pointed
+ * to by buff.
+ *
+ * Note: This function will send an error to stderr and exit if it
+ * cannot write to a given file.
+ */
+//static void saveFile_orDie(const char* fileName, const void* buff, size_t buffSize)
+//{
+//    FILE* const oFile = fopen_orDie(fileName, "wb");
+//    size_t const wSize = fwrite(buff, 1, buffSize, oFile);
+//    if (wSize != (size_t)buffSize) {
+//        fprintf(stderr, "fwrite: %s : %s \n", fileName, strerror(errno));
+//        exit(ERROR_fwrite);
+//    }
+//    if (fclose(oFile)) {
+//        perror(fileName);
+//        exit(ERROR_fclose);
+//    }
+//}
+
+#endif
diff --git a/concurrentqueue.h b/concurrentqueue.h
new file mode 100644
index 0000000..b38d218
--- /dev/null
+++ b/concurrentqueue.h
@@ -0,0 +1,3742 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+// does not support `if constexpr`, so we have no choice but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // always disabled for now since several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel { namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
+	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
+	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
+	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
+	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
+	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
+	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
+	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
+	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
+	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#else
+	template<typename T> struct identity { typedef T type; };
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+} }
+
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
+// we can apply per-function compile-time suppression.
+// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+	// The number of times to spin before sleeping when waiting on a semaphore.
+	// Recommended values are on the order of 1000-10000 unless the number of
+	// consumer threads exceeds the number of idle cores (in which case try 0-100).
+	// Only affects instances of the BlockingConcurrentQueue.
+	static const int MAX_SEMA_SPINS = 10000;
+	
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+	};
+	
+	
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			listener->next = tlsInst.tail;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { }
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / BLOCK_SIZE);
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+				if (firstAllocatedBlock != nullptr)
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
+				return false;
+			}
+			localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+			idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+			localBlockIndex->tail.store(newTail, std::memory_order_release);
+			return true;
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / BLOCK_SIZE);
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		freeList.add(block);
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		else {
+			return nullptr;
+		}
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+		bool recycled;
+		return recycle_or_create_producer(isExplicit, recycled);
+	}
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					recycled = true;
+					return ptr;
+				}
+			}
+		}
+		
+		recycled = false;
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
+		}
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
+			}
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1;
+							probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed)) ||
+								(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) {
+#else
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed))) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					auto newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = static_cast<size_t>(newCapacity);
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				bool recycled;
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false, recycled));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				if (recycled) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1;
+					auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+					
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed)) ||
+						(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) {
+#else
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed))) {
+#endif
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from thread exit listeners
+		details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener);
+		
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1;
+				probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release);
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename TAlign>
+	static inline void* aligned_malloc(size_t size)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::malloc)(size);
+		else {
+			size_t alignment = std::alignment_of<TAlign>::value;
+			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+			if (!raw)
+				return nullptr;
+			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+			*(reinterpret_cast<void**>(ptr) - 1) = raw;
+			return ptr;
+		}
+	}
+
+	template<typename TAlign>
+	static inline void aligned_free(void* ptr)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::free)(ptr);
+		else
+			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+	}
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+		if (p == nullptr)
+			return nullptr;
+
+		for (size_t i = 0; i != count; ++i)
+			new (p + i) U();
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; )
+				(p + --i)->~U();
+		}
+		aligned_free<U>(p);
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr)
+			p->~U();
+		aligned_free<U>(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#ifndef MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
diff --git a/distributions.cc b/distributions.cc
index ce939e7..6cb9532 100644
--- a/distributions.cc
+++ b/distributions.cc
@@ -32,3 +32,4 @@ double generate_uniform(double lambda) {
   if (lambda <= 0.0) return 0;
   return 1.0 / lambda;
 }
+
diff --git a/libzstd.a b/libzstd.a
new file mode 100644
index 0000000..3be4d40
Binary files /dev/null and b/libzstd.a differ
diff --git a/lightweightsemaphore.h b/lightweightsemaphore.h
new file mode 100644
index 0000000..b0f24e1
--- /dev/null
+++ b/lightweightsemaphore.h
@@ -0,0 +1,411 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed 
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+	void* m_hSema;
+	
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		const long maxLong = 0x7fffffff;
+		m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		assert(m_hSema);
+	}
+
+	~Semaphore()
+	{
+		CloseHandle(m_hSema);
+	}
+
+	bool wait()
+	{
+		const unsigned long infinite = 0xffffffff;
+		return WaitForSingleObject(m_hSema, infinite) == 0;
+	}
+	
+	bool try_wait()
+	{
+		return WaitForSingleObject(m_hSema, 0) == 0;
+	}
+	
+	bool timed_wait(std::uint64_t usecs)
+	{
+		return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+	}
+
+	void signal(int count = 1)
+	{
+		while (!ReleaseSemaphore(m_hSema, count, nullptr));
+	}
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	semaphore_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		assert(rc == KERN_SUCCESS);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		semaphore_destroy(mach_task_self(), m_sema);
+	}
+
+	bool wait()
+	{
+		return semaphore_wait(m_sema) == KERN_SUCCESS;
+	}
+	
+	bool try_wait()
+	{
+		return timed_wait(0);
+	}
+	
+	bool timed_wait(std::uint64_t timeout_usecs)
+	{
+		mach_timespec_t ts;
+		ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+		ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+		// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+		kern_return_t rc = semaphore_timedwait(m_sema, ts);
+		return rc == KERN_SUCCESS;
+	}
+
+	void signal()
+	{
+		while (semaphore_signal(m_sema) != KERN_SUCCESS);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		}
+	}
+};
+#elif defined(__unix__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	sem_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		assert(rc == 0);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		sem_destroy(&m_sema);
+	}
+
+	bool wait()
+	{
+		// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		int rc;
+		do {
+			rc = sem_wait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool try_wait()
+	{
+		int rc;
+		do {
+			rc = sem_trywait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool timed_wait(std::uint64_t usecs)
+	{
+		struct timespec ts;
+		const int usecs_in_1_sec = 1000000;
+		const int nsecs_in_1_sec = 1000000000;
+		clock_gettime(CLOCK_REALTIME, &ts);
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+		// so we have to clean things up before passing it in
+		if (ts.tv_nsec >= nsecs_in_1_sec) {
+			ts.tv_nsec -= nsecs_in_1_sec;
+			++ts.tv_sec;
+		}
+
+		int rc;
+		do {
+			rc = sem_timedwait(&m_sema, &ts);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	void signal()
+	{
+		while (sem_post(&m_sema) == -1);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (sem_post(&m_sema) == -1);
+		}
+	}
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+}	// end namespace details
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+	typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+	std::atomic<ssize_t> m_count;
+	details::Semaphore m_sema;
+	int m_maxSpins;
+
+	bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+	{
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+			std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount > 0)
+			return true;
+		if (timeout_usecs < 0)
+		{
+			if (m_sema.wait())
+				return true;
+		}
+		if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs))
+			return true;
+		// At this point, we've timed out waiting for the semaphore, but the
+		// count is still decremented indicating we may still be waiting on
+		// it. So we have to re-adjust the count, but only if the semaphore
+		// wasn't signaled enough times for us too since then. If it was, we
+		// need to release the semaphore too.
+		while (true)
+		{
+			oldCount = m_count.load(std::memory_order_acquire);
+			if (oldCount >= 0 && m_sema.try_wait())
+				return true;
+			if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+				return false;
+		}
+	}
+
+	ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+	{
+		assert(max > 0);
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if (oldCount > 0)
+			{
+				ssize_t newCount = oldCount > max ? oldCount - max : 0;
+				if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+					return oldCount - newCount;
+			}
+			std::atomic_signal_fence(std::memory_order_acquire);
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount <= 0)
+		{
+			if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs)))
+			{
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						break;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return 0;
+				}
+			}
+		}
+		if (max > 1)
+			return 1 + tryWaitMany(max - 1);
+		return 1;
+	}
+
+public:
+	LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins)
+	{
+		assert(initialCount >= 0);
+		assert(maxSpins >= 0);
+	}
+
+	bool tryWait()
+	{
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+		}
+		return false;
+	}
+
+	bool wait()
+	{
+		return tryWait() || waitWithPartialSpinning();
+	}
+
+	bool wait(std::int64_t timeout_usecs)
+	{
+		return tryWait() || waitWithPartialSpinning(timeout_usecs);
+	}
+
+	// Acquires between 0 and (greedily) max, inclusive
+	ssize_t tryWaitMany(ssize_t max)
+	{
+		assert(max >= 0);
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			ssize_t newCount = oldCount > max ? oldCount - max : 0;
+			if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+				return oldCount - newCount;
+		}
+		return 0;
+	}
+
+	// Acquires at least one, and (greedily) at most max
+	ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+	{
+		assert(max >= 0);
+		ssize_t result = tryWaitMany(max);
+		if (result == 0 && max > 0)
+			result = waitManyWithPartialSpinning(max, timeout_usecs);
+		return result;
+	}
+	
+	ssize_t waitMany(ssize_t max)
+	{
+		ssize_t result = waitMany(max, -1);
+		assert(result > 0);
+		return result;
+	}
+
+	void signal(ssize_t count = 1)
+	{
+		assert(count >= 0);
+		ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+		ssize_t toRelease = -oldCount < count ? -oldCount : count;
+		if (toRelease > 0)
+		{
+			m_sema.signal((int)toRelease);
+		}
+	}
+	
+	std::size_t availableApprox() const
+	{
+		ssize_t count = m_count.load(std::memory_order_relaxed);
+		return count > 0 ? static_cast<std::size_t>(count) : 0;
+	}
+};
+
+}   // end namespace moodycamel
diff --git a/mutilate.cc b/mutilate.cc
index 426fd05..a1f298a 100644
--- a/mutilate.cc
+++ b/mutilate.cc
@@ -2,16 +2,23 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
+#include <sched.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <sys/types.h>
+#include <netinet/tcp.h>
+#include <fcntl.h> /* Added for the nonblocking socket */
+#include <sys/un.h>
 #include <time.h>
 #include <unistd.h>
 
 #include <queue>
 #include <string>
 #include <vector>
+#include <sstream>
+#include <filesystem>
+namespace fs = std::filesystem;
 
 #include <event2/buffer.h>
 #include <event2/bufferevent.h>
@@ -20,6 +27,10 @@
 #include <event2/thread.h>
 #include <event2/util.h>
 
+
+#include "common.h" //for zstd
+#include "zstd.h" //shippped with mutilate
+
 #include "config.h"
 
 #ifdef HAVE_LIBZMQ
@@ -37,13 +48,43 @@
 #include "log.h"
 #include "mutilate.h"
 #include "util.h"
+#include "blockingconcurrentqueue.h"
+//#include <folly/concurrency/UnboundedQueue.h>
+//#include <folly/concurrency/ConcurrentHashMap.h>
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define hashsize(n) ((unsigned long int)1<<(n))
 
 using namespace std;
+using namespace moodycamel;
+//using namespace folly;
+
+int max_n[3] = {0,0,0};
+ifstream kvfile;
+pthread_mutex_t flock = PTHREAD_MUTEX_INITIALIZER;
+
+pthread_mutex_t reader_l;
+pthread_cond_t reader_ready;
+int reader_not_ready = 1;
+
+pthread_mutex_t *item_locks;
+int item_lock_hashpower = 14;
+        
+map<string,int> g_key_hist;
+
+//USPMCQueue<Operation*,true,8,7> g_trace_queue;
+
+//ConcurrentHashMap<int, double> cid_rate;
+unordered_map<int,double> cid_rate;
+//ConcurrentHashMap<string, vector<Operation*>> copy_keys;
+unordered_map<string, vector<Operation*>> copy_keys;
+unordered_map<string, vector<Operation*>> wb_keys;
+//ConcurrentHashMap<string, vector<Operation*>> touch_keys;
+unordered_map<string, int> touch_keys;
+//ConcurrentHashMap<string, vector<Operation*>> wb_keys;
 
 gengetopt_args_info args;
-char random_char[2 * 1024 * 1024];  // Buffer used to generate random values.
+char random_char[4 * 1024 * 1024];  // Buffer used to generate random values.
 
 #ifdef HAVE_LIBZMQ
 vector<zmq::socket_t*> agent_sockets;
@@ -55,11 +96,27 @@ struct thread_data {
   options_t *options;
   bool master;  // Thread #0, not to be confused with agent master.
 #ifdef HAVE_LIBZMQ
-  zmq::socket_t *socket;
+  zmq::socket_t *socketz;
 #endif
+  int id;
+  //std::vector<ConcurrentQueue<string>*> trace_queue;
+  std::vector<queue<Operation*>*> *trace_queue;
+  //std::vector<pthread_mutex_t*> *mutexes;
+  pthread_mutex_t* g_lock;
+  std::unordered_map<string,vector<Operation*>> *g_wb_keys;
+};
+
+struct reader_data {
+  //std::vector<ConcurrentQueue<string>*> trace_queue;
+  std::vector<queue<Operation*>*> *trace_queue;
+  std::vector<pthread_mutex_t*> *mutexes;
+  string *trace_filename;
+  int twitter_trace;
 };
 
 // struct evdns_base *evdns;
+    
+pthread_t pt[1024];
 
 pthread_barrier_t barrier;
 
@@ -70,33 +127,36 @@ void init_random_stuff();
 void go(const vector<string> &servers, options_t &options,
         ConnectionStats &stats
 #ifdef HAVE_LIBZMQ
-, zmq::socket_t* socket = NULL
+, zmq::socket_t* socketz = NULL
 #endif
 );
 
+//void do_mutilate(const vector<string> &servers, options_t &options,
+//                 ConnectionStats &stats,std::vector<ConcurrentQueue<string>*> trace_queue,  bool master = true
 void do_mutilate(const vector<string> &servers, options_t &options,
-                 ConnectionStats &stats, bool master = true
+                 ConnectionStats &stats,std::vector<queue<Operation*>*> *trace_queue, pthread_mutex_t *g_lock, unordered_map<string,vector<Operation*>> *g_wb_keys,  bool master = true
 #ifdef HAVE_LIBZMQ
-, zmq::socket_t* socket = NULL
+, zmq::socket_t* socketz = NULL
 #endif
 );
 void args_to_options(options_t* options);
 void* thread_main(void *arg);
+void* reader_thread(void *arg);
 
 #ifdef HAVE_LIBZMQ
-static std::string s_recv (zmq::socket_t &socket) {
+static std::string s_recv (zmq::socket_t &socketz) {
   zmq::message_t message;
-  socket.recv(&message);
+  socketz.recv(&message);
 
   return std::string(static_cast<char*>(message.data()), message.size());
 }
 
 //  Convert string to 0MQ string and send to socket
-static bool s_send (zmq::socket_t &socket, const std::string &string) {
+static bool s_send (zmq::socket_t &socketz, const std::string &string) {
   zmq::message_t message(string.size());
   memcpy(message.data(), string.data(), string.size());
 
-  return socket.send(message);
+  return socketz.send(message);
 }
 
 /*
@@ -156,17 +216,21 @@ static bool s_send (zmq::socket_t &socket, const std::string &string) {
 void agent() {
   zmq::context_t context(1);
 
-  zmq::socket_t socket(context, ZMQ_REP);
-  socket.bind((string("tcp://*:")+string(args.agent_port_arg)).c_str());
+  zmq::socket_t socketz(context, ZMQ_REP);
+  if (atoi(args.agent_port_arg) == -1) {
+    socketz.bind(string("ipc:///tmp/memcached.sock").c_str());
+  } else {
+    socketz.bind((string("tcp://*:")+string(args.agent_port_arg)).c_str());
+  }
 
   while (true) {
     zmq::message_t request;
 
-    socket.recv(&request);
+    socketz.recv(&request);
 
     zmq::message_t num(sizeof(int));
     *((int *) num.data()) = args.threads_arg * args.lambda_mul_arg;
-    socket.send(num);
+    socketz.send(num);
 
     options_t options;
     memcpy(&options, request.data(), sizeof(options));
@@ -174,8 +238,8 @@ void agent() {
     vector<string> servers;
 
     for (int i = 0; i < options.server_given; i++) {
-      servers.push_back(s_recv(socket));
-      s_send(socket, "ACK");
+      servers.push_back(s_recv(socketz));
+      s_send(socketz, "ACK");
     }
 
     for (auto i: servers) {
@@ -184,9 +248,9 @@ void agent() {
 
     options.threads = args.threads_arg;
 
-    socket.recv(&request);
+    socketz.recv(&request);
     options.lambda_denom = *((int *) request.data());
-    s_send(socket, "THANKS");
+    s_send(socketz, "THANKS");
 
     //    V("AGENT SLEEPS"); sleep(1);
     options.lambda = (double) options.qps / options.lambda_denom * args.lambda_mul_arg;
@@ -199,7 +263,7 @@ void agent() {
 
     ConnectionStats stats;
 
-    go(servers, options, stats, &socket);
+    go(servers, options, stats, &socketz);
 
     AgentStats as;
 
@@ -212,11 +276,11 @@ void agent() {
     as.stop = stats.stop;
     as.skips = stats.skips;
 
-    string req = s_recv(socket);
+    string req = s_recv(socketz);
     //    V("req = %s", req.c_str());
     request.rebuild(sizeof(as));
     memcpy(request.data(), &as, sizeof(as));
-    socket.send(request);
+    socketz.send(request);
   }
 }
 
@@ -319,7 +383,7 @@ void finish_agent(ConnectionStats &stats) {
  * skew.
  */
 
-void sync_agent(zmq::socket_t* socket) {
+void sync_agent(zmq::socket_t* socketz) {
   //  V("agent: synchronizing");
 
   if (args.agent_given) {
@@ -338,16 +402,16 @@ void sync_agent(zmq::socket_t* socket) {
       if (s_recv(*s).compare(string("ack")))
         DIE("sync_agent[M]: out of sync [2]");
   } else if (args.agentmode_given) {
-    if (s_recv(*socket).compare(string("sync_req")))
+    if (s_recv(*socketz).compare(string("sync_req")))
       DIE("sync_agent[A]: out of sync [1]");
 
     /* The real sync */
-    s_send(*socket, "sync");
-    if (s_recv(*socket).compare(string("proceed")))
+    s_send(*socketz, "sync");
+    if (s_recv(*socketz).compare(string("proceed")))
       DIE("sync_agent[A]: out of sync [2]");
     /* End sync */
 
-    s_send(*socket, "ack");
+    s_send(*socketz, "ack");
   }
 
   //  V("agent: synchronized");
@@ -413,6 +477,7 @@ string name_to_ipaddr(string host) {
 }
 
 int main(int argc, char **argv) {
+  //event_enable_debug_mode();
   if (cmdline_parser(argc, argv, &args) != 0) exit(-1);
 
   for (unsigned int i = 0; i < args.verbose_given; i++)
@@ -445,7 +510,7 @@ int main(int argc, char **argv) {
   //  struct event_base *base;
 
   //  if ((base = event_base_new()) == NULL) DIE("event_base_new() fail");
-  //  evthread_use_pthreads();
+  //evthread_use_pthreads();
 
   //  if ((evdns = evdns_base_new(base, 1)) == 0) DIE("evdns");
 
@@ -470,8 +535,14 @@ int main(int argc, char **argv) {
   pthread_barrier_init(&barrier, NULL, options.threads);
 
   vector<string> servers;
-  for (unsigned int s = 0; s < args.server_given; s++)
-    servers.push_back(name_to_ipaddr(string(args.server_arg[s])));
+  for (unsigned int s = 0; s < args.server_given; s++) {
+    if (options.unix_socket || args.use_shm_given) {
+        servers.push_back(string(args.server_arg[s]));
+    } else {
+        servers.push_back(name_to_ipaddr(string(args.server_arg[s])));
+    }
+  }
+  
 
   ConnectionStats stats;
 
@@ -583,23 +654,61 @@ int main(int argc, char **argv) {
 
   if (!args.scan_given && !args.loadonly_given) {
     stats.print_header();
-    stats.print_stats("read",   stats.get_sampler);
-    stats.print_stats("update", stats.set_sampler);
-    stats.print_stats("op_q",   stats.op_sampler);
+    stats.print_stats("read     ",   stats.get_sampler);
+    stats.print_stats("read_l1  ",   stats.get_l1_sampler);
+    stats.print_stats("read_l2  ",   stats.get_l2_sampler);
+    stats.print_stats("update_l1", stats.set_l1_sampler);
+    stats.print_stats("update_l2", stats.set_l2_sampler);
+    stats.print_stats("op_q     ",   stats.op_sampler);
 
-    int total = stats.gets + stats.sets;
+    int total = stats.gets_l1 + stats.gets_l2 + stats.sets_l1 + stats.sets_l2;
 
     printf("\nTotal QPS = %.1f (%d / %.1fs)\n",
            total / (stats.stop - stats.start),
            total, stats.stop - stats.start);
+    
+    int rtotal = stats.gets +  stats.sets;
+    printf("\nTotal RPS = %.1f (%d / %.1fs)\n",
+           rtotal / (stats.stop - stats.start),
+           rtotal, stats.stop - stats.start);
 
     if (args.search_given && peak_qps > 0.0)
       printf("Peak QPS  = %.1f\n", peak_qps);
 
     printf("\n");
 
-    printf("Misses = %" PRIu64 " (%.1f%%)\n", stats.get_misses,
-           (double) stats.get_misses/stats.gets*100);
+    printf("GET Misses = %" PRIu64 " (%.1f%%)\n", stats.get_misses,
+           (double) stats.get_misses/(stats.gets)*100);
+    if (servers.size() == 2) {
+        int64_t additional = 0;
+        if (stats.delete_misses_l2 > 0) {
+            additional = stats.delete_misses_l2 - stats.set_excl_hits_l1;
+            fprintf(stderr,"delete misses_l2 %lu, delete hits_l2 %lu, excl_set_l1_hits: %lu\n",stats.delete_misses_l2,stats.delete_hits_l2,stats.set_excl_hits_l1);
+            if (additional < 0) {
+                fprintf(stderr,"additional misses is neg! %ld\n",additional);
+                additional = 0;
+            }
+        }
+
+        for (int i = 0; i < 40; i++) {
+            fprintf(stderr,"class %d, gets: %lu, sets: %lu\n",i,stats.gets_cid[i],stats.sets_cid[i]);
+        }
+        //printf("Misses (L1) = %" PRIu64 " (%.1f%%)\n", stats.get_misses_l1 + stats.set_misses_l1,
+        //       (double) (stats.get_misses_l1 + stats.set_misses_l1) /(stats.gets + stats.sets)*100);
+        printf("Misses (L1) = %" PRIu64 " (%.1f%%)\n", stats.get_misses_l1 ,
+               (double) (stats.get_misses_l1) /(stats.gets)*100);
+        printf("SET Misses (L1) = %" PRIu64 " (%.1f%%)\n", stats.set_misses_l1 ,
+               (double) (stats.set_misses_l1) /(stats.sets)*100);
+        //printf("Misses (L2) = %" PRIu64 " (%.1f%%)\n", stats.get_misses_l2,
+        //       (double) (stats.get_misses_l2) /(stats.gets)*100);
+        printf("L2 Writes = %" PRIu64 " (%.1f%%)\n", stats.sets_l2,
+               (double) stats.sets_l2/(stats.gets+stats.sets)*100);
+        
+        printf("Incl WBs  = %" PRIu64 " (%.1f%%)\n", stats.incl_wbs,
+               (double) stats.incl_wbs/(stats.gets+stats.sets)*100);
+        printf("Excl WBs  = %" PRIu64 " (%.1f%%)\n", stats.excl_wbs,
+               (double) stats.excl_wbs/(stats.gets+stats.sets)*100);
+    }
 
     printf("Skipped TXs = %" PRIu64 " (%.1f%%)\n\n", stats.skips,
            (double) stats.skips / total * 100);
@@ -642,7 +751,7 @@ int main(int argc, char **argv) {
 void go(const vector<string>& servers, options_t& options,
         ConnectionStats &stats
 #ifdef HAVE_LIBZMQ
-, zmq::socket_t* socket
+, zmq::socket_t* socketz
 #endif
 ) {
 #ifdef HAVE_LIBZMQ
@@ -651,8 +760,53 @@ void go(const vector<string>& servers, options_t& options,
   }
 #endif
 
+  //std::vector<ConcurrentQueue<string>*> trace_queue; // = (ConcurrentQueue<string>**)malloc(sizeof(ConcurrentQueue<string>)
+  std::vector<queue<Operation*>*> *trace_queue = new std::vector<queue<Operation*>*>(); 
+  // = (ConcurrentQueue<string>**)malloc(sizeof(ConcurrentQueue<string>)
+  //std::vector<pthread_mutex_t*> *mutexes = new std::vector<pthread_mutex_t*>(); 
+  pthread_mutex_t *g_lock = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t)); 
+  *g_lock = PTHREAD_MUTEX_INITIALIZER;
+
+  unordered_map<string,vector<Operation*>> *g_wb_keys = new unordered_map<string,vector<Operation*>>();
+
+  for (int i = 0; i <= options.apps; i++) {
+  //    //trace_queue.push_back(new ConcurrentQueue<string>(2000000));
+  //    pthread_mutex_t *lock = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t));
+  //    *lock = PTHREAD_MUTEX_INITIALIZER;
+  //    mutexes->push_back(lock);
+      trace_queue->push_back(new std::queue<Operation*>());
+  }
+  pthread_mutex_init(&reader_l, NULL);
+  pthread_cond_init(&reader_ready, NULL);
+
+  //ConcurrentQueue<string> *trace_queue = new ConcurrentQueue<string>(20000000);
+  struct reader_data *rdata = (struct reader_data*)malloc(sizeof(struct reader_data));
+  rdata->trace_queue = trace_queue;
+  //rdata->mutexes = mutexes;
+  rdata->twitter_trace = options.twitter_trace;
+  pthread_t rtid;
+  if (options.read_file) {
+      rdata->trace_filename = new string(options.file_name); 
+      int error = 0;
+      if ((error = pthread_create(&rtid, NULL,reader_thread,rdata)) != 0) {
+        printf("reader thread failed to be created with error code %d\n", error);
+      }
+      pthread_mutex_lock(&reader_l);
+      while (reader_not_ready) 
+        pthread_cond_wait(&reader_ready,&reader_l);
+      pthread_mutex_unlock(&reader_l);
+      
+  }
+
+  /* initialize item locks */
+  uint32_t item_lock_count = hashsize(item_lock_hashpower);
+  item_locks = (pthread_mutex_t*)calloc(item_lock_count, sizeof(pthread_mutex_t));
+  for (size_t i = 0; i < item_lock_count; i++) {
+      pthread_mutex_init(&item_locks[i], NULL);
+  }
+
+
   if (options.threads > 1) {
-    pthread_t pt[options.threads];
     struct thread_data td[options.threads];
 #ifdef __clang__
     vector<string>* ts = static_cast<vector<string>*>(alloca(sizeof(vector<string>) * options.threads));
@@ -664,10 +818,15 @@ void go(const vector<string>& servers, options_t& options,
     int current_cpu = -1;
 #endif
 
+
     for (int t = 0; t < options.threads; t++) {
       td[t].options = &options;
+      td[t].id = t;
+      td[t].trace_queue = trace_queue;
+      td[t].g_lock = g_lock;
+      td[t].g_wb_keys = g_wb_keys;
 #ifdef HAVE_LIBZMQ
-      td[t].socket = socket;
+      td[t].socketz = socketz;
 #endif
       if (t == 0) td[t].master = true;
       else td[t].master = false;
@@ -711,24 +870,31 @@ void go(const vector<string>& servers, options_t& options,
 
       if (pthread_create(&pt[t], &attr, thread_main, &td[t]))
         DIE("pthread_create() failed");
+      usleep(t);
     }
 
     for (int t = 0; t < options.threads; t++) {
       ConnectionStats *cs;
       if (pthread_join(pt[t], (void**) &cs)) DIE("pthread_join() failed");
       stats.accumulate(*cs);
+      
       delete cs;
     }
+  for (int i = 1; i <= 2; i++) {
+      fprintf(stderr,"max issue buf n[%d]: %u\n",i,max_n[i]);
+  }
+    //delete trace_queue;
+
   } else if (options.threads == 1) {
-    do_mutilate(servers, options, stats, true
+    do_mutilate(servers, options, stats, trace_queue, g_lock, g_wb_keys, true
 #ifdef HAVE_LIBZMQ
-, socket
+, socketz
 #endif
 );
   } else {
 #ifdef HAVE_LIBZMQ
     if (args.agent_given) {
-      sync_agent(socket);
+      sync_agent(socketz);
     }
 #endif
   }
@@ -746,14 +912,427 @@ void go(const vector<string>& servers, options_t& options,
 #endif
 }
 
+int stick_this_thread_to_core(int core_id) {
+   int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+   if (core_id < 0 || core_id >= num_cores)
+      return EINVAL;
+
+   cpu_set_t cpuset;
+   CPU_ZERO(&cpuset);
+   CPU_SET(core_id, &cpuset);
+
+   pthread_t current_thread = pthread_self();    
+   return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
+}
+
+bool hasEnding (string const &fullString, string const &ending) {
+    if (fullString.length() >= ending.length()) {
+        return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending));
+    } else {
+        return false;
+    }
+}
+
+static char *get_stream(ZSTD_DCtx* dctx, FILE *fin, size_t const buffInSize, void* const buffIn, size_t const buffOutSize, void* const buffOut) {
+    /* This loop assumes that the input file is one or more concatenated zstd
+     * streams. This example won't work if there is trailing non-zstd data at
+     * the end, but streaming decompression in general handles this case.
+     * ZSTD_decompressStream() returns 0 exactly when the frame is completed,
+     * and doesn't consume input after the frame.
+     */
+    size_t const toRead = buffInSize;
+    size_t read;
+    size_t lastRet = 0;
+    int isEmpty = 1;
+    if ( (read = fread_orDie(buffIn, toRead, fin)) ) {
+        isEmpty = 0;
+        ZSTD_inBuffer input = { buffIn, read, 0 };
+        /* Given a valid frame, zstd won't consume the last byte of the frame
+         * until it has flushed all of the decompressed data of the frame.
+         * Therefore, instead of checking if the return code is 0, we can
+         * decompress just check if input.pos < input.size.
+         */
+        char *trace = (char*)malloc(buffOutSize*2);
+        memset(trace,0,buffOutSize+1);
+        size_t tracelen = buffOutSize+1;
+        size_t total = 0;
+        while (input.pos < input.size) {
+            ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
+            /* The return code is zero if the frame is complete, but there may
+             * be multiple frames concatenated together. Zstd will automatically
+             * reset the context when a frame is complete. Still, calling
+             * ZSTD_DCtx_reset() can be useful to reset the context to a clean
+             * state, for instance if the last decompression call returned an
+             * error.
+             */
+            
+            size_t const ret = ZSTD_decompressStream(dctx, &output , &input);
+            
+            if (output.pos + total > tracelen) {
+                trace = (char*)realloc(trace,(output.pos+total+1));
+                tracelen = (output.pos+total+1);
+            }
+            strncat(trace,(const char*)buffOut,output.pos); 
+            total += output.pos;
+
+            lastRet = ret;
+        }
+        int idx = total;
+        while (trace[idx] != '\n') {
+            idx--;
+        }
+        trace[idx] = 0;
+        trace[idx+1] = 0;
+        return trace;
+
+    }
+
+    if (isEmpty) {
+        fprintf(stderr, "input is empty\n");
+        return NULL;
+    }
+
+    if (lastRet != 0) {
+        /* The last return value from ZSTD_decompressStream did not end on a
+         * frame, but we reached the end of the file! We assume this is an
+         * error, and the input was truncated.
+         */
+        fprintf(stderr, "EOF before end of stream: %zu\n", lastRet);
+        exit(1);
+    }
+    return NULL;
+
+}
+
+void* reader_thread(void *arg) {
+  struct reader_data *rdata = (struct reader_data *) arg;
+  //std::vector<ConcurrentQueue<string>*> trace_queue = (std::vector<ConcurrentQueue<string>*>) rdata->trace_queue;
+  std::vector<queue<Operation*>*> *trace_queue = (std::vector<queue<Operation*>*>*) rdata->trace_queue;
+  //  std::vector<pthread_mutex_t*> *mutexes = (std::vector<pthread_mutex_t*>*) rdata->mutexes;
+  int twitter_trace = rdata->twitter_trace;
+  string fn = *(rdata->trace_filename);
+  srand(time(NULL));
+  if (hasEnding(fn,".zst")) {
+        string blobfile = fs::path( fn ).filename();
+        blobfile.erase(blobfile.length()-4);
+        blobfile.insert(0,"/dev/shm/");
+        blobfile.append(".data");
+        int do_blob = 0;
+        int blob = 0;
+        if (do_blob) {
+            blob = open(blobfile.c_str(),O_CREAT | O_APPEND | O_RDWR, S_IRWXU);
+        }
+        //init
+        const char *filename = fn.c_str();
+        FILE* const fin  = fopen_orDie(filename, "rb");
+        size_t const buffInSize = ZSTD_DStreamInSize()*1000;
+        void*  const buffIn  = malloc_orDie(buffInSize);
+        size_t const buffOutSize = ZSTD_DStreamOutSize()*1000;
+        void*  const buffOut = malloc_orDie(buffOutSize);
+
+        map<string,Operation*> key_hist;
+        ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+        //CHECK(dctx != NULL, "ZSTD_createDCtx() failed!");
+        //char *leftover = malloc(buffOutSize);
+        //memset(leftover,0,buffOutSize);
+		//char *trace = (char*)decompress(filename);
+        uint64_t nwrites = 0;
+        uint64_t nout = 1;
+        int batch = 0;
+        int cappid = 1;
+        fprintf(stderr,"%lu trace queues for connections\n",trace_queue->size());
+        char *trace = get_stream(dctx, fin, buffInSize, buffIn, buffOutSize, buffOut);
+        while (trace != NULL) {
+            char *ftrace = trace;
+            char *line = NULL;
+            char *line_p = (char*)calloc(2048,sizeof(char));
+            while ((line = strsep(&trace,"\n"))) {
+                strncpy(line_p,line,2048);
+                string full_line(line);
+                //check the appid
+                int appid = 0;
+                int first = 1;
+                if (full_line.length() > 10) {
+                    
+                    if (trace_queue->size() > 0) {
+                        stringstream ss(full_line);
+                        string rT;
+                        string rApp;
+                        string rKey;
+                        string rOp;
+                        string rvaluelen;
+                        Operation *Op = new Operation;
+                        if (twitter_trace == 1) {
+                            string rKeySize;
+                            size_t n = std::count(full_line.begin(), full_line.end(), ',');
+                            if (n == 6) {
+                                getline( ss, rT, ',' );
+                                getline( ss, rKey, ',' );
+                                getline( ss, rKeySize, ',' );
+                                getline( ss, rvaluelen, ',' );
+                                getline( ss, rApp, ',' );
+                                getline( ss, rOp, ',' );
+                                if (rOp.compare("get") == 0) {
+                                    Op->type = Operation::GET;
+                                } else if (rOp.compare("set") == 0) {
+                                    Op->type = Operation::SET;
+                                }
+                                if (rvaluelen.compare("") == 0 || rvaluelen.size() < 1 || rvaluelen.empty()) {
+                                    continue;
+                                }
+                                appid = cappid;
+                                if (nout % 1000 == 0) {
+                                    cappid++;
+                                    cappid = cappid % trace_queue->size();
+                                    if (cappid == 0) cappid = 1;
+                                }
+                                //appid = stoi(rApp) % trace_queue->size();
+                                if (appid == 0) appid = 1;
+                                //appid = (rand() % (trace_queue->size()-1)) + 1;
+                                //if (appid == 0) appid = 1;
+                                
+                                
+                            } else {
+                                continue;
+                            }
+                            
+                        } 
+                        else if (twitter_trace == 2) {
+                            size_t n = std::count(full_line.begin(), full_line.end(), ',');
+                            if (n == 4) {
+                                getline( ss, rT, ',');
+                                getline( ss, rApp, ',');
+                                getline( ss, rOp, ',' );
+                                getline( ss, rKey, ',' );
+                                getline( ss, rvaluelen, ',' );
+                                int ot = stoi(rOp);
+                                switch (ot) {
+                                    case 1:
+                                        Op->type = Operation::GET;
+                                        break;
+                                    case 2:
+                                        Op->type = Operation::SET;
+                                        break;
+                                }
+                                appid = (stoi(rApp)) % trace_queue->size();
+                                if (appid == 0) appid = 1;
+                                //appid = (nout) % trace_queue->size();
+                            } else {
+                                continue;
+                            }
+                        } 
+                        else if (twitter_trace == 3) {
+                            size_t n = std::count(full_line.begin(), full_line.end(), ',');
+                            if (n == 4) {
+                                getline( ss, rT, ',');
+                                getline( ss, rApp, ',');
+                                getline( ss, rOp, ',' );
+                                getline( ss, rKey, ',' );
+                                getline( ss, rvaluelen, ',' );
+                                int ot = stoi(rOp);
+                                switch (ot) {
+                                    case 1:
+                                        Op->type = Operation::GET;
+                                        break;
+                                    case 2:
+                                        Op->type = Operation::SET;
+                                        break;
+                                }
+                                //if (first) {
+                                //    appid = (rand() % (trace_queue->size()-1)) + 1;
+                                //    if (appid == 0) appid = 1;
+                                //    first = 0;
+                                //}
+                                //batch++;
+                                appid = (rand() % (trace_queue->size()-1)) + 1;
+                                if (appid == 0) appid = 1;
+                            } else {
+                                continue;
+                            }
+                        } 
+                        else if (twitter_trace == 4) {
+                            size_t n = std::count(full_line.begin(), full_line.end(), ',');
+                            if (n == 4) {
+                                getline( ss, rT, ',');
+                                getline( ss, rKey, ',' );
+                                getline( ss, rOp, ',' );
+                                getline( ss, rvaluelen, ',' );
+                                int ot = stoi(rOp);
+                                switch (ot) {
+                                    case 1:
+                                        Op->type = Operation::GET;
+                                        break;
+                                    case 2:
+                                        Op->type = Operation::SET;
+                                        break;
+                                }
+                                if (rvaluelen == "0") {
+                                    rvaluelen = "50000";
+                                }
+
+                                appid = (rand() % (trace_queue->size()-1)) + 1;
+                                if (appid == 0) appid = 1;
+                            } else {
+                                continue;
+                            }
+                        } 
+                        int vl = stoi(rvaluelen);
+                        if (appid < (int)trace_queue->size() && vl < 524000 && vl > 1) {
+                            Op->valuelen = vl;
+                            strncpy(Op->key,rKey.c_str(),255);;
+                            if (Op->type == Operation::GET) {
+                                //find when was last read
+                                Operation *last_op = key_hist[rKey];
+                                if (last_op != NULL) {
+                                    last_op->future = 1; //THE FUTURE IS NOW
+                                    Op->curr = 1;
+                                    Op->future = 0;
+                                    key_hist[rKey] = Op;
+                                    g_key_hist[rKey] = 1;
+                                } else {
+                                    //first ref
+                                    Op->curr = 1;
+                                    Op->future = 0;
+                                    key_hist[rKey] = Op;
+                                    g_key_hist[rKey] = 0;
+                                }
+                            }
+                            Op->appid = appid;
+                            trace_queue->at(appid)->push(Op);
+                            //g_trace_queue.enqueue(Op);
+                            //if (twitter_trace == 3) { // && batch == 2) {
+                            //    appid = (rand() % (trace_queue->size()-1)) + 1;
+                            //    if (appid == 0) appid = 1;
+                            //    batch = 0;
+                            //}
+                        }
+                    } else {
+                        fprintf(stderr,"big error!\n");
+                    }
+                }
+                //bool res = trace_queue[appid]->try_enqueue(full_line);
+                //while (!res) {
+                //    //usleep(10);
+                //    //res = trace_queue[appid]->try_enqueue(full_line);
+                //    nwrites++;
+                //}
+                nout++;
+                if (nout % 1000000 == 0) fprintf(stderr,"decompressed requests: %lu, waits: %lu\n",nout,nwrites);
+
+            }
+            free(line_p);
+            free(ftrace);
+            trace = get_stream(dctx, fin, buffInSize, buffIn, buffOutSize, buffOut);
+        }
+
+  	for (int i = 0; i < 10; i++) {
+            for (int j = 0; j < (int)trace_queue->size(); j++) {
+  	        //trace_queue[j]->enqueue(eof);
+                Operation *eof = new Operation;
+                eof->type = Operation::SASL;
+                eof->appid = j;
+  	            trace_queue->at(j)->push(eof);
+                //g_trace_queue.enqueue(eof);
+                if (i == 0) {
+                    fprintf(stderr,"appid %d, tq size: %ld\n",j,trace_queue->at(j)->size());
+                }
+            }
+  	}
+        if (do_blob) {
+            for (int i = 0; i < (int)trace_queue->size(); i++) {
+                queue<Operation*> tmp = *(trace_queue->at(i));
+                while (!tmp.empty()) {
+                    Operation *Op = tmp.front();
+                    int br = write(blob,(void*)(Op),sizeof(Operation));
+                    if (br != sizeof(Operation)) {
+                        fprintf(stderr,"error writing op!\n");
+                    }
+                    tmp.pop();
+                }
+
+            }
+        }
+
+        pthread_mutex_lock(&reader_l);
+        if (reader_not_ready) {
+            reader_not_ready = 0;
+        }
+        pthread_mutex_unlock(&reader_l);
+        pthread_cond_signal(&reader_ready);
+        if (trace) {
+            free(trace);
+        }
+        ZSTD_freeDCtx(dctx);
+        fclose_orDie(fin);
+        free(buffIn);
+        free(buffOut);
+
+	
+  } else if (hasEnding(fn,".data")) {
+     ifstream trace_file (fn, ios::in | ios::binary);
+    uint32_t treqs = 0;
+     char *ops = (char*)malloc(sizeof(Operation)*1000000);
+     Operation *optr = (Operation*)(ops);
+     while (trace_file.good()) {
+        trace_file.read((char*)ops,sizeof(Operation)*1000000);
+        int tbytes = trace_file.gcount();
+        int tops = tbytes/sizeof(Operation);
+        for (int i = 0; i < tops; i++) {
+            Operation *op = (Operation*)optr;
+            string rKey = string(op->key);
+            g_key_hist[rKey] = 0;
+            if (op->future) {
+                g_key_hist[rKey] = 1;
+            }
+            trace_queue->at(op->appid)->push(op);
+            treqs++;
+            if (treqs % 1000000 == 0) fprintf(stderr,"loaded requests: %u\n",treqs);
+            optr++;
+
+        }
+        optr = (Operation*)ops;
+     }
+     trace_file.close();
+     
+     pthread_mutex_lock(&reader_l);
+     if (reader_not_ready) {
+         reader_not_ready = 0;
+     }
+     pthread_mutex_unlock(&reader_l);
+     pthread_cond_signal(&reader_ready);
+
+  }
+      //else {
+ 
+  	//ifstream trace_file;
+  	//trace_file.open(rdata->trace_filename);
+  	//while (trace_file.good()) {
+  	//  string line;
+  	//  getline(trace_file,line);
+  	//  trace_queue->enqueue(line);
+  	//}
+  	//string eof = "EOF";
+  	//for (int i = 0; i < 1000; i++) {
+  	//  trace_queue->enqueue(eof);
+  	//}
+  //}
+
+  return NULL;
+}
+
 void* thread_main(void *arg) {
   struct thread_data *td = (struct thread_data *) arg;
-
+  int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+  //int res = stick_this_thread_to_core(td->id % num_cores);
+  //if (res != 0) {
+  //      DIE("pthread_attr_setaffinity_np(%d) failed: %s",
+  //                td->id, strerror(res));
+  //}
   ConnectionStats *cs = new ConnectionStats();
 
-  do_mutilate(*td->servers, *td->options, *cs, td->master
+  do_mutilate(*td->servers, *td->options, *cs,  td->trace_queue, td->g_lock, td->g_wb_keys, td->master
 #ifdef HAVE_LIBZMQ
-, td->socket
+, td->socketz
 #endif
 );
 
@@ -761,9 +1340,9 @@ void* thread_main(void *arg) {
 }
 
 void do_mutilate(const vector<string>& servers, options_t& options,
-                 ConnectionStats& stats, bool master
+                 ConnectionStats& stats, vector<queue<Operation*>*> *trace_queue, pthread_mutex_t* g_lock, unordered_map<string,vector<Operation*>> *g_wb_keys, bool master 
 #ifdef HAVE_LIBZMQ
-, zmq::socket_t* socket
+, zmq::socket_t* socketz
 #endif
 ) {
   int loop_flag =
@@ -775,136 +1354,251 @@ void do_mutilate(const vector<string>& servers, options_t& options,
   struct evdns_base *evdns;
   struct event_config *config;
 
+
   if ((config = event_config_new()) == NULL) DIE("event_config_new() fail");
 
 #ifdef HAVE_DECL_EVENT_BASE_FLAG_PRECISE_TIMER
   if (event_config_set_flag(config, EVENT_BASE_FLAG_PRECISE_TIMER))
-    DIE("event_config_set_flag(EVENT_BASE_FLAG_PRECISE_TIMER) fail");
+        DIE("event_config_set_flag(EVENT_BASE_FLAG_PRECISE_TIMER) fail");
 #endif
 
   if ((base = event_base_new_with_config(config)) == NULL)
     DIE("event_base_new() fail");
 
-  //  evthread_use_pthreads();
+  //evthread_use_pthreads();
 
   if ((evdns = evdns_base_new(base, 1)) == 0) DIE("evdns");
 
   //  event_base_priority_init(base, 2);
 
   // FIXME: May want to move this to after all connections established.
-  double start = get_time();
-  double now = start;
 
-  vector<Connection*> connections;
-  vector<Connection*> server_lead;
 
-  for (auto s: servers) {
-    // Split args.server_arg[s] into host:port using strtok().
-    char *s_copy = new char[s.length() + 1];
-    strcpy(s_copy, s.c_str());
+  if (servers.size() == 1) {
+    vector<Connection*> connections;
+    vector<Connection*> server_lead;
+    for (auto s: servers) { 
+      // Split args.server_arg[s] into host:port using strtok().
+      char *s_copy = new char[s.length() + 1];
+      strcpy(s_copy, s.c_str());
+
+      char *h_ptr = strtok_r(s_copy, ":", &saveptr);
+      char *p_ptr = strtok_r(NULL, ":", &saveptr);
+
+      if (h_ptr == NULL) DIE("strtok(.., \":\") failed to parse %s", s.c_str());
+
+      string hostname = h_ptr;
+      string port = "11211";
+      if (p_ptr) port = p_ptr;
+
+      delete[] s_copy;
+
+      int conns = args.measure_connections_given ? args.measure_connections_arg :
+        options.connections;
+
+      srand(time(NULL));
+      for (int c = 0; c <= conns; c++) {
+        Connection* conn = new Connection(base, evdns, hostname, port, options,
+                                          //NULL,//trace_queue,
+                                          args.agentmode_given ? false :
+                                          true);
+        int tries = 120;
+        int connected = 0;
+        int s = 2;
+        for (int i = 0; i < tries; i++) {
+          int ret = conn->do_connect();
+          if (ret) {
+              connected = 1;
+              fprintf(stderr,"thread %lu, conn: %d, connected!\n",pthread_self(),c+1);
+              break;
+          }
+          int d = s + rand() % 100;
+          //s = s + d;
+          
+          //fprintf(stderr,"conn: %d, sleeping %d\n",c,d);
+          sleep(d);
+        } 
+        if (connected) {
+          //fprintf(stderr,"cid %d gets trace_queue\nfirst: %s",conn->get_cid(),trace_queue->at(conn->get_cid())->front().c_str());
+          //conn->set_queue(trace_queue->at(conn->get_cid()));
+          //conn->set_lock(mutexes->at(conn->get_cid()));
+          connections.push_back(conn);
+        } else {
+          fprintf(stderr,"conn: %d, not connected!!\n",c);
+
+        }
+        if (c == 0) server_lead.push_back(conn);
+      }
+    }
+    double start = get_time();
+    double now = start;
+
+    // Wait for all Connections to become IDLE.
+    while (1) {
+      // FIXME: If all connections become ready before event_base_loop
+      // is called, this will deadlock.
+      event_base_loop(base, EVLOOP_ONCE);
 
-    char *h_ptr = strtok_r(s_copy, ":", &saveptr);
-    char *p_ptr = strtok_r(NULL, ":", &saveptr);
+      bool restart = false;
+      for (Connection *conn: connections)
+        if (!conn->is_ready()) restart = true;
 
-    if (h_ptr == NULL) DIE("strtok(.., \":\") failed to parse %s", s.c_str());
+      if (restart) continue;
+      else break;
+    }
 
-    string hostname = h_ptr;
-    string port = "11211";
-    if (p_ptr) port = p_ptr;
+    // Load database on lead connection for each server.
+    if (!options.noload) {
+      V("Loading database.");
 
-    delete[] s_copy;
+      for (auto c: server_lead) c->start_loading();
 
-    int conns = args.measure_connections_given ? args.measure_connections_arg :
-      options.connections;
+      // Wait for all Connections to become IDLE.
+      while (1) {
+        // FIXME: If all connections become ready before event_base_loop
+        // is called, this will deadlock.
+        event_base_loop(base, EVLOOP_ONCE);
 
-    for (int c = 0; c < conns; c++) {
-      Connection* conn = new Connection(base, evdns, hostname, port, options,
-                                        args.agentmode_given ? false :
-                                        true);
-      connections.push_back(conn);
-      if (c == 0) server_lead.push_back(conn);
+        bool restart = false;
+        for (Connection *conn: connections)
+          if (!conn->is_ready()) restart = true;
+
+        if (restart) continue;
+        else break;
+      }
     }
-  }
 
-  // Wait for all Connections to become IDLE.
-  while (1) {
-    // FIXME: If all connections become ready before event_base_loop
-    // is called, this will deadlock.
-    event_base_loop(base, EVLOOP_ONCE);
+    if (options.loadonly) {
+      evdns_base_free(evdns, 0);
+      event_base_free(base);
+      return;
+    }
 
-    bool restart = false;
-    for (Connection *conn: connections)
-      if (!conn->is_ready()) restart = true;
+    // FIXME: Remove.  Not needed, testing only.
+    //  // FIXME: Synchronize start_time here across threads/nodes.
+    //  pthread_barrier_wait(&barrier);
 
-    if (restart) continue;
-    else break;
-  }
+    // Warmup connection.
+    if (options.warmup > 0) {
+        if (master) V("Warmup start.");
 
-  // Load database on lead connection for each server.
-  if (!options.noload) {
-    V("Loading database.");
+#ifdef HAVE_LIBZMQ
+        if (args.agent_given || args.agentmode_given) {
+          if (master) V("Synchronizing.");
 
-    for (auto c: server_lead) c->start_loading();
+          // 1. thread barrier: make sure our threads ready before syncing agents
+          // 2. sync agents: all threads across all agents are now ready
+          // 3. thread barrier: don't release our threads until all agents ready
+          pthread_barrier_wait(&barrier);
+          if (master) sync_agent(socketz);
+          pthread_barrier_wait(&barrier);
 
-    // Wait for all Connections to become IDLE.
-    while (1) {
-      // FIXME: If all connections become ready before event_base_loop
-      // is called, this will deadlock.
-      event_base_loop(base, EVLOOP_ONCE);
+          if (master) V("Synchronized.");
+        }
+#endif
+
+      int old_time = options.time;
+      //    options.time = 1;
+
+      start = get_time();
+      for (Connection *conn: connections) {
+        conn->start_time = start;
+        conn->options.time = options.warmup;
+        conn->start(); // Kick the Connection into motion.
+      }
+
+      while (1) {
+        event_base_loop(base, loop_flag);
+
+        //#ifdef USE_CLOCK_GETTIME
+        //      now = get_time();
+        //#else
+        struct timeval now_tv;
+        event_base_gettimeofday_cached(base, &now_tv);
+        now = tv_to_double(&now_tv);
+        //#endif
+
+        bool restart = false;
+        for (Connection *conn: connections)
+          if (!conn->check_exit_condition(now))
+            restart = true;
+
+        if (restart) continue;
+        else break;
+      }
 
       bool restart = false;
       for (Connection *conn: connections)
         if (!conn->is_ready()) restart = true;
 
-      if (restart) continue;
-      else break;
+      if (restart) {
+
+      // Wait for all Connections to become IDLE.
+      while (1) {
+        // FIXME: If there were to use EVLOOP_ONCE and all connections
+        // become ready before event_base_loop is called, this will
+        // deadlock.  We should check for IDLE before calling
+        // event_base_loop.
+        event_base_loop(base, EVLOOP_ONCE); // EVLOOP_NONBLOCK);
+
+        bool restart = false;
+        for (Connection *conn: connections)
+          if (!conn->is_ready()) restart = true;
+
+        if (restart) continue;
+        else break;
+      }
+      }
+
+      for (Connection *conn: connections) {
+        conn->reset();
+        conn->options.time = old_time;
+      }
+
+      if (master) V("Warmup stop.");
     }
-  }
 
-  if (options.loadonly) {
-    evdns_base_free(evdns, 0);
-    event_base_free(base);
-    return;
-  }
 
-  // FIXME: Remove.  Not needed, testing only.
-  //  // FIXME: Synchronize start_time here across threads/nodes.
-  //  pthread_barrier_wait(&barrier);
+    // FIXME: Synchronize start_time here across threads/nodes.
+    pthread_barrier_wait(&barrier);
 
-  // Warmup connection.
-  if (options.warmup > 0) {
-    if (master) V("Warmup start.");
+    if (master && args.wait_given) {
+      if (get_time() < boot_time + args.wait_arg) {
+        double t = (boot_time + args.wait_arg)-get_time();
+        V("Sleeping %.1fs for -W.", t);
+        sleep_time(t);
+      }
+    }
 
 #ifdef HAVE_LIBZMQ
     if (args.agent_given || args.agentmode_given) {
       if (master) V("Synchronizing.");
 
-      // 1. thread barrier: make sure our threads ready before syncing agents
-      // 2. sync agents: all threads across all agents are now ready
-      // 3. thread barrier: don't release our threads until all agents ready
       pthread_barrier_wait(&barrier);
-      if (master) sync_agent(socket);
+      if (master) sync_agent(socketz);
       pthread_barrier_wait(&barrier);
 
       if (master) V("Synchronized.");
     }
 #endif
 
-    int old_time = options.time;
-    //    options.time = 1;
+    if (master && !args.scan_given && !args.search_given)
+      V("started at %f", get_time());
 
     start = get_time();
     for (Connection *conn: connections) {
       conn->start_time = start;
-      conn->options.time = options.warmup;
       conn->start(); // Kick the Connection into motion.
     }
 
+    //  V("Start = %f", start);
+
+    // Main event loop.
     while (1) {
       event_base_loop(base, loop_flag);
 
-      //#ifdef USE_CLOCK_GETTIME
-      //      now = get_time();
+      //#if USE_CLOCK_GETTIME
+      //    now = get_time();
       //#else
       struct timeval now_tv;
       event_base_gettimeofday_cached(base, &now_tv);
@@ -920,108 +1614,613 @@ void do_mutilate(const vector<string>& servers, options_t& options,
       else break;
     }
 
-    bool restart = false;
-    for (Connection *conn: connections)
-      if (!conn->is_ready()) restart = true;
+    if (master && !args.scan_given && !args.search_given)
+      V("stopped at %f  options.time = %d", get_time(), options.time);
+
+    // Tear-down and accumulate stats.
+    for (Connection *conn: connections) {
+      stats.accumulate(conn->stats);
+      delete conn;
+    }
+
+    stats.start = start;
+    stats.stop = now;
 
-    if (restart) {
+    event_config_free(config);
+    evdns_base_free(evdns, 0);
+    event_base_free(base);
+  } else if (servers.size() == 2 && !(args.approx_given || args.approx_batch_given || args.use_shm_given || args.use_shm_batch_given)) {
+    vector<ConnectionMulti*> connections;
+    vector<ConnectionMulti*> server_lead;
 
+    string hostname1 = servers[0];
+    string hostname2 = servers[1];
+    string port = "11211";
+
+    int conns = args.measure_connections_given ? args.measure_connections_arg :
+      options.connections;
+
+    srand(time(NULL));
+    for (int c = 0; c < conns; c++) {
+
+      int fd1 = -1;
+
+      if ( (fd1 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("socket error");
+        exit(-1);
+      }
+
+      struct sockaddr_un sin1;
+      memset(&sin1, 0, sizeof(sin1));
+      sin1.sun_family = AF_LOCAL;
+      strcpy(sin1.sun_path, hostname1.c_str());
+
+      fcntl(fd1, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state   */
+      int addrlen;
+      addrlen = sizeof(sin1);
+
+      int max_tries = 50;
+      int n_tries = 0;
+      int s = 10;
+      while (connect(fd1, (struct sockaddr*)&sin1, addrlen) == -1) {
+        perror("l1 connect error");
+        if (n_tries++ > max_tries) {
+            fprintf(stderr,"conn l1 %d unable to connect after sleep for %d\n",c+1,s);
+            exit(-1);
+        }
+        int d = s + rand() % 100;
+        usleep(d);
+        s = (int)((double)s*1.25);
+      }
+      
+      int fd2 = -1;
+      if ( (fd2 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("l2 socket error");
+        exit(-1);
+      }
+      struct sockaddr_un sin2;
+      memset(&sin2, 0, sizeof(sin2));
+      sin2.sun_family = AF_LOCAL;
+      strcpy(sin2.sun_path, hostname2.c_str());
+      fcntl(fd2, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state   */
+      addrlen = sizeof(sin2);
+      n_tries = 0;
+      s = 10;
+      while (connect(fd2, (struct sockaddr*)&sin2, addrlen) == -1) {
+        perror("l2 connect error");
+        if (n_tries++ > max_tries) {
+            fprintf(stderr,"conn l2 %d unable to connect after sleep for %d\n",c+1,s);
+            exit(-1);
+        }
+        int d = s + rand() % 100;
+        usleep(d);
+        s = (int)((double)s*1.25);
+      }
+
+
+      ConnectionMulti* conn = new ConnectionMulti(base, evdns, 
+              hostname1, hostname2, port, options,args.agentmode_given ? false : true, fd1, fd2);
+     
+      int connected = 0;
+      if (conn) {
+          connected = 1;
+      }
+      int cid = conn->get_cid();
+      
+      if (connected) {
+        fprintf(stderr,"cid %d gets l1 fd %d l2 fd %d\n",cid,fd1,fd2);
+        fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key);
+        if (g_lock != NULL) {
+            conn->set_g_wbkeys(g_wb_keys);
+            conn->set_lock(g_lock);
+        }
+        conn->set_queue(trace_queue->at(cid));
+        connections.push_back(conn);
+      } else {
+        fprintf(stderr,"conn multi: %d, not connected!!\n",c);
+
+      }
+    }
+    
+    // wait for all threads to reach here
+    pthread_barrier_wait(&barrier);
+
+    fprintf(stderr,"thread %ld gtg\n",pthread_self());
     // Wait for all Connections to become IDLE.
     while (1) {
-      // FIXME: If there were to use EVLOOP_ONCE and all connections
-      // become ready before event_base_loop is called, this will
-      // deadlock.  We should check for IDLE before calling
-      // event_base_loop.
-      event_base_loop(base, EVLOOP_ONCE); // EVLOOP_NONBLOCK);
+      // FIXME: If all connections become ready before event_base_loop
+      // is called, this will deadlock.
+      event_base_loop(base, EVLOOP_ONCE);
 
       bool restart = false;
-      for (Connection *conn: connections)
+      for (ConnectionMulti *conn: connections)
         if (!conn->is_ready()) restart = true;
 
       if (restart) continue;
       else break;
     }
+   
+    
+
+    double start = get_time();
+    double now = start;
+    for (ConnectionMulti *conn: connections) {
+        conn->start_time = start;
+        conn->start(); // Kick the Connection into motion.
+    } 
+    //fprintf(stderr,"Start = %f\n", start);
+
+    // Main event loop.
+    while (1) {
+      event_base_loop(base, loop_flag);
+      struct timeval now_tv;
+      event_base_gettimeofday_cached(base, &now_tv);
+      now = tv_to_double(&now_tv);
+
+      bool restart = false;
+      for (ConnectionMulti *conn: connections) {
+        if (!conn->check_exit_condition(now)) {
+          restart = true;
+        }
+      }
+      if (restart) continue;
+      else break;
+
     }
 
-    for (Connection *conn: connections) {
-      conn->reset();
-      conn->options.time = old_time;
+
+    //  V("Start = %f", start);
+
+    if (master && !args.scan_given && !args.search_given)
+      V("stopped at %f  options.time = %d", get_time(), options.time);
+
+    // Tear-down and accumulate stats.
+    for (ConnectionMulti *conn: connections) {
+      stats.accumulate(conn->stats);
+      delete conn;
     }
 
-    if (master) V("Warmup stop.");
-  }
+    stats.start = start;
+    stats.stop = now;
+
+    event_config_free(config);
+    evdns_base_free(evdns, 0);
+    event_base_free(base);
+  } else if (servers.size() == 2 && args.approx_given) {
+    vector<ConnectionMultiApprox*> connections;
+    vector<ConnectionMultiApprox*> server_lead;
+
+    string hostname1 = servers[0];
+    string hostname2 = servers[1];
+    string port = "11211";
+
+    int conns = args.measure_connections_given ? args.measure_connections_arg :
+      options.connections;
+
+    srand(time(NULL));
+    for (int c = 0; c < conns; c++) {
+
+      int fd1 = -1;
 
+      if ( (fd1 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("socket error");
+        exit(-1);
+      }
 
-  // FIXME: Synchronize start_time here across threads/nodes.
-  pthread_barrier_wait(&barrier);
+      struct sockaddr_un sin1;
+      memset(&sin1, 0, sizeof(sin1));
+      sin1.sun_family = AF_LOCAL;
+      strcpy(sin1.sun_path, hostname1.c_str());
+
+      fcntl(fd1, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state   */
+      int addrlen;
+      addrlen = sizeof(sin1);
+
+      int max_tries = 50;
+      int n_tries = 0;
+      int s = 10;
+      while (connect(fd1, (struct sockaddr*)&sin1, addrlen) == -1) {
+        perror("l1 connect error");
+        if (n_tries++ > max_tries) {
+            fprintf(stderr,"conn l1 %d unable to connect after sleep for %d\n",c+1,s);
+            exit(-1);
+        }
+        int d = s + rand() % 100;
+        usleep(d);
+        s = (int)((double)s*1.25);
+      }
+      
+      int fd2 = -1;
+      if ( (fd2 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("l2 socket error");
+        exit(-1);
+      }
+      struct sockaddr_un sin2;
+      memset(&sin2, 0, sizeof(sin2));
+      sin2.sun_family = AF_LOCAL;
+      strcpy(sin2.sun_path, hostname2.c_str());
+      fcntl(fd2, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state   */
+      addrlen = sizeof(sin2);
+      n_tries = 0;
+      s = 10;
+      while (connect(fd2, (struct sockaddr*)&sin2, addrlen) == -1) {
+        perror("l2 connect error");
+        if (n_tries++ > max_tries) {
+            fprintf(stderr,"conn l2 %d unable to connect after sleep for %d\n",c+1,s);
+            exit(-1);
+        }
+        int d = s + rand() % 100;
+        usleep(d);
+        s = (int)((double)s*1.25);
+      }
 
-  if (master && args.wait_given) {
-    if (get_time() < boot_time + args.wait_arg) {
-      double t = (boot_time + args.wait_arg)-get_time();
-      V("Sleeping %.1fs for -W.", t);
-      sleep_time(t);
+
+      ConnectionMultiApprox* conn = new ConnectionMultiApprox(base, evdns, 
+              hostname1, hostname2, port, options,args.agentmode_given ? false : true, fd1, fd2);
+     
+      int connected = 0;
+      if (conn) {
+          connected = 1;
+      }
+      int cid = conn->get_cid();
+      
+      if (connected) {
+        fprintf(stderr,"cid %d gets l1 fd %d l2 fd %d\n",cid,fd1,fd2);
+        fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key);
+        if (g_lock != NULL) {
+            conn->set_g_wbkeys(g_wb_keys);
+            conn->set_lock(g_lock);
+        }
+        conn->set_queue(trace_queue->at(cid));
+        connections.push_back(conn);
+      } else {
+        fprintf(stderr,"conn multi: %d, not connected!!\n",c);
+
+      }
     }
-  }
+    
+    // wait for all threads to reach here
+    pthread_barrier_wait(&barrier);
 
-#ifdef HAVE_LIBZMQ
-  if (args.agent_given || args.agentmode_given) {
-    if (master) V("Synchronizing.");
+    fprintf(stderr,"thread %ld gtg\n",pthread_self());
+    // Wait for all Connections to become IDLE.
+    while (1) {
+      // FIXME: If all connections become ready before event_base_loop
+      // is called, this will deadlock.
+      event_base_loop(base, EVLOOP_ONCE);
+
+      bool restart = false;
+      for (ConnectionMultiApprox *conn: connections)
+        if (!conn->is_ready()) restart = true;
 
+      if (restart) continue;
+      else break;
+    }
+   
+    
+
+    double start = get_time();
+    double now = start;
+    for (ConnectionMultiApprox *conn: connections) {
+        conn->start_time = start;
+        conn->start(); // Kick the Connection into motion.
+    } 
+    //fprintf(stderr,"Start = %f\n", start);
+
+    // Main event loop.
+    while (1) {
+      event_base_loop(base, loop_flag);
+      struct timeval now_tv;
+      event_base_gettimeofday_cached(base, &now_tv);
+      now = tv_to_double(&now_tv);
+
+      bool restart = false;
+      for (ConnectionMultiApprox *conn: connections) {
+        if (!conn->check_exit_condition(now)) {
+          restart = true;
+        }
+      }
+      if (restart) continue;
+      else break;
+
+    }
+
+
+    //  V("Start = %f", start);
+
+    if (master && !args.scan_given && !args.search_given)
+      V("stopped at %f  options.time = %d", get_time(), options.time);
+
+    // Tear-down and accumulate stats.
+    for (ConnectionMultiApprox *conn: connections) {
+      stats.accumulate(conn->stats);
+      delete conn;
+    }
+
+    stats.start = start;
+    stats.stop = now;
+
+    event_config_free(config);
+    evdns_base_free(evdns, 0);
+    event_base_free(base);
+  
+  } else if (servers.size() == 2 && args.approx_batch_given) {
+    vector<ConnectionMultiApproxBatch*> connections;
+    vector<ConnectionMultiApproxBatch*> server_lead;
+
+    string hostname1 = servers[0];
+    string hostname2 = servers[1];
+    string port = "11211";
+
+    int conns = args.measure_connections_given ? args.measure_connections_arg :
+      options.connections;
+
+    srand(time(NULL));
+    for (int c = 0; c < conns; c++) {
+
+      int fd1 = -1;
+
+      if ( (fd1 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("socket error");
+        exit(-1);
+      }
+
+      struct sockaddr_un sin1;
+      memset(&sin1, 0, sizeof(sin1));
+      sin1.sun_family = AF_LOCAL;
+      strcpy(sin1.sun_path, hostname1.c_str());
+
+      fcntl(fd1, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state   */
+      int addrlen;
+      addrlen = sizeof(sin1);
+
+      int max_tries = 50;
+      int n_tries = 0;
+      int s = 10;
+      while (connect(fd1, (struct sockaddr*)&sin1, addrlen) == -1) {
+        perror("l1 connect error");
+        if (n_tries++ > max_tries) {
+            fprintf(stderr,"conn l1 %d unable to connect after sleep for %d\n",c+1,s);
+            exit(-1);
+        }
+        int d = s + rand() % 100;
+        usleep(d);
+        s = (int)((double)s*1.25);
+      }
+      
+      int fd2 = -1;
+      if ( (fd2 = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+        perror("l2 socket error");
+        exit(-1);
+      }
+      struct sockaddr_un sin2;
+      memset(&sin2, 0, sizeof(sin2));
+      sin2.sun_family = AF_LOCAL;
+      strcpy(sin2.sun_path, hostname2.c_str());
+      fcntl(fd2, F_SETFL, O_NONBLOCK); /* Change the socket into non-blocking state   */
+      addrlen = sizeof(sin2);
+      n_tries = 0;
+      s = 10;
+      while (connect(fd2, (struct sockaddr*)&sin2, addrlen) == -1) {
+        perror("l2 connect error");
+        if (n_tries++ > max_tries) {
+            fprintf(stderr,"conn l2 %d unable to connect after sleep for %d\n",c+1,s);
+            exit(-1);
+        }
+        int d = s + rand() % 100;
+        usleep(d);
+        s = (int)((double)s*1.25);
+      }
+
+
+      ConnectionMultiApproxBatch* conn = new ConnectionMultiApproxBatch(base, evdns, 
+              hostname1, hostname2, port, options,args.agentmode_given ? false : true, fd1, fd2);
+     
+      int connected = 0;
+      if (conn) {
+          connected = 1;
+      }
+      int cid = conn->get_cid();
+      
+      if (connected) {
+        fprintf(stderr,"cid %d gets l1 fd %d l2 fd %d\n",cid,fd1,fd2);
+        fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key);
+        if (g_lock != NULL) {
+            conn->set_g_wbkeys(g_wb_keys);
+            conn->set_lock(g_lock);
+        }
+        conn->set_queue(trace_queue->at(cid));
+        connections.push_back(conn);
+      } else {
+        fprintf(stderr,"conn multi: %d, not connected!!\n",c);
+
+      }
+    }
+    
+    // wait for all threads to reach here
     pthread_barrier_wait(&barrier);
-    if (master) sync_agent(socket);
+
+    fprintf(stderr,"thread %ld gtg\n",pthread_self());
+    // Wait for all Connections to become IDLE.
+    while (1) {
+      // FIXME: If all connections become ready before event_base_loop
+      // is called, this will deadlock.
+      event_base_loop(base, EVLOOP_ONCE);
+
+      bool restart = false;
+      for (ConnectionMultiApproxBatch *conn: connections)
+        if (!conn->is_ready()) restart = true;
+
+      if (restart) continue;
+      else break;
+    }
+   
+    
+
+    double start = get_time();
+    double now = start;
+    for (ConnectionMultiApproxBatch *conn: connections) {
+        conn->start_time = start;
+        conn->start(); // Kick the Connection into motion.
+    } 
+    //fprintf(stderr,"Start = %f\n", start);
+
+    // Main event loop.
+    while (1) {
+      event_base_loop(base, loop_flag);
+      struct timeval now_tv;
+      event_base_gettimeofday_cached(base, &now_tv);
+      now = tv_to_double(&now_tv);
+
+      bool restart = false;
+      for (ConnectionMultiApproxBatch *conn: connections) {
+        if (!conn->check_exit_condition(now)) {
+          restart = true;
+        }
+      }
+      if (restart) continue;
+      else {
+          for (ConnectionMultiApproxBatch *conn: connections) {
+              fprintf(stderr,"tid %ld, cid: %d\n",pthread_self(),conn->get_cid());
+          }
+          break;
+      }
+
+    }
+
+
+    //  V("Start = %f", start);
+
+    if (master && !args.scan_given && !args.search_given)
+      V("stopped at %f  options.time = %d", get_time(), options.time);
+
+    // Tear-down and accumulate stats.
+    for (ConnectionMultiApproxBatch *conn: connections) {
+      stats.accumulate(conn->stats);
+      delete conn;
+    }
+
+    stats.start = start;
+    stats.stop = now;
+
+    event_config_free(config);
+    evdns_base_free(evdns, 0);
+    event_base_free(base);
+  } else if (servers.size() == 2 && args.use_shm_given) {
+    vector<ConnectionMultiApproxShm*> connections;
+
+    int conns = args.measure_connections_given ? args.measure_connections_arg :
+      options.connections;
+
+    srand(time(NULL));
+    for (int c = 0; c < conns; c++) {
+
+
+      ConnectionMultiApproxShm* conn = new ConnectionMultiApproxShm(options,args.agentmode_given ? false : true);
+      int connected = 0;
+      if (conn && conn->do_connect()) {
+          connected = 1;
+      }
+      int cid = conn->get_cid();
+      
+      if (connected) {
+        fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key);
+        if (g_lock != NULL) {
+            conn->set_g_wbkeys(g_wb_keys);
+            conn->set_lock(g_lock);
+        }
+        conn->set_queue(trace_queue->at(cid));
+        connections.push_back(conn);
+      } else {
+        fprintf(stderr,"conn multi: %d, not connected!!\n",c);
+
+      }
+    }
+    
+    // wait for all threads to reach here
     pthread_barrier_wait(&barrier);
+    double start = get_time();
+    fprintf(stderr,"Start = %f\n", start);
+    double now = start;
+    for (ConnectionMultiApproxShm *conn: connections) {
+        conn->start_time = now;
+        conn->drive_write_machine_shm(now);
+    }
 
-    if (master) V("Synchronized.");
-  }
-#endif
 
-  if (master && !args.scan_given && !args.search_given)
-    V("started at %f", get_time());
 
-  start = get_time();
-  for (Connection *conn: connections) {
-    conn->start_time = start;
-    conn->start(); // Kick the Connection into motion.
-  }
+    if (master && !args.scan_given && !args.search_given)
+      V("stopped at %f  options.time = %d", get_time(), options.time);
 
-  //  V("Start = %f", start);
+    // Tear-down and accumulate stats.
+    for (ConnectionMultiApproxShm *conn: connections) {
+      stats.accumulate(conn->stats);
+      delete conn;
+    }
+    double stop = get_time();
+    fprintf(stderr,"Stop = %f\n", stop);
+    stats.start = start;
+    stats.stop = stop;
 
-  // Main event loop.
-  while (1) {
-    event_base_loop(base, loop_flag);
 
-    //#if USE_CLOCK_GETTIME
-    //    now = get_time();
-    //#else
-    struct timeval now_tv;
-    event_base_gettimeofday_cached(base, &now_tv);
-    now = tv_to_double(&now_tv);
-    //#endif
+  } else if (servers.size() == 2 && args.use_shm_batch_given) {
+    vector<ConnectionMultiApproxBatchShm*> connections;
 
-    bool restart = false;
-    for (Connection *conn: connections)
-      if (!conn->check_exit_condition(now))
-        restart = true;
+    int conns = args.measure_connections_given ? args.measure_connections_arg :
+      options.connections;
 
-    if (restart) continue;
-    else break;
-  }
+    srand(time(NULL));
+    for (int c = 0; c < conns; c++) {
 
-  if (master && !args.scan_given && !args.search_given)
-    V("stopped at %f  options.time = %d", get_time(), options.time);
 
-  // Tear-down and accumulate stats.
-  for (Connection *conn: connections) {
-    stats.accumulate(conn->stats);
-    delete conn;
-  }
+      ConnectionMultiApproxBatchShm* conn = new ConnectionMultiApproxBatchShm(options,args.agentmode_given ? false : true);
+      int connected = 0;
+      if (conn && conn->do_connect()) {
+          connected = 1;
+      }
+      int cid = conn->get_cid();
+      
+      if (connected) {
+        fprintf(stderr,"cid %d gets trace_queue\nfirst: %s\n",cid,trace_queue->at(cid)->front()->key);
+        if (g_lock != NULL) {
+            conn->set_g_wbkeys(g_wb_keys);
+            conn->set_lock(g_lock);
+        }
+        conn->set_queue(trace_queue->at(cid));
+        connections.push_back(conn);
+      } else {
+        fprintf(stderr,"conn multi: %d, not connected!!\n",c);
+
+      }
+    }
+    
+    // wait for all threads to reach here
+    pthread_barrier_wait(&barrier);
+    double start = get_time();
+    fprintf(stderr,"Start = %f\n", start);
+    double now = start;
+    for (ConnectionMultiApproxBatchShm *conn: connections) {
+        conn->start_time = now;
+        conn->drive_write_machine_shm(now);
+    }
+
 
-  stats.start = start;
-  stats.stop = now;
 
-  event_config_free(config);
-  evdns_base_free(evdns, 0);
-  event_base_free(base);
+    if (master && !args.scan_given && !args.search_given)
+      V("stopped at %f  options.time = %d", get_time(), options.time);
+
+    // Tear-down and accumulate stats.
+    for (ConnectionMultiApproxBatchShm *conn: connections) {
+      stats.accumulate(conn->stats);
+      delete conn;
+    }
+    double stop = get_time();
+    fprintf(stderr,"Stop = %f\n", stop);
+    stats.start = start;
+    stats.stop = stop;
+
+
+  }
 }
 
 void args_to_options(options_t* options) {
@@ -1032,6 +2231,16 @@ void args_to_options(options_t* options) {
   options->threads = args.threads_arg;
   options->server_given = args.server_given;
   options->roundrobin = args.roundrobin_given;
+  options->apps = args.apps_arg;
+  options->rand_admit = args.rand_admit_arg;
+  options->threshold = args.threshold_arg;
+  options->wb_all = args.wb_all_arg;
+  options->ratelimit = args.ratelimit_given;
+  options->v1callback = args.v1callback_given;
+  if (args.inclusives_given) {
+    memset(options->inclusives,0,256);
+    strncpy(options->inclusives,args.inclusives_arg,256);
+  }
 
   int connections = options->connections;
   if (options->roundrobin) {
@@ -1058,7 +2267,38 @@ void args_to_options(options_t* options) {
   //  else
   options->records = args.records_arg / options->server_given;
 
+  options->queries = args.queries_arg / options->server_given;
+  
+  options->misswindow = args.misswindow_arg;
+
+  options->use_assoc = args.assoc_given;
+  options->assoc = args.assoc_arg;
+  options->twitter_trace = args.twitter_trace_arg;
+
+  options->unix_socket = args.unix_socket_given;
+  options->miss_through = args.miss_through_given;
+  options->successful_queries = args.successful_given;
   options->binary = args.binary_given;
+  options->redis = args.redis_given;
+ 
+  if (options->use_assoc && !options->redis)
+        DIE("assoc must be used with redis");
+
+  options->read_file = args.read_file_given;
+  if (args.read_file_given)
+    strcpy(options->file_name, args.read_file_arg);
+
+  if (args.prefix_given)
+      strcpy(options->prefix,args.prefix_arg);
+
+  //getset mode (first issue get, then set same key if miss)
+  options->getset = args.getset_given;
+  options->getsetorset = args.getsetorset_given;
+  //delete 90 percent of keys after halfway
+  //model workload in Rumble and Ousterhout - log structured memory
+  //for dram based storage
+  options->delete90 = args.delete90_given;
+
   options->sasl = args.username_given;
   
   if (args.password_given)
diff --git a/update_readme.sh b/update_readme.sh
old mode 100644
new mode 100755
diff --git a/zstd.h b/zstd.h
new file mode 100644
index 0000000..222339d
--- /dev/null
+++ b/zstd.h
@@ -0,0 +1,2450 @@
+/*
+ * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <limits.h>   /* INT_MAX */
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    4
+#define ZSTD_VERSION_RELEASE  9
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+/*! ZSTD_versionNumber() :
+ *  Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+
+/*! ZSTD_versionString() :
+ *  Return runtime library version, like "1.4.5". Requires v1.3.0+. */
+ZSTDLIB_API const char* ZSTD_versionString(void);
+
+/* *************************************
+ *  Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ *  Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER            0xFD2FB528    /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437    /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50    /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK   0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX  17
+#define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of `src` frame content, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() :
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ *        or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Note : re-using context is just a speed / resource optimization.
+ *         It doesn't change the compression ratio, which remains identical.
+ *  Note 2 : In multi-threaded environments,
+ *         use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ *  Important : in order to behave similarly to `ZSTD_compress()`,
+ *  this function compresses at requested compression level,
+ *  __ignoring any other parameter__ .
+ *  If any advanced parameter was set using the advanced API,
+ *  they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(),
+ *  requires an allocated ZSTD_DCtx.
+ *  Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced compression API
+***************************************/
+
+/* API design :
+ *   Parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+ *
+ *   This API supercedes all other "advanced" API entry points in the experimental section.
+ *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+               ZSTD_dfast=2,
+               ZSTD_greedy=3,
+               ZSTD_lazy=4,
+               ZSTD_lazy2=5,
+               ZSTD_btlazy2=6,
+               ZSTD_btopt=7,
+               ZSTD_btultra=8,
+               ZSTD_btultra2=9
+               /* note : new strategies _might_ be added in the future.
+                         Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+
+typedef enum {
+
+    /* compression parameters
+     * Note: When compressing with a ZSTD_CDict these parameters are superseded
+     * by the parameters used to construct the ZSTD_CDict.
+     * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+    ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+                              * Note that exact compression parameters are dynamically determined,
+                              * depending on both compression level and srcSize (when known).
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level.
+                              * Note 2 : setting a level does not automatically set all other compression parameters
+                              *   to default. Setting this will however eventually dynamically impact the compression
+                              *   parameters which have not been manually set. The manually set
+                              *   ones will 'stick'. */
+    /* Advanced compression parameters :
+     * It's possible to pin down compression parameters to some specific values.
+     * In which case, these values are no longer dynamically selected by the compressor */
+    ZSTD_c_windowLog=101,    /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * This will set a memory budget for streaming decompression,
+                              * with larger values requiring more memory
+                              * and typically compressing more.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+                              *       requires explicitly allowing such size at streaming decompression stage. */
+    ZSTD_c_hashLog=102,      /* Size of the initial probe table, as a power of 2.
+                              * Resulting memory usage is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_c_chainLog=103,     /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting memory usage is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless for "fast" strategy.
+                              * It's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_c_searchLog=104,    /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless for "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_c_minMatch=105,     /* Minimum size of searched matches.
+                              * Note that Zstandard can still find matches of smaller size,
+                              * it just tweaks its search algorithm to look for this size and larger.
+                              * Larger values increase compression and decompression speed, but decrease ratio.
+                              * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+                              * For strategies btopt, btultra & btultra2:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_c_strategy=107,     /* See ZSTD_strategy enum definition.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    /* LDM mode parameters */
+    ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                     * This parameter is designed to improve compression ratio
+                                     * for large inputs, by finding large matches at long distance.
+                                     * It increases memory usage and window size.
+                                     * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+                                     * except when expressly set to a different value.
+                                     * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and
+                                     * compression strategy >= ZSTD_btopt (== compression level 16+) */
+    ZSTD_c_ldmHashLog=161,   /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_c_ldmMinMatch=162,  /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashRateLog". */
+
+    /* frame parameters */
+    ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression.
+                              * This is automatically the case when using ZSTD_compress2(),
+                              * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_c_dictIDFlag=202,   /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+     * Otherwise, trying to set any other value than default (0) will be a no-op and return an error.
+     * In a situation where it's unknown if the linked library supports multi-threading or not,
+     * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property.
+     */
+    ZSTD_c_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() :
+                              * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+                              * while compression is performed in parallel, within worker thread(s).
+                              * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+                              *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned,
+                              * compression is performed inside Caller's thread, and all invocations are blocking */
+    ZSTD_c_jobSize=401,      /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+                              * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlap size, or 1 MB, whichever is largest.
+                              * The minimum size is automatically and transparently enforced. */
+    ZSTD_c_overlapLog=402,   /* Control the overlap size, as a fraction of window size.
+                              * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+                              * It helps preserve compression ratio, while each job is compressed in parallel.
+                              * This value is enforced only when nbWorkers >= 1.
+                              * Larger values increase compression ratio, but decrease speed.
+                              * Possible values range from 0 to 9 :
+                              * - 0 means "default" : value will be determined by the library, depending on strategy
+                              * - 1 means "no overlap"
+                              * - 9 means "full overlap", using a full window size.
+                              * Each intermediate rank increases/decreases load size by a factor 2 :
+                              * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
+                              * default value varies between 6 and 9, depending on strategy */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_c_rsyncable
+     * ZSTD_c_format
+     * ZSTD_c_forceMaxWindow
+     * ZSTD_c_forceAttachDict
+     * ZSTD_c_literalCompressionMode
+     * ZSTD_c_targetCBlockSize
+     * ZSTD_c_srcSizeHint
+     * ZSTD_c_enableDedicatedDictSearch
+     * ZSTD_c_stableInBuffer
+     * ZSTD_c_stableOutBuffer
+     * ZSTD_c_blockDelimiters
+     * ZSTD_c_validateSequences
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly;
+     *        also, the enums values themselves are unstable and can still change.
+     */
+     ZSTD_c_experimentalParam1=500,
+     ZSTD_c_experimentalParam2=10,
+     ZSTD_c_experimentalParam3=1000,
+     ZSTD_c_experimentalParam4=1001,
+     ZSTD_c_experimentalParam5=1002,
+     ZSTD_c_experimentalParam6=1003,
+     ZSTD_c_experimentalParam7=1004,
+     ZSTD_c_experimentalParam8=1005,
+     ZSTD_c_experimentalParam9=1006,
+     ZSTD_c_experimentalParam10=1007,
+     ZSTD_c_experimentalParam11=1008,
+     ZSTD_c_experimentalParam12=1009
+} ZSTD_cParameter;
+
+typedef struct {
+    size_t error;
+    int lowerBound;
+    int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbWorkers >= 1),
+ *              the following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ *  This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ *  Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ *           It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ *  Note 3 : Whenever all input data is provided and consumed in a single round,
+ *           for example with ZSTD_compress2(),
+ *           or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ *           this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+typedef enum {
+    ZSTD_reset_session_only = 1,
+    ZSTD_reset_parameters = 2,
+    ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ *  There are 2 different things that can be reset, independently or jointly :
+ *  - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ *                  Useful after an error, or to interrupt any ongoing compression.
+ *                  Any internal data not yet flushed is cancelled.
+ *                  Compression parameters and dictionary remain unchanged.
+ *                  They will be used to compress next frame.
+ *                  Resetting session never fails.
+ *  - The parameters : changes all parameters back to "default".
+ *                  This removes any reference to any dictionary too.
+ *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ *  - Both : similar to resetting the session, followed by resetting parameters.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  ZSTD_compress2() always starts a new frame.
+ *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - The function is always blocking, returns when compression is completed.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ *           or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+                                   void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced decompression API
+***************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ *        Therefore, no new decompression function is necessary.
+ */
+
+typedef enum {
+
+    ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+                              * the streaming API will refuse to allocate memory buffer
+                              * in order to protect the host from unreasonable memory requirements.
+                              * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+                              * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+                              * Special: value 0 means "use default maximum windowLog". */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_d_format
+     * ZSTD_d_stableOutBuffer
+     * ZSTD_d_forceIgnoreChecksum
+     * ZSTD_d_refMultipleDDicts
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly
+     */
+     ZSTD_d_experimentalParam1=1000,
+     ZSTD_d_experimentalParam2=1001,
+     ZSTD_d_experimentalParam3=1002,
+     ZSTD_d_experimentalParam4=1003
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - both lower and upper bounds, inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_dParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  Session and parameters can be reset jointly or separately.
+ *  Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+*  For parallel execution, use one separate ZSTD_CStream per thread.
+*
+*  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+*  Parameters are sticky : when starting a new compression on the same context,
+*  it will re-use the same sticky parameters as previous compression session.
+*  When in doubt, it's recommended to fully initialize the context before usage.
+*  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+*  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+*  set more specific parameters, the pledged source size, or load a dictionary.
+*
+*  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+*  consume input stream. The function will automatically update both `pos`
+*  fields within `input` and `output`.
+*  Note that the function may not consume the entire input, for example, because
+*  the output buffer is already full, in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  and then present again remaining input data.
+*  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+*        but doesn't guarantee maximal forward progress. This is especially relevant
+*        when compressing with multiple threads. The call won't block if it can
+*        consume some input, but if it can't it will wait for some, but not all,
+*        output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+*           or an error code, which can be tested using ZSTD_isError().
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+*  operation.
+*  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+*  start a new frame.
+*  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush=1,    /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression.
+                        * note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_end=2       /* flush any remaining data _and_ close current frame.
+                        * note that frame is only closed after compressed data is fully flushed (return value == 0).
+                        * After that point, any additional data starts a new frame.
+                        * note : each frame is independent (does not reference any content from previous frame).
+                        : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() :
+ *  Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ *  - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - endOp must be a valid directive
+ *  - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ *  - When nbWorkers>=1, function is non-blocking : it copies a portion of input, distributes jobs to internal worker threads, flush to output whatever is available,
+ *                                                  and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                  The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                         ZSTD_outBuffer* output,
+                                         ZSTD_inBuffer* input,
+                                         ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API.
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Advanced parameters and dictionary compression can only be used through the
+ * new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
+ *
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
+ */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+*  Alternatively, use advanced API to set specific properties.
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, respecting output buffer size.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (just a hint for better latency)
+*                                that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression at an explicit compression level using a Dictionary.
+ *  A dictionary can be any arbitrary data segment (also called a prefix),
+ *  or a buffer with specified information (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a known Dictionary.
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/***********************************
+ *  Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages or blocks using the same dictionary,
+ *  it's recommended to digest the dictionary only once, since it's a costly operation.
+ *  ZSTD_createCDict() will create a state from digesting a dictionary.
+ *  The resulting state can be used for future compression operations with very limited startup cost.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ *  Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ *  Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ *      in which case the only thing that it transports is the @compressionLevel.
+ *      This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ *      expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times.
+ *  Note : compression level is _decided at dictionary creation time_,
+ *     and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/********************************
+ *  Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/*******************************************************************************
+ * Advanced dictionary and prefix API
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() :
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 2 : Loading a dictionary involves building tables.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *           Tables are dependent on compression parameters, and for this reason,
+ *           compression parameters can no longer be changed after loading a dictionary.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() :
+ *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supersede any compression parameter previously set within CCtx.
+ *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ *  The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ *  The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compressed frame.
+ *  A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ *           Its content must remain unmodified during compression.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_c_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *           If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ *           Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                 const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() :
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *           It's recommended to "load once, use many times", to amortize the cost
+ *  Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ *           Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ *           how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() :
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ *
+ *  If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function
+ *  will store the DDict references in a table, and the DDict used for decompression
+ *  will be determined at decompression time, as per the dict ID in the frame.
+ *  The memory for the table is allocated on the first call to refDDict, and can be
+ *  freed with ZSTD_freeDCtx().
+ *
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) to decompress next frame.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_decompressStream() returns 0.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                 const void* prefix, size_t prefixSize);
+
+/* ===   Memory management   === */
+
+/*! ZSTD_sizeof_*() :
+ *  These functions give the _current_ memory usage of selected object.
+ *  Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif  /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/****************************************************************************************
+ *   experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1)   /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format)    ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE    8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32    30
+#define ZSTD_WINDOWLOG_MAX_64    31
+#define ZSTD_WINDOWLOG_MAX     ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN       10
+#define ZSTD_HASHLOG_MAX       ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN          6
+#define ZSTD_CHAINLOG_MAX_32     29
+#define ZSTD_CHAINLOG_MAX_64     30
+#define ZSTD_CHAINLOG_MAX      ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN        ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX      (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN        1
+#define ZSTD_MINMATCH_MAX         7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN         3   /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX    ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN        ZSTD_fast
+#define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN       0
+#define ZSTD_OVERLAPLOG_MAX       9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27   /* by default, the streaming decoder will refuse any frame
+                                           * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+                                           * to preserve host's memory from unreasonable requirements.
+                                           * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+                                           * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX      ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN        4
+#define ZSTD_LDM_MINMATCH_MAX     4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN   1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX   8
+#define ZSTD_LDM_HASHRATELOG_MIN     0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN   64
+#define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN        0
+#define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+
+/* internal */
+#define ZSTD_HASHLOG3_MAX           17
+
+
+/* ---  Advanced types  --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef struct {
+    unsigned int offset;      /* The offset of the match. (NOT the same as the offset code)
+                               * If offset == 0 and matchLength == 0, this sequence represents the last
+                               * literals in the block of litLength size.
+                               */
+
+    unsigned int litLength;   /* Literal length of the sequence. */
+    unsigned int matchLength; /* Match length of the sequence. */
+
+                              /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
+                               * In this case, we will treat the sequence as a marker for a block boundary.
+                               */
+
+    unsigned int rep;         /* Represents which repeat offset is represented by the field 'offset'.
+                               * Ranges from [0, 3].
+                               *
+                               * Repeat offsets are essentially previous offsets from previous sequences sorted in
+                               * recency order. For more detail, see doc/zstd_compression_format.md
+                               *
+                               * If rep == 0, then 'offset' does not contain a repeat offset.
+                               * If rep > 0:
+                               *  If litLength != 0:
+                               *      rep == 1 --> offset == repeat_offset_1
+                               *      rep == 2 --> offset == repeat_offset_2
+                               *      rep == 3 --> offset == repeat_offset_3
+                               *  If litLength == 0:
+                               *      rep == 1 --> offset == repeat_offset_2
+                               *      rep == 2 --> offset == repeat_offset_3
+                               *      rep == 3 --> offset == repeat_offset_1 - 1
+                               *
+                               * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
+                               * 'rep', but repeat offsets do not necessarily need to be calculated from an external
+                               * sequence provider's perspective. For example, ZSTD_compressSequences() does not
+                               * use this 'rep' field at all (as of now).
+                               */
+} ZSTD_Sequence;
+
+typedef struct {
+    unsigned windowLog;       /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;        /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;         /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;       /**< nb of searches : larger == more compression, slower */
+    unsigned minMatch;        /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;    /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;   /**< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+    int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    int checksumFlag;    /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+    int noDictIDFlag;    /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef enum {
+    ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0,  /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef = 1    /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+    ZSTD_f_zstd1 = 0,           /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless = 1  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                                 * Useful to save 4 bytes per generated frame.
+                                 * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */
+    ZSTD_d_validateChecksum = 0,
+    ZSTD_d_ignoreChecksum = 1
+} ZSTD_forceIgnoreChecksum_e;
+
+typedef enum {
+    /* Note: this enum controls ZSTD_d_refMultipleDDicts */
+    ZSTD_rmd_refSingleDDict = 0,
+    ZSTD_rmd_refMultipleDDicts = 1
+} ZSTD_refMultipleDDicts_e;
+
+typedef enum {
+    /* Note: this enum and the behavior it controls are effectively internal
+     * implementation details of the compressor. They are expected to continue
+     * to evolve and should be considered only in the context of extremely
+     * advanced performance tuning.
+     *
+     * Zstd currently supports the use of a CDict in three ways:
+     *
+     * - The contents of the CDict can be copied into the working context. This
+     *   means that the compression can search both the dictionary and input
+     *   while operating on a single set of internal tables. This makes
+     *   the compression faster per-byte of input. However, the initial copy of
+     *   the CDict's tables incurs a fixed cost at the beginning of the
+     *   compression. For small compressions (< 8 KB), that copy can dominate
+     *   the cost of the compression.
+     *
+     * - The CDict's tables can be used in-place. In this model, compression is
+     *   slower per input byte, because the compressor has to search two sets of
+     *   tables. However, this model incurs no start-up cost (as long as the
+     *   working context's tables can be reused). For small inputs, this can be
+     *   faster than copying the CDict's tables.
+     *
+     * - The CDict's tables are not used at all, and instead we use the working
+     *   context alone to reload the dictionary and use params based on the source
+     *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+     *   This method is effective when the dictionary sizes are very small relative
+     *   to the input size, and the input size is fairly large to begin with.
+     *
+     * Zstd has a simple internal heuristic that selects which strategy to use
+     * at the beginning of a compression. However, if experimentation shows that
+     * Zstd is making poor choices, it is possible to override that choice with
+     * this enum.
+     */
+    ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+    ZSTD_dictForceAttach   = 1, /* Never copy the dictionary. */
+    ZSTD_dictForceCopy     = 2, /* Always copy the dictionary. */
+    ZSTD_dictForceLoad     = 3  /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+  ZSTD_lcm_auto = 0,          /**< Automatically determine the compression mode based on the compression level.
+                               *   Negative compression levels will be uncompressed, and positive compression
+                               *   levels will be compressed. */
+  ZSTD_lcm_huffman = 1,       /**< Always attempt Huffman compression. Uncompressed literals will still be
+                               *   emitted if Huffman compression is not profitable. */
+  ZSTD_lcm_uncompressed = 2   /**< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - upper-bound for the decompressed size of all data in all successive frames
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *  note 1  : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ *  note 2  : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ *            in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ *  note 3  : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ *              upper-bound = # blocks * min(128 KB, Window_Size)
+ */
+ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+typedef enum {
+  ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+  ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+} ZSTD_sequenceFormat_e;
+
+/*! ZSTD_generateSequences() :
+ * Generate sequences using ZSTD_compress2, given a source buffer.
+ *
+ * Each block will end with a dummy sequence
+ * with offset == 0, matchLength == 0, and litLength == length of last literals.
+ * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+ * simply acts as a block delimiter.
+ *
+ * zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2
+ *
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+ * @return : number of sequences generated
+ */
+
+ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                                          size_t outSeqsSize, const void* src, size_t srcSize);
+
+/*! ZSTD_mergeBlockDelimiters() :
+ * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+ * by merging them into into the literals of the next sequence.
+ *
+ * As such, the final generated result has no explicit representation of block boundaries,
+ * and the final last literals segment is not represented in the sequences.
+ *
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters
+ * @return : number of sequences left after merging
+ */
+ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+
+/*! ZSTD_compressSequences() :
+ * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+ * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+ * The entire source is compressed into a single frame.
+ *
+ * The compression behavior changes based on cctx params. In particular:
+ *    If ZSTD_c_blockDelimiters == ZSTD_sf_noBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+ *    no block delimiters (defined in ZSTD_Sequence). Block boundaries are roughly determined based on
+ *    the block size derived from the cctx, and sequences may be split. This is the default setting.
+ *
+ *    If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+ *    block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
+ *
+ *    If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined
+ *    behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for
+ *    specifics regarding offset/matchlength requirements) then the function will bail out and return an error.
+ *
+ *    In addition to the two adjustable experimental params, there are other important cctx params.
+ *    - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN.
+ *    - ZSTD_c_compressionLevel accordingly adjusts the strength of the entropy coder, as it would in typical compression.
+ *    - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset
+ *      is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md
+ *
+ * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+ * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+ *         and cannot emit an RLE block that disagrees with the repcode history
+ * @return : final compressed size or a ZSTD error.
+ */
+ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                                  const void* src, size_t srcSize);
+
+
+/*! ZSTD_writeSkippableFrame() :
+ * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ *
+ * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number,
+ * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15.
+ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so
+ * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, if the source size is not representable
+ * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid).
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+                                            const void* src, size_t srcSize, unsigned magicVariant);
+
+
+/***************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *
+ *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ *  for any compression level up to selected one.
+ *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ *         does not include space for a window buffer.
+ *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  The estimate will assume the input may be arbitrarily large,
+ *  which is the worst case.
+ *
+ *  When srcSize can be bound by a known and rather "small" value,
+ *  this fact can be used to provide a tighter estimation
+ *  because the CCtx compression context will need less memory.
+ *  This tighter estimation can be provided by more advanced functions
+ *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+/* ! Thread pool :
+ * These prototypes make it possible to share a thread pool among multiple compression contexts.
+ * This can limit resources for applications with multiple threads where each one uses
+ * a threaded compression mode (via ZSTD_c_nbWorkers parameter).
+ * ZSTD_createThreadPool creates a new thread pool with a given number of threads.
+ * Note that the lifetime of such pool must exist while being used.
+ * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value
+ * to use an internal thread pool).
+ * ZSTD_freeThreadPool frees a thread pool.
+ */
+typedef struct POOL_ctx_s ZSTD_threadPool;
+ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads);
+ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool);
+ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool);
+
+
+/*
+ * This API is temporary and is expected to change or disappear in the future!
+ */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
+    const void* dict, size_t dictSize,
+    ZSTD_dictLoadMethod_e dictLoadMethod,
+    ZSTD_dictContentType_e dictContentType,
+    const ZSTD_CCtx_params* cctxParams,
+    ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(
+    const void* dict, size_t dictSize,
+    ZSTD_dictLoadMethod_e dictLoadMethod,
+    ZSTD_dictContentType_e dictContentType,
+    ZSTD_customMem customMem);
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is just referenced, not duplicated.
+ *  As a consequence, `dictBuffer` **must** outlive CDict,
+ *  and its content must remain unmodified throughout the lifetime of CDict.
+ *  note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getDictID_fromCDict() :
+ *  Provides the dictID of the dictionary loaded into `cdict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ *  same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ *  All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ *  Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ *  cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ *  This function never fails (wide contract) */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ *  Note : this function is now DEPRECATED.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */
+ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+                                          void* dst, size_t dstCapacity,
+                                    const void* src, size_t srcSize,
+                                    const void* dict,size_t dictSize,
+                                          ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ *  Note : this function is now REDUNDANT.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning in some future version */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_CDict* cdict,
+                                              ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ *  It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ *  Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* ===   experimental parameters   === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+  * which makes compressed files more rsync friendly
+  * by adding periodic synchronization points to the compressed data.
+  * The target average block size is ZSTD_c_jobSize / 2.
+  * It's possible to modify the job size to increase or decrease
+  * the granularity of the synchronization point.
+  * Once the jobSize is smaller than the window size,
+  * it will result in compression ratio degradation.
+  * NOTE 1: rsyncable mode only works when multithreading is enabled.
+  * NOTE 2: rsyncable performs poorly in combination with long range mode,
+  * since it will decrease the effectiveness of synchronization points,
+  * though mileage may vary.
+  * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+  * If the selected compression level is already running significantly slower,
+  * the overall speed won't be significantly impacted.
+  */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controls how the literals are compressed (default is auto).
+ * The value must be of type ZSTD_literalCompressionMode_e.
+ * See ZSTD_literalCompressionMode_t enum definition for details.
+ */
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/* Controls whether the new and experimental "dedicated dictionary search
+ * structure" can be used. This feature is still rough around the edges, be
+ * prepared for surprising behavior!
+ *
+ * How to use it:
+ *
+ * When using a CDict, whether to use this feature or not is controlled at
+ * CDict creation, and it must be set in a CCtxParams set passed into that
+ * construction (via ZSTD_createCDict_advanced2()). A compression will then
+ * use the feature or not based on how the CDict was constructed; the value of
+ * this param, set in the CCtx, will have no effect.
+ *
+ * However, when a dictionary buffer is passed into a CCtx, such as via
+ * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control
+ * whether the CDict that is created internally can use the feature or not.
+ *
+ * What it does:
+ *
+ * Normally, the internal data structures of the CDict are analogous to what
+ * would be stored in a CCtx after compressing the contents of a dictionary.
+ * To an approximation, a compression using a dictionary can then use those
+ * data structures to simply continue what is effectively a streaming
+ * compression where the simulated compression of the dictionary left off.
+ * Which is to say, the search structures in the CDict are normally the same
+ * format as in the CCtx.
+ *
+ * It is possible to do better, since the CDict is not like a CCtx: the search
+ * structures are written once during CDict creation, and then are only read
+ * after that, while the search structures in the CCtx are both read and
+ * written as the compression goes along. This means we can choose a search
+ * structure for the dictionary that is read-optimized.
+ *
+ * This feature enables the use of that different structure.
+ *
+ * Note that some of the members of the ZSTD_compressionParameters struct have
+ * different semantics and constraints in the dedicated search structure. It is
+ * highly recommended that you simply set a compression level in the CCtxParams
+ * you pass into the CDict creation call, and avoid messing with the cParams
+ * directly.
+ *
+ * Effects:
+ *
+ * This will only have any effect when the selected ZSTD_strategy
+ * implementation supports this feature. Currently, that's limited to
+ * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2.
+ *
+ * Note that this means that the CDict tables can no longer be copied into the
+ * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be
+ * useable. The dictionary can only be attached or reloaded.
+ *
+ * In general, you should expect compression to be faster--sometimes very much
+ * so--and CDict creation to be slightly slower. Eventually, we will probably
+ * make this mode the default.
+ */
+#define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8
+
+/* ZSTD_c_stableInBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the compressor, and
+ * compression will fail if it ever changes. This means the only flush
+ * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+ * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+ * MUST not be modified during compression or you will get data corruption.
+ *
+ * When this flag is enabled zstd won't allocate an input window buffer,
+ * because the user guarantees it can reference the ZSTD_inBuffer until
+ * the frame is complete. But, it will still allocate an output buffer
+ * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+ * avoid the memcpy() from the input buffer to the input window buffer.
+ *
+ * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+ * That means this flag cannot be used with ZSTD_compressStream().
+ *
+ * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, compression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+ * not be modified during compression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * matches. Normally zstd maintains its own window buffer for this purpose,
+ * but passing this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+
+/* ZSTD_c_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells he compressor that the ZSTD_outBuffer will not be resized between
+ * calls. Specifically: (out.size - out.pos) will never grow. This gives the
+ * compressor the freedom to say: If the compressed data doesn't fit in the
+ * output buffer then return ZSTD_error_dstSizeTooSmall. This allows us to
+ * always decompress directly into the output buffer, instead of decompressing
+ * into an internal buffer and copying to the output buffer.
+ *
+ * When this flag is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer. It will still allocate the
+ * input window buffer (see ZSTD_c_stableInBuffer).
+ *
+ * Zstd will check that (out.size - out.pos) never grows and return an error
+ * if it does. While not strictly necessary, this should prevent surprises.
+ */
+#define ZSTD_c_stableOutBuffer ZSTD_c_experimentalParam10
+
+/* ZSTD_c_blockDelimiters
+ * Default is 0 == ZSTD_sf_noBlockDelimiters.
+ *
+ * For use with sequence compression API: ZSTD_compressSequences().
+ *
+ * Designates whether or not the given array of ZSTD_Sequence contains block delimiters
+ * and last literals, which are defined as sequences with offset == 0 and matchLength == 0.
+ * See the definition of ZSTD_Sequence for more specifics.
+ */
+#define ZSTD_c_blockDelimiters ZSTD_c_experimentalParam11
+
+/* ZSTD_c_validateSequences
+ * Default is 0 == disabled. Set to 1 to enable sequence validation.
+ *
+ * For use with sequence compression API: ZSTD_compressSequences().
+ * Designates whether or not we validate sequences provided to ZSTD_compressSequences()
+ * during function execution.
+ *
+ * Without validation, providing a sequence that does not conform to the zstd spec will cause
+ * undefined behavior, and may produce a corrupted block.
+ *
+ * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
+ * specifics regarding offset/matchlength requirements) then the function will bail out and
+ * return an error.
+ *
+ */
+#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
+
+/*! ZSTD_CCtx_getParameter() :
+ *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ *                                     an existing ZSTD_CCtx_params structure.
+ *                                     This is similar to
+ *                                     ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent frames.
+ *  - ZSTD_compressStream2() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation of CCtx for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() :
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using
+ *  ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : a code representing success or failure (which can be tested with
+ *           ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ *  Same as ZSTD_compressStream2(),
+ *  but using only integral types as arguments.
+ *  This variant might be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/***************************************
+*  Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but references `dict` content instead of copying it into `dctx`.
+ *  This saves memory if `dict` remains around.,
+ *  However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but gives direct control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ *  Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+/*! ZSTD_DCtx_getParameter() :
+ *  Get the requested decompression parameter value, selected by enum ZSTD_dParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value);
+
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
+ */
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flags is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/* ZSTD_d_forceIgnoreChecksum
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable
+ *
+ * Tells the decompressor to skip checksum validation during decompression, regardless
+ * of whether checksumming was specified during compression. This offers some
+ * slight performance benefits, and may be useful for debugging.
+ * Param has values of type ZSTD_forceIgnoreChecksum_e
+ */
+#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3
+
+/* ZSTD_d_refMultipleDDicts
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable
+ *
+ * If enabled and dctx is allocated on the heap, then additional memory will be allocated
+ * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict()
+ * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead
+ * store all references. At decompression time, the appropriate dictID is selected
+ * from the set of DDicts based on the dictID in the frame.
+ *
+ * Usage is simply calling ZSTD_refDDict() on multiple dict buffers.
+ *
+ * Param has values of byte ZSTD_refMultipleDDicts_e
+ *
+ * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory
+ * allocation for the hash table. ZSTD_freeDCtx() also frees this memory.
+ * Memory is allocated as per ZSTD_DCtx::customMem.
+ *
+ * Although this function allocates memory for the table, the user is still responsible for
+ * memory management of the underlying ZSTD_DDict* themselves.
+ */
+#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+
+
+/*! ZSTD_DCtx_setFormat() :
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+/*! ZSTD_decompressStream_simpleArgs() :
+ *  Same as ZSTD_decompressStream(),
+ *  but using only integral types as arguments.
+ *  This can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/********************************************************************
+*  Advanced streaming functions
+*  Warning : most of these functions are now redundant with the Advanced API.
+*  Once Advanced API reaches "stable" status,
+*  redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+
+/*! ZSTD_initCStream_srcSize() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                         int compressionLevel,
+                         unsigned long long pledgedSrcSize);
+
+/*! ZSTD_initCStream_usingDict() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           int compressionLevel);
+
+/*! ZSTD_initCStream_advanced() :
+ * This function is deprecated, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ *     for ((param, value) : params) {
+ *         ZSTD_CCtx_setParameter(zcs, param, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize,
+                          ZSTD_parameters params,
+                          unsigned long long pledgedSrcSize);
+
+/*! ZSTD_initCStream_usingCDict() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/*! ZSTD_initCStream_usingCDict_advanced() :
+ *   This function is DEPRECATED, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ *     for ((fParam, value) : fParams) {
+ *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ *  start a new frame, using same parameters from previous frame.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ *  Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
+ */
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flush speed is limited by production speed of oldest job
+ *    irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+
+/*!
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/*!
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+/*!
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/**
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/**       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+      ===> In which case, nothing is produced into `dst` !
+      + User __must__ test for such outcome and deal directly with uncompressed data
+      + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+        Doing so would mess up with statistics history, leading to potential data corruption.
+      + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
+
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif