diff --git a/src/PARAMS.med_core b/src/PARAMS.med_core
new file mode 100644
index 00000000..9ec860d4
--- /dev/null
+++ b/src/PARAMS.med_core
@@ -0,0 +1,229 @@
+# File         : PARAMS.kaby_lake
+# Date         : 03/06/19
+# Description  : Kaby Lake Configuration
+# 
+# Based on documentation found here:
+# https://en.wikichip.org/wiki/intel/microarchitectures/kaby_lake
+
+## Simulation Parameters
+--mode                          full
+--model                         cmp
+--sim_limit                     none
+
+## Core Parameters
+
+# Femptoseconds, 3.2GHz, used for energy estimates.
+--chip_cycle_time               312500
+
+
+### Fetch Stage
+--fetch_off_path_ops            1
+--fetch_across_cache_lines      1
+
+# Will break the packet upon a taken branch.
+--fetch_break_on_taken          1
+
+# Number of bubble cycles to wait after taken branch.
+--fetch_taken_bubble_cycles     0
+
+#### ICache
+--icache_size                   32768
+--icache_assoc                  8
+--icache_line_size              64
+
+### Branch Predictor
+--extra_recovery_cycles         0                         # Number of cycles before the fetching of the first instructions after recovery.
+--extra_redirect_cycles         0                         # Number of cycles before the fetching of the first instructions after redirect.
+--cfs_per_cycle                 6                         # Number of branches that can be predicted in a single cycle
+--bp_update_at_retire           0                         # Update the BP at retire. If false, update at the end of exec.
+--update_bp_off_path            1                         # Allow off path ops to update branch predictor state (e.g., ops when they complete exec stage).
+--bp_mech                       tagescl                   
+--taken_per_cycle               1 
+--fetch_queue_size              256 
+--fetch_queue_num_taken         16 
+
+
+#### BTB
+
+# BTB model to use.
+--btb_mech                      generic
+--btb_entries                   4096
+--btb_assoc                     4
+
+# Allow the BTB to be updated by off path ops. 
+--btb_off_path_writes           1
+
+
+#### CRS
+
+# Enable return stack
+--enable_crs                    1
+--crs_entries                   32
+--crs_realistic                 1
+
+### iBP
+--enable_ibp                    1                         # Enable the indirect branch predictor
+--ibtb_mech                     tc_tagged                 
+                                                          # iBTB Model. tc_tagless, tc_tagged, tc_hybrid.
+--ibtb_off_path_writes          1                         # Allow off path ops to update the ibtb.
+--tc_entries                    4096
+--tc_assoc                      4
+
+### Decode Stage
+--decode_cycles                 5
+
+
+### Map Stage
+--map_cycles                    8
+
+### Issue Stage
+
+# Max number of instructions to be fetched, decoded, renamed, and issued per cycle.
+--issue_width                   8
+
+--rs_fill_width                 0
+--rs_sizes                    176
+--rs_connections                0
+--fu_types                      0 0 0 0 0 0 0 0 0 0 0 0
+
+
+### Exec Stage
+
+### DCache
+--dcache_size 	 	        49152
+--dcache_read_ports             2
+--dcache_write_ports            1
+--dcache_banks                  2
+--dcache_assoc		        12
+--dcache_line_size              64
+
+### Reorder/Retire Stage
+
+# Max number of instructions to be retired per cycle.
+--node_ret_width                16
+--node_table_size               352
+#--load_queue_entries            128
+#--store_queue_entries           72
+
+# Do not keep stores in RSVs on cache misses; TODO: confirm what this knob does
+--stores_do_not_block_window    1
+
+# TODO: confirm what this knob does
+--prefs_do_not_block_window     1
+
+
+## Uncore
+
+### Mid-level cache
+
+# Enable use of a midlevel cache between i/d and L1
+--mlc_present                   0
+
+### LLC
+--l1_size                       1048576
+--l1_banks                      1
+--l1_cycles		        18
+--l1_assoc                      16
+--l1_line_size                  64
+--l1_interleave_factor          64
+
+
+### Prefetcher
+--pref_framework_on             1
+--pref_stream_on 		            1
+--pref_stream_per_core_enable   1
+--pref_shared_queues            0
+--pref_train_on_pref_misses     0
+--pref_oracle_train             0
+--pref_ul1req_queue_overwrite_on_full 1
+
+--stream_length                 64
+--stream_prefetch_n             4
+--stream_start_dis              1
+--stream_train_num 		4	
+--stream_create_on_dc_miss      0
+--stream_create_on_l1_miss      1
+
+--pref_throttlefb_on=1
+--pref_acc_thresh_1=0.75
+--pref_acc_thresh_2=0.4
+--pref_acc_thresh_3=0.4
+--pref_timely_thresh=0.01
+--pref_polpf_thresh=0.005
+--pref_update_interval=8192
+
+--mem_req_buffer_pref_watermark 4
+--promote_to_higher_priority_mem_req_type 1
+
+### Memory
+--addr_translation		random
+
+--mem_priority_ifetch        0
+--mem_priority_dfetch        1
+--mem_priority_dstore        2
+--mem_priority_iprf          3
+--mem_priority_dprf          4
+--mem_priority_wb            5
+--mem_priority_wb_nodirty    5
+
+--mem_req_buffer_entries        64
+--mem_l1_fill_queue_entries     64
+--va_page_size_bytes      4096
+--bus_width_in_bytes            8
+
+--ramulator_standard		DDR4
+--ramulator_speed		DDR4_2400R
+--ramulator_org 		DDR4_8Gb_x8
+--ramulator_channels		2
+--ramulator_ranks		1
+--ramulator_bankgroups		4
+--ramulator_banks		4
+--ramulator_chip_width		8
+--ramulator_rows		65536
+--ramulator_cols		1024
+--ramulator_scheduling_policy	FRFCFS_Cap		
+--ramulator_readq_entries	 64
+--ramulator_writeq_entries  64
+--ramulator_record_cmd_trace	FALSE		
+--ramulator_print_cmd_trace	FALSE		
+--ramulator_tCK			833333
+--ramulator_tCL			16		
+--ramulator_tCCD		6		
+--ramulator_tCCDS		4		
+--ramulator_tCCDL		6		
+--ramulator_tCWL		12		
+--ramulator_tBL			4		
+--ramulator_tWTR		9		
+--ramulator_tWTRS		3		
+--ramulator_tWTRL		9		
+--ramulator_tRP			16		
+--ramulator_tRPpb		16		
+--ramulator_tRPab		16		
+--ramulator_tRCD		16		
+--ramulator_tRCDR		16		
+--ramulator_tRCDW		16		
+--ramulator_tRAS		39		
+--dram_tech_in_nm		32
+
+## Other
+
+
+### Debug
+--debug_inst_start              1
+--debug_inst_stop              100000000
+--debug_cycle_start             1
+--debug_cycle_stop             100000000
+
+
+## Stats and Params
+--dump_params                   1
+--dump_stats                    1
+--dump_trace                    0
+
+####################################
+--set_off_path_confirmed        1
+
+--order_beyond_bus              1
+
+--mem_ooo_stores                1
+--mem_obey_store_dep            1
diff --git a/src/bp/bp.c b/src/bp/bp.c
index 9dc828c6..11e4b865 100644
--- a/src/bp/bp.c
+++ b/src/bp/bp.c
@@ -253,6 +253,7 @@ void init_bp_data(uns8 proc_id, Bp_Data* bp_data) {
 /* bp_predict_op:  predicts the target of a control flow instruction */
 
 Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
+  DEBUG(0,"before bp predict op, recovery_sch %d\n", op->oracle_info.recovery_sch);
   Addr addr = fetch_addr;
   /*Addr line_addr;*/
   Addr* btb_target;
@@ -283,7 +284,9 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
   op->recovery_info.branchTarget     = op->oracle_info.target;
 
 
+  DEBUG(0,"before timestamp, recovery_sch %d\n", op->oracle_info.recovery_sch);
   bp_data->bp->timestamp_func(op);
+  DEBUG(0,"after timestamp, recovery_sch %d\n", op->oracle_info.recovery_sch);
   if(USE_LATE_BP) {
     bp_data->late_bp->timestamp_func(op);
   }
@@ -369,15 +372,24 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
         op->oracle_info.pred      = op->oracle_info.dir;
         op->oracle_info.no_target = FALSE;
       } else {
+        DEBUG(0,"before pred func, recovery_sch %d\n", op->oracle_info.recovery_sch);
         op->oracle_info.pred = bp_data->bp->pred_func(op);
+        DEBUG(0,"after pred func, recovery_sch %d\n", op->oracle_info.recovery_sch);
         if(USE_LATE_BP) {
           op->oracle_info.late_pred = bp_data->late_bp->pred_func(op);
         }
+        DEBUG(0,"after late pred func, recovery_sch %d\n", op->oracle_info.recovery_sch);
       }
 
       // Update history used by the rest of Scarab.
-      bp_data->global_hist = (bp_data->global_hist >> 1) |
-                             (op->oracle_info.pred << 31);
+      if(USE_LATE_BP && DECOUPLED_BP){
+        bp_data->global_hist = (bp_data->global_hist >> 1) |
+                               (op->oracle_info.late_pred << 31);
+      }
+      else{
+        bp_data->global_hist = (bp_data->global_hist >> 1) |
+                               (op->oracle_info.pred << 31);
+      }
 
       if(PERFECT_CBR_BTB ||
          (PERFECT_NT_BTB && op->oracle_info.pred == NOT_TAKEN)) {
@@ -474,12 +486,7 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
   }
   // }}}
 
-  // pred_target = convert_to_cmp_addr(op->proc_id, pred_target);
-
-  bp_data->bp->spec_update_func(op);
-  if(USE_LATE_BP) {
-    bp_data->late_bp->spec_update_func(op);
-  }
+  pred_target = convert_to_cmp_addr(op->proc_id, pred_target);
 
   const Addr pc_plus_offset = ADDR_PLUS_OFFSET(
     op->inst_info->addr, op->inst_info->trace_info.inst_size);
@@ -494,6 +501,10 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
   op->oracle_info.misfetch = !op->oracle_info.mispred &&
                              prediction != op->oracle_info.npc;
 
+  STAT_EVENT(op->proc_id, BP_ON_PATH_CORRECT + op->oracle_info.mispred +
+                            2 * op->oracle_info.misfetch + 3 * op->off_path);
+  op->oracle_info.early_late_disagree = FALSE;
+  op->oracle_info.early_pred = op->oracle_info.pred;
   if(USE_LATE_BP) {
     const Addr late_prediction = op->oracle_info.late_pred ? pred_target :
                                                              pc_plus_offset;
@@ -503,8 +514,25 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
                                    (late_prediction != op->oracle_info.npc);
     op->oracle_info.late_misfetch = !op->oracle_info.late_mispred &&
                                     late_prediction != op->oracle_info.npc;
+    if(DECOUPLED_BP){
+      if(late_prediction != prediction){
+        op->oracle_info.early_late_disagree = TRUE;
+      }
+      op->oracle_info.pred = op->oracle_info.late_pred;
+      op->oracle_info.mispred = op->oracle_info.late_mispred;
+      op->oracle_info.misfetch= op->oracle_info.late_misfetch;
+      op->oracle_info.pred_npc = op->oracle_info.late_pred_npc;
+    }
   }
 
+  if(!TAGE_NO_UNCOND_UPDATE || op->table_info->cf_type != CF_BR){
+    bp_data->bp->spec_update_func(op);
+    if(USE_LATE_BP) {
+      bp_data->late_bp->spec_update_func(op);
+    }
+  }
+
+
   op->bp_cycle = cycle_count;
 
   // {{{ stats and debugging
@@ -520,8 +548,6 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
       STAT_EVENT(op->proc_id, BTB_OFF_PATH_MISS);
   }
 
-  STAT_EVENT(op->proc_id, BP_ON_PATH_CORRECT + op->oracle_info.mispred +
-                            2 * op->oracle_info.misfetch + 3 * op->off_path);
   STAT_EVENT(op->proc_id,
              LATE_BP_ON_PATH_CORRECT + op->oracle_info.late_mispred +
                2 * op->oracle_info.late_misfetch + 3 * op->off_path);
@@ -593,7 +619,8 @@ Addr bp_predict_op(Bp_Data* bp_data, Op* op, uns br_num, Addr fetch_addr) {
     DEBUG(bp_data->proc_id, "low_conf_count:%d \n", td->td_info.low_conf_count);
   }
 
-  return prediction;
+  DEBUG(0,"end bp predict op, recovery_sch %d\n", op->oracle_info.recovery_sch);
+  return op->oracle_info.pred_npc;
 }
 
 
diff --git a/src/bp/bp.param.def b/src/bp/bp.param.def
index 55775590..a4f20908 100644
--- a/src/bp/bp.param.def
+++ b/src/bp/bp.param.def
@@ -57,7 +57,13 @@ DEF_PARAM(  perfect_crs               , PERFECT_CRS               , Flag    , Fl
 DEF_PARAM(  perfect_cbr_btb           , PERFECT_CBR_BTB           , Flag    , Flag       , FALSE      ,        )
 DEF_PARAM(  perfect_nt_btb            , PERFECT_NT_BTB            , Flag    , Flag       , FALSE      ,        )
 
+DEF_PARAM(  decoupled_bp              , DECOUPLED_BP              , Flag    , Flag       , FALSE      ,        )
+DEF_PARAM(  fetch_queue_size          , FETCH_QUEUE_SIZE          , uns     , uns        , 32         ,        )
+DEF_PARAM(  fetch_queue_num_taken     , FETCH_QUEUE_NUM_TAKEN     , uns     , uns        , 4         ,        )
 DEF_PARAM(  cfs_per_cycle             , CFS_PER_CYCLE             , uns     , uns        , 3          ,        )
+DEF_PARAM(  taken_per_cycle           , TAKEN_PER_CYCLE           , uns     , uns        , 1          ,        )
+DEF_PARAM(  bp_op_per_cycle           , BP_OP_PER_CYCLE           , uns     , uns        , 24         ,        )
+
 DEF_PARAM(  update_bp_off_path        , UPDATE_BP_OFF_PATH        , Flag    , Flag       , FALSE      ,        )
 DEF_PARAM(  bp_update_at_retire       , BP_UPDATE_AT_RETIRE       , Flag    , Flag       , FALSE      ,        )
 
@@ -66,7 +72,7 @@ DEF_PARAM(  bp_mech                   , BP_MECH                   , uns     , bp
 DEF_PARAM(  late_bp_mech              , LATE_BP_MECH              , uns     , bp_mech    , NUM_BP     ,        )
 DEF_PARAM(  late_bp_latency           , LATE_BP_LATENCY           , uns     , uns        , 5          ,        )
 DEF_PARAM(  hist_length               , HIST_LENGTH               , uns     , uns        , 16         ,        )
-DEF_PARAM(  pht_ctr_bits              , PHT_CTR_BITS              , uns     , uns        , 2          , const  ) /* const */
+DEF_PARAM(  pht_ctr_bits              , PHT_CTR_BITS              , uns     , uns        , 2          ,        ) /* const */
 DEF_PARAM(  bht_entries               , BHT_ENTRIES               , uns     , uns        , (4 * 1024) ,        )
 DEF_PARAM(  bht_assoc                 , BHT_ASSOC                 , uns     , uns        , 4          ,        )
 DEF_PARAM(  hybrids_index_length      , HYBRIDS_INDEX_LENGTH      , uns     , uns        , 16         ,        )
@@ -151,3 +157,32 @@ DEF_PARAM(  perceptron_train_misp_factor  , PERCEPTRON_TRAIN_MISP_FACTOR
 DEF_PARAM(  perceptron_train_corr_factor  , PERCEPTRON_TRAIN_CORR_FACTOR          , uns     , uns        , 1          ,        )
 DEF_PARAM(  perceptron_conf_his_both     , PERCEPTRON_CONF_HIS_BOTH   , Flag    , Flag        , FALSE     ,           )
 DEF_PARAM(  perceptron_conf_his_both_length  , PERCEPTRON_CONF_HIS_BOTH_LENGTH   , uns    , uns        , 4     ,           )
+DEF_PARAM(  path_under_cf_hist_length, PATH_UNDER_CF_HIST_LENGTH,    uns    , uns        , 16     ,           )
+
+
+DEF_PARAM(  use_2_bit_counter_in_l0, USE_2_BIT_COUNTER_IN_L0 ,    Flag, Flag, TRUE     ,           )
+DEF_PARAM(  l0_btb_size, L0_BTB_SIZE,    uns, uns, 1024,           )
+DEF_PARAM(  l0_btb_assoc, L0_BTB_ASSOC,    uns, uns, 4,           )
+
+DEF_PARAM(  ffp_tage_with_ras, FFP_TAGE_WITH_RAS,    Flag, Flag, FALSE     ,           )
+DEF_PARAM(  ffp_use_late_pred, FFP_USE_LATE_PRED,    Flag, Flag, FALSE     ,           )
+DEF_PARAM(  ffp_num_tage, FFP_NUM_TAGE,    uns, uns, 4,           )
+DEF_PARAM(  ffp_use_bm, FFP_USE_BM,    Flag, Flag, FALSE,           )
+DEF_PARAM(  ffp_use_gshare, FFP_USE_GSHARE,    Flag, Flag, FALSE,           )
+DEF_PARAM(  ffp_skip_tage_if_use_bm, FFP_SKIP_TAGE_IF_USE_BM,    Flag, Flag, FALSE,           )
+DEF_PARAM(  ahead_distance, AHEAD_DISTANCE,    uns, uns, 5,           )
+DEF_PARAM(  ffp_hash_pc_bits, FFP_HASH_PC_BITS,    uns, uns, 3,           )
+DEF_PARAM(  future_tage_size_kb, FUTURE_TAGE_SIZE_KB,    uns, uns, 64,           )
+DEF_PARAM(  future_tage_latency, FUTURE_TAGE_LATENCY,    uns, uns, 3,           )
+DEF_PARAM(  ffp_gshare_thresh, FFP_GSHARE_THRESH,    int, int, 4,           )
+DEF_PARAM(  ffp_bm_thresh, FFP_BM_THRESH,    int, int, 2,           )
+DEF_PARAM(  ffp_ffp_thresh, FFP_FFP_THRESH,    int, int, 0,           )
+DEF_PARAM(  ffp_kill_bm_one_wrong, FFP_KILL_BM_ONE_WRONG,    Flag, Flag, FALSE,           )
+DEF_PARAM(  ffp_hash_dir, FFP_HASH_DIR,    Flag, Flag, TRUE,           )
+DEF_PARAM(  ffp_gshare_hist, FFP_GSHARE_HIST,    uns, uns, 5,           )
+DEF_PARAM(  ffp_dyn_ahead, FFP_DYN_AHEAD,    Flag, Flag, FALSE,           )
+DEF_PARAM(  tagescl_size, TAGESCL_SIZE,    uns, uns, 64,           )
+DEF_PARAM(  ffp_use_tage, FFP_USE_TAGE,    Flag, Flag, FALSE,           )
+
+DEF_PARAM(  tage_no_uncond_update, TAGE_NO_UNCOND_UPDATE,    Flag, Flag, FALSE,           )
+DEF_PARAM(  tagescl_hist_length,TAGESCL_HIST_LENGTH ,    int, int, 3000,           )
\ No newline at end of file
diff --git a/src/bp/bp.stat.def b/src/bp/bp.stat.def
index bb7526c7..78c83dd4 100644
--- a/src/bp/bp.stat.def
+++ b/src/bp/bp.stat.def
@@ -77,6 +77,18 @@ DEF_STAT(  LATE_BP_OFF_PATH_CORRECT      , DIST    , NO_RATIO       )
 DEF_STAT(  LATE_BP_OFF_PATH_MISPREDICT   , COUNT   , NO_RATIO       )
 DEF_STAT(  LATE_BP_OFF_PATH_MISFETCH     , DIST    , NO_RATIO       )
 
+DEF_STAT(  PATH1_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH2_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH3_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH4_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH5_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH6_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH7_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH8_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH9_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH10_UNDER_CF,              COUNT, NO_RATIO  )
+DEF_STAT(  PATH11_UNDER_CF,              COUNT, NO_RATIO  )
+
 DEF_STAT(  BP_ON_PATH_CONF_PVN      , DIST    , NO_RATIO       )
 DEF_STAT(  BP_ON_PATH_CONF_PVN_BOT  , DIST    , NO_RATIO       )
 
@@ -535,3 +547,95 @@ DEF_STAT(  CF_RET_USED_TARGET_INCORRECT,                COUNT, NO_RATIO  )
 
 DEF_STAT(  CF_DEFAULT_USED_TARGET_CORRECT,      COUNT, NO_RATIO  )
 DEF_STAT(  CF_DEFAULT_USED_TARGET_INCORRECT,    DIST,  NO_RATIO  )
+
+DEF_STAT(  BP_BW_FULL_WIDTH,      			COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_FULL_FQ,        			COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_SYS_CALL,        			COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_LATE_BP_REDIRECT,	        COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_REDIRECT,			        COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_1_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_2_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_3_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_4_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_5_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_6_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_7_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_8_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_9_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_10_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_11_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_12_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_13_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_14_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_15_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_16_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_17_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_18_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_19_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_20_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_21_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_22_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_23_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_24_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_25_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_26_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_27_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_28_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_29_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_30_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_31_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_32_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_33_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_34_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_35_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_36_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_37_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_38_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+DEF_STAT(  BP_BW_39_BREAK_TAKEN, 		    COUNT, NO_RATIO  )
+
+DEF_STAT(  TOTAL_ON_PATH_TAKEN, 		COUNT, NO_RATIO  )
+DEF_STAT(  TOTAL_ON_PATH_CYCLES, 		COUNT, NO_RATIO  )
+DEF_STAT(  TOTAL_EARLY_LATE_DISAGREE, 		COUNT, NO_RATIO  )
+
+DEF_STAT(  FFP_AND_GSHARE_WRONG, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_GSHARE_OVERRIDE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BM_OVERRIDE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_USEFUL_GSHARE_OVERRIDE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BAD_GSHARE_OVERRIDE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_USEFUL_BM_OVERRIDE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BAD_BM_OVERRIDE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_LATE, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_CONFLICT, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_TAGE_MISPREDICT, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK0_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK1_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK2_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK3_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK4_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK5_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK6_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK7_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK8_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK9_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK10_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK11_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK12_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK13_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK14_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK15_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK16_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK17_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK18_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK19_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK20_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK21_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK22_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK23_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK24_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK25_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK26_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK27_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK28_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK29_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK30_ALLOC, 		COUNT, NO_RATIO  )
+DEF_STAT(  FFP_BANK31_ALLOC, 		COUNT, NO_RATIO  )
\ No newline at end of file
diff --git a/src/bp/decoupled_bp.cc b/src/bp/decoupled_bp.cc
new file mode 100644
index 00000000..627133af
--- /dev/null
+++ b/src/bp/decoupled_bp.cc
@@ -0,0 +1,471 @@
+#include <vector>
+#include <set>
+#include <map>
+#include <algorithm>
+#include <fstream>
+#include <deque>
+
+extern "C" {
+#include "debug/debug_macros.h"
+#include "debug/debug_print.h"
+#include "globals/assert.h"
+#include "globals/global_defs.h"
+#include "globals/global_types.h"
+#include "globals/global_vars.h"
+
+#include "bp/bp.h"
+#include "map.h"
+#include "op_pool.h"
+#include "op.h"
+#include "packet_build.h"
+#include "thread.h"
+#include "bp/bp.param.h"
+#include "cmp_model.h"
+#include "core.param.h"
+#include "debug/debug.param.h"
+#include "frontend/frontend.h"
+#include "frontend/pin_trace_fe.h"
+#include "memory/memory.h"
+#include "memory/memory.param.h"
+#include "prefetcher/l2l1pref.h"
+#include "prefetcher/stream_pref.h"
+#include "statistics.h"
+#include "bp/decoupled_bp.h"
+}
+
+#define DEBUG(proc_id, args...) _DEBUG(proc_id, DEBUG_ICACHE_STAGE, ##args)
+
+uns last_packet_fetch_time = 0; /* for computing fetch break latency */
+
+std::vector<std::deque<fetch_queue_entry>> fetch_queue;
+Decoupled_BP* dbp = NULL;
+
+extern Flag USE_LATE_BP;
+
+void set_dbp_stage(Decoupled_BP* new_dbp) {
+  dbp = new_dbp;
+}
+
+void reset_dbp_stage() {
+  ASSERT(0, dbp);
+  fetch_queue.resize(fetch_queue.size() + 1);
+  ASSERT(0, fetch_queue.size() - 1 == dbp->proc_id);
+  fetch_queue[dbp->proc_id].clear();
+  dbp->next_addr = td->inst_addr;
+  op_count[dbp->proc_id] = 0;
+}
+
+void init_dbp_stage(uns8 proc_id){
+  ASSERT(0, dbp);
+  ASSERT(0, DECOUPLED_BP);
+  memset(dbp, 0, sizeof(Decoupled_BP));
+  dbp->proc_id = proc_id; 
+  reset_dbp_stage();
+  //dbp->next_addr = td->inst_addr;
+  ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, dbp->next_addr);
+  dbp->off_path = FALSE;
+  dbp->off_path_btb_miss = FALSE;
+  dbp->back_on_path = FALSE;
+  dbp->next_addr = frontend_next_fetch_addr(dbp->proc_id);
+}
+
+Op* read_fetch_queue(uns proc_id)
+{
+  if(fetch_queue[proc_id].empty()) return NULL;
+  ASSERT(proc_id, fetch_queue[proc_id].front().valid);
+  return fetch_queue[proc_id].front().op;
+}
+
+Flag pop_fetch_queue(uns proc_id)
+{
+  ASSERT(proc_id, proc_id == dbp->proc_id);
+  if(!fetch_queue[proc_id].empty()){
+    ASSERT(0, fetch_queue[proc_id].front().valid);
+    if(fetch_queue[proc_id].front().op->table_info->cf_type){
+      dbp->num_branches_in_fetch_queue--;
+      if(USE_LATE_BP ? fetch_queue[proc_id].front().op->oracle_info.late_pred : fetch_queue[proc_id].front().op->oracle_info.pred){
+        dbp->num_taken_branches_in_fetch_queue--;
+        ASSERT(proc_id, dbp->num_taken_branches_in_fetch_queue >=0);
+      }
+      ASSERT(proc_id, dbp->num_branches_in_fetch_queue >=0);
+    }
+    fetch_queue[proc_id].pop_front();
+    return TRUE;
+  }
+  return FALSE;
+}
+
+void update_decoupled_bp()
+{
+  dbp->state = dbp->next_state;
+  
+  switch(dbp->state) {
+    case BP_NORMAL: {
+
+      dbp->off_path &= !dbp->back_on_path;
+      dbp->back_on_path = FALSE;
+
+      if(!FETCH_OFF_PATH_OPS && dbp->off_path)
+        return;
+  
+      if(fetch_queue[dbp->proc_id].size() == FETCH_QUEUE_SIZE || dbp->num_taken_branches_in_fetch_queue == FETCH_QUEUE_NUM_TAKEN) {
+        dbp->next_state = BP_NORMAL;
+        STAT_EVENT(dbp->proc_id, BP_BW_FULL_FQ);
+        return;
+      }
+      
+      dbp->next_state = cycle_decoupled_bp(dbp->proc_id);
+      DEBUG(dbp->proc_id, "DBP next state: %d \n", dbp->next_state);
+      STAT_EVENT(dbp->proc_id, FETCH_ON_PATH + dbp->off_path);
+
+      //STAT_EVENT(dbp->proc_id, FETCH_0_OPS + fetch_packet_op_count);
+    } break;
+
+    case BP_WAIT_TIMER: {
+      DEBUG(dbp->proc_id, "Decoupled BP waiting on timer:%llu\n", dbp->timer_cycle);
+      STAT_EVENT(dbp->proc_id, BP_BW_LATE_BP_REDIRECT);
+      if(cycle_count >= dbp->timer_cycle){
+        dbp->next_state = BP_NORMAL;
+      }
+    } break;
+
+    case BP_WAIT_REDIRECT: {
+      DEBUG(dbp->proc_id, "decoupled bp waiting for redirect\n");
+      STAT_EVENT(dbp->proc_id, BP_BW_REDIRECT);
+    } break;
+
+    case BP_WAIT_EMPTY_ROB: {
+      STAT_EVENT(dbp->proc_id, BP_BW_SYS_CALL);
+      if(td->seq_op_list.count == 0){
+        DEBUG(dbp->proc_id, "empty pipeline, decoupled bp back to normal\n");
+        dbp->next_state = BP_NORMAL;
+      }
+    } break;
+
+    default:
+      FATAL_ERROR(dbp->proc_id, "Invalid dbp state.\n");
+  }
+}
+
+Bp_State cycle_decoupled_bp(uns proc_id) {
+  uns cf_num    = 0;
+  uns taken_count = 0;
+  uns ops_count = 0;
+
+  ASSERT(dbp->proc_id, dbp->proc_id == td->proc_id);
+
+  Bp_Break break_predict = BP_NO_BREAK;
+  while(break_predict == BP_NO_BREAK) {
+    dbp->curr_addr = dbp->next_addr;
+    Op*        op   = alloc_op(dbp->proc_id);
+    Inst_Info* inst = 0;
+    UNUSED(inst);
+
+    if(frontend_can_fetch_op(dbp->proc_id)) {
+      frontend_fetch_op(dbp->proc_id, op);
+      ASSERTM(dbp->proc_id, dbp->curr_addr == op->inst_info->addr,
+              "Fetch address 0x%llx does not match op address 0x%llx\n",
+              dbp->curr_addr, op->inst_info->addr);
+      op->fetch_addr = dbp->curr_addr;
+      ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, op->fetch_addr);
+      op->off_path  = dbp->off_path;
+      td->inst_addr = op->inst_info->addr;  // FIXME: BUG 54
+      ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, td->inst_addr);
+      //if(!op->off_path) {
+      //  if(op->eom)
+      //    issued_real_inst++;
+      //  issued_uop++;
+      //}
+      inst = op->inst_info;
+    } else {
+      free_op(op);
+      return BP_NORMAL;
+    }
+
+    if(!op->off_path &&
+       (op->table_info->mem_type == MEM_LD ||
+        op->table_info->mem_type == MEM_ST) &&
+       op->oracle_info.va == 0) {
+      // don't care if the va is 0x0 if mem_type is MEM_PF(SW prefetch),
+      // MEM_WH(write hint), or MEM_EVICT(cache block eviction hint)
+      print_func_op(op);
+      FATAL_ERROR(dbp->proc_id, "Access to 0x0\n");
+    }
+
+    if(DUMP_TRACE && DEBUG_RANGE_COND(dbp->proc_id))
+      print_func_op(op);
+
+    if(DIE_ON_CALLSYS && !op->off_path) {
+      ASSERT(dbp->proc_id, op->table_info->cf_type != CF_SYS);
+    }
+
+    /* add to sequential op list */
+    add_to_seq_op_list(td, op);
+
+    ASSERT(dbp->proc_id, (uns) td->seq_op_list.count <= op_pool_active_ops);
+
+    /* map the op based on true dependencies & set information in
+     * op->oracle_info */
+    thread_map_op(op);
+
+    STAT_EVENT(op->proc_id, FETCH_ALL_INST);
+    STAT_EVENT(op->proc_id, ORACLE_ON_PATH_INST + op->off_path);
+    STAT_EVENT(op->proc_id, ORACLE_ON_PATH_INST_MEM +
+                              (op->table_info->mem_type == NOT_MEM) +
+                              2 * op->off_path);
+
+    thread_map_mem_dep(op);
+    op->fetch_cycle = cycle_count;
+
+    
+    op_count[dbp->proc_id]++;          /* increment instruction counters */
+    unique_count_per_core[dbp->proc_id]++;
+    unique_count++;
+    
+    fetch_queue_entry new_entry;
+    new_entry.op = op;
+    new_entry.valid = true;
+    fetch_queue[op->proc_id].push_back(new_entry);
+
+    /* check trigger */
+    if(op->inst_info->trigger_op_fetched_hook)
+      model->op_fetched_hook(op);
+
+    /* move on to next instruction in the cache line */
+    INC_STAT_EVENT(dbp->proc_id, INST_LOST_FETCH + dbp->off_path, 1);
+
+    DEBUG(dbp->proc_id,
+          "Fetching op from Decoupled BP addr: %s off: %d inst_info: %p ii_addr: %s "
+          "dis: %s, uid:%llu opnum: (%s:%s)\n",
+          hexstr64s(op->inst_info->addr), op->off_path, op->inst_info,
+          hexstr64s(op->inst_info->addr), disasm_op(op, TRUE),
+          op->inst_uid,
+          unsstr64(op->op_num), unsstr64(op->unique_num));
+
+    /* figure out next address after current instruction */
+    if(op->table_info->cf_type) {
+      // For pipeline gating
+      if(op->table_info->cf_type == CF_CBR){
+        td->td_info.fetch_br_count++;
+      }
+
+      if(IS_CALLSYS(op->table_info) || op->table_info->bar_type & BAR_FETCH) {
+        // for fetch barriers (including syscalls), we do not want to do
+        // redirect/recovery, BUT we still want to update the branch predictor.
+        bp_predict_op(g_bp_data, op, cf_num, dbp->curr_addr);
+        op->oracle_info.mispred   = 0;
+        op->oracle_info.misfetch  = 0;
+        op->oracle_info.btb_miss  = 0;
+        op->oracle_info.no_target = 0;
+        dbp->next_addr     = ADDR_PLUS_OFFSET(dbp->curr_addr, op->inst_info->trace_info.inst_size);
+        ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, dbp->next_addr)
+        break_predict = BP_BREAK_ON_BARRIER;
+      } else {
+        dbp->next_addr = bp_predict_op(g_bp_data, op, cf_num, dbp->curr_addr);
+        // initially bp_predict_op can return a garbage, for multi core run,
+        // addr must follow cmp addr convention
+        dbp->next_addr = convert_to_cmp_addr(dbp->proc_id, dbp->next_addr);
+        DEBUG(0, "dbp next addr after bp_predict op is %llx\n", dbp->next_addr);
+        ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, dbp->next_addr);
+      }
+
+      cf_num++;
+      dbp->num_branches_in_fetch_queue++;
+      //if(USE_LATE_BP ? op->oracle_info.late_pred : op->oracle_info.pred){
+      if(op->oracle_info.pred || (USE_LATE_BP && op->oracle_info.late_pred)){
+        taken_count++;
+        dbp->num_taken_branches_in_fetch_queue++;
+      }
+
+      ASSERT(dbp->proc_id,
+             (op->oracle_info.mispred << 2 | op->oracle_info.misfetch << 1 |
+              op->oracle_info.btb_miss) <= 0x7);
+
+      //const uns8 mispred       = op->oracle_info.mispred;
+      //const uns8 late_mispred  = op->oracle_info.late_mispred;
+      //const uns8 misfetch      = op->oracle_info.misfetch;
+      //const uns8 late_misfetch = op->oracle_info.late_misfetch;
+
+
+      if(break_predict == BP_NO_BREAK){
+        if(op->oracle_info.btb_miss){
+          //if(FETCH_NT_AFTER_BTB_MISS){
+          //  if(op->oracle_info.dir && dbp->oldest_btb_miss_op_num != MAX_CTR) {
+          //    dbp->oldest_btb_miss_op_num = op->op_num; 
+          //  }   
+          //  final_prediction = false;
+          //  dbp->next_addr = ADDR_PLUS_OFFSET(dbp->curr_addr, op->inst_info->trace_info.inst_size);
+          //  DEBUG(0, "dbp next addr after btb miss is %llx\n", dbp->next_addr);
+          //}
+          //else{
+            DEBUG(dbp->proc_id, "Change dbp to wait for redirect\n");
+            dbp->oldest_btb_miss_op_num = op->op_num; 
+            break_predict = BP_BREAK_ON_BTB_MISS;
+          //}
+        }
+        else if(USE_LATE_BP && !op->oracle_info.btb_miss) {
+          if(op->oracle_info.early_late_disagree){
+            dbp->timer_cycle = cycle_count + LATE_BP_LATENCY - 1;
+            break_predict = BP_BREAK_ON_EARLY_LATE_DISAGREE;
+            //ASSERT(dbp->proc_id, dbp->next_addr == op->oracle_info.late_pred_npc);
+            //if(!op->off_path){
+            //  if(op->oracle_info.pred == false){
+            //    STAT_EVENT(dbp->proc_id, TOTAL_EARLY_LATE_DISAGREE);
+            //  }
+            //  STAT_EVENT(dbp->proc_id, TOTAL_EARLY_LATE_DISAGREE);
+            //  STAT_EVENT(dbp->proc_id, TOTAL_EARLY_LATE_DISAGREE);
+            //}
+          }
+        }
+      }
+
+      //DEBUG(0, "pred=%d, late_pred=%d, final_pred=%d\n", op->oracle_info.pred, op->oracle_info.late_pred, final_prediction);
+      //bool final_mispred = op->oracle_info.pred != op->oracle_info.dir && (dbp->next_addr != op->oracle_info.npc);
+      //bool final_misfetch = !final_mispred && (dbp->next_addr != op->oracle_info.npc) && !op->oracle_info.no_target;
+      if(IS_CALLSYS(op->table_info) || op->table_info->bar_type & BAR_FETCH) {
+        op->oracle_info.mispred = FALSE;
+        op->oracle_info.misfetch = FALSE;
+      }
+
+      //DEBUG(0, "final_mispred=%d, final_misfetch=%d\n", final_mispred, final_misfetch);
+
+      if(op->oracle_info.misfetch || op->oracle_info.mispred) {
+        dbp->off_path = TRUE;
+
+        if(!op->off_path)
+          td->td_info.last_bp_miss_op = op;
+
+        if(FETCH_OFF_PATH_OPS) {
+          DEBUG(dbp->proc_id, "redirected frontend to 0x%s\n",
+                hexstr64s(dbp->next_addr));
+          frontend_redirect(td->proc_id, op->inst_uid, dbp->next_addr);
+        }
+        else{
+          if(break_predict == BP_NO_BREAK)
+            break_predict = BP_BREAK_ON_MISPRED;
+        }
+      }
+      if(op->oracle_info.pred && !op->off_path){
+        STAT_EVENT(dbp->proc_id, TOTAL_ON_PATH_TAKEN);
+      }
+      //TODO: INCREMENT THE TAKEN COUNT BASED ON THE EARLY BP
+      if(TAKEN_PER_CYCLE && taken_count == TAKEN_PER_CYCLE && break_predict == BP_NO_BREAK) { 
+        break_predict = BP_BREAK_ON_N_TAKEN;
+        DEBUG(dbp->proc_id, "break_predict on taken\n");
+        if(!op->off_path){
+          STAT_EVENT(dbp->proc_id, TOTAL_ON_PATH_CYCLES);
+        }
+      }
+      if(CFS_PER_CYCLE && cf_num > CFS_PER_CYCLE && break_predict == BP_NO_BREAK){
+        break_predict = BP_BREAK_ON_NUM_OP;
+        DEBUG(dbp->proc_id, "break_predict on number of cfs\n");
+      }
+    } else {
+      if(op->eom) {
+        dbp->next_addr = ADDR_PLUS_OFFSET(
+          dbp->curr_addr, op->inst_info->trace_info.inst_size);
+      }
+      // pass the global branch history to all the instructions
+      op->oracle_info.pred_global_hist = g_bp_data->global_hist;
+      ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, dbp->next_addr);
+    }
+    ops_count++;
+    if(BP_OP_PER_CYCLE && ops_count >= BP_OP_PER_CYCLE && break_predict == BP_NO_BREAK){ 
+      break_predict = BP_BREAK_ON_NUM_OP; 
+    }
+    if(fetch_queue[dbp->proc_id].size() >= FETCH_QUEUE_SIZE && break_predict == BP_NO_BREAK){
+      break_predict = BP_BREAK_ON_FULL_FETCH_QUEUE;
+    }
+    if(dbp->num_taken_branches_in_fetch_queue >= FETCH_QUEUE_NUM_TAKEN && break_predict == BP_NO_BREAK){
+      break_predict = BP_BREAK_ON_FULL_FETCH_QUEUE;
+    }
+  } // end of while loop
+  ASSERT(dbp->proc_id, ops_count <= BP_OP_PER_CYCLE);
+  if(ops_count == BP_OP_PER_CYCLE){
+    STAT_EVENT(dbp->proc_id, BP_BW_FULL_WIDTH);
+  }
+  else{
+    //this is a hack to get the stats correctly without 50 lines of code
+    STAT_EVENT(dbp->proc_id, BP_BW_REDIRECT + ops_count);
+  }
+
+  DEBUG(dbp->proc_id, "end of predict packet, %d\n", break_predict);
+  switch (break_predict) {
+    case BP_NO_BREAK:
+      ASSERT(dbp->proc_id, FALSE);
+      break;
+    case BP_BREAK_ON_EARLY_LATE_DISAGREE:
+      return BP_WAIT_TIMER;
+      break;
+    case BP_BREAK_ON_BARRIER:
+      return BP_WAIT_EMPTY_ROB;
+      break;
+    case BP_BREAK_ON_N_TAKEN:
+    case BP_BREAK_ON_NUM_OP:
+    case BP_BREAK_ON_FULL_FETCH_QUEUE:
+    case BP_BREAK_ON_MISPRED:
+      return BP_NORMAL;
+      break;
+    case BP_BREAK_ON_BTB_MISS:
+      return BP_WAIT_REDIRECT;
+      break;
+  }
+  ASSERT(dbp->proc_id, FALSE);
+  return BP_NORMAL;
+}
+
+void recover_fetch_queue()
+{
+  //EARLY AND LATE BP DISAGREEMENT ON DONE BY STALLING THE BP FOR N CYCLES
+  //ALL THE FLUSHES SHOULD CLEAR THE FETCH QUEUE
+  ASSERT(dbp->proc_id, dbp->proc_id == bp_recovery_info->proc_id);
+  /* 
+  DEBUG(0, "state of the fetch queue\n");
+  for(auto thing : fetch_queue[dbp->proc_id]){
+    DEBUG(0, "op num %llu\n", thing.op->op_num);
+  }
+  */
+  
+  while(!fetch_queue[dbp->proc_id].empty())
+  {
+    ASSERT(dbp->proc_id, FLUSH_OP(fetch_queue[dbp->proc_id].back().op));
+    free_op(fetch_queue[dbp->proc_id].back().op);
+    fetch_queue[dbp->proc_id].pop_back();
+  }
+  ASSERT(dbp->proc_id, fetch_queue[dbp->proc_id].empty());
+
+  dbp->num_branches_in_fetch_queue = 0;
+  dbp->num_taken_branches_in_fetch_queue = 0;
+  dbp->back_on_path = !bp_recovery_info->recovery_force_offpath;
+  dbp->next_addr = bp_recovery_info->recovery_fetch_addr;
+  op_count[dbp->proc_id] = bp_recovery_info->recovery_op_num + 1;
+  dbp->next_state = BP_NORMAL;
+}
+
+void redirect_decoupled_bp(){
+  ASSERT(dbp->proc_id, dbp->proc_id == bp_recovery_info->proc_id);
+  ASSERT(dbp->proc_id, dbp->state == BP_WAIT_REDIRECT);
+
+  Op*  op                = bp_recovery_info->redirect_op;
+  Addr next_fetch_addr   = op->oracle_info.pred_npc;
+  op->redirect_scheduled = FALSE;
+
+  DEBUG(dbp->proc_id, "BP stage redirect signaled. next_fetch_addr: 0x%s\n",
+        hexstr64s(next_fetch_addr));
+  //ASSERT(dbp->proc_id, !FETCH_NT_AFTER_BTB_MISS);
+
+  Flag main_predictor_wrong = op->oracle_info.mispred ||
+                              op->oracle_info.misfetch;
+
+  if(USE_LATE_BP) {
+    main_predictor_wrong = FALSE;
+  }
+
+  Flag late_predictor_wrong = (USE_LATE_BP && (op->oracle_info.late_mispred ||
+                                               op->oracle_info.late_misfetch));
+  dbp->back_on_path          = !(op->off_path || main_predictor_wrong ||
+                       late_predictor_wrong);
+  dbp->next_addr       = next_fetch_addr;
+  ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, dbp->next_addr);
+  dbp->next_state = BP_NORMAL;
+}
diff --git a/src/bp/decoupled_bp.h b/src/bp/decoupled_bp.h
new file mode 100644
index 00000000..68915bee
--- /dev/null
+++ b/src/bp/decoupled_bp.h
@@ -0,0 +1,76 @@
+#ifndef __DECOUPLED_BP_H__
+#define __DECOUPLED_BP_H__
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+typedef enum Bp_State_enum {
+  BP_NORMAL,
+  BP_WAIT_TIMER,
+  BP_WAIT_EMPTY_ROB,
+  BP_WAIT_REDIRECT
+} Bp_State;
+
+typedef enum Bp_Break_enum {
+  BP_NO_BREAK,
+  BP_BREAK_ON_EARLY_LATE_DISAGREE,
+  BP_BREAK_ON_BTB_MISS,
+  BP_BREAK_ON_N_TAKEN,
+  BP_BREAK_ON_BARRIER,
+  BP_BREAK_ON_NUM_OP,
+  BP_BREAK_ON_MISPRED,
+  BP_BREAK_ON_FULL_FETCH_QUEUE
+} Bp_Break;
+
+typedef struct Decoupled_BP_struct {
+  uns8       proc_id;
+
+  Bp_State state ; /* state that the BP is in */
+  Bp_State
+    next_state; /* state that the BP is going to be in next cycle */
+
+  Counter inst_count; /* instruction counter used to number ops (global counter
+                         is for retired ops) */
+  Addr        curr_addr;      /* address fetched */
+  Addr        next_addr; /* address to fetch */
+  Flag        off_path;        /* is the icache fetching on the correct path? */
+  Flag        off_path_btb_miss; /* is the icache off path from a btb miss */
+  Counter     oldest_btb_miss_op_num; /* uid of the oldest btb miss*/
+  Flag back_on_path; /* did a recovery happen to put the machine back on path?
+                      */
+
+  Counter timer_cycle; /* cycle that the icache stall timer will have elapsed
+                          and the icache can fetch again */
+  
+  //data needed to maintain the fetch queue
+  Counter num_branches_in_fetch_queue;
+  Counter num_taken_branches_in_fetch_queue;
+} Decoupled_BP;
+
+typedef struct fetch_queue_entry {
+  Flag valid;
+  Op* op;
+} fetch_queue_entry;
+
+extern Decoupled_BP* dbp;
+
+void set_dbp_stage(Decoupled_BP* new_dbp);
+void reset_dbp_stage();
+void init_dbp_stage(uns8 proc_id);
+void update_decoupled_bp();
+void redirect_decoupled_bp();
+Bp_State cycle_decoupled_bp(uns proc_id);
+void recover_fetch_queue();
+Op * read_fetch_queue(uns proc_id);
+Flag pop_fetch_queue(uns proc_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif  //__DECOUPLED_BP_H__
diff --git a/src/cmp_model.c b/src/cmp_model.c
index 7739d723..677b277b 100644
--- a/src/cmp_model.c
+++ b/src/cmp_model.c
@@ -30,6 +30,7 @@
 /* Global variables */
 #include "cmp_model.h"
 #include "bp/bp.param.h"
+#include "bp/decoupled_bp.h"
 #include "core.param.h"
 #include "debug/debug.param.h"
 #include "debug/debug_macros.h"
@@ -94,6 +95,9 @@ void cmp_init(uns mode) {
 
     cmp_init_thread_data(proc_id);
 
+    if(DECOUPLED_BP){
+      init_dbp_stage(proc_id);
+    }
     init_icache_stage(proc_id, "ICACHE");
     init_icache_trace();
 
@@ -140,6 +144,9 @@ void cmp_reset() {
 
   for(proc_id = 0; proc_id < NUM_CORES; proc_id++) {
     cmp_set_all_stages(proc_id);
+    if(DECOUPLED_BP){
+      reset_dbp_stage();
+    }
     reset_icache_stage();
     reset_decode_stage();
     reset_map_stage();
@@ -211,6 +218,9 @@ void cmp_cores(void) {
       update_map_stage(dec->last_sd);
       update_decode_stage(&ic->sd);
       update_icache_stage();
+      if(DECOUPLED_BP){
+        update_decoupled_bp();
+      }
 
       node_sched_ops();
 
@@ -339,6 +349,9 @@ void cmp_recover() {
                  bp_recovery_info->recovery_inst_uid,
                  bp_recovery_info->late_bp_recovery_wrong);
 
+  if(DECOUPLED_BP){
+    recover_fetch_queue();
+  }
   recover_icache_stage();
   recover_decode_stage();
   recover_map_stage();
@@ -360,7 +373,12 @@ void cmp_redirect() {
   bp_recovery_info->redirect_op->oracle_info.btb_miss_resolved = TRUE;
   ASSERT_PROC_ID_IN_ADDR(bp_recovery_info->proc_id,
                          bp_recovery_info->redirect_op->oracle_info.pred_npc);
-  redirect_icache_stage();
+  if(DECOUPLED_BP){
+    redirect_decoupled_bp();
+  }
+  else{
+    redirect_icache_stage();
+  }
 }
 
 /**************************************************************************************/
@@ -413,10 +431,18 @@ void cmp_warmup(Op* op) {
 
   // Warmup caches for instructions
   Icache_Stage* ic = &(cmp_model.icache_stage[proc_id]);
+  Decoupled_BP* dbp = NULL;
   // keep next_fetch_addr current to avoid confusing simulation mode
   if(op->eom) {
-    ic->next_fetch_addr = op->oracle_info.npc;
-    ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
+    if(DECOUPLED_BP){
+      dbp = &(cmp_model.bp_stage[proc_id]);
+      dbp->next_addr = op->oracle_info.npc;
+      ASSERT_PROC_ID_IN_ADDR(dbp->proc_id, dbp->next_addr)
+    }
+    else{
+      ic->next_fetch_addr = op->oracle_info.npc;
+      ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
+    }
   }
   Cache*      icache  = &(ic->icache);
   Inst_Info** ic_data = (Inst_Info**)cache_access(icache, ia, &dummy_line_addr,
diff --git a/src/cmp_model.h b/src/cmp_model.h
index 7e40d490..28948c8b 100644
--- a/src/cmp_model.h
+++ b/src/cmp_model.h
@@ -30,6 +30,7 @@
 #define __CMP_MODEL_H__
 
 #include "bp/bp.h"
+#include "bp/decoupled_bp.h"
 #include "cmp_model_support.h"
 #include "dcache_stage.h"
 #include "decode_stage.h"
@@ -63,6 +64,7 @@ typedef struct Cmp_Model_struct {
   Node_Stage*   node_stage;
   Exec_Stage*   exec_stage;
   Dcache_Stage* dcache_stage;
+  Decoupled_BP* bp_stage;
 
   uns window_size;
 
diff --git a/src/cmp_model_support.c b/src/cmp_model_support.c
index a1172e78..fee42dbd 100644
--- a/src/cmp_model_support.c
+++ b/src/cmp_model_support.c
@@ -27,6 +27,7 @@
  ***************************************************************************************/
 
 #include "cmp_model_support.h"
+#include "bp/decoupled_bp.h"
 #include "cmp_model.h"
 #include "core.param.h"
 #include "frontend/pin_trace_fe.h"
@@ -35,6 +36,7 @@
 #include "globals/utils.h"
 #include "packet_build.h"
 #include "statistics.h"
+#include "bp/bp.param.h"
 
 /**************************************************************************************/
 /* cmp_init_cmp_model  */
@@ -60,6 +62,9 @@ void cmp_init_cmp_model() {
   cmp_model.exec_stage   = (Exec_Stage*)malloc(sizeof(Exec_Stage) * NUM_CORES);
   cmp_model.dcache_stage = (Dcache_Stage*)malloc(sizeof(Dcache_Stage) *
                                                  NUM_CORES);
+  if(DECOUPLED_BP)
+    cmp_model.bp_stage = (Decoupled_BP*)malloc(sizeof(Decoupled_BP) * 
+                                                 NUM_CORES);
 }
 
 
@@ -85,6 +90,8 @@ void cmp_set_all_stages(uns8 proc_id) {
   set_node_stage(&cmp_model.node_stage[proc_id]);
   set_exec_stage(&cmp_model.exec_stage[proc_id]);
   set_dcache_stage(&cmp_model.dcache_stage[proc_id]);
+  if(DECOUPLED_BP)
+    set_dbp_stage(&cmp_model.bp_stage[proc_id]);
 }
 
 /**************************************************************************************/
@@ -122,4 +129,7 @@ void cmp_init_bogus_sim(uns8 proc_id) {
   reset_all_ops_node_stage();
   reset_exec_stage();
   reset_dcache_stage();
+  if(DECOUPLED_BP){
+    ASSERT(proc_id, FALSE);
+  }
 }
diff --git a/src/core.param.def b/src/core.param.def
index dc6ac00f..92b87229 100644
--- a/src/core.param.def
+++ b/src/core.param.def
@@ -124,6 +124,8 @@ DEF_PARAM(rs_fill_width, RS_FILL_WIDTH, uns, uns, 8, )
 DEF_PARAM(node_table_size, NODE_TABLE_SIZE, uns, uns, 256, )
 DEF_PARAM(node_ret_width, NODE_RET_WIDTH, uns, uns, 4, )
 DEF_PARAM(node_retire_rate, NODE_RETIRE_RATE, uns, uns, 10, )
+DEF_PARAM(load_queue_entries, LOAD_QUEUE_SIZE, uns, uns, 64, )
+DEF_PARAM(store_queue_entries, STORE_QUEUE_SIZE, uns, uns, 64, )
 
 /********EXEC PORT
  * PARAMETERS*********************************************************/
diff --git a/src/core.stat.def b/src/core.stat.def
index 4249d295..20e746da 100644
--- a/src/core.stat.def
+++ b/src/core.stat.def
@@ -68,9 +68,12 @@ DEF_STAT( NODE_UOP_COUNT,       COUNT,   NO_RATIO    )
 
 DEF_STAT(  FULL_WINDOW_STALL,  PERCENT,  NODE_CYCLE  )
 
-DEF_STAT(  FULL_WINDOW_MEM_OP  ,  DIST,   NO_RATIO  )
-DEF_STAT(  FULL_WINDOW_FP_OP   ,  COUNT,  NO_RATIO  )
-DEF_STAT(  FULL_WINDOW_OTHER_OP,  DIST,   NO_RATIO  )
+DEF_STAT(  FULL_WINDOW_ROB_MEM_OP  ,  DIST,   NO_RATIO  )
+DEF_STAT(  FULL_WINDOW_ROB_FP_OP   ,  COUNT,  NO_RATIO  )
+DEF_STAT(  FULL_WINDOW_ROB_OTHER_OP,  COUNT,   NO_RATIO  )
+DEF_STAT(  FULL_WINDOW_LQ_FULL  ,  COUNT,   NO_RATIO  )
+DEF_STAT(  FULL_WINDOW_SQ_FULL  ,  COUNT,   NO_RATIO  )
+DEF_STAT(  FULL_WINDOW_WAITING_ON_RET,  DIST,   NO_RATIO  )
 
 DEF_STAT(  RET_BLOCKED_DC_MISS, PERCENT, NODE_CYCLE )
 DEF_STAT(  RET_BLOCKED_L1_MISS, PERCENT, NODE_CYCLE )
diff --git a/src/debug/memview.c b/src/debug/memview.c
index 83e2059d..8e6161a9 100644
--- a/src/debug/memview.c
+++ b/src/debug/memview.c
@@ -66,11 +66,11 @@ typedef struct Proc_Info_struct {
 /**************************************************************************************/
 /* Global Variables */
 
-FILE*         trace;
-Bank_Info*    bank_infos;
-Proc_Info*    proc_infos;
-Trigger*      start_trigger;
-Mem_Req_Type* req_types;
+static FILE*         trace;
+static Bank_Info*    bank_infos;
+static Proc_Info*    proc_infos;
+static Trigger*      start_trigger;
+static Mem_Req_Type* req_types;
 
 /**************************************************************************************/
 /* Local Prototypes */
diff --git a/src/dumb_model.c b/src/dumb_model.c
index 8080aa6e..45362e98 100644
--- a/src/dumb_model.c
+++ b/src/dumb_model.c
@@ -70,6 +70,7 @@ static Flag dumb_req_done(Mem_Req* req);
 static Proc_Info* infos;
 static Counter    req_num;
 static uns64      page_num_mask;
+Dumb_Model        dumb_model;
 
 /**************************************************************************************/
 /* dumb_init */
diff --git a/src/dumb_model.h b/src/dumb_model.h
index 0e3f88a0..34b03341 100644
--- a/src/dumb_model.h
+++ b/src/dumb_model.h
@@ -42,7 +42,6 @@ typedef struct Dumb_Model_struct {
 /**************************************************************************************/
 /* Global vars */
 
-Dumb_Model        dumb_model;
 extern Dumb_Model dumb_model;
 
 /**************************************************************************************/
diff --git a/src/dvfs/perf_pred.c b/src/dvfs/perf_pred.c
index 80ff7ae6..4bf11c37 100644
--- a/src/dvfs/perf_pred.c
+++ b/src/dvfs/perf_pred.c
@@ -112,7 +112,7 @@ enum {
 static Counter   chip_cycle_count;
 static Stat_Mon* stat_mon;
 
-Proc_Info* proc_infos;
+static Proc_Info* proc_infos;
 
 /* static function prototypes */
 static void critical_access_plot(uns proc_id, Mem_Req_Type type, uns req_ret,
diff --git a/src/frontend/pin_trace_fe.c b/src/frontend/pin_trace_fe.c
index 8b2a9032..8229a316 100644
--- a/src/frontend/pin_trace_fe.c
+++ b/src/frontend/pin_trace_fe.c
@@ -52,7 +52,7 @@
 /**************************************************************************************/
 /* Global Variables */
 
-char* trace_files[MAX_NUM_PROCS];
+static char* trace_files[MAX_NUM_PROCS];
 
 ctype_pin_inst* next_pi;
 
diff --git a/src/globals/utils.h b/src/globals/utils.h
index 7bc76ef5..515e4c03 100644
--- a/src/globals/utils.h
+++ b/src/globals/utils.h
@@ -282,6 +282,9 @@
 #define MIN4(v0, v1, v2, v3) (MIN2(MIN2((v0), (v1)), MIN2((v2), (v3))))
 #define MAX4(v0, v1, v2, v3) (MAX2(MAX2((v0), (v1)), MAX2((v2), (v3))))
 
+// a is the original addr, num is the shift amt before interleaving (usually
+// the cacheline), int is the interleave factor. The bank idx computed here
+// is simply the lower bits
 #define BANK(a, num, int) ((a) >> LOG2(int) & N_BIT_MASK(LOG2(num)))
 #define CHANNEL(bank, num) ((bank) >> LOG2(num))
 #define BANK_IN_CHANNEL(bank, num) ((bank)&N_BIT_MASK(LOG2(num)))
diff --git a/src/icache_stage.c b/src/icache_stage.c
index cf65aeb0..4b1e716a 100644
--- a/src/icache_stage.c
+++ b/src/icache_stage.c
@@ -52,6 +52,7 @@
 #include "prefetcher/l2l1pref.h"
 #include "prefetcher/stream_pref.h"
 #include "statistics.h"
+#include "bp/decoupled_bp.h"
 
 
 /**************************************************************************************/
@@ -66,6 +67,7 @@
 /* Global Variables */
 
 Icache_Stage* ic = NULL;
+Pb_Data* ic_pb_data;
 
 extern Cmp_Model              cmp_model;
 extern Memory*                mem;
@@ -205,6 +207,16 @@ void recover_icache_stage() {
     }
   }
 
+  if(DECOUPLED_BP){
+    if(ic->next_state != IC_FILL && ic->next_state != IC_WAIT_FOR_MISS) {
+      ic->next_state = IC_FETCH;
+    }
+    if(SWITCH_IC_FETCH_ON_RECOVERY && model->id == CMP_MODEL) {
+      ic->next_state = IC_FETCH;
+    }
+    return;
+  }
+
   ic->back_on_path = !bp_recovery_info->recovery_force_offpath;
 
   Op* op = bp_recovery_info->recovery_op;
@@ -306,6 +318,16 @@ void update_icache_stage() {
       reset_packet_build(ic_pb_data);  // reset packet build counters
 
       while(!break_fetch) {
+        if(DECOUPLED_BP){
+
+          Op* top_of_queue = read_fetch_queue(ic->proc_id);
+          if(top_of_queue == NULL){
+            //DEBUG(ic->proc_id, "Fetch queue empty\n");
+            break_fetch = BREAK_EMPTY_FETCH_QUEUE;
+            return;
+          }
+          ic->next_fetch_addr = top_of_queue->inst_info->addr;
+        }
         ic->fetch_addr = ic->next_fetch_addr;
         ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->fetch_addr)
 
@@ -468,209 +490,239 @@ static inline Icache_State icache_issue_ops(Break_Reason* break_fetch,
   last_icache_issue_time = cycle_count;
 
   while(1) {
-    Op*        op   = alloc_op(ic->proc_id);
-    Inst_Info* inst = 0;
-    UNUSED(inst);
-
-    if(frontend_can_fetch_op(ic->proc_id)) {
-      frontend_fetch_op(ic->proc_id, op);
-      ASSERTM(ic->proc_id, ic->next_fetch_addr == op->inst_info->addr,
-              "Fetch address 0x%llx does not match op address 0x%llx\n",
-              ic->next_fetch_addr, op->inst_info->addr);
-      op->fetch_addr = ic->next_fetch_addr;
-      ASSERT_PROC_ID_IN_ADDR(ic->proc_id, op->fetch_addr)
-      op->off_path  = ic->off_path;
-      td->inst_addr = op->inst_info->addr;  // FIXME: BUG 54
-      ASSERT_PROC_ID_IN_ADDR(ic->proc_id, td->inst_addr);
-      if(!op->off_path) {
-        if(op->eom)
-          issued_real_inst++;
-        issued_uop++;
+    if(DECOUPLED_BP){
+      Op* op = read_fetch_queue(ic->proc_id);
+      if(op == NULL){
+        //fetch queue empty, try again next cycle
+        *break_fetch = BREAK_CF;
+        return IC_FETCH;
       }
-      inst = op->inst_info;
-    } else {
-      free_op(op);
-      *break_fetch = BREAK_BARRIER;
-      return IC_FETCH;
-    }
+      else{
+        ic->next_fetch_addr = op->inst_info->addr;
+        ic->fetch_addr = ic->next_fetch_addr; 
+        //ASSERTM(ic->proc_id, ic->next_fetch_addr == op->inst_info->addr,
+        //        "Fetch address 0x%llx does not match op address 0x%llx\n",
+        //        ic->next_fetch_addr, op->inst_info->addr);
+        
+        packet_break = packet_build(ic_pb_data, break_fetch, op, 0);
+        if(packet_break == PB_BREAK_BEFORE) {
+          break;
+        }
 
-    if(!op->off_path &&
-       (op->table_info->mem_type == MEM_LD ||
-        op->table_info->mem_type == MEM_ST) &&
-       op->oracle_info.va == 0) {
-      // don't care if the va is 0x0 if mem_type is MEM_PF(SW prefetch),
-      // MEM_WH(write hint), or MEM_EVICT(cache block eviction hint)
-      print_func_op(op);
-      FATAL_ERROR(ic->proc_id, "Access to 0x0\n");
-    }
+        ic->sd.ops[ic->sd.op_count] = op;
+        ic->sd.op_count++;
 
-    if(DUMP_TRACE && DEBUG_RANGE_COND(ic->proc_id))
-      print_func_op(op);
+        ASSERT(ic->proc_id, pop_fetch_queue(ic->proc_id));
 
-    if(DIE_ON_CALLSYS && !op->off_path) {
-      ASSERT(ic->proc_id, op->table_info->cf_type != CF_SYS);
+        if(packet_break == PB_BREAK_AFTER)
+          break;
+      }
     }
+    else {
+      Op*        op   = alloc_op(ic->proc_id);
+      Inst_Info* inst = 0;
+      UNUSED(inst);
+
+      if(frontend_can_fetch_op(ic->proc_id)) {
+        frontend_fetch_op(ic->proc_id, op);
+        ASSERTM(ic->proc_id, ic->next_fetch_addr == op->inst_info->addr,
+                "Fetch address 0x%llx does not match op address 0x%llx\n",
+                ic->next_fetch_addr, op->inst_info->addr);
+        op->fetch_addr = ic->next_fetch_addr;
+        ASSERT_PROC_ID_IN_ADDR(ic->proc_id, op->fetch_addr)
+        op->off_path  = ic->off_path;
+        td->inst_addr = op->inst_info->addr;  // FIXME: BUG 54
+        ASSERT_PROC_ID_IN_ADDR(ic->proc_id, td->inst_addr);
+        if(!op->off_path) {
+          if(op->eom)
+            issued_real_inst++;
+          issued_uop++;
+        }
+        inst = op->inst_info;
+      } else {
+        free_op(op);
+        *break_fetch = BREAK_BARRIER;
+        return IC_FETCH;
+      }
 
-    packet_break = packet_build(ic_pb_data, break_fetch, op, 0);
-    if(packet_break == PB_BREAK_BEFORE) {
-      free_op(op);
-      break;
-    }
+      if(!op->off_path &&
+         (op->table_info->mem_type == MEM_LD ||
+          op->table_info->mem_type == MEM_ST) &&
+         op->oracle_info.va == 0) {
+        // don't care if the va is 0x0 if mem_type is MEM_PF(SW prefetch),
+        // MEM_WH(write hint), or MEM_EVICT(cache block eviction hint)
+        print_func_op(op);
+        FATAL_ERROR(ic->proc_id, "Access to 0x0\n");
+      }
 
-    /* add to sequential op list */
-    add_to_seq_op_list(td, op);
-
-    ASSERT(ic->proc_id, td->seq_op_list.count <= op_pool_active_ops);
-
-    /* map the op based on true dependencies & set information in
-     * op->oracle_info */
-    /* num cycles since last group issued */
-    op->fetch_lag = fetch_lag;
-
-    thread_map_op(op);
-
-    STAT_EVENT(op->proc_id, FETCH_ALL_INST);
-    STAT_EVENT(op->proc_id, ORACLE_ON_PATH_INST + op->off_path);
-    STAT_EVENT(op->proc_id, ORACLE_ON_PATH_INST_MEM +
-                              (op->table_info->mem_type == NOT_MEM) +
-                              2 * op->off_path);
-
-    thread_map_mem_dep(op);
-    op->fetch_cycle = cycle_count;
-
-    ic->sd.ops[ic->sd.op_count] = op; /* put op in the exit list */
-    op_count[ic->proc_id]++;          /* increment instruction counters */
-    unique_count_per_core[ic->proc_id]++;
-    unique_count++;
-
-    /* check trigger */
-    if(op->inst_info->trigger_op_fetched_hook)
-      model->op_fetched_hook(op);
-
-    /* move on to next instruction in the cache line */
-    ic->sd.op_count++;
-    INC_STAT_EVENT(ic->proc_id, INST_LOST_FETCH + ic->off_path, 1);
-
-    DEBUG(ic->proc_id,
-          "Fetching op from Icache addr: %s off: %d inst_info: %p ii_addr: %s "
-          "dis: %s opnum: (%s:%s)\n",
-          hexstr64s(op->inst_info->addr), op->off_path, op->inst_info,
-          hexstr64s(op->inst_info->addr), disasm_op(op, TRUE),
-          unsstr64(op->op_num), unsstr64(op->unique_num));
-
-    /* figure out next address after current instruction */
-    if(op->table_info->cf_type) {
-      // For pipeline gating
-      if(op->table_info->cf_type == CF_CBR)
-        td->td_info.fetch_br_count++;
-
-      if(*break_fetch == BREAK_BARRIER) {
-        // for fetch barriers (including syscalls), we do not want to do
-        // redirect/recovery, BUT we still want to update the branch predictor.
-        bp_predict_op(g_bp_data, op, (*cf_num)++, ic->fetch_addr);
-        op->oracle_info.mispred   = 0;
-        op->oracle_info.misfetch  = 0;
-        op->oracle_info.btb_miss  = 0;
-        op->oracle_info.no_target = 0;
-        ic->next_fetch_addr       = ADDR_PLUS_OFFSET(
-          ic->next_fetch_addr, op->inst_info->trace_info.inst_size);
-        ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
-      } else {
-        //
-        ic->next_fetch_addr = bp_predict_op(g_bp_data, op, (*cf_num)++,
-                                            ic->fetch_addr);
-        // initially bp_predict_op can return a garbage, for multi core run,
-        // addr must follow cmp addr convention
-        ic->next_fetch_addr = convert_to_cmp_addr(ic->proc_id,
-                                                  ic->next_fetch_addr);
-        ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
+      if(DUMP_TRACE && DEBUG_RANGE_COND(ic->proc_id))
+        print_func_op(op);
+
+      if(DIE_ON_CALLSYS && !op->off_path) {
+        ASSERT(ic->proc_id, op->table_info->cf_type != CF_SYS);
       }
 
-      ASSERT(ic->proc_id,
-             (op->oracle_info.mispred << 2 | op->oracle_info.misfetch << 1 |
-              op->oracle_info.btb_miss) <= 0x7);
-
-      const uns8 mispred       = op->oracle_info.mispred;
-      const uns8 late_mispred  = op->oracle_info.late_mispred;
-      const uns8 misfetch      = op->oracle_info.misfetch;
-      const uns8 late_misfetch = op->oracle_info.late_misfetch;
-
-      /* if it's a mispredict, kick the oracle off path */
-      if(mispred || misfetch ||
-         (USE_LATE_BP && (late_mispred || late_misfetch))) {
-        ic->off_path = TRUE;
-
-        if(FETCH_OFF_PATH_OPS) {
-          if(mispred || misfetch) {
-            DEBUG(ic->proc_id,
-                  "Cycle %llu: redirected frontend because of the "
-                  "early branch predictor to 0x%s\n",
-                  cycle_count, hexstr64s(ic->next_fetch_addr));
-            frontend_redirect(td->proc_id, op->inst_uid, ic->next_fetch_addr);
-          }
+      packet_break = packet_build(ic_pb_data, break_fetch, op, 0);
+      if(packet_break == PB_BREAK_BEFORE) {
+        free_op(op);
+        break;
+      }
 
-          if(USE_LATE_BP) {
-            if((mispred || misfetch) && !late_mispred && !late_misfetch) {
-              bp_sched_recovery(bp_recovery_info, op, cycle_count,
-                                /*late_bp_recovery=*/TRUE,
-                                /*force_offpath=*/FALSE);
-              DEBUG(ic->proc_id,
-                    "Scheduled a recovery to correct addr for cycle %llu\n",
-                    cycle_count + LATE_BP_LATENCY);
-            } else if((late_mispred || late_misfetch) &&
-                      op->oracle_info.pred_npc !=
-                        op->oracle_info.late_pred_npc) {
-              bp_sched_recovery(bp_recovery_info, op, cycle_count,
-                                /*late_bp_recovery=*/TRUE,
-                                /*force_offpath=*/TRUE);
+      /* add to sequential op list */
+      add_to_seq_op_list(td, op);
+
+      ASSERT(ic->proc_id, td->seq_op_list.count <= op_pool_active_ops);
+
+      /* map the op based on true dependencies & set information in
+       * op->oracle_info */
+      /* num cycles since last group issued */
+      op->fetch_lag = fetch_lag;
+
+      thread_map_op(op);
+
+      STAT_EVENT(op->proc_id, FETCH_ALL_INST);
+      STAT_EVENT(op->proc_id, ORACLE_ON_PATH_INST + op->off_path);
+      STAT_EVENT(op->proc_id, ORACLE_ON_PATH_INST_MEM +
+                                (op->table_info->mem_type == NOT_MEM) +
+                                2 * op->off_path);
+
+      thread_map_mem_dep(op);
+      op->fetch_cycle = cycle_count;
+
+      ic->sd.ops[ic->sd.op_count] = op; /* put op in the exit list */
+      op_count[ic->proc_id]++;          /* increment instruction counters */
+      unique_count_per_core[ic->proc_id]++;
+      unique_count++;
+
+      /* check trigger */
+      if(op->inst_info->trigger_op_fetched_hook)
+        model->op_fetched_hook(op);
+
+      /* move on to next instruction in the cache line */
+      ic->sd.op_count++;
+      INC_STAT_EVENT(ic->proc_id, INST_LOST_FETCH + ic->off_path, 1);
+
+      DEBUG(ic->proc_id,
+            "Fetching op from Icache addr: %s off: %d inst_info: %p ii_addr: %s "
+            "dis: %s opnum: (%s:%s)\n",
+            hexstr64s(op->inst_info->addr), op->off_path, op->inst_info,
+            hexstr64s(op->inst_info->addr), disasm_op(op, TRUE),
+            unsstr64(op->op_num), unsstr64(op->unique_num));
+
+      /* figure out next address after current instruction */
+      if(op->table_info->cf_type) {
+        // For pipeline gating
+        if(op->table_info->cf_type == CF_CBR)
+          td->td_info.fetch_br_count++;
+
+        if(*break_fetch == BREAK_BARRIER) {
+          // for fetch barriers (including syscalls), we do not want to do
+          // redirect/recovery, BUT we still want to update the branch predictor.
+          bp_predict_op(g_bp_data, op, (*cf_num)++, ic->fetch_addr);
+          op->oracle_info.mispred   = 0;
+          op->oracle_info.misfetch  = 0;
+          op->oracle_info.btb_miss  = 0;
+          op->oracle_info.no_target = 0;
+          ic->next_fetch_addr       = ADDR_PLUS_OFFSET(
+            ic->next_fetch_addr, op->inst_info->trace_info.inst_size);
+          ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
+        } else {
+          //
+          ic->next_fetch_addr = bp_predict_op(g_bp_data, op, (*cf_num)++,
+                                              ic->fetch_addr);
+          // initially bp_predict_op can return a garbage, for multi core run,
+          // addr must follow cmp addr convention
+          ic->next_fetch_addr = convert_to_cmp_addr(ic->proc_id,
+                                                    ic->next_fetch_addr);
+          ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
+        }
+
+        ASSERT(ic->proc_id,
+               (op->oracle_info.mispred << 2 | op->oracle_info.misfetch << 1 |
+                op->oracle_info.btb_miss) <= 0x7);
+
+        const uns8 mispred       = op->oracle_info.mispred;
+        const uns8 late_mispred  = op->oracle_info.late_mispred;
+        const uns8 misfetch      = op->oracle_info.misfetch;
+        const uns8 late_misfetch = op->oracle_info.late_misfetch;
+
+        /* if it's a mispredict, kick the oracle off path */
+        if(mispred || misfetch ||
+           (USE_LATE_BP && (late_mispred || late_misfetch))) {
+          ic->off_path = TRUE;
+
+          if(FETCH_OFF_PATH_OPS) {
+            if(mispred || misfetch) {
               DEBUG(ic->proc_id,
-                    "Scheduled a recovery to wrong addr for cycle %llu\n",
-                    cycle_count + LATE_BP_LATENCY);
+                    "Cycle %llu: redirected frontend because of the "
+                    "early branch predictor to 0x%s\n",
+                    cycle_count, hexstr64s(ic->next_fetch_addr));
+              frontend_redirect(td->proc_id, op->inst_uid, ic->next_fetch_addr);
+            }
+
+            if(USE_LATE_BP) {
+              if((mispred || misfetch) && !late_mispred && !late_misfetch) {
+                bp_sched_recovery(bp_recovery_info, op, cycle_count,
+                                  /*late_bp_recovery=*/TRUE,
+                                  /*force_offpath=*/FALSE);
+                DEBUG(ic->proc_id,
+                      "Scheduled a recovery to correct addr for cycle %llu\n",
+                      cycle_count + LATE_BP_LATENCY);
+              } else if((late_mispred || late_misfetch) &&
+                        op->oracle_info.pred_npc !=
+                          op->oracle_info.late_pred_npc) {
+                bp_sched_recovery(bp_recovery_info, op, cycle_count,
+                                  /*late_bp_recovery=*/TRUE,
+                                  /*force_offpath=*/TRUE);
+                DEBUG(ic->proc_id,
+                      "Scheduled a recovery to wrong addr for cycle %llu\n",
+                      cycle_count + LATE_BP_LATENCY);
+              }
             }
+          } else {
+            packet_break = PB_BREAK_AFTER;
+            *break_fetch = BREAK_OFFPATH;
           }
-        } else {
-          packet_break = PB_BREAK_AFTER;
-          *break_fetch = BREAK_OFFPATH;
-        }
 
-        // pipeline gating
-        if(!op->off_path)
-          td->td_info.last_bp_miss_op = op;
-        ///////////////////////////////////////
-      }
+          // pipeline gating
+          if(!op->off_path)
+            td->td_info.last_bp_miss_op = op;
+          ///////////////////////////////////////
+        }
 
 
-      /* if it's a btb miss, quit fetching and wait for redirect */
-      if(op->oracle_info.btb_miss) {
-        *break_fetch = BREAK_BTB_MISS;
-        DEBUG(ic->proc_id, "Changed icache to wait for redirect %llu\n",
-              cycle_count);
-        return IC_WAIT_FOR_REDIRECT;
-      }
+        /* if it's a btb miss, quit fetching and wait for redirect */
+        if(op->oracle_info.btb_miss) {
+          *break_fetch = BREAK_BTB_MISS;
+          DEBUG(ic->proc_id, "Changed icache to wait for redirect %llu\n",
+                cycle_count);
+          return IC_WAIT_FOR_REDIRECT;
+        }
 
-      /* if it's a taken branch, wait for timer */
-      if(FETCH_BREAK_ON_TAKEN && op->oracle_info.pred &&
-         *break_fetch != BREAK_BARRIER) {
-        *break_fetch = BREAK_TAKEN;
-        if(FETCH_TAKEN_BUBBLE_CYCLES >= 1) {
-          ic->timer_cycle = cycle_count + FETCH_TAKEN_BUBBLE_CYCLES;
-          return IC_WAIT_FOR_TIMER;
-        } else
-          return IC_FETCH;
-      }
-    } else {
-      if(op->eom) {
-        ic->next_fetch_addr = ADDR_PLUS_OFFSET(
-          ic->next_fetch_addr, op->inst_info->trace_info.inst_size);
+        /* if it's a taken branch, wait for timer */
+        if(FETCH_BREAK_ON_TAKEN && op->oracle_info.pred &&
+           *break_fetch != BREAK_BARRIER) {
+          *break_fetch = BREAK_TAKEN;
+          if(FETCH_TAKEN_BUBBLE_CYCLES >= 1) {
+            ic->timer_cycle = cycle_count + FETCH_TAKEN_BUBBLE_CYCLES;
+            return IC_WAIT_FOR_TIMER;
+          } else
+            return IC_FETCH;
+        }
+      } else {
+        if(op->eom) {
+          ic->next_fetch_addr = ADDR_PLUS_OFFSET(
+            ic->next_fetch_addr, op->inst_info->trace_info.inst_size);
+          ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
+        }
+        // pass the global branch history to all the instructions
+        op->oracle_info.pred_global_hist = g_bp_data->global_hist;
         ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
       }
-      // pass the global branch history to all the instructions
-      op->oracle_info.pred_global_hist = g_bp_data->global_hist;
-      ASSERT_PROC_ID_IN_ADDR(ic->proc_id, ic->next_fetch_addr)
-    }
 
-    if(packet_break == PB_BREAK_AFTER)
-      break;
+      if(packet_break == PB_BREAK_AFTER)
+        break;
+    }
   }
 
   if(*break_fetch == BREAK_BARRIER) {
diff --git a/src/icache_stage.h b/src/icache_stage.h
index 98dda971..09b8d1da 100644
--- a/src/icache_stage.h
+++ b/src/icache_stage.h
@@ -98,13 +98,6 @@ typedef struct Icache_Data_struct {
 } Icache_Data;
 
 
-/**************************************************************************************/
-/* Global Variables */
-
-Pb_Data* ic_pb_data;  // cmp cne is fine for cmp now assuming homogeneous cmp
-// But decided to use array for future use
-
-
 /**************************************************************************************/
 /* External Variables */
 
diff --git a/src/libs/cache_lib.c b/src/libs/cache_lib.c
index 93476ab7..adbda1ed 100644
--- a/src/libs/cache_lib.c
+++ b/src/libs/cache_lib.c
@@ -69,7 +69,16 @@ char rand_repl_state[31];
 
 
 /**************************************************************************************/
-
+/**
+ * @brief Return set index of the addr
+ * As a side-effect, the tag and line_addr will be populated
+ * @param cache
+ * @param addr The access addr (input)
+ * @param tag  The tag of the access (output)
+ * @param line_addr The base address of the cache blk corresponding to the
+ * access (output)
+ * @return uns The set index of the access
+ */
 static inline uns cache_index(Cache* cache, Addr addr, Addr* tag,
                               Addr* line_addr) {
   *line_addr = addr & ~cache->offset_mask;
@@ -197,10 +206,15 @@ void init_cache(Cache* cache, const char* name, uns cache_size, uns assoc,
   }
 }
 
-/**************************************************************************************/
-/* cache_access: Does a cache lookup based on the address.  Returns a pointer
- * to the cache line data if it is found.  */
-
+/**
+ * @brief access the address.
+ *
+ * @param cache
+ * @param addr the request addr
+ * @param line_addr
+ * @param update_repl
+ * @return void* data field of the blk or NULL if cache miss
+ */
 void* cache_access(Cache* cache, Addr addr, Addr* line_addr, Flag update_repl) {
   Addr tag;
   uns  set = cache_index(cache, addr, &tag, line_addr);
@@ -210,6 +224,7 @@ void* cache_access(Cache* cache, Addr addr, Addr* line_addr, Flag update_repl) {
     return access_ideal_storage(cache, set, tag, addr);
   }
 
+  // search the ways
   for(ii = 0; ii < cache->assoc; ii++) {
     Cache_Entry* line = &cache->entries[set][ii];
 
@@ -249,29 +264,51 @@ void* cache_access(Cache* cache, Addr addr, Addr* line_addr, Flag update_repl) {
   return NULL;
 }
 
-/**************************************************************************************/
-/* cache_insert: returns a pointer to the data section of the new cache line.
-   Sets line_addr to the address of the first block of the new line.  Sets
-   repl_line_addr to the address of the first block that was replaced
-
-   DON'T call this unless you are sure that the line is not in the
-   cache (call after cache_access returned NULL)
-*/
 
+/**
+ * @brief Insert new addr to the cache
+ *
+ * This function is a wrapper of cache_insert_replpos, see below
+ *
+ * Note cache_insert is intrusive, for a non-instusive function
+ * (which only pick out the victim but not doing the insertion),
+ * see get_next_repl_line, both of these functions calls find_repl_entry
+ * internally
+ *
+ * DON'T call this unless you are sure that the line is not in the
+ * cache (call after cache_access returned NULL)
+ *
+ * @param cache
+ * @param proc_id
+ * @param addr
+ * @param line_addr
+ * @param repl_line_addr
+ * @return void* The data field of the inserted blk
+ */
 void* cache_insert(Cache* cache, uns8 proc_id, Addr addr, Addr* line_addr,
                    Addr* repl_line_addr) {
   return cache_insert_replpos(cache, proc_id, addr, line_addr, repl_line_addr,
                               INSERT_REPL_DEFAULT, FALSE);
 }
-/**************************************************************************************/
-/* cache_insert_replpos: returns a pointer to the data section of the new cache
-   line.  Sets line_addr to the address of the first block of the new line.
-   Sets repl_line_addr to the address of the first block that was replaced
-
-   DON'T call this unless you are sure that the line is not in the
-   cache (call after cache_access returned NULL)
-*/
 
+/**
+ * @brief Insert new blk into cache
+ *  returns a pointer to the data section of the new cache line.
+ *  Sets line_addr to the address of the first block of the new line.  Sets
+ *  repl_line_addr to the address of the first block that was replaced
+ *
+ *  Note this func won't do the WB if the victim is dirty, the info of the
+ *  victim blk is returned and WB is handled by the caller of this func
+ *
+ *  DON'T call this unless you are sure that the line is *not* in the
+ *  cache (call after cache_access returned NULL)
+ * @param cache
+ * @param proc_id
+ * @param addr The addr of the blk to be inserted
+ * @param line_addr The base addr of the blk to be insert (input)
+ * @param repl_line_addr The base addr of the blk got evicted (output)
+ * @return void* The data field of the inserted blk
+ */
 void* cache_insert_replpos(Cache* cache, uns8 proc_id, Addr addr,
                            Addr* line_addr, Addr* repl_line_addr,
                            Cache_Insert_Repl insert_repl_policy,
@@ -285,16 +322,19 @@ void* cache_insert_replpos(Cache* cache, uns8 proc_id, Addr addr,
     new_line        = insert_sure_line(cache, set, tag);
     *repl_line_addr = 0;
   } else {
+    // new_line points to the victim, repl_index is the way id for the victim
     new_line = find_repl_entry(cache, proc_id, set, &repl_index);
     /* before insert the data into cache, if the cache has shadow entry */
     /* insert that entry to the shadow cache */
     if((cache->repl_policy == REPL_SHADOW_IDEAL) && new_line->valid)
       shadow_cache_insert(cache, set, new_line->tag, new_line->base);
-    if(new_line->valid)  // bug fixed. 4/26/04 if the entry is not valid,
-                         // repl_line_addr should be set to 0
+    if(new_line->valid) {
+      // bug fixed. 4/26/04 if the entry is not valid,
+      // repl_line_addr should be set to 0
       *repl_line_addr = new_line->base;
-    else
+    } else {
       *repl_line_addr = 0;
+    }
     DEBUG(0,
           "Replacing 2.2f(set %u, way %u, tag 0x%s, base 0x%s) in cache '%s' "
           "with base 0x%s\n",
@@ -311,6 +351,7 @@ void* cache_insert_replpos(Cache* cache, uns8 proc_id, Addr addr,
 
   new_line->pref = isPrefetch;
 
+  // determine the insert loc (insertion policy)
   switch(insert_repl_policy) {
     case INSERT_REPL_DEFAULT:
       update_repl_policy(cache, new_line, set, repl_index, TRUE);
@@ -402,10 +443,15 @@ void* cache_insert_replpos(Cache* cache, uns8 proc_id, Addr addr,
 }
 
 
-/**************************************************************************************/
-/* invalidate_line: Does a cache lookup based on the address.  Returns a pointer
-   to the cache line data if it is found.  */
-
+/**
+ * @brief Invalidate the blk by address if presented, no wb even the blk
+ * is dirty
+ *
+ * @param cache
+ * @param addr
+ * @param line_addr
+ * @param True on find in cache, False on no present
+ */
 void cache_invalidate(Cache* cache, Addr addr, Addr* line_addr) {
   Addr tag;
   uns  set = cache_index(cache, addr, &tag, line_addr);
@@ -426,7 +472,11 @@ void cache_invalidate(Cache* cache, Addr addr, Addr* line_addr) {
 
 
 /**
- * @brief Return a pointer to the lru item in the cache set
+ * @brief Return a pointer to the victim to be replaced
+ *
+ * The caller of this func is supposed to handle the possible
+ * writeback correctly, otherwise the correctness of simulation
+ * is compromised
  *
  * @param cache
  * @param proc_id
diff --git a/src/memory/mem_req.h b/src/memory/mem_req.h
index 95478a49..e73f1fb4 100644
--- a/src/memory/mem_req.h
+++ b/src/memory/mem_req.h
@@ -40,6 +40,7 @@ struct Mem_Queue_struct;
 /**************************************************************************************/
 /* Types */
 
+// M(em)R(eq)S(tate)
 typedef enum Mem_Req_State_enum {
   MRS_INV, /* if you change this order or add anything, fix mem_req_state_names
               [] and is_final_state() in memory.c and */
@@ -63,6 +64,7 @@ typedef enum Mem_Req_State_enum {
   MRS_FILL_DONE, /* final state */
 } Mem_Req_State;
 
+// M(em)R(eq)T(ype)
 #define MRT_LIST(elem)                               \
   elem(IFETCH)         /* instruction fetch */       \
     elem(DFETCH)       /* data fetch */              \
@@ -129,7 +131,15 @@ struct Mem_Req_struct {
   uns  op_count;  /* number of ops that are waiting for the miss */
   uns  req_count; /* number of requests coalesced into this one */
   Flag (*done_func)(struct Mem_Req_struct*); /* pointer to function to call when
-                                                the memory request is finished
+                                                the memory request is finished,
+                                                this is the mechanism scarab
+                                                used to implement a "callback".
+                                                i.e. when a req is finally
+                                                returned from the mem system,
+                                                continue with the rest of the
+                                                process. This is mostly used by
+                                                I$ and D$ to fill the line when
+                                                req returned from uncore/mem
                                               */
   Flag mlc_miss;                             /* did this request miss in MLC */
   Flag mlc_miss_satisfied;   /* did this request miss in MLC and it is already
diff --git a/src/memory/memory.c b/src/memory/memory.c
index 0a88207e..b77d2dad 100644
--- a/src/memory/memory.c
+++ b/src/memory/memory.c
@@ -288,9 +288,11 @@ void init_mem_req_type_priorities() {
   }
 }
 
-/**************************************************************************************/
-/* init_memory: */
 
+/**
+ * @brief Init memory
+ *
+ */
 void init_memory() {
   int  ii;
   char name[20];
@@ -390,25 +392,46 @@ void init_memory() {
 }
 
 /**
- * @brief this function should only be called once in warmup mode
+ * @brief Instantiate all the parts in uncore
+ *
+ * Note: this function should only be called once in warmup mode
  *
  */
 void init_uncores(void) {
   mem->uncores = (Uncore*)malloc(sizeof(Uncore) * NUM_CORES);
 
-  /* Initialize MLC cache (shared only for now) */
-  Ported_Cache* mlc = (Ported_Cache*)malloc(sizeof(Ported_Cache));
-  init_cache(&mlc->cache, "MLC_CACHE", MLC_SIZE, MLC_ASSOC, MLC_LINE_SIZE,
-             sizeof(MLC_Data), MLC_CACHE_REPL_POLICY);
-  mlc->num_banks = MLC_BANKS;
-  mlc->ports     = (Ports*)malloc(sizeof(Ports) * mlc->num_banks);
-  for(uns ii = 0; ii < mlc->num_banks; ii++) {
-    char name[MAX_STR_LENGTH + 1];
-    snprintf(name, MAX_STR_LENGTH, "MLC BANK %d PORTS", ii);
-    init_ports(&mlc->ports[ii], name, MLC_READ_PORTS, MLC_WRITE_PORTS, FALSE);
-  }
-  for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++) {
-    MLC(proc_id) = mlc;
+  /* Initialize MLC cache */
+  if(PRIVATE_MLC) {
+    for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++) {
+      Ported_Cache* mlc = (Ported_Cache*)malloc(sizeof(Ported_Cache));
+      char          buf[MAX_STR_LENGTH + 1];
+      sprintf(buf, "MLC[%d]", proc_id);
+      init_cache(&mlc->cache, buf, MLC_SIZE, MLC_ASSOC, MLC_LINE_SIZE,
+                 sizeof(MLC_Data), MLC_CACHE_REPL_POLICY);
+      mlc->num_banks = MLC_BANKS;
+      mlc->ports     = (Ports*)malloc(sizeof(Ports) * mlc->num_banks);
+      for(uns ii = 0; ii < mlc->num_banks; ii++) {
+        char name[MAX_STR_LENGTH + 1];
+        snprintf(name, MAX_STR_LENGTH, "MLC[%d] BANK %d PORTS", proc_id, ii);
+        init_ports(&mlc->ports[ii], name, MLC_READ_PORTS, MLC_WRITE_PORTS,
+                   FALSE);
+      }
+      MLC(proc_id) = mlc;
+    }
+  } else {
+    Ported_Cache* mlc = (Ported_Cache*)malloc(sizeof(Ported_Cache));
+    init_cache(&mlc->cache, "MLC_CACHE", MLC_SIZE, MLC_ASSOC, MLC_LINE_SIZE,
+               sizeof(MLC_Data), MLC_CACHE_REPL_POLICY);
+    mlc->num_banks = MLC_BANKS;
+    mlc->ports     = (Ports*)malloc(sizeof(Ports) * mlc->num_banks);
+    for(uns ii = 0; ii < mlc->num_banks; ii++) {
+      char name[MAX_STR_LENGTH + 1];
+      snprintf(name, MAX_STR_LENGTH, "MLC BANK %d PORTS", ii);
+      init_ports(&mlc->ports[ii], name, MLC_READ_PORTS, MLC_WRITE_PORTS, FALSE);
+    }
+    for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++) {
+      MLC(proc_id) = mlc;
+    }
   }
 
   /* Initialize LLC */
@@ -434,6 +457,7 @@ void init_uncores(void) {
         snprintf(name, MAX_STR_LENGTH, "L1[%d] BANK %d PORTS", proc_id, ii);
         init_ports(&l1->ports[ii], name, L1_READ_PORTS, L1_WRITE_PORTS, FALSE);
       }
+      // Use this macro to unify the handling of private/share L1
       L1(proc_id) = l1;
     }
   } else {
@@ -813,7 +837,7 @@ int cycle_busoutq_insert_count = 0;
 int l1_in_buf_count            = 0;
 
 /**
- * @brief Not sure what this func is doing
+ * @brief Sort all mem queues
  *
  */
 void update_memory_queues() {
@@ -866,8 +890,9 @@ void update_on_chip_memory_stats() {
 
 /**
  * @brief simulate the memory system for one cycle
- * functions are called in reverse order, that's fill queues (req going back to
- * cpu), first, then ramulator (DRAM), then request queues (reg going down to
+ *
+ * Note: updates happen in reverse order, that's fill queues (reqs going back to
+ * cpu), first, then ramulator (DRAM), then request queues (reqs going out to
  * mem)
  *
  */
@@ -895,11 +920,12 @@ void update_memory() {
   if(freq_is_ready(FREQ_DOMAIN_L1)) {
     cycle_count = freq_cycle_count(FREQ_DOMAIN_L1);
 
-    mem_process_bus_out_reqs();
+    mem_process_bus_out_reqs();  // obsolete code, nothing will be executed
     mem_process_l1_reqs();
     mem_process_mlc_reqs();
   }
 
+  // WQ: why is this not called before mlc_fill???
   for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++) {
     if(freq_is_ready(FREQ_DOMAIN_CORES[proc_id])) {
       cycle_count = freq_cycle_count(FREQ_DOMAIN_CORES[proc_id]);
@@ -925,13 +951,18 @@ int mem_compare_priority(const void* a, const void* b) {
     return 0;
 }
 
-/**************************************************************************************/
-/* mem_start_mlc_access: */
 
+/**
+ * @brief Try obtain MLC ports, transit state into MLC_WAIT on success
+ *
+ * @param req
+ */
 void mem_start_mlc_access(Mem_Req* req) {
   Flag avail = FALSE;
 
   /* FIXME: Only WB reqs try to get a write port? How about stores? */
+  // WQ: store definately need read port first, but potentially need to obtain
+  //    write port subsequently, not sure how to model this
   Flag need_wp = ((req->type == MRT_WB) || (req->type == MRT_WB_NODIRTY));
   Flag need_rp = !need_wp;
   if((need_wp && get_write_port(&MLC(req->proc_id)->ports[req->mlc_bank])) ||
@@ -954,9 +985,11 @@ void mem_start_mlc_access(Mem_Req* req) {
     STAT_EVENT(req->proc_id, MLC_LD_BANK_BLOCK + avail);
 }
 
-/**************************************************************************************/
-/* mem_start_l1_access: */
-
+/**
+ * @brief Change the MRS state into wait, if obtain the required port
+ *
+ * @param req
+ */
 void mem_start_l1_access(Mem_Req* req) {
   Flag avail = FALSE;
 
@@ -982,6 +1015,7 @@ void mem_start_l1_access(Mem_Req* req) {
       req->rdy_cycle                  = freq_convert_future_cycle(
         core_domain, core_cycle_count + L1_CYCLES, FREQ_DOMAIN_L1);
     } else {
+      // cycle_count is the current global cycle
       req->rdy_cycle = cycle_count + L1_CYCLES;
     }
 
@@ -995,11 +1029,22 @@ void mem_start_l1_access(Mem_Req* req) {
     STAT_EVENT(req->proc_id, L1_LD_BANK_BLOCK + avail);
 }
 
-/**************************************************************************************/
-/* mem_process_l1_hit_access: */
-/* Returns TRUE if l1 access is complete and needs to be removed from l1_queue
+/**
+ * @brief post_process after LLC hit.
+ *
+ * Resp the req either to core's fill_queue or MLC fill_queue
+ * If need to WB (write through cache), it will be handled outside this func
+ *
+ * Update the state of the data blk (dirty bit) on hit
+ *
+ * @param req
+ * @param l1_queue_entry
+ * @param line_addr
+ * @param data
+ * @param lru_position
+ * @return Flag Returns TRUE if l1 access is complete and needs to be removed
+ * from l1_queue
  */
-
 Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
                                Addr* line_addr, L1_Data* data,
                                int lru_position) {
@@ -1057,7 +1102,8 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
       STAT_EVENT(req->proc_id, L1_WB_HIT);
       STAT_EVENT(req->proc_id, CORE_L1_WB_HIT);
     }
-    data->dirty |= (req->type == MRT_WB);
+    // mark the blk dirty on WB
+    data->dirty |= (req->type == MRT_WB || req->type == MRT_DSTORE);
   }
 
   DEBUG(req->proc_id,
@@ -1092,15 +1138,26 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
     DEBUG(req->proc_id, "Req index:%d no longer a chip demand\n", req->id);
   }
 
-  // this is just a stat collection
+  // collect stat for wrong path accesses
   wp_process_l1_hit(data, req);
 
+  /////////////////////////////////////////////
+  // main logic for handling a hit:
+  // Case 0: req is WB and L1 is write through, propogate downwards
+  // WQ: case 0 is deprecated, WB will be sent to ramulator from the caller
   if(L1_WRITE_THROUGH && (req->type == MRT_WB)) {
     req->state     = MRS_BUS_NEW;
     req->rdy_cycle = cycle_count + L1Q_TO_FSB_TRANSFER_LATENCY;
+    // Case 1: propagate upwards to MLC
   } else if(fill_mlc) {
     req->state     = MRS_FILL_MLC;
     req->rdy_cycle = cycle_count + 1;
+    // borrow the dirty_l0 field of req to mark the fill contains dirty data
+    // true for both inclusive & exclusive L1
+    if(data->dirty) {
+      req->dirty_l0 = TRUE;
+    }
+
     // insert into mlc queue
     req->queue = &(mem->mlc_fill_queue);
     if(!ORDER_BEYOND_BUS)
@@ -1111,30 +1168,47 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
       mem_insert_req_into_queue(req, req->queue,
                                 ALL_FIFO_QUEUES ? mlc_fill_seq_num : 0);
     mlc_fill_seq_num++;
+    // maintain exclusivity, evict from L1
+    if(EXCLUSIVE_L1) {
+      // no need to WB since dirtyness will be propogated upwards too
+      ASSERT(0, MLC_PRESENT);
+      Addr dummy;
+      cache_invalidate(&L1(req->proc_id)->cache, req->addr, &dummy);
+    }
+
+    // Case 2: if done_func is not bound (usually is a prefetch), terminate the
+    // req
   } else if(!req->done_func) {
     req->state = MRS_L1_HIT_DONE;
     // Free the request buffer
     mem_free_reqbuf(req);
+    // Case 3: propgate upwards, directly to the core
   } else {
+    // this case should only be called when no MLC in sys
+    // WQ: seems the dirtyness is not propogate upward here
     req->state     = MRS_L1_HIT_DONE;
     req->rdy_cycle = freq_cycle_count(
       FREQ_DOMAIN_CORES[req->proc_id]);  // no +1 to match old performance
     // insert into core fill queue
     req->queue = &(mem->core_fill_queues[req->proc_id]);
-    if(!ORDER_BEYOND_BUS)
+
+    if(data->dirty)
+      req->dirty_l0 = TRUE;
+    if(!ORDER_BEYOND_BUS) {
       mem_insert_req_into_queue(req, req->queue,
                                 ALL_FIFO_QUEUES ?
                                   core_fill_seq_num[req->proc_id] :
                                   l1_queue_entry->priority);
-    else
+    } else {
       mem_insert_req_into_queue(
         req, req->queue, ALL_FIFO_QUEUES ? core_fill_seq_num[req->proc_id] : 0);
-    core_fill_seq_num[req->proc_id]++;
+      core_fill_seq_num[req->proc_id]++;
+    }
   }
 
   /* Set the priority so that this entry will be removed from the l1_queue */
   l1_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
-
+  // wq todo: the count is not set correctly?
 
   if(L2L1PREF_ON)
     l2l1pref_mem(req);
@@ -1142,92 +1216,124 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
   return TRUE;
 }
 
-/**************************************************************************************/
-/* mem_process_mlc_hit_access: */
-/* Returns TRUE if mlc access is complete and needs to be removed from mlc_queue
+/**
+ * @brief
+ *
+ * @param req
+ * @param mlc_queue_entry
+ * @param line_addr
+ * @param data
+ * @param lru_position
+ * @return Flag Returns TRUE if mlc access is complete and needs to be removed
+ * from mlc_queue
  */
-
 Flag mem_process_mlc_hit_access(Mem_Req* req, Mem_Queue_Entry* mlc_queue_entry,
                                 Addr* line_addr, MLC_Data* data,
                                 int lru_position) {
-  if(!req->done_func ||
-     req->done_func(req)) { /* If done_func is not complete we will keep
-                               accessing MLC until done_func returns TRUE */
-
-    if(data) { /* not perfect mlc */
-      if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
-         (req->type == MRT_IFETCH)) {
-        if(data->prefetch) {  // prefetch hit
-          DEBUG(req->proc_id, "%7lld mlc prefetch hit %d\n", cycle_count,
-                (int)(req->addr));
-          STAT_EVENT(req->proc_id, MLC_PREF_HIT);
-          if(!data->seen_prefetch) {
-            data->seen_prefetch = TRUE;
-
-            STAT_EVENT(req->proc_id, MLC_PREF_UNIQUE_HIT);
-            STAT_EVENT(req->proc_id, PREF_MLC_TOTAL_USED);
-            STAT_EVENT(req->proc_id, CORE_PREF_MLC_USED);
-            STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL_USED);
-          }
-        }
-      }
-
-      if(req->type == MRT_DPRF || req->type == MRT_IPRF ||
-         req->demand_match_prefetch) {
-        STAT_EVENT(req->proc_id, MLC_PREF_REQ_HIT);
-        STAT_EVENT(req->proc_id, CORE_MLC_PREF_REQ_HIT);
-      } else if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
-                (req->type == MRT_IFETCH)) {
-        STAT_EVENT(req->proc_id, MLC_DEMAND_HIT);
-        STAT_EVENT(req->proc_id, CORE_MLC_DEMAND_HIT);
-      } else {  // CMP Watch out RA
-        STAT_EVENT(req->proc_id, MLC_WB_HIT);
-        STAT_EVENT(req->proc_id, CORE_MLC_WB_HIT);
-      }
-      data->dirty |= (req->type == MRT_WB);
-    }
-
+  if(data) { /* not perfect mlc */
     if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
        (req->type == MRT_IFETCH)) {
-      STAT_EVENT(req->proc_id, MLC_HIT);
-      STAT_EVENT(req->proc_id, CORE_MLC_HIT);
-      STAT_EVENT(req->proc_id, MLC_HIT_ONPATH + req->off_path);
-      if(0 && DEBUG_EXC_INSERTS) {
-        printf("addr:%s hit in MLC type:%s\n", hexstr64s(req->addr),
-               Mem_Req_Type_str(req->type));
+      if(data->prefetch) {  // prefetch hit
+        DEBUG(req->proc_id, "%7lld mlc prefetch hit %d\n", cycle_count,
+              (int)(req->addr));
+        STAT_EVENT(req->proc_id, MLC_PREF_HIT);
+        if(!data->seen_prefetch) {
+          data->seen_prefetch = TRUE;
+
+          STAT_EVENT(req->proc_id, MLC_PREF_UNIQUE_HIT);
+          STAT_EVENT(req->proc_id, PREF_MLC_TOTAL_USED);
+          STAT_EVENT(req->proc_id, CORE_PREF_MLC_USED);
+          STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL_USED);
+        }
       }
     }
+    data->dirty |= (req->type == MRT_WB || req->type == MRT_DSTORE);
+  }
 
-    STAT_EVENT_ALL(MLC_HIT_ALL);
-    STAT_EVENT_ALL(MLC_HIT_ALL_ONPATH + req->off_path);
+  if(req->type == MRT_DPRF || req->type == MRT_IPRF ||
+     req->demand_match_prefetch) {
+    STAT_EVENT(req->proc_id, MLC_PREF_REQ_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_PREF_REQ_HIT);
+  } else if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
+            (req->type == MRT_IFETCH)) {
+    STAT_EVENT(req->proc_id, MLC_DEMAND_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_DEMAND_HIT);
+  } else {  // CMP Watch out RA
+    STAT_EVENT(req->proc_id, MLC_WB_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_WB_HIT);
+  }
 
-    // cmp IGNORE
-    if(req->off_path)
-      STAT_EVENT(req->proc_id, MLC_HIT_OFFPATH_IFETCH + MIN2(req->type, 6));
-    else
-      STAT_EVENT(req->proc_id, MLC_HIT_ONPATH_IFETCH + MIN2(req->type, 6));
-
-    if(MLC_WRITE_THROUGH && (req->type == MRT_WB)) {
-      req->state     = MRS_L1_NEW;
-      req->rdy_cycle = cycle_count + MLCQ_TO_L1Q_TRANSFER_LATENCY;
-    } else {  // writeback done
-      /* Remove the entry from request buffer */
-      req->state = MRS_MLC_HIT_DONE;
-      mem_free_reqbuf(req);
+  if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
+     (req->type == MRT_IFETCH)) {
+    STAT_EVENT(req->proc_id, MLC_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_HIT);
+    STAT_EVENT(req->proc_id, MLC_HIT_ONPATH + req->off_path);
+    if(0 && DEBUG_EXC_INSERTS) {
+      printf("addr:%s hit in MLC type:%s\n", hexstr64s(req->addr),
+             Mem_Req_Type_str(req->type));
     }
+  }
 
-    /* Set the priority so that this entry will be removed from the mlc_queue */
-    mlc_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
+  STAT_EVENT_ALL(MLC_HIT_ALL);
+  STAT_EVENT_ALL(MLC_HIT_ALL_ONPATH + req->off_path);
 
-    return TRUE;
-  } else {
-    return FALSE;
+  // cmp IGNORE
+  if(req->off_path)
+    STAT_EVENT(req->proc_id, MLC_HIT_OFFPATH_IFETCH + MIN2(req->type, 6));
+  else
+    STAT_EVENT(req->proc_id, MLC_HIT_ONPATH_IFETCH + MIN2(req->type, 6));
+
+  /////////////////////////////////////////////////////////////////
+  // main logic for handling mlc hit
+  // Case 0, deprecated: wb
+  if(MLC_WRITE_THROUGH && (req->type == MRT_WB)) {
+    req->state     = MRS_L1_NEW;
+    req->rdy_cycle = cycle_count + MLCQ_TO_L1Q_TRANSFER_LATENCY;
+  }
+  // Case 1: is a prefetch, free the req here
+  else if(!req->done_func) {
+    /* Remove the entry from request buffer */
+    req->state = MRS_MLC_HIT_DONE;
+    mem_free_reqbuf(req);
+  }
+  // Case 2: steer the req to cores' fill queue
+  else {
+    req->state     = MRS_MLC_HIT_DONE;
+    req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_CORES[req->proc_id]);
+    req->queue     = &(mem->core_fill_queues[req->proc_id]);
+    if(data->dirty)
+      req->dirty_l0 = TRUE;
+    if(!ORDER_BEYOND_BUS) {
+      mem_insert_req_into_queue(req, req->queue,
+                                ALL_FIFO_QUEUES ?
+                                  core_fill_seq_num[req->proc_id] :
+                                  mlc_queue_entry->priority);
+    } else {
+      mem_insert_req_into_queue(
+        req, req->queue, ALL_FIFO_QUEUES ? core_fill_seq_num[req->proc_id] : 0);
+      core_fill_seq_num[req->proc_id]++;
+    }
   }
-}
 
-/**************************************************************************************/
-/* mem_process_l1_miss_access: */
+  /* Set the priority so that this entry will be removed from the mlc_queue */
+  mlc_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
+
+  return TRUE;
+}
 
+/**
+ * @brief Miss path for LLC access
+ *
+ * if WB, inserted into L1 array. Otherwise, send to ramulator (handled by the
+ * caller)
+ *
+ *
+ * @param req
+ * @param l1_queue_entry
+ * @param line_addr
+ * @param data
+ * @return Flag return 1 if miss processing finished correctly
+ */
 static Flag mem_process_l1_miss_access(Mem_Req*         req,
                                        Mem_Queue_Entry* l1_queue_entry,
                                        Addr* line_addr, L1_Data* data) {
@@ -1238,6 +1344,7 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
         hexstr64s(req->addr), req->l1_bank, req->size,
         mem_req_state_names[req->state]);
 
+  // collect stats
   if(!req->l1_miss) {  // have we collected these statistics already?
     if(req->type == MRT_DFETCH || req->type == MRT_DSTORE ||
        req->type == MRT_IFETCH) {
@@ -1284,13 +1391,17 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
       STAT_EVENT(req->proc_id, L1_MISS_ONPATH_IFETCH + MIN2(req->type, 6));
   }
 
+  /*
+   * Case 1: if the request is a write back request then the processor just
+   * insert the request to the L1 cache
+   */
   if((req->type == MRT_WB) || (req->type == MRT_WB_NODIRTY)) {
-    // if the request is a write back request then the processor just insert the
-    // request to the L1 cache
-    if(req->type == MRT_WB_NODIRTY)
+    if(!EXCLUSIVE_L1 && req->type == MRT_WB_NODIRTY)
       WARNING(0, "CMP: A WB_NODIRTY request found! Check it out!");
 
+    // install the blk and descruct the req
     if(req->done_func) {
+      // this should be rare, wb usually don't bind done_func
       ASSERT(req->proc_id, ALLOW_TYPE_MATCHES);
       ASSERT(req->proc_id, req->wb_requested_back);
       if(req->done_func(req)) {
@@ -1310,6 +1421,8 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
     } else {
       STAT_EVENT(req->proc_id, WB_L1_MISS_FILL_L1);  // CMP remove this later
       if(!l1_fill_line(req)) {
+        // if cannot insert the blk, need to inform the caller not to delete
+        // the req from the queue (and retry later)
         req->rdy_cycle = cycle_count + 1;
         return FALSE;
       }
@@ -1327,6 +1440,9 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
     }
   }
 
+  /**
+   * Case 2: TODO check the situation here
+   */
   if(STALL_MEM_REQS_ONLY && !mem_req_type_is_stalling(req->type)) {
     // not calling done_func to avoid filling caches
     req->state     = MRS_INV;
@@ -1336,42 +1452,31 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
     return TRUE;
   }
 
-  /* Mark the request as L1_miss */
+  /**
+   * Case 3: just need to propogate the miss downwards (handled in the caller of
+   * this func)
+   */
   req->l1_miss       = TRUE;
   req->l1_miss_cycle = cycle_count;
 
   if((CONSTANT_MEMORY_LATENCY && !queue_full(&mem->l1fill_queue)) ||
-     //(!CONSTANT_MEMORY_LATENCY && !queue_full(&mem->bus_out_queue))) {
      (!CONSTANT_MEMORY_LATENCY)) {
-    // Ramulator: moving the lines below to where ramulator_send() is called
-
-    //// cmp FIXME
-    // if (TRACK_L1_MISS_DEPS || MARK_L1_MISSES)
-    //    mark_ops_as_l1_miss(req);
-
-    // req->state = MRS_BUS_NEW; // FIXME?
-    // req->rdy_cycle = cycle_count + L1Q_TO_FSB_TRANSFER_LATENCY; /* this req
-    // will be ready to be sent to memory in the next cycle */
-
-    //// cmp FIXME
-    // if (STREAM_PREFETCH_ON)
-    //    stream_ul1_miss (req);
-
-    ///* Set the priority so that this entry will be removed from the l1_queue
-    ///*/
-    // l1_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
-
-    // STAT_EVENT(req->proc_id, SEND_MISS_REQ_QUEUE);
     return TRUE;
   } else {
-    // STAT_EVENT(req->proc_id, REJECTED_QUEUE_BUS_OUT);
     return FALSE;
   }
 }
 
-/**************************************************************************************/
-/* mem_process_mlc_miss_access: */
 
+/**
+ * @brief Miss path for MLC accesses
+ *
+ * @param req
+ * @param mlc_queue_entry
+ * @param line_addr
+ * @param data
+ * @return Flag
+ */
 static Flag mem_process_mlc_miss_access(Mem_Req*         req,
                                         Mem_Queue_Entry* mlc_queue_entry,
                                         Addr* line_addr, MLC_Data* data) {
@@ -1416,12 +1521,14 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
   req->mlc_miss       = TRUE;
   req->mlc_miss_cycle = cycle_count;
 
+  // Case 0: if WB, directly insert to MLC
   if((req->type == MRT_WB) || (req->type == MRT_WB_NODIRTY)) {
     // if the request is a write back request then the processor just insert the
     // request to the MLC cache
     if(req->type == MRT_WB_NODIRTY)
       WARNING(0, "CMP: A WB_NODIRTY request found! Check it out!");
 
+    // WQ: WB with a done func should be rare (dc miss won't bind func_done)
     if(req->done_func) {
       ASSERT(req->proc_id, ALLOW_TYPE_MATCHES);
       ASSERT(req->proc_id, req->wb_requested_back);
@@ -1438,6 +1545,7 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
       }
     } else {
       STAT_EVENT(req->proc_id, WB_MLC_MISS_FILL_MLC);  // CMP remove this later
+      // WQ TODO: check if mlc_fill can potentially fail
       mlc_fill_line(req);
       if(MLC_WRITE_THROUGH && req->type == MRT_WB) {
         req->state     = MRS_L1_NEW;
@@ -1451,7 +1559,7 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
       return TRUE;
     }
   }
-
+  // Case 1: otherwise, send req to downwards (l1)
   if(!queue_full(&mem->l1_queue)) {
     req->state     = MRS_L1_NEW;
     req->rdy_cycle = cycle_count +
@@ -1467,11 +1575,19 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
   }
 }
 
-/**************************************************************************************/
-/* mem_complete_l1_access: */
-/* Returns TRUE if l1 access is complete and needs to be removed from l1_queue
+/**
+ * @brief Process the L1 reg already obtain the port
+ *
+ * If hit in L1, send req back upwards. Otherwise try sending it out to bus
+ * (ramulator)
+ *
+ * @param req
+ * @param l1_queue_entry
+ * @param out_queue_insertion_count
+ * @param reserved_entry_count
+ * @return Flag TRUE if l1 access is complete and needs to be removed from
+ * l1_queue
  */
-
 static Flag mem_complete_l1_access(Mem_Req*         req,
                                    Mem_Queue_Entry* l1_queue_entry,
                                    int*             out_queue_insertion_count,
@@ -1542,8 +1658,11 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
   if(!PREFETCH_UPDATE_LRU_L1 &&
      (req->type == MRT_DPRF || req->type == MRT_IPRF))
     update_l1_lru = FALSE;
+
+  // lookup LLC, data set to NULL on miss
   data = (L1_Data*)cache_access(&L1(req->proc_id)->cache, req->addr, &line_addr,
-                                update_l1_lru);  // access L2
+                                update_l1_lru);
+  // update the shadow cache
   cache_part_l1_access(req);
   if(FORCE_L1_MISS)
     data = NULL;
@@ -1568,13 +1687,13 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
      !data) /* do not put into L2 if this is a prefetch or off-path */
     data = l1_pref_cache_access(req);
 
-  Flag access_done = TRUE;
-  if(data || PERFECT_L1) { /* l1 hit */
-    // if exclusive cache, invalidate the line in L2 if there is a done function
-    // to transfer the data to L1 -- also need to propagate the dirty to L1
+  Flag access_done = TRUE;  // This flag tells whether to remove the req from
+                            // L1_queue
+  if(data || PERFECT_L1) {  /* l1 hit */
     Flag l1_hit_access = mem_process_l1_hit_access(
       req, l1_queue_entry, &line_addr, data, lru_position);
     if(!l1_hit_access)
+      // WQ: this should not happen
       access_done = FALSE;
     else {
       if(!PREF_ORACLE_TRAIN_ON &&
@@ -1588,12 +1707,10 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
         pref_ul1_hit(req->proc_id, req->addr, req->loadPC, req->global_hist);
       }
 
-      if(L1_WRITE_THROUGH && (req->type == MRT_WB) &&
+      // propogate to dram for writethrough cache regardless of hit/miss
+      if(L1_WRITE_THROUGH &&
+         (req->type == MRT_WB || req->type == MRT_WB_NODIRTY) &&
          !CONSTANT_MEMORY_LATENCY) {
-        // req->queue = &(mem->bus_out_queue);
-
-        // mem_insert_req_into_queue (req, req->queue, ALL_FIFO_QUEUES ?
-        // bus_out_seq_num : 0);
         ASSERT(req->proc_id, MRS_L1_WAIT == req->state);
         req->state    = MRS_MEM_NEW;
         l1_hit_access = ramulator_send(req);
@@ -1611,13 +1728,8 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
           // perf_pred_mem_req_start(req);
           mem_free_reqbuf(req);
         }
-
-        // bus_out_seq_num++;
-        //(*out_queue_insertion_count) += 1;
-        // STAT_EVENT(req->proc_id, BUS_ACCESS);
       }
     }
-    // CMP IGNORE
   } else { /* l1 miss */
     /* if req is wb then either fill l1 or try again */
     Flag l1_miss_send_bus = (L1_WRITE_THROUGH && (req->type == MRT_WB)) ||
@@ -1627,6 +1739,9 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
       l1_miss_send_bus = FALSE;
     Flag l1_miss_access = mem_process_l1_miss_access(req, l1_queue_entry,
                                                      &line_addr, data);
+    // send a miss req downwards
+    // WQ: to be consistent, this blk need to be moved in
+    // mem_process_l1_miss_access
     if(l1_miss_access && l1_miss_send_bus) {
       if(CONSTANT_MEMORY_LATENCY) {
         mem->uncores[req->proc_id].num_outstanding_l1_misses++;
@@ -1648,20 +1763,14 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
         STAT_EVENT(req->proc_id, POWER_DRAM_ACTIVATE);
         STAT_EVENT(req->proc_id, POWER_DRAM_READ);
       } else {
-        // Ramulator remove
-        // req->queue = &(mem->bus_out_queue);
-        // mem_insert_req_into_queue (req, req->queue, ALL_FIFO_QUEUES ?
-        // bus_out_seq_num : 0);
-
         ASSERT(req->proc_id, MRS_L1_WAIT == req->state);
         req->state     = MRS_MEM_NEW;
         l1_miss_access = ramulator_send(req);
         if(!l1_miss_access) {
-          // STAT_EVENT(req->proc_id, REJECTED_QUEUE_BUS_OUT);
-
+          // Fail to send req to dram
           req->state  = MRS_L1_WAIT;
           access_done = FALSE;
-        } else {
+        } else {  // send to dram succeed
           ASSERT(req->proc_id, req->mem_queue_cycle >= req->rdy_cycle);
           req->queue = NULL;
 
@@ -1673,10 +1782,6 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
           if(TRACK_L1_MISS_DEPS || MARK_L1_MISSES)
             mark_ops_as_l1_miss(req);
 
-          // req->state = MRS_BUS_NEW; // FIXME?
-          // req->rdy_cycle = cycle_count + L1Q_TO_FSB_TRANSFER_LATENCY; /* this
-          // req will be ready to be sent to memory in the next cycle */
-
           // cmp FIXME
           if(STREAM_PREFETCH_ON)
             stream_ul1_miss(req);
@@ -1749,11 +1854,17 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
   return access_done;
 }
 
-/**************************************************************************************/
-/* mem_complete_mlc_access: */
-/* Returns TRUE if mlc access is complete and needs to be removed from mlc_queue
- */
 
+/**
+ * @brief Access MLC array
+ *
+ * @param req
+ * @param mlc_queue_entry
+ * @param l1_queue_insertion_count
+ * @param reserved_entry_count
+ * @return Flag Returns TRUE if mlc access is complete and needs to be removed
+ * from mlc_queue
+ */
 static Flag mem_complete_mlc_access(Mem_Req*         req,
                                     Mem_Queue_Entry* mlc_queue_entry,
                                     int*             l1_queue_insertion_count,
@@ -1770,11 +1881,9 @@ static Flag mem_complete_mlc_access(Mem_Req*         req,
                                  &line_addr, update_mlc_lru);  // access MLC
 
   if(data || PERFECT_MLC) { /* mlc hit */
-    // if exclusive cache, invalidate the line in L2 if there is a done function
-    // to transfer the data to MLC -- also need to propagate the dirty to MLC
     Flag mlc_hit_access = mem_process_mlc_hit_access(
       req, mlc_queue_entry, &line_addr, data, lru_position);
-    if(!mlc_hit_access) {
+    if(!mlc_hit_access) {  // not gonna happen
       return FALSE;
     } else {
       if(!PREF_ORACLE_TRAIN_ON &&
@@ -1788,7 +1897,9 @@ static Flag mem_complete_mlc_access(Mem_Req*         req,
         pref_umlc_hit(req->proc_id, req->addr, req->loadPC, req->global_hist);
       }
 
-      if(MLC_WRITE_THROUGH && (req->type == MRT_WB)) {
+      // wb for write through cache
+      if(MLC_WRITE_THROUGH &&
+         (req->type == MRT_WB || req->type == MRT_WB_NODIRTY)) {
         req->queue = &(mem->l1_queue);
         mem_insert_req_into_queue(req, req->queue,
                                   ALL_FIFO_QUEUES ? l1_seq_num : 0);
@@ -1833,21 +1944,25 @@ static Flag mem_complete_mlc_access(Mem_Req*         req,
         // Train the Data prefetcher
         pref_umlc_miss(req->proc_id, req->addr, req->loadPC, req->global_hist);
       }
-
       return TRUE;
     } else if(!mlc_miss_access) {
+      // miss process is not ready
       return FALSE;
+    } else {
+      return TRUE;
     }
-    return TRUE;
   }
   ASSERT(req->proc_id, 0);
 }
 
-/**************************************************************************************/
-/* mem_process_new_reqs: */
-/* Access L1 if port is ready - If L1 miss, then put the request into miss queue
- */
 
+/**
+ * @brief Access path for all reqs from upward into L1
+ *
+ * WQ: modeling for DSTORE is largely off now: obtain port & handling of
+ * write miss
+ *
+ */
 static void mem_process_l1_reqs() {
   Mem_Req* req = NULL;
   int      ii;
@@ -1857,7 +1972,6 @@ static void mem_process_l1_reqs() {
   int      l1_queue_reserve_entry_count = 0;
 
   /* Go thru the l1_queue and try to access L1 for each request */
-
   for(ii = 0; ii < mem->l1_queue.entry_count; ii++) {
     reqbuf_id = mem->l1_queue.base[ii].reqbuf;
     req       = &(mem->req_buffer[reqbuf_id]);
@@ -1880,9 +1994,8 @@ static void mem_process_l1_reqs() {
 
     /* Request is ready: see what state it is in */
 
-    /* If this is a new request, reserve L1 port and transition to wait state */
     if(req->state == MRS_L1_NEW) {
-      mem_start_l1_access(req);
+      mem_start_l1_access(req);  // obtain port for req, change req->state
       STAT_EVENT(req->proc_id, L1_ACCESS);
       if(req->type == MRT_DPRF || req->type == MRT_IPRF)
         STAT_EVENT(req->proc_id, L1_PREF_ACCESS);
@@ -1895,6 +2008,7 @@ static void mem_process_l1_reqs() {
               mem->req_count, mem->l1_queue.entry_count,
               mem->bus_out_queue.entry_count, mem->l1fill_queue.entry_count);
 
+      // actual logic for accessing L1 array
       if(mem_complete_l1_access(req, &(mem->l1_queue.base[ii]),
                                 &out_queue_insertion_count,
                                 &l1_queue_reserve_entry_count))
@@ -1934,11 +2048,10 @@ static void mem_process_l1_reqs() {
   }
 }
 
-/**************************************************************************************/
-/* mem_process_mlc_reqs: */
-/* Access MLC if port is ready - If MLC miss, then put the request into miss
- * queue */
 
+/**
+ * @brief Access path for all req coming from core side into MLC
+ */
 static void mem_process_mlc_reqs() {
   Mem_Req* req = NULL;
   int      ii;
@@ -2020,10 +2133,15 @@ static void mem_process_mlc_reqs() {
   }
 }
 
-/**************************************************************************************/
-/* mem_process_bus_out_reqs: */
-/* FIXME: need to busy the bus for the time a line is being sent */
-
+/**
+ * @deprecated
+ * @brief Obsolete, bus_out is repalced by ramulator. The function
+ * will still be called but since bus_out_queue is supposed to always
+ * be 0, the first return will take the execution out of the function
+ *
+ * To send req to DRAM, use ramulator_send()
+ *
+ */
 static void mem_process_bus_out_reqs() {
   Mem_Req* req;
   int      ii;
@@ -2043,6 +2161,7 @@ static void mem_process_bus_out_reqs() {
     // return; // VEYNU: if there is no room in the mem queue do nothing
     return;  // Ramulator: early return if bus_out_queue is empty
   }
+
   ASSERTM(0, FALSE,
           "ERROR: bus_out_queue should always be empty\n");  // Ramulator
   // Ramulator handles off-chip communication latency itself. So we
@@ -2347,9 +2466,19 @@ static void mem_process_bus_out_reqs() {
   }
 }
 
-/**************************************************************************************/
-/* mem_complete_bus_in_access: */
 
+/**
+ * @brief Add req into uncore queues
+ *
+ * Depends on uncore config, this func will either steer reqs
+ * into l1fill_queue or to mlc_fill_queue, and change the req->state
+ * correspondingly.
+ *
+ * Ramulator call this func to return serviced reqs
+ *
+ * @param req
+ * @param priority
+ */
 void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
   DEBUG(req->proc_id,
         "Mem request completed bus in access  index:%ld  type:%s  addr:0x%s  "
@@ -2357,22 +2486,32 @@ void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
         (long int)(req - mem->req_buffer), Mem_Req_Type_str(req->type),
         hexstr64s(req->addr), req->size, mem_req_state_names[req->state]);
 
-  req->state = MRS_FILL_L1;
-
-  /* Crossing frequency domain boundary between the chip and memory controller
-   */
-  req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_L1) + 1;
-
-  req->queue = &(mem->l1fill_queue);
+  // usually the dest are either L1 (prefetchers) or None (demandings)
+  // WQ TODO: MLC prefetch
+  Counter* fill_seq_num;
+  if(req->destination == DEST_L1 || !MLC_PRESENT || !EXCLUSIVE_L1) {
+    req->state = MRS_FILL_L1;
+    // Crossing frequency domain boundary between the chip and memory controller
+    req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_L1) + 1;
+    req->queue     = &(mem->l1fill_queue);
+    fill_seq_num   = &l1fill_seq_num;
+  } else {
+    req->state     = MRS_FILL_MLC;
+    req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_L1) + 1;
+    req->queue     = &(mem->mlc_fill_queue);
+    fill_seq_num   = &mlc_fill_seq_num;
+  }
 
-  if(!ORDER_BEYOND_BUS)
+  if(!ORDER_BEYOND_BUS) {
     mem_insert_req_into_queue(req, req->queue,
-                              ALL_FIFO_QUEUES ? l1fill_seq_num : priority);
-  else
+                              ALL_FIFO_QUEUES ? *fill_seq_num : priority);
+  } else {
     mem_insert_req_into_queue(req, req->queue,
-                              ALL_FIFO_QUEUES ? l1fill_seq_num : 0);
+                              ALL_FIFO_QUEUES ? *fill_seq_num : 0);
+  }
+  (*fill_seq_num)++;
 
-  l1fill_seq_num++;
+  // WQ TODO: currently there is no dedicate counter for mlc, reuse l1 counters
   ASSERT(req->proc_id,
          mem->uncores[req->proc_id].num_outstanding_l1_misses > 0);
   mem->uncores[req->proc_id].num_outstanding_l1_misses--;
@@ -2380,6 +2519,7 @@ void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
   if(!CONSTANT_MEMORY_LATENCY && !PERF_PRED_REQS_FINISH_AT_FILL)
     perf_pred_mem_req_done(req);
 
+  // collect stats
   if(req->type != MRT_WB_NODIRTY && req->type != MRT_WB) {
     INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY,
                        req->rdy_cycle - req->mem_queue_cycle);
@@ -2405,12 +2545,25 @@ void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
   }
 }
 
+/**
+ * @brief Remove req from l1_fill_queue by change the l1fill_queue.entry_count
+ *
+ * the removal_count should be pre-populated before this call. The # of
+ * removal_count req with lowest priority will be removed
+ *
+ * removal_count will be reset after the call
+ *
+ * @param proc_id
+ * @param p_l1fill_queue_removal_count
+ */
 static void remove_from_l1_fill_queue(uns  proc_id,
                                       int* p_l1fill_queue_removal_count) {
   /* Remove requests from l1 fill queue */
   if(*p_l1fill_queue_removal_count > 0) {
     /* After this sort requests that should be removed will be at the tail of
      * the l1_queue */
+    // WQ TODO: assert the num of req with MIN_PROORITY is always equal to the
+    // removal_count
     DEBUG(0, "l1fill_queue removal\n");
     qsort(mem->l1fill_queue.base, mem->l1fill_queue.entry_count,
           sizeof(Mem_Queue_Entry), mem_compare_priority);
@@ -2427,17 +2580,21 @@ static void remove_from_l1_fill_queue(uns  proc_id,
 }
 
 /**
- * @brief
+ * @brief Handle reqs in L1 fill queue
  *
  */
 static void mem_process_l1_fill_reqs() {
   Mem_Req* req = NULL;
   int      ii;
   int      reqbuf_id;
-  int      l1fill_queue_removal_count = 0;
 
-  /* Go thru the l1fill_queue */
+  // control the num of req in fill_queue to remove in each call
+  // main logic will update this var, and the final call of
+  // remove_from_l1_fill_queue will take this var and remove the exact
+  // number from the tail of the queue (sorted by priority)
+  int l1fill_queue_removal_count = 0;
 
+  /* Go thru the l1fill_queue */
   for(ii = 0; ii < mem->l1fill_queue.entry_count; ii++) {
     reqbuf_id = mem->l1fill_queue.base[ii].reqbuf;
     req       = &(mem->req_buffer[reqbuf_id]);
@@ -2450,17 +2607,27 @@ static void mem_process_l1_fill_reqs() {
     if(cycle_count < req->rdy_cycle)
       continue;
 
+    // reqs in L1_FILL_QUEUE will be in one of 3 states at any given time:
+    //  1) Fill_l1
+    //  2) Fill_mlc
+    //  3) Done
+    //  Initially when mem_complete_bus_in_access steer reqs to here, the req
+    //  state should be Fill_l1, after the req got filled into L1, depends on
+    //  whether the L1 is exclusive and req's dest, the state either transit
+    //  into FILL_MLC or FILL_DONE
+    // req will be removed (controlled by l1fill_queue_removal_count)
     if(req->state == MRS_FILL_L1) {
       DEBUG(req->proc_id,
             "Mem request about to fill L1  index:%ld  type:%s  addr:0x%s  "
             "size:%d  state: %s\n",
             (long int)(req - mem->req_buffer), Mem_Req_Type_str(req->type),
             hexstr64s(req->addr), req->size, mem_req_state_names[req->state]);
+      // install the line into cache
       if(l1_fill_line(req)) {
         ASSERT(0, req->type != MRT_WB && req->type != MRT_WB_NODIRTY);
         if(CONSTANT_MEMORY_LATENCY)
           perf_pred_mem_req_done(req);
-        if(MLC_PRESENT && req->destination != DEST_L1) {
+        if(MLC_PRESENT && !EXCLUSIVE_L1 && req->destination != DEST_L1) {
           req->state     = MRS_FILL_MLC;
           req->rdy_cycle = cycle_count + 1;
         } else {
@@ -2488,15 +2655,20 @@ static void mem_process_l1_fill_reqs() {
         mem_insert_req_into_queue(req, req->queue,
                                   ALL_FIFO_QUEUES ? mlc_fill_seq_num : 0);
       mlc_fill_seq_num++;
-      // remove from l1fill queue - how do we handle this now?
-      if(HIER_MSHR_ON)
+      if(HIER_MSHR_ON) {
         req->reserved_entry_count -= 1;
+      }
+      // remove from l1fill queue
       l1fill_queue_removal_count++;
+      // MIN_PRIOTITY will guarantte this req be moved, WQ: is this always true?
+      // feels like a dangerous design
       mem->l1fill_queue.base[ii].priority =
         Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
     } else {
       ASSERT(req->proc_id, req->state == MRS_FILL_DONE);
-      if(!req->done_func) {
+      // The existance of done_func indicates whether this req should
+      // ultimately be sent back to core
+      if(!req->done_func) {  // req should be resolved at here (l1)
         if(HIER_MSHR_ON)
           req->reserved_entry_count -= 1;
 
@@ -2508,8 +2680,11 @@ static void mem_process_l1_fill_reqs() {
         mem->l1fill_queue.base[ii].priority =
           Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
 
-        remove_from_l1_fill_queue(req->proc_id, &l1fill_queue_removal_count);
-      } else {
+        // WQ: seems uncessary to call here
+        // remove_from_l1_fill_queue(req->proc_id, &l1fill_queue_removal_count);
+      } else {  // steer req to core's fill queue
+        // this should not happen when MLC is present (regardless of incl/excl)
+        // WQ TODO: put an assert here
         req->rdy_cycle = freq_cycle_count(
           FREQ_DOMAIN_CORES[req->proc_id]);  // no +1 to match old performance
         // insert into core fill queue
@@ -2532,15 +2707,19 @@ static void mem_process_l1_fill_reqs() {
     }
   }
 
-  if(req) {
+  if(req) {  // predicate is only to make sure req->proc_id is valid
     remove_from_l1_fill_queue(req->proc_id, &l1fill_queue_removal_count);
   }
 }
 
 
-/**************************************************************************************/
-/* mem_process_mlc_fill_reqs: */
-
+/**
+ * @brief Fill resp into mlc
+ *
+ * Pending reqs in the MLC_QUEUE_FILL will be inserted and the
+ * req state will turn MRS_FILL_DONE. Next time this func got
+ * called, all reqs of MRS_FILL_DONE will be freed.
+ */
 static void mem_process_mlc_fill_reqs() {
   Mem_Req* req;
   int      ii;
@@ -2562,6 +2741,7 @@ static void mem_process_mlc_fill_reqs() {
     if(cycle_count < req->rdy_cycle)
       continue;
 
+    // either from DRAM or L1, depends on the cache inclusivitity
     if(req->state == MRS_FILL_MLC) {
       DEBUG(req->proc_id,
             "Mem request about to fill MLC  index:%ld  type:%s  addr:0x%s  "
@@ -2569,26 +2749,45 @@ static void mem_process_mlc_fill_reqs() {
             (long int)(req - mem->req_buffer), Mem_Req_Type_str(req->type),
             hexstr64s(req->addr), req->size, mem_req_state_names[req->state]);
       if(mlc_fill_line(req)) {
+        // mark done if req successfully writen into the array
         req->state     = MRS_FILL_DONE;
         req->rdy_cycle = cycle_count + 1;
       }
     } else {
       ASSERT(req->proc_id, req->state == MRS_FILL_DONE);
-      if(!req->done_func || req->done_func(req)) {
+      // WQ: this looks wrong, done_func is not supposed to be called here
+      // if(!req->done_func || req->done_func(req)) {
+      if(!req->done_func) {  // reqs supposed to be resolved here
         if(HIER_MSHR_ON)
           req->reserved_entry_count -= 1;
-
         // Free the request buffer
         mem_free_reqbuf(req);
-
-        // remove from mlc_fill queue - how do we handle this now?
-        mlc_fill_queue_removal_count++;
-        mem->mlc_fill_queue.base[ii].priority =
-          Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
+      } else {  // needs to be delivered to core's fill_queue
+        req->rdy_cycle = freq_cycle_count(
+          FREQ_DOMAIN_CORES[req->proc_id]);  // no +1 to match old performance
+        // insert into core fill queue
+        req->queue = &(mem->core_fill_queues[req->proc_id]);
+        if(!ORDER_BEYOND_BUS)
+          mem_insert_req_into_queue(req, req->queue,
+                                    ALL_FIFO_QUEUES ?
+                                      core_fill_seq_num[req->proc_id] :
+                                      mem->l1fill_queue.base[ii].priority);
+        else
+          mem_insert_req_into_queue(
+            req, req->queue,
+            ALL_FIFO_QUEUES ? core_fill_seq_num[req->proc_id] : 0);
+        core_fill_seq_num[req->proc_id]++;
       }
+
+      // remove from mlc_fill queue - how do we handle this now?
+      mlc_fill_queue_removal_count++;
+      mem->mlc_fill_queue.base[ii].priority =
+        Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
     }
   }
 
+  // WQ: this is really inconsistent, this functionality for L1 is packed into
+  // a separate func...
   /* Remove requests from mlc access queue */
   if(mlc_fill_queue_removal_count > 0) {
     /* After this sort requests that should be removed will be at the tail of
@@ -2606,9 +2805,15 @@ static void mem_process_mlc_fill_reqs() {
   }
 }
 
-/**************************************************************************************/
-/* mem_process_core_fill_reqs: */
-
+/**
+ * @brief Upwards interface between memsys and queue
+ *
+ * reqs reach here should have a callback (done_func), req
+ * is fed back into core by calling this done_func. And then
+ * req will be marked as freed
+ *
+ * @param proc_id
+ */
 static void mem_process_core_fill_reqs(uns proc_id) {
   Mem_Req* req;
   int      ii;
@@ -2627,11 +2832,13 @@ static void mem_process_core_fill_reqs(uns proc_id) {
     ASSERT(req->proc_id, (req->type != MRT_WB) || req->wb_requested_back);
     ASSERT(req->proc_id, req->type != MRT_WB_NODIRTY);
     ASSERT(req->proc_id, cycle_count >= req->rdy_cycle);
-    ASSERT(proc_id,
-           req->state == MRS_L1_HIT_DONE || req->state == MRS_FILL_DONE);
+    ASSERT(proc_id, req->state == MRS_L1_HIT_DONE ||
+                      req->state == MRS_MLC_HIT_DONE ||
+                      req->state == MRS_FILL_DONE);
     ASSERT(proc_id,
            req->done_func);  // requests w/o done_func() should be done by now
 
+    // common used done func are i/dcache_fill_line
     if(req->done_func(req)) {
       // Free the request buffer
       mem_free_reqbuf(req);
@@ -2797,14 +3004,25 @@ static inline Mem_Req* mem_search_queue(
   return matching_req;
 }
 
-/**************************************************************************************/
-/* mem_search_reqbuf: */
 
+/**
+ * @brief Search given queues to check whether a req of addr exist
+ *
+ * @param proc_id
+ * @param addr
+ * @param type
+ * @param size
+ * @param demand_hit_prefetch set if the matching req is a prefetch and a
+                                demand hits it
+ * @param demand_hit_writeback
+ * @param queues_to_search
+ * @param queue_entry
+ * @param ramulator_match
+ * @return Mem_Req*
+ */
 static inline Mem_Req* mem_search_reqbuf(
   uns8 proc_id, Addr addr, Mem_Req_Type type, uns size,
-  Flag* demand_hit_prefetch, /* set if the matching req is a prefetch and a
-                                demand hits it */
-  Flag* demand_hit_writeback, uns queues_to_search,
+  Flag* demand_hit_prefetch, Flag* demand_hit_writeback, uns queues_to_search,
   Mem_Queue_Entry** queue_entry, Flag* ramulator_match) {
   Mem_Req* req;
   ASSERTM(proc_id, proc_id == get_proc_id_from_cmp_addr(addr),
@@ -3185,10 +3403,15 @@ Flag mem_can_allocate_req_buffer(uns proc_id, Mem_Req_Type type,
   return TRUE;
 }
 
-/**************************************************************************************/
-/* mem_allocate_req_buffer: */
-/* If queue is specified, only allocates if its entry_count < size */
-
+/**
+ * @brief alloc new req from the request buffer
+ *
+ * If queue is specified, only allocates if its entry_count < size
+ * @param proc_id
+ * @param type
+ * @param for_l1_writeback
+ * @return Mem_Req*
+ */
 static inline Mem_Req* mem_allocate_req_buffer(uns proc_id, Mem_Req_Type type,
                                                Flag for_l1_writeback) {
   if(!mem_can_allocate_req_buffer(proc_id, type, for_l1_writeback))
@@ -3387,14 +3610,27 @@ static Mem_Req* mem_kick_out_oldest_first_prefetch_from_queues(
   return NULL;
 }
 
-/**************************************************************************************/
-/* mem_init_new_req: */
-
-static void mem_init_new_req(
-  Mem_Req* new_req, Mem_Req_Type type, Mem_Queue_Type queue_type, uns8 proc_id,
-  Addr addr, uns size, uns delay, Op* op, Flag done_func(Mem_Req*),
-  Counter unique_num, /* This counter is used when op is NULL */
-  Flag kicked_out_another, Counter new_priority) {
+/**
+ * @brief Populate the newly generated req
+ *
+ * @param new_req
+ * @param type
+ * @param queue_type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num This counter is used when op is NULL
+ * @param kicked_out_another
+ * @param new_priority
+ */
+static void mem_init_new_req(Mem_Req* new_req, Mem_Req_Type type,
+                             Mem_Queue_Type queue_type, uns8 proc_id, Addr addr,
+                             uns size, uns delay, Op* op,
+                             Flag done_func(Mem_Req*), Counter unique_num,
+                             Flag kicked_out_another, Counter new_priority) {
   ASSERT(0, queue_type & (QUEUE_L1 | QUEUE_MLC));
   Flag to_mlc = (queue_type == QUEUE_MLC);
 
@@ -3436,9 +3672,12 @@ static void mem_init_new_req(
   new_req->mem_channel = CHANNEL(new_req->mem_flat_bank, RAMULATOR_BANKS);
   new_req->mem_bank = BANK_IN_CHANNEL(new_req->mem_flat_bank, RAMULATOR_BANKS);
   */
+
+  // WQ TODO: add support for configurable (bank) hashing schemes
   new_req->mlc_bank = BANK(addr, MLC(proc_id)->num_banks,
                            MLC_INTERLEAVE_FACTOR);
   new_req->l1_bank  = BANK(addr, L1(proc_id)->num_banks, L1_INTERLEAVE_FACTOR);
+
   new_req->start_cycle          = freq_cycle_count(FREQ_DOMAIN_L1) + delay;
   new_req->rdy_cycle            = freq_cycle_count(FREQ_DOMAIN_L1) + delay;
   new_req->first_stalling_cycle = mem_req_type_is_stalling(type) ?
@@ -3527,9 +3766,14 @@ static void mem_init_new_req(
 }
 
 
-/**************************************************************************************/
-/* mem_insert_req_into_queue: */
-
+/**
+ * @brief Put req into corresponding queues. Queue entry holds idx of req_buf
+ *
+ * @param new_req
+ * @param queue
+ * @param priority
+ * @return Mem_Queue_Entry*
+ */
 static inline Mem_Queue_Entry* mem_insert_req_into_queue(Mem_Req*   new_req,
                                                          Mem_Queue* queue,
                                                          Counter    priority) {
@@ -3595,10 +3839,18 @@ void mem_insert_req_round_robin() {
 }
 
 
-/**************************************************************************************/
-/* new_mem_req: */
-/* Returns TRUE if the request is successfully entered into the memory system */
-
+/**
+ * @brief Create new req and insert into correct queue
+ *
+ * Note this is one of the four sources where new_req will be generated
+ * in scarab simulation, the other three are: new_mem_dc/mlc/l1_wb_req and
+ * are used specific for write back reqs
+ *
+ * This func is used to create req when core has demanding LD/ST or for
+ * prefetchers (done_func is usually bound to d/icache_fill_line())
+ *
+ * Returns TRUE if the request is successfully entered into the memory system
+ */
 Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
                  uns delay, Op* op, Flag done_func(Mem_Req*),
                  Counter unique_num, /* This counter is used when op is NULL */
@@ -3614,6 +3866,10 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   Counter priority_offset = freq_cycle_count(FREQ_DOMAIN_L1);
   Counter new_priority;
   Flag    to_mlc = MLC_PRESENT && (!pref_info || pref_info->dest != DEST_L1);
+
+  // Demand reqs will have dest of DEST_NONE, which means req will return to
+  // the core ultimately. Prefetch reqs can has different dest depends on
+  // pref_info
   Destination destination = (pref_info ? pref_info->dest : DEST_NONE);
 
   ASSERTM(proc_id, proc_id == get_proc_id_from_cmp_addr(addr),
@@ -3668,7 +3924,6 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   }
 
   /* Step 2: Found matching request. Adjust it based on the current request */
-
   if(matching_req) {
     // Simulation inaccuracy: an L2-destined request can match a request in the
     // MLC queue, not the other way around
@@ -3770,6 +4025,7 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
     }
   }
 
+  // Use oracle info (look into cache hit/miss) to train prefetchers
   /* we model this more accurately by training the prefetcher when we actually
    * hit/miss if PREF_ORACLE_TRAIN_ON is off */
   // cmp FIXME What can I do for the prefetcher?
@@ -3852,9 +4108,13 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
     return insert_new_req_into_l1_queue(proc_id, new_req);
 }
 
-/**************************************************************************************/
-/* insert_new_req_into_l1_queue: */
-
+/**
+ * @brief
+ *
+ * @param proc_id
+ * @param new_req
+ * @return Flag
+ */
 static Flag insert_new_req_into_l1_queue(uns proc_id, Mem_Req* new_req) {
   if(!ROUND_ROBIN_TO_L1) {
     if(queue_full(&mem->l1_queue)) {
@@ -3873,9 +4133,14 @@ static Flag insert_new_req_into_l1_queue(uns proc_id, Mem_Req* new_req) {
   return TRUE;
 }
 
-/**************************************************************************************/
-/* insert_new_req_into_mlc_queue: */
 
+/**
+ * @brief
+ *
+ * @param proc_id
+ * @param new_req
+ * @return Flag
+ */
 static Flag insert_new_req_into_mlc_queue(uns proc_id, Mem_Req* new_req) {
   if(queue_full(&mem->mlc_queue)) {
     ASSERT(proc_id, 0);
@@ -3887,10 +4152,22 @@ static Flag insert_new_req_into_mlc_queue(uns proc_id, Mem_Req* new_req) {
   return TRUE;
 }
 
-/**************************************************************************************/
-/* new_mem_dc_wb_req: */
-/* Returns TRUE if the request is successfully entered into the memory system */
 
+/**
+ * @brief New dcache write back req
+ *
+ * @param type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num
+ * @param used_onpath
+ * @return Flag TRUE if the request is successfully entered into the memory
+ * system
+ */
 Flag new_mem_dc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
                        uns delay, Op* op, Flag done_func(Mem_Req*),
                        Counter unique_num, Flag used_onpath) {
@@ -3979,6 +4256,7 @@ Flag new_mem_dc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   new_req->wb_used_onpath = used_onpath;  // DC WB requests carry this flag
 
   /* Step 6: Insert the request into the l1 queue if it is not already there */
+  // WQ: note the WB is steered into req queues not fill queues
   if(MLC_PRESENT)
     insert_new_req_into_mlc_queue(proc_id, new_req);
   else
@@ -3987,10 +4265,20 @@ Flag new_mem_dc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   return TRUE;
 }
 
-/**************************************************************************************/
-/* new_mem_mlc_wb_req: */
-/* Returns TRUE if the request is successfully entered into the memory system */
-
+/**
+ * @brief Create req for wb and insert into L1 req queue
+ *
+ * @param type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num
+ * @return Flag  Returns TRUE if the request is successfully entered into the
+ * memory system
+ */
 static Flag new_mem_mlc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr,
                                uns size, uns delay, Op* op,
                                Flag done_func(Mem_Req*), Counter unique_num) {
@@ -4080,12 +4368,22 @@ static Flag new_mem_mlc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr,
 }
 
 
+/**
+ * @brief
+ *
+ * @param type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num This counter is used when op is NULL
+ * @return Flag
+ */
 static Flag new_mem_l1_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr,
                               uns size, uns delay, Op* op,
-                              Flag    done_func(Mem_Req*),
-                              Counter unique_num) /* This counter is used when
-                                                     op is NULL */
-{
+                              Flag done_func(Mem_Req*), Counter unique_num) {
   Mem_Req*         new_req              = NULL;
   Mem_Req*         matching_req         = NULL;
   Mem_Queue_Entry* queue_entry          = NULL;
@@ -4246,7 +4544,7 @@ void op_nuke_mem_req(Op* op) {
 
 
 /**
- * @brief
+ * @brief Install the line into LLC
  *
  * @param req
  * @return Flag 1 on successful fill
@@ -4293,6 +4591,21 @@ Flag l1_fill_line(Mem_Req* req) {
     return SUCCESS;
   }
 
+  // WQ: seems scarab is not modelling the mem sys correctly, there could be
+  // cases  when two req of the same block is propogating at the same time in
+  // the system  results in when fill the cache line, it was already presented
+  // there so,
+  // Temporarily fix starts here:
+  Addr dummy_line_addr;
+  if(cache_access(&L1(req->proc_id)->cache, req->addr, &dummy_line_addr,
+                  FALSE)) {
+    // for some reason the blk is already filled
+    DEBUG(req->proc_id, "Duplicated L1 Fill, skipping\n");
+    return SUCCESS;
+  }
+  // Temporarily fix done
+
+
   /* Do not insert the line yet, just check which line we
      need to replace. If that line is dirty, it's possible
      that we won't be able to insert the writeback into the
@@ -4547,6 +4860,7 @@ Flag l1_fill_line(Mem_Req* req) {
     }
   }
 
+  // update the newly inserted blk
   /* this will make it bring the line into the l1 and then modify it */
   data->proc_id = req->proc_id;
   data->dirty   = ((req->type == MRT_WB) &&
@@ -4598,9 +4912,12 @@ Flag l1_fill_line(Mem_Req* req) {
 }
 
 
-/**************************************************************************************/
-/* mlc_fill_line: */
-
+/**
+ * @brief Fill line into MLC, handle possible WBs
+ *
+ * @param req
+ * @return Flag Return 1 on successfully put req into the cache, 0 otherwise
+ */
 Flag mlc_fill_line(Mem_Req* req) {
   MLC_Data* data;
   Addr      line_addr, repl_line_addr = 0;
@@ -4624,64 +4941,11 @@ Flag mlc_fill_line(Mem_Req* req) {
         (req->op_count ? &(top->unique_num) : 0x0));
 
 
+  // WQ: why is this commented out??
   /* if it can't get a write port, fail */
   /* if (!get_write_port(&MLC(req->proc_id)->ports[req->mlc_bank])) return
    * FAILURE; */
 
-  // Put prefetches in the right position for replacement
-  // cmp FIXME prefetchers
-  if(req->type == MRT_DPRF || req->type == MRT_IPRF) {
-    mem->pref_replpos = INSERT_REPL_DEFAULT;
-    if(PREF_INSERT_LRU) {
-      mem->pref_replpos = INSERT_REPL_LRU;
-      STAT_EVENT(req->proc_id, PREF_REPL_LRU);
-    } else if(PREF_INSERT_MIDDLE) {
-      mem->pref_replpos = INSERT_REPL_MID;
-      STAT_EVENT(req->proc_id, PREF_REPL_MID);
-    } else if(PREF_INSERT_LOWQTR) {
-      mem->pref_replpos = INSERT_REPL_LOWQTR;
-      STAT_EVENT(req->proc_id, PREF_REPL_LOWQTR);
-    }
-    data = (MLC_Data*)cache_insert_replpos(
-      &MLC(req->proc_id)->cache, req->proc_id, req->addr, &line_addr,
-      &repl_line_addr, mem->pref_replpos, TRUE);
-  } else {
-    data = (MLC_Data*)cache_insert(&MLC(req->proc_id)->cache, req->proc_id,
-                                   req->addr, &line_addr, &repl_line_addr);
-  }
-
-  if(req->type == MRT_WB_NODIRTY || req->type == MRT_WB) {
-    STAT_EVENT(req->proc_id, MLC_WB_FILL);
-    STAT_EVENT(req->proc_id, CORE_MLC_WB_FILL);
-  } else {
-    STAT_EVENT(req->proc_id, MLC_FILL);
-    STAT_EVENT(req->proc_id, CORE_MLC_FILL);
-    INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY, cycle_count - req->mlc_miss_cycle);
-    INC_STAT_EVENT(req->proc_id, CORE_MEM_LATENCY,
-                   cycle_count - req->mlc_miss_cycle);
-
-    if(req->type != MRT_DPRF && req->type != MRT_IPRF &&
-       !req->demand_match_prefetch) {
-      STAT_EVENT(req->proc_id, MLC_DEMAND_FILL);
-      STAT_EVENT(req->proc_id, CORE_MLC_DEMAND_FILL);
-      INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY_DEMAND,
-                         cycle_count - req->mlc_miss_cycle);
-      INC_STAT_EVENT(req->proc_id, CORE_MEM_LATENCY_DEMAND,
-                     cycle_count - req->mlc_miss_cycle);
-    } else {
-      STAT_EVENT(req->proc_id, MLC_PREF_FILL);
-      STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL);
-      INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY_PREF,
-                         cycle_count - req->mlc_miss_cycle);
-      INC_STAT_EVENT(req->proc_id, CORE_MEM_LATENCY_PREF,
-                     cycle_count - req->mlc_miss_cycle);
-      if(req->demand_match_prefetch) {
-        STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL_PARTIAL_USED);
-        STAT_EVENT(req->proc_id, CORE_PREF_MLC_PARTIAL_USED);
-        STAT_EVENT_ALL(PREF_MLC_TOTAL_PARTIAL_USED);
-      }
-    }
-  }
 
   /* Do not insert the line yet, just check which line we
      need to replace. If that line is dirty, it's possible
@@ -4694,17 +4958,21 @@ Flag mlc_fill_line(Mem_Req* req) {
 
   /* If we are replacing anything, check if we need to write it back */
   if(repl_line_valid) {
-    if(!MLC_WRITE_THROUGH && data->dirty) {
-      /* need to do a write-back */
+    /* write-back on dirty victim or exclusive hierarchy */
+    if((!MLC_WRITE_THROUGH && data->dirty) || (EXCLUSIVE_L1)) {
       DEBUG(req->proc_id, "Scheduling writeback of addr:0x%s\n",
             hexstr64s(repl_line_addr));
       if(0 && DEBUG_EXC_INSERTS)
         printf("Scheduling L2 writeback of addr:0x%s ins addr:0x%s\n",
                hexstr64s(repl_line_addr), hexstr64s(req->addr));
-      if(!new_mem_mlc_wb_req(MRT_WB, data->proc_id, repl_line_addr,
+
+      Mem_Req_Type wbtype = data->dirty ? MRT_WB : MRT_WB_NODIRTY;
+      if(!new_mem_mlc_wb_req(wbtype, data->proc_id, repl_line_addr,
                              MLC_LINE_SIZE, 1, NULL, NULL, unique_count))
         return FAILURE;
-      STAT_EVENT(req->proc_id, MLC_FILL_DIRTY);
+
+      // WQ:this stat looks wrong..
+      // STAT_EVENT(req->proc_id, MLC_FILL_DIRTY);
     }
 
     if(data->prefetch) {
@@ -4787,12 +5055,68 @@ Flag mlc_fill_line(Mem_Req* req) {
     }
   }
 
+  if(req->type == MRT_DPRF || req->type == MRT_IPRF) {
+    mem->pref_replpos = INSERT_REPL_DEFAULT;
+    if(PREF_INSERT_LRU) {
+      mem->pref_replpos = INSERT_REPL_LRU;
+      STAT_EVENT(req->proc_id, PREF_REPL_LRU);
+    } else if(PREF_INSERT_MIDDLE) {
+      mem->pref_replpos = INSERT_REPL_MID;
+      STAT_EVENT(req->proc_id, PREF_REPL_MID);
+    } else if(PREF_INSERT_LOWQTR) {
+      mem->pref_replpos = INSERT_REPL_LOWQTR;
+      STAT_EVENT(req->proc_id, PREF_REPL_LOWQTR);
+    }
+    data = (MLC_Data*)cache_insert_replpos(
+      &MLC(req->proc_id)->cache, req->proc_id, req->addr, &line_addr,
+      &repl_line_addr, mem->pref_replpos, TRUE);
+  } else {
+    data = (MLC_Data*)cache_insert(&MLC(req->proc_id)->cache, req->proc_id,
+                                   req->addr, &line_addr, &repl_line_addr);
+  }
+
+  if(req->type == MRT_WB_NODIRTY || req->type == MRT_WB) {
+    STAT_EVENT(req->proc_id, MLC_WB_FILL);
+    STAT_EVENT(req->proc_id, CORE_MLC_WB_FILL);
+  } else {
+    STAT_EVENT(req->proc_id, MLC_FILL);
+    STAT_EVENT(req->proc_id, CORE_MLC_FILL);
+    INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY, cycle_count - req->mlc_miss_cycle);
+    INC_STAT_EVENT(req->proc_id, CORE_MEM_LATENCY,
+                   cycle_count - req->mlc_miss_cycle);
+
+    if(req->type != MRT_DPRF && req->type != MRT_IPRF &&
+       !req->demand_match_prefetch) {
+      STAT_EVENT(req->proc_id, MLC_DEMAND_FILL);
+      STAT_EVENT(req->proc_id, CORE_MLC_DEMAND_FILL);
+      INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY_DEMAND,
+                         cycle_count - req->mlc_miss_cycle);
+      INC_STAT_EVENT(req->proc_id, CORE_MEM_LATENCY_DEMAND,
+                     cycle_count - req->mlc_miss_cycle);
+    } else {
+      STAT_EVENT(req->proc_id, MLC_PREF_FILL);
+      STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL);
+      INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY_PREF,
+                         cycle_count - req->mlc_miss_cycle);
+      INC_STAT_EVENT(req->proc_id, CORE_MEM_LATENCY_PREF,
+                     cycle_count - req->mlc_miss_cycle);
+      if(req->demand_match_prefetch) {
+        STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL_PARTIAL_USED);
+        STAT_EVENT(req->proc_id, CORE_PREF_MLC_PARTIAL_USED);
+        STAT_EVENT_ALL(PREF_MLC_TOTAL_PARTIAL_USED);
+      }
+    }
+  }
+
+
   /* this will make it bring the line into the mlc and then modify it */
   data->proc_id = req->proc_id;
   data->dirty   = ((req->type == MRT_WB) &&
                  (req->state != MRS_FILL_MLC));  // write back can fill mlc
                                                  // directly - reqs filling core
                                                  // should not dirty the line
+  data->dirty |= req->dirty_l0;  // for exclusive L1, pull dirty blk
+                                 // from L1 to MLC
   data->prefetch = req->type == MRT_DPRF || req->type == MRT_IPRF ||
                    req->demand_match_prefetch;
   data->seen_prefetch = req->demand_match_prefetch; /* If demand matches
@@ -5201,9 +5525,13 @@ Flag is_final_state(Mem_Req_State state) {
          (state == MRS_MEM_DONE) || (state == MRS_FILL_DONE);
 }
 
-/**************************************************************************************/
-/* wp_process_l1_hit: */
 
+/**
+ * @brief Wrong path stat collect for l1 hit
+ *
+ * @param line
+ * @param req
+ */
 void wp_process_l1_hit(L1_Data* line, Mem_Req* req) {
   if(!line) {
     ASSERT(req->proc_id, PERFECT_L1);
@@ -5281,9 +5609,12 @@ void wp_process_l1_hit(L1_Data* line, Mem_Req* req) {
 }
 
 
-/**************************************************************************************/
-/* wp_process_l1_fill: */
-
+/**
+ * @brief wrong path stat collect for l1_fill
+ *
+ * @param line
+ * @param req
+ */
 void wp_process_l1_fill(L1_Data* line, Mem_Req* req) {
   if(!WP_COLLECT_STATS)
     return;
@@ -5385,7 +5716,7 @@ static void update_mem_req_occupancy_counter(Mem_Req_Type type, int delta) {
       counter = &mem_req_wb_entries;
       break;
     default:
-      FATAL_ERROR(0, "Unknown mem req state\n");
+      FATAL_ERROR(0, "Unknown mem req type\n");
       break;
   }
   *counter += delta;
diff --git a/src/memory/memory.h b/src/memory/memory.h
index f0bcf6cc..9ab58e24 100644
--- a/src/memory/memory.h
+++ b/src/memory/memory.h
@@ -87,7 +87,7 @@ typedef enum Mem_Queue_Type_enum {
 } Mem_Queue_Type;
 
 typedef struct Mem_Queue_Entry_struct {
-  int     reqbuf;   /* request buffer num */
+  int reqbuf; /* request buffer num, a pointer(idx) to the global req_buffer */
   Counter priority; /* priority of the miss */
   Counter rdy_cycle;
 } Mem_Queue_Entry;
@@ -132,11 +132,13 @@ typedef struct Uncore_struct {
 
 typedef struct Memory_struct {
   /* miss buffer */
-  Mem_Req* req_buffer;
-  List     req_buffer_free_list;
-  List*    l1_in_buffer_core;
-  uns      total_mem_req_buffers;
-  uns*     num_req_buffers_per_core;
+  Mem_Req* req_buffer;  // global buffer holds all the real reqs, the entries
+                        // from
+  // various queues below points to the reqs in this buffer (with idx)
+  List  req_buffer_free_list;
+  List* l1_in_buffer_core;
+  uns   total_mem_req_buffers;
+  uns*  num_req_buffers_per_core;
 
   int req_count;
 
@@ -147,6 +149,9 @@ typedef struct Memory_struct {
   Cache pref_l1_cache;
 
   /* various queues (arrays) */
+  /* reg comes from upward goes to the queue (includes WBs)
+     reg comes from downward goes to fill_queue
+   */
   Mem_Queue  mlc_queue;
   Mem_Queue  mlc_fill_queue;
   Mem_Queue  l1_queue;
diff --git a/src/memory/memory.param.def b/src/memory/memory.param.def
index b29aafab..6c4d34a7 100644
--- a/src/memory/memory.param.def
+++ b/src/memory/memory.param.def
@@ -51,13 +51,18 @@
 */
 DEF_PARAM(enable_swprf, ENABLE_SWPRF, Flag, Flag, FALSE, )
 
-/* MLC */
+/**
+ * MLC
+ * parameters (size, bank etc.) refer to single MLC is configured 
+ * as private. If shared, parameters refer to the aggregated capacity
+ */
 DEF_PARAM(mlc_present, MLC_PRESENT, Flag, Flag, FALSE, )
 DEF_PARAM(mlc_size, MLC_SIZE, uns, uns, (512 * 1024), )
-DEF_PARAM(mlc_assoc, MLC_ASSOC, uns, uns, 4, )
+DEF_PARAM(mlc_assoc, MLC_ASSOC, uns, uns, 8, )
 DEF_PARAM(mlc_line_size, MLC_LINE_SIZE, uns, uns, 64, )
 DEF_PARAM(mlc_cycles, MLC_CYCLES, uns, uns, 12, )
 DEF_PARAM(perfect_mlc, PERFECT_MLC, Flag, Flag, FALSE, )
+DEF_PARAM(private_mlc, PRIVATE_MLC, Flag, Flag, TRUE, )
 DEF_PARAM(mlc_read_ports, MLC_READ_PORTS, uns, uns, 1, )
 DEF_PARAM(mlc_write_ports, MLC_WRITE_PORTS, uns, uns, 1, )
 DEF_PARAM(mlc_banks, MLC_BANKS, uns, uns, 8, )
@@ -75,6 +80,7 @@ DEF_PARAM(l1_line_size, L1_LINE_SIZE, uns, uns,
 DEF_PARAM(l1_cycles, L1_CYCLES, uns, uns, 24, )
 DEF_PARAM(perfect_l1, PERFECT_L1, Flag, Flag, FALSE, )
 DEF_PARAM(private_l1, PRIVATE_L1, Flag, Flag, FALSE, )
+DEF_PARAM(exclusive_l1, EXCLUSIVE_L1, Flag, Flag, TRUE, )
 DEF_PARAM(l1_read_ports, L1_READ_PORTS, uns, uns, 1, )
 DEF_PARAM(l1_write_ports, L1_WRITE_PORTS, uns, uns, 1, )
 DEF_PARAM(l1_banks, L1_BANKS, uns, uns, 8, )
diff --git a/src/node_stage.c b/src/node_stage.c
index 7ca016f1..8dd47315 100644
--- a/src/node_stage.c
+++ b/src/node_stage.c
@@ -87,7 +87,12 @@ Flag op_not_ready_for_retire(Op* op);
 Flag is_node_table_empty(void);
 void collect_not_ready_to_retire_stats(Op* op);
 Flag is_node_table_full(void);
+Flag get_mem_ld(Op *op);
+Flag get_mem_st(Op *op);
+Flag is_lq_full(void);
+Flag is_sq_full(void);
 void collect_node_table_full_stats(Op* op);
+void collect_lsq_full_stats(Op* op);
 
 /**************************************************************************************/
 /* set_node_stage:*/
@@ -133,6 +138,12 @@ void reset_node_stage() {
   node->mem_blocked          = FALSE;
   node->mem_block_length     = 0;
   node->ret_stall_length     = 0;
+  node->num_loads        = 0;
+  node->num_stores       = 0;
+  node->node_lq_head     = NULL;
+  node->node_sq_head     = NULL;
+  node->node_lq_tail     = NULL;
+  node->node_sq_tail     = NULL;
 }
 
 /**************************************************************************************/
@@ -153,6 +164,12 @@ void reset_all_ops_node_stage() {
   node->node_count       = 0;
   node->mem_blocked      = FALSE;
   node->ret_stall_length = 0;
+  node->num_loads        = 0;
+  node->num_stores       = 0;
+  node->node_lq_head     = NULL;
+  node->node_sq_head     = NULL;
+  node->node_lq_tail     = NULL;
+  node->node_sq_tail     = NULL;
 }
 
 /**************************************************************************************/
@@ -228,6 +245,8 @@ void flush_window() {
   uns  keep_ops  = 0;
 
   node->node_tail = NULL;
+  node->node_lq_tail = NULL;
+  node->node_sq_tail = NULL;
   for(op = node->node_head, last = &node->node_head; op; op = *last) {
     ASSERT(node->proc_id, node->proc_id == op->proc_id);
 
@@ -242,6 +261,30 @@ void flush_window() {
         ASSERT(op->proc_id, node->rs[op->rs_id].rs_op_count > 0);
         node->rs[op->rs_id].rs_op_count--;
       }
+ 
+      if(get_mem_ld(op)) {
+        node->num_loads--;
+        if(node->node_lq_head==op) {
+          node->node_lq_head = NULL;
+          node->node_lq_tail = NULL;
+        }
+        else if(node->node_lq_tail!= NULL && node->node_lq_tail->next_lq_node==op) {
+          node->node_lq_tail->next_lq_node = NULL;
+        }
+      }
+      if(get_mem_st(op)) {
+        node->num_stores--;
+        if(node->node_sq_head==op){
+          node->node_sq_head = NULL;
+          node->node_sq_tail = NULL;
+        }
+        else if(node->node_sq_tail!= NULL && node->node_sq_tail->next_sq_node==op) {
+          node->node_sq_tail->next_sq_node = NULL;
+        }
+      }
+      ASSERT(node->proc_id, node->num_loads>=0);
+      ASSERT(node->proc_id, node->num_stores>=0);
+
       free_op(op);
     } else {
       /* Keep op */
@@ -255,9 +298,13 @@ void flush_window() {
       keep_ops++;
       last            = &op->next_node;
       node->node_tail = op;
+      if(get_mem_ld(op)) node->node_lq_tail = op;
+      if(get_mem_st(op)) node->node_sq_tail = op;
     }
   }
 
+  ASSERT(node->proc_id, !((node->node_lq_head==NULL) ^ (node->num_loads==0)));
+  ASSERT(node->proc_id, !((node->node_sq_head==NULL) ^ (node->num_stores==0)));
   ASSERT(node->proc_id, flush_ops + keep_ops == node->node_count);
   node->node_count = keep_ops;
   ASSERT(node->proc_id, node->node_count <= NODE_TABLE_SIZE);
@@ -415,19 +462,33 @@ void node_issue(Stage_Data* src_sd) {
   // Table.
   // We will stick them into the RS later
   for(ii = 0; ii < src_sd->max_op_count; ii++) {
+
+    // If it is not full, issue the next op
+    Op* op = src_sd->ops[ii];
+    if(!op)
+      continue;
+    
     /* if node table is full, stall */
     if(is_node_table_full()) {
       collect_node_table_full_stats(node->node_head);
       rob_block_issue_reason = ROB_BLOCK_ISSUE_FULL;
       return;
     }
+    else if(get_mem_ld(op) && is_lq_full())
+    {
+      collect_lsq_full_stats(node->node_lq_head);
+      rob_block_issue_reason = ROB_BLOCK_ISSUE_FULL;
+      return;
+    }
+    else if(get_mem_st(op) && is_sq_full())
+    {
+      collect_lsq_full_stats(node->node_sq_head);
+      rob_block_issue_reason = ROB_BLOCK_ISSUE_FULL;
+      return;
+    }
+    
     rob_block_issue_reason = ROB_BLOCK_ISSUE_NONE;
 
-    // If it is not full, issue the next op
-    Op* op = src_sd->ops[ii];
-    if(!op)
-      continue;
-
     ASSERT(node->proc_id, node->proc_id == op->proc_id);
     /* check if it's a synchronizing op that can't issue  */
     if((op->table_info->bar_type & BAR_ISSUE) && (node->node_count > 0))
@@ -452,6 +513,44 @@ void node_issue(Stage_Data* src_sd) {
     op->in_node_list = TRUE;
     node->node_tail  = op;
 
+    // Add mem ops to load and store queues
+    if(get_mem_ld(op))
+    {
+      ASSERT(0, !((node->node_lq_head!=NULL) ^ (node->node_lq_tail!=NULL)));
+      node->num_loads++;
+      op->next_lq_node = NULL;
+      if(node->node_lq_head == NULL)
+      {
+        ASSERT(0, node->node_lq_tail==NULL);
+        node->node_lq_head = op;
+        node->node_lq_tail = op;
+      }
+      else
+      {
+        node->node_lq_tail->next_lq_node = op;
+        node->node_lq_tail = op;
+      }
+      ASSERT(0, node->node_lq_head->op_num <= node->node_lq_tail->op_num);
+    }
+    if(get_mem_st(op))
+    {
+      ASSERT(0, !((node->node_sq_head!=NULL) ^ (node->node_sq_tail!=NULL)));
+      node->num_stores++;
+      op->next_sq_node = NULL;
+      if(node->node_sq_head == NULL)
+      {
+        ASSERT(0, node->node_sq_tail==NULL);
+        node->node_sq_head = op;
+        node->node_sq_tail = op;
+      }
+      else
+      {
+        node->node_sq_tail->next_sq_node = op;
+        node->node_sq_tail = op;
+      }
+      ASSERT(0, node->node_sq_head->op_num <= node->node_sq_tail->op_num);
+    }
+
     STAT_EVENT(node->proc_id, OP_ISSUED);
 
     if(!node->next_op_into_rs)    /* if there are no ops waiting to enter RS */
@@ -800,6 +899,42 @@ void node_retire() {
 
     node->ret_op++;
 
+    if(get_mem_ld(op))
+    {
+      node->num_loads--;
+      ASSERT(0, node->num_loads>=0);
+      ASSERT(0, node->node_lq_head!=NULL && node->node_lq_tail!=NULL);
+      ASSERT(0, node->node_lq_head->op_num <= node->node_lq_tail->op_num);
+      if(op->next_lq_node == NULL)
+      {
+        node->node_lq_head = NULL;
+        node->node_lq_tail = NULL;
+      }
+      else
+      {
+        node->node_lq_head = op->next_lq_node;
+      }
+    }
+    if(get_mem_st(op))
+    {
+      node->num_stores--;
+      ASSERT(0, node->num_stores>=0);
+      ASSERT(0, node->node_sq_head!=NULL && node->node_sq_tail!=NULL);
+      ASSERT(0, node->node_sq_head->op_num <= node->node_sq_tail->op_num);
+      if(op->next_sq_node == NULL)
+      {
+        node->node_sq_head = NULL;
+        node->node_sq_tail = NULL;
+      }
+      else
+      {
+        node->node_sq_head = op->next_sq_node;
+      }
+    }
+
+    ASSERT(node->proc_id, node->num_loads >= 0);
+    ASSERT(node->proc_id, node->num_stores >= 0);
+
     STAT_EVENT(op->proc_id, RET_ALL_INST);
 
     remove_from_seq_op_list(td, op);
@@ -1070,18 +1205,57 @@ Flag is_node_table_full() {
   return (node->node_count == NODE_TABLE_SIZE);
 }
 
+Flag get_mem_ld(Op * op) {
+  return op->table_info->mem_type==MEM_LD;
+}
+
+Flag get_mem_st(Op * op) {
+  return op->table_info->mem_type==MEM_ST || op->table_info->mem_type==MEM_EVICT;
+}
+
+Flag is_lq_full()
+{
+  ASSERT(node->proc_id, node->num_loads <= LOAD_QUEUE_SIZE);
+  return (node->num_loads == LOAD_QUEUE_SIZE);
+}
+
+Flag is_sq_full()
+{
+  ASSERT(node->proc_id, node->num_stores <= STORE_QUEUE_SIZE);
+  return (node->num_stores == STORE_QUEUE_SIZE);
+}
+
 void collect_node_table_full_stats(Op* op) {
   if(!(op->state == OS_DONE || OP_DONE(op))) {
     if(op->table_info->op_type == OP_IMEM ||
        op->table_info->op_type == OP_FMEM) {
-      STAT_EVENT(node->proc_id, FULL_WINDOW_MEM_OP);
+      STAT_EVENT(node->proc_id, FULL_WINDOW_ROB_MEM_OP);
     } else if(op->table_info->op_type >= OP_FCVT &&
               op->table_info->op_type <= OP_FCMOV) {
-      STAT_EVENT(node->proc_id, FULL_WINDOW_FP_OP);
+      STAT_EVENT(node->proc_id, FULL_WINDOW_ROB_FP_OP);
     } else {
-      STAT_EVENT(node->proc_id, FULL_WINDOW_OTHER_OP);
+      STAT_EVENT(node->proc_id, FULL_WINDOW_ROB_OTHER_OP);
     }
   }
+  else {
+    STAT_EVENT(node->proc_id, FULL_WINDOW_WAITING_ON_RET);
+  }
+
+  STAT_EVENT(node->proc_id, FULL_WINDOW_STALL);
+}
+
+void collect_lsq_full_stats(Op* op) {
+
+  if(!(op->state == OS_DONE || OP_DONE(op))) {
+    if(get_mem_ld(op)) {
+      STAT_EVENT(node->proc_id, FULL_WINDOW_LQ_FULL);
+    } else {
+      STAT_EVENT(node->proc_id, FULL_WINDOW_SQ_FULL);
+    }
+  }
+  else {
+    STAT_EVENT(node->proc_id, FULL_WINDOW_WAITING_ON_RET);
+  }
 
   STAT_EVENT(node->proc_id, FULL_WINDOW_STALL);
 }
diff --git a/src/node_stage.h b/src/node_stage.h
index c3967a1e..5996a0e8 100644
--- a/src/node_stage.h
+++ b/src/node_stage.h
@@ -70,6 +70,14 @@ typedef struct Node_Stage_struct {
   Flag mem_blocked;       // are we out of mem req buffers for this core
   uns  mem_block_length;  // length of the current memory block
   uns  ret_stall_length;  // length of the current retirement stall
+
+  uns num_loads;      // number of inflight loads
+  uns num_stores;     // and stores allowed
+  Op * node_lq_head;  // pointers to head and tail    
+  Op * node_lq_tail;  // ops in the load and 
+  Op * node_sq_head;  // store queues
+  Op * node_sq_tail;
+
 } Node_Stage;
 
 
diff --git a/src/op.h b/src/op.h
index 868491a9..fd478939 100644
--- a/src/op.h
+++ b/src/op.h
@@ -215,6 +215,8 @@ struct Op_struct {
   struct Op_struct* next_rdy;      // pointer to next ready op (node table)
   Flag              in_rdy_list;   // is the op in the node stage's ready list?
   struct Op_struct* next_node;     // pointer to the next op in the node table
+  struct Op_struct* next_lq_node;  // pointer to the next load in the LQ
+  struct Op_struct* next_sq_node;  // pointer to the next store in the SQ
   Flag              in_node_list;  // is the op in the node list?
   Flag              replay;        // is the op waiting to replay?
   uns               replay_count;  // number of times the op has replayed
diff --git a/src/op_info.h b/src/op_info.h
index 94003652..b039ce85 100644
--- a/src/op_info.h
+++ b/src/op_info.h
@@ -97,7 +97,10 @@ struct Op_Info_struct {
   uns8 dir;        // true direction of branch, set by oracle
   Addr pred_npc;   // predicted next pc field
   Addr pred_addr;  // address used to predict branch (might be fetch_addr)
-  uns8 pred;       // predicted direction of branch, set by the branch predictor
+  uns8 pred;       // overall predicted direction of branch, set by the last branch predictor
+                   // note that this can change depending on which point of the pipeline it is in
+  uns8 early_pred; // predicted direction of branch, set by the early branch predictor
+  uns8 early_late_disagree;       // if the late predictor did not agree with the early predictor 
   Flag misfetch;   // true if target address is the ONLY thing that was wrong
   Flag mispred;  // true if the direction of the branch was mispredicted and the
                  // branch should cause a recovery, set by the branch predictor
diff --git a/src/packet_build.h b/src/packet_build.h
index 832c2765..c3c1fa10 100644
--- a/src/packet_build.h
+++ b/src/packet_build.h
@@ -103,6 +103,7 @@ typedef enum Break_Reason_enum {
   BREAK_STALL,         // break because the pipeline is stalled
   BREAK_BARRIER,       // break because of a system call or a fetch barrier
                        // instruction
+  BREAK_EMPTY_FETCH_QUEUE,// break because of an empty fetch queue 
   BREAK_OFFPATH,       // break because the machine is offpath
   BREAK_ALIGNMENT,     // break because of misaligned fetch (offpath)
   BREAK_TAKEN,         // break because of nonsequential control flow
diff --git a/src/pin/pin_lib/decoder.cc b/src/pin/pin_lib/decoder.cc
index 6c90de30..5ae509f1 100644
--- a/src/pin/pin_lib/decoder.cc
+++ b/src/pin/pin_lib/decoder.cc
@@ -137,7 +137,7 @@ ctype_pin_inst* pin_decoder_get_latest_inst() {
 }
 
 void pin_decoder_print_unknown_opcodes() {
-  for(const auto opcode : unknown_opcodes) {
+  for(const auto& opcode : unknown_opcodes) {
     (*glb_err_ostream) << opcode << std::endl;
   }
 }
diff --git a/src/pin/pin_lib/pin_api_to_xed.h b/src/pin/pin_lib/pin_api_to_xed.h
index 4f4039b9..8fb11c0e 100644
--- a/src/pin/pin_lib/pin_api_to_xed.h
+++ b/src/pin/pin_lib/pin_api_to_xed.h
@@ -133,7 +133,7 @@ struct InstInfo {
 // TODO: Double check that below works calls and branches
 #define XED_INS_IsDirectBranchOrCall(ins) XED_INS_IsDirectBranch(ins)
 #define XED_INS_IsIndirectBranchOrCall(ins) !XED_INS_IsDirectBranchOrCall(ins)
-#define XED_INS_IsSyscall(ins) (XED_INS_Category(ins) == XED_CATEGORY_SYSCALL)
+#define XED_INS_IsSyscall(ins) (XED_INS_Category(ins) == XED_CATEGORY_SYSCALL || XED_INS_Category(ins) == XED_CATEGORY_SYSTEM)
 #define XED_INS_IsSysret(ins) (XED_INS_Category(ins) == XED_CATEGORY_SYSRET)
 #define XED_INS_IsInterrupt(ins) \
   (XED_INS_Category(ins) == XED_CATEGORY_INTERRUPT)
diff --git a/src/pin/pin_lib/uop_generator.c b/src/pin/pin_lib/uop_generator.c
index 65b45304..5fd8c787 100644
--- a/src/pin/pin_lib/uop_generator.c
+++ b/src/pin/pin_lib/uop_generator.c
@@ -99,8 +99,6 @@ typedef struct Trace_Uop_struct Trace_Uop;
 extern int op_type_delays[NUM_OP_TYPES];
 extern uns NEW_INST_TABLE_SIZE;  // TODO: what is this?
 
-char* trace_files[MAX_NUM_PROCS];
-
 char dbg_print_buf[1024];
 
 Trace_Uop*** trace_uop_bulk;
diff --git a/src/pin/pin_lib/x86_decoder.cc b/src/pin/pin_lib/x86_decoder.cc
index c833ca69..59079a2a 100644
--- a/src/pin/pin_lib/x86_decoder.cc
+++ b/src/pin/pin_lib/x86_decoder.cc
@@ -876,6 +876,7 @@ void init_pin_opcode_convert(void) {
                                            NONE};  // Potential FMOV
   iclass_to_scarab_map[XED_ICLASS_FLDCW]   = {OP_NOTPIPELINED_MEDIUM, -1, 1,
                                             NONE};
+  iclass_to_scarab_map[XED_ICLASS_FLDENV]  = {OP_MOV, 14, 1, NONE};
   iclass_to_scarab_map[XED_ICLASS_FLDL2E]  = {OP_MOV, -1, 1,
                                              NONE};  // Potential FMOV
   iclass_to_scarab_map[XED_ICLASS_FLDL2T]  = {OP_MOV, -1, 1,
@@ -892,11 +893,12 @@ void init_pin_opcode_convert(void) {
   iclass_to_scarab_map[XED_ICLASS_FNCLEX] = {OP_NOTPIPELINED_SLOW, -1, 1, NONE};
   iclass_to_scarab_map[XED_ICLASS_FNSTCW] = {OP_NOTPIPELINED_MEDIUM, -1, 1,
                                              NONE};
-  iclass_to_scarab_map[XED_ICLASS_FNSTSW] = {OP_NOTPIPELINED_MEDIUM, -1, 1,
+  iclass_to_scarab_map[XED_ICLASS_FNSTENV] = {OP_MOV, 14, 1, NONE};
+  iclass_to_scarab_map[XED_ICLASS_FNSTSW]  = {OP_NOTPIPELINED_MEDIUM, -1, 1,
                                              NONE};
-  iclass_to_scarab_map[XED_ICLASS_FNOP]   = {OP_NOP, -1, 1, NONE};
-  iclass_to_scarab_map[XED_ICLASS_FPREM]  = {OP_FMUL, 8, 1, NONE};
-  iclass_to_scarab_map[XED_ICLASS_FRNDINT]       = {OP_FCVT, 8, 1, NONE};
+  iclass_to_scarab_map[XED_ICLASS_FNOP]    = {OP_NOP, -1, 1, NONE};
+  iclass_to_scarab_map[XED_ICLASS_FPREM]   = {OP_FMUL, 8, 1, NONE};
+  iclass_to_scarab_map[XED_ICLASS_FRNDINT] = {OP_FCVT, 8, 1, NONE};
   iclass_to_scarab_map[XED_ICLASS_FSETPM287_NOP] = {OP_NOP, -1, 1, NONE};
   iclass_to_scarab_map[XED_ICLASS_FSIN]    = {OP_NOTPIPELINED_VERY_SLOW, 8, 1,
                                            NONE};
diff --git a/src/prefetcher/l2l1pref.c b/src/prefetcher/l2l1pref.c
index 60d556c6..990d5b86 100644
--- a/src/prefetcher/l2l1pref.c
+++ b/src/prefetcher/l2l1pref.c
@@ -59,7 +59,7 @@
 
 extern Memory*       mem;
 extern Dcache_Stage* dc;
-Cache*               l1_cache;
+static Cache*        l1_cache;
 
 /***************************************************************************************/
 /* Local Prototypes */
diff --git a/src/prefetcher/l2way_pref.c b/src/prefetcher/l2way_pref.c
index 1e8471fb..199d5bea 100644
--- a/src/prefetcher/l2way_pref.c
+++ b/src/prefetcher/l2way_pref.c
@@ -64,7 +64,7 @@ L2way_Rec**    l2way_table;
 L1pref_Req*    l1pref_req_queue;
 static Counter l1pref_send_no;
 static Counter l1pref_req_no;
-Cache*         l1_cache;
+static Cache*  l1_cache;
 
 /**************************************************************************************/
 
diff --git a/src/ramulator.cc b/src/ramulator.cc
index 3b0e2492..ffc0f512 100644
--- a/src/ramulator.cc
+++ b/src/ramulator.cc
@@ -168,6 +168,14 @@ void init_configs() {
 }
 
 
+/**
+ * @brief Send req to ramulator
+ *
+ * This func is the interface between scarab and ramulator
+ *
+ * @param scarab_req
+ * @return int
+ */
 int ramulator_send(Mem_Req* scarab_req) {
   Request req;
 
diff --git a/src/sim.c b/src/sim.c
index d107a732..608e19ad 100644
--- a/src/sim.c
+++ b/src/sim.c
@@ -266,9 +266,10 @@ static inline Counter check_forward_progress(uns8 proc_id) {
        (Counter)FORWARD_PROGRESS_LIMIT)) {
     uns8 proc_id2;
     for(proc_id2 = 0; proc_id2 < NUM_CORES; proc_id2++) {
-      if(!sim_done[proc_id2])
+      if(!sim_done[proc_id2]){
         dump_stats(proc_id2, TRUE, global_stat_array[proc_id2],
                    NUM_GLOBAL_STATS);
+      }
     }
 
     if(cmp_model.node_stage[proc_id].node_head) {
@@ -723,6 +724,7 @@ void full_sim() {
     }
   }
 
+
   if(model->done_func)
     model->done_func();
   if(SIM_MODEL != DUMB_MODEL && DUMB_CORE_ON)