From b48b605b51eaddac879d4642021ccbe1de7656a5 Mon Sep 17 00:00:00 2001 From: tinebp Date: Fri, 15 Nov 2024 03:42:06 -0800 Subject: [PATCH 01/36] remove deprecared yosys link --- hw/syn/yosys/synth.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/hw/syn/yosys/synth.sh b/hw/syn/yosys/synth.sh index b44f16e6b..76559b8d3 100755 --- a/hw/syn/yosys/synth.sh +++ b/hw/syn/yosys/synth.sh @@ -28,7 +28,7 @@ dir_list=() inc_args="" macro_args="" no_warnings=1 -process="elaborate,netlist,techmap,verilog,link" +process="elaborate,netlist,techmap,verilog" declare -a excluded_warnings=("Resizing cell port") @@ -135,11 +135,6 @@ done echo "synth -top $top_level" fi - # link design - if echo "$process" | grep -q "link"; then - echo "link_design -top $top_level" - fi - # convert to netlist if echo "$process" | grep -q "netlist"; then echo "proc; opt" From 320c090613ab4a17be410e3c1860cf689c0b3da5 Mon Sep 17 00:00:00 2001 From: tinebp Date: Tue, 19 Nov 2024 01:57:33 -0800 Subject: [PATCH 02/36] xilinx asynchronous bram patch fixes --- hw/rtl/VX_platform.vh | 3 + hw/rtl/libs/VX_async_ram_patch.sv | 236 +++++++++++++------ hw/rtl/libs/VX_dp_ram.sv | 64 +++--- hw/rtl/libs/VX_rr_arbiter.sv | 2 +- hw/rtl/libs/VX_sp_ram.sv | 124 +++++----- hw/scripts/xilinx_async_bram_patch.tcl | 301 +++++++++++++++++-------- hw/scripts/xilinx_export_netlist.tcl | 13 ++ hw/syn/xilinx/README | 3 + hw/syn/xilinx/xrt/Makefile | 1 + 9 files changed, 490 insertions(+), 257 deletions(-) diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index d874b9b2b..08a2f6ca5 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -163,6 +163,7 @@ endgenerate `define USE_BLOCK_BRAM (* ramstyle = "block" *) `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) +`define RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams on" *) `define DISABLE_BRAM (* ramstyle = "logic" *) `define PRESERVE_NET (* preserve *) `define BLACKBOX_CELL (* black_box *) @@ -173,6 +174,7 @@ endgenerate `define USE_BLOCK_BRAM (* ram_style = "block" *) `define USE_FAST_BRAM (* ram_style = "distributed" *) `define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *) +`define RW_RAM_CHECK (* rw_addr_collision = "yes" *) `define DISABLE_BRAM (* ram_style = "registers" *) `define PRESERVE_NET (* keep = "true" *) `define BLACKBOX_CELL (* black_box *) @@ -183,6 +185,7 @@ endgenerate `define USE_BLOCK_BRAM `define USE_FAST_BRAM `define NO_RW_RAM_CHECK +`define RW_RAM_CHECK `define DISABLE_BRAM `define PRESERVE_NET `define BLACKBOX_CELL diff --git a/hw/rtl/libs/VX_async_ram_patch.sv b/hw/rtl/libs/VX_async_ram_patch.sv index fd29e881d..43e8139e6 100644 --- a/hw/rtl/libs/VX_async_ram_patch.sv +++ b/hw/rtl/libs/VX_async_ram_patch.sv @@ -13,12 +13,6 @@ `include "VX_platform.vh" -`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ - if (wren[i]) begin \ - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ - end \ - end - `define RAM_INITIALIZATION \ if (INIT_ENABLE != 0) begin : g_init \ if (INIT_FILE != "") begin : g_file \ @@ -32,14 +26,93 @@ end \ end -`define RAM_BYPASS(__d) \ - reg [DATAW-1:0] bypass_data_r; \ - reg bypass_valid_r; \ +`define SYNC_RAM_WF_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [ADDRW-1:0] raddr_r; \ + always @(posedge clk) begin \ + if (__re || __we) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + raddr_r <= __ra; \ + end \ + end \ + assign __d = ram[raddr_r] + +`define SYNC_RAM_WF_WREN_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [ADDRW-1:0] raddr_r; \ always @(posedge clk) begin \ - bypass_valid_r <= read_s && write && (raddr_s == waddr); \ - bypass_data_r <= wdata; \ + if (__re || __we) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + raddr_r <= __ra; \ + end \ end \ - assign __d = bypass_valid_r ? bypass_data_r : rdata_r + assign __d = ram[raddr_r] + +`define SYNC_RAM_RF_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [DATAW-1:0] rdata_r; \ + always @(posedge clk) begin \ + if (__re || __we) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + rdata_r <= ram[__ra]; \ + end \ + end \ + assign __d = rdata_r + +`define SYNC_RAM_RF_WREN_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [DATAW-1:0] rdata_r; \ + always @(posedge clk) begin \ + if (__re || __we) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + rdata_r <= ram[__ra]; \ + end \ + end \ + assign __d = rdata_r + +`define ASYNC_RAM_BLOCK(__d, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + always @(posedge clk) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + end \ + assign __d = ram[__ra] + +`define ASYNC_RAM_BLOCK_WREN(__d, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + always @(posedge clk) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + end \ + assign __d = ram[__ra] `TRACING_OFF module VX_async_ram_patch #( @@ -47,6 +120,8 @@ module VX_async_ram_patch #( parameter SIZE = 1, parameter WRENW = 1, parameter DUAL_PORT = 0, + parameter FORCE_BRAM = 0, + parameter WRITE_FIRST = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -79,77 +154,102 @@ module VX_async_ram_patch #( .out ({raddr_s, read_s, is_raddr_reg}) ); - // synchroneous ram - - wire [DATAW-1:0] rdata_s; + wire [DATAW-1:0] rdata_s, rdata_a; - if (WRENW != 1) begin : g_wren_sync_ram - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - reg [DATAW-1:0] rdata_r; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (read_s || write) begin - if (write) begin - `RAM_WRITE_WREN + if (1) begin : g_sync_ram + if (WRENW != 1) begin : g_wren + if (FORCE_BRAM) begin : g_bram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_WF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_RF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_lutram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES + `SYNC_RAM_WF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES + `SYNC_RAM_RF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES end - rdata_r <= ram[raddr_s]; end - end - `RAM_BYPASS(rdata_s); - end else begin : g_no_wren_sync_ram - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - reg [DATAW-1:0] rdata_r; - `RAM_INITIALIZATION - `UNUSED_VAR (wren) - always @(posedge clk) begin - if (read_s || write) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_no_wren + if (FORCE_BRAM) begin : g_bram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_WF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_RF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_lutram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES + `SYNC_RAM_WF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES + `SYNC_RAM_RF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES end - rdata_r <= ram[raddr_s]; end end - `RAM_BYPASS(rdata_s); end - // asynchronous ram (fallback) - - wire [DATAW-1:0] rdata_a; - - if (DUAL_PORT != 0) begin : g_dp_async_ram - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - if (WRENW != 1) begin : g_wren - always @(posedge clk) begin - if (write) begin - `RAM_WRITE_WREN + if (1) begin : g_async_ram + if (DUAL_PORT != 0) begin : g_dp + if (WRENW != 1) begin : g_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES end - end - end else begin : g_no_wren - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_no_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES end end - end - assign rdata_a = ram[raddr]; - end else begin : g_sp_async_ram - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - if (WRENW != 1) begin : g_wren - always @(posedge clk) begin - if (write) begin - `RAM_WRITE_WREN + end else begin : g_sp + if (WRENW != 1) begin : g_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES end - end - end else begin : g_no_wren - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_no_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES end end end - assign rdata_a = ram[waddr]; end assign rdata = is_raddr_reg ? rdata_s : rdata_a; diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 0cff67882..2cb88efe5 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -80,7 +80,7 @@ module VX_dp_ram #( if (FORCE_BRAM) begin : g_bram if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - (* rw_addr_collision = "yes" *) `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -93,7 +93,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr_r]; end else begin : g_no_wren - (* rw_addr_collision = "yes" *) `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -166,7 +166,7 @@ module VX_dp_ram #( end else begin : g_auto if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - (* rw_addr_collision = "yes" *) `RAM_ARRAY_WREN + `RW_RAM_CHECK `RAM_ARRAY_WREN `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -179,7 +179,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr_r]; end else begin : g_no_wren - (* rw_addr_collision = "yes" *) reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -220,7 +220,7 @@ module VX_dp_ram #( end assign rdata = rdata_r; end - end else begin + end else begin : g_undefined if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION @@ -253,30 +253,32 @@ module VX_dp_ram #( end else begin : g_async `UNUSED_VAR (read) if (FORCE_BRAM) begin : g_bram + `ifdef VIVADO + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (1), + .FORCE_BRAM (FORCE_BRAM), + .WRITE_FIRST(RDW_MODE == "W"), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (waddr), + .wdata (wdata), + .raddr (raddr), + .rdata (rdata) + ); + `else if (RDW_MODE == "W") begin : g_write_first - `ifdef VIVADO - VX_async_ram_patch #( - .DATAW (DATAW), - .SIZE (SIZE), - .WRENW (WRENW), - .DUAL_PORT (1), - .INIT_ENABLE(INIT_ENABLE), - .INIT_FILE (INIT_FILE), - .INIT_VALUE (INIT_VALUE) - ) async_ram_patch ( - .clk (clk), - .reset (reset), - .read (read), - .write (write), - .wren (wren), - .waddr (waddr), - .wdata (wdata), - .raddr (raddr), - .rdata (rdata) - ); - `else if (WRENW != 1) begin : g_wren - `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -285,7 +287,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr]; end else begin : g_no_wren - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -294,7 +296,6 @@ module VX_dp_ram #( end assign rdata = ram[raddr]; end - `endif end else begin : g_read_first if (WRENW != 1) begin : g_wren `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN @@ -316,10 +317,11 @@ module VX_dp_ram #( assign rdata = ram[raddr]; end end + `endif end else begin : g_auto if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - `RAM_ARRAY_WREN + `RW_RAM_CHECK `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -328,7 +330,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr]; end else begin : g_no_wren - reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin diff --git a/hw/rtl/libs/VX_rr_arbiter.sv b/hw/rtl/libs/VX_rr_arbiter.sv index 1d3b479bf..c86da584a 100644 --- a/hw/rtl/libs/VX_rr_arbiter.sv +++ b/hw/rtl/libs/VX_rr_arbiter.sv @@ -485,7 +485,7 @@ module VX_rr_arbiter #( .D (NUM_REQS) ) grant_decoder ( .sel_in (grant_index), - .data_in (1'b1), + .data_in (grant_valid), .data_out (grant_onehot) ); diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index 88b922384..3c673e462 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -77,20 +77,20 @@ module VX_sp_ram #( localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM); if (OUT_REG) begin : g_sync if (FORCE_BRAM) begin : g_bram - if (RDW_MODE == "R") begin : g_read_first + if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [DATAW-1:0] rdata_r; + reg [ADDRW-1:0] addr_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - rdata_r <= ram[addr]; + addr_r <= addr; end end - assign rdata = rdata_r; + assign rdata = ram[addr_r]; end else begin : g_no_wren `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -99,26 +99,28 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; end - rdata_r <= ram[addr]; end end assign rdata = rdata_r; end - end else if (RDW_MODE == "W") begin : g_write_first + end else if (RDW_MODE == "R") begin : g_read_first if (WRENW != 1) begin : g_wren `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [ADDRW-1:0] addr_r; + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - addr_r <= addr; + rdata_r <= ram[addr]; end end - assign rdata = ram[addr_r]; + assign rdata = rdata_r; end else begin : g_no_wren `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -127,10 +129,8 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; - rdata_r <= wdata; - end else begin - rdata_r <= ram[addr]; end + rdata_r <= ram[addr]; end end assign rdata = rdata_r; @@ -165,7 +165,7 @@ module VX_sp_ram #( end assign rdata = rdata_r; end - end else if (RDW_MODE == "U") begin : g_unknown + end else if (RDW_MODE == "U") begin : g_undefined if (WRENW != 1) begin : g_wren `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION @@ -195,20 +195,20 @@ module VX_sp_ram #( end end end else begin : g_auto - if (RDW_MODE == "R") begin : g_read_first + if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [DATAW-1:0] rdata_r; + reg [ADDRW-1:0] addr_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - rdata_r <= ram[addr]; + addr_r <= addr; end end - assign rdata = rdata_r; + assign rdata = ram[addr_r]; end else begin : g_no_wren reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -217,26 +217,28 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; end - rdata_r <= ram[addr]; end end assign rdata = rdata_r; end - end else if (RDW_MODE == "W") begin : g_write_first + end else if (RDW_MODE == "R") begin : g_read_first if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [ADDRW-1:0] addr_r; + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - addr_r <= addr; + rdata_r <= ram[addr]; end end - assign rdata = ram[addr_r]; + assign rdata = rdata_r; end else begin : g_no_wren reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -245,10 +247,8 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; - rdata_r <= wdata; - end else begin - rdata_r <= ram[addr]; end + rdata_r <= ram[addr]; end end assign rdata = rdata_r; @@ -283,7 +283,7 @@ module VX_sp_ram #( end assign rdata = rdata_r; end - end else if (RDW_MODE == "U") begin : g_unknown + end else if (RDW_MODE == "U") begin : g_undefined if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION @@ -316,30 +316,32 @@ module VX_sp_ram #( end else begin : g_async `UNUSED_VAR (read) if (FORCE_BRAM) begin : g_bram + `ifdef VIVADO + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (0), + .FORCE_BRAM (FORCE_BRAM), + .WRITE_FIRST(RDW_MODE == "W"), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (addr), + .wdata (wdata), + .raddr (addr), + .rdata (rdata) + ); + `else if (RDW_MODE == "W") begin : g_write_first - `ifdef VIVADO - VX_async_ram_patch #( - .DATAW (DATAW), - .SIZE (SIZE), - .WRENW (WRENW), - .DUAL_PORT (0), - .INIT_ENABLE(INIT_ENABLE), - .INIT_FILE (INIT_FILE), - .INIT_VALUE (INIT_VALUE) - ) async_ram_patch ( - .clk (clk), - .reset (reset), - .read (read), - .write (write), - .wren (wren), - .waddr (addr), - .wdata (wdata), - .raddr (addr), - .rdata (rdata) - ); - `else if (WRENW != 1) begin : g_wren - `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -348,7 +350,7 @@ module VX_sp_ram #( end assign rdata = ram[addr]; end else begin : g_no_wren - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -357,7 +359,6 @@ module VX_sp_ram #( end assign rdata = ram[addr]; end - `endif end else begin : g_read_first if (WRENW != 1) begin : g_wren `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN @@ -379,10 +380,11 @@ module VX_sp_ram #( assign rdata = ram[addr]; end end + `endif end else begin : g_auto if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - `RAM_ARRAY_WREN + `RW_RAM_CHECK `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -391,7 +393,7 @@ module VX_sp_ram #( end assign rdata = ram[addr]; end else begin : g_no_wren - reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -443,22 +445,22 @@ module VX_sp_ram #( end if (OUT_REG) begin : g_sync - if (RDW_MODE == "R") begin : g_read_first - reg [DATAW-1:0] rdata_r; + if (RDW_MODE == "W") begin : g_write_first + reg [ADDRW-1:0] addr_r; always @(posedge clk) begin if (read || write) begin - rdata_r <= ram[addr]; + addr_r <= addr; end end - assign rdata = rdata_r; - end else if (RDW_MODE == "W") begin : g_write_first - reg [ADDRW-1:0] addr_r; + assign rdata = ram[addr_r]; + end else if (RDW_MODE == "R") begin : g_read_first + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (read || write) begin - addr_r <= addr; + rdata_r <= ram[addr]; end end - assign rdata = ram[addr_r]; + assign rdata = rdata_r; end else if (RDW_MODE == "N") begin : g_no_change reg [DATAW-1:0] rdata_r; always @(posedge clk) begin diff --git a/hw/scripts/xilinx_async_bram_patch.tcl b/hw/scripts/xilinx_async_bram_patch.tcl index 5af7ba953..f0a49ecd6 100644 --- a/hw/scripts/xilinx_async_bram_patch.tcl +++ b/hw/scripts/xilinx_async_bram_patch.tcl @@ -1,3 +1,16 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + namespace eval vortex { variable debug 0 @@ -17,6 +30,25 @@ proc str_replace {str match repl} { return $result } +proc regex_escape {str} { + return [string map { + \\ \\\\ + ^ \\^ + . \\. + \[ \\\[ + \] \\\] + \$ \\\$ + \( \\\( + \) \\\) + | \\| + * \\* + + \\+ + ? \\? + \{ \\\{ + \} \\\} + } $str] +} + proc unique_cell_name {name} { if {[get_cells -quiet $name] == {}} { return $name } set index 0 @@ -31,29 +63,58 @@ proc unique_net_name {name} { return ${name}_${index} } -proc find_nested_cells {parent name_match {should_exist 1}} { - set matching_cells {} - foreach cell [get_cells -hierarchical -include_replicated_objects -filter "PARENT == $parent"] { - set name [get_property NAME $cell] - if {[regexp $name_match $name]} { - lappend matching_cells $cell +proc build_parent_child_map {all_cells} { + set parent_child_map {} + foreach cell $all_cells { + set parent [get_property PARENT $cell] + if {$parent ne ""} { + if {[dict exists $parent_child_map $parent]} { + dict lappend parent_child_map $parent $cell + } else { + dict set parent_child_map $parent [list $cell] + } } } - if {[llength $matching_cells] == 0} { - print_error "No matching cell found for '$parent' matching '$name_match'." $should_exist + return $parent_child_map +} + +proc find_cell_descendants_recursive {parent_cell parent_child_map} { + set descendants {} + if {[dict exists $parent_child_map $parent_cell]} { + set children [dict get $parent_child_map $parent_cell] + foreach child $children { + # Add the child to the list + lappend descendants $child + # Recursively add its descendants + set sub_descendants [find_cell_descendants_recursive $child $parent_child_map] + lappend descendants {*}$sub_descendants + } } - return $matching_cells + return $descendants } -proc find_nested_cell {parent name_match} { - foreach cell [get_cells -hierarchical -filter "PARENT == $parent"] { - set name [get_property NAME $cell] - if {$name == $name_match} { - return $cell +proc find_cell_descendants {parent_cell} { + set all_cells [get_cells -hierarchical] + set parent_child_map [build_parent_child_map $all_cells] + return [find_cell_descendants_recursive $parent_cell $parent_child_map] +} + +proc find_nested_cells {parent_cell name_match {should_exist 1}} { + set hier_sep [get_hierarchy_separator] + set matching_cells {} + foreach cell [find_cell_descendants $parent_cell] { + set parent_name [get_property PARENT $cell] + set cell_name [get_property NAME $cell] + set name_prefix [regex_escape "${parent_name}${hier_sep}"] + set pattern "${name_prefix}${name_match}" + if {[regexp $pattern $cell_name]} { + lappend matching_cells $cell } } - puts "ERROR: No matching cell found for '$parent' matching '$name_match'." - exit -1 + if {[llength $matching_cells] == 0} { + print_error "No matching cell found for '$parent_cell' matching '$name_match'." $should_exist + } + return $matching_cells } proc find_cell_nets {cell name_match {should_exist 1}} { @@ -70,22 +131,23 @@ proc find_cell_nets {cell name_match {should_exist 1}} { return $matching_nets } -proc get_cell_net {cell name_match} { - foreach net [get_nets -hierarchical -filter "PARENT_CELL == $cell"] { - set name [get_property NAME $net] - if {$name == $name_match} { - return $net - } +proc get_cell_net {cell name} { + set net [get_nets -hierarchical -filter "PARENT_CELL == $cell && NAME == $name"] + if {[llength $net] == 0} { + puts "ERROR: No matching net found for '$cell' matching '$name'." + exit -1 } - puts "ERROR: No matching net found for '$cell' matching '$name_match'." - exit -1 + return $net; } proc find_cell_pins {cell name_match {should_exist 1}} { + set hier_sep [get_hierarchy_separator] set matching_pins {} foreach pin [get_pins -of_objects $cell] { set name [get_property NAME $pin] - if {[regexp $name_match $name]} { + set name_prefix [regex_escape "${cell}${hier_sep}"] + set pattern "${name_prefix}${name_match}" + if {[regexp $pattern $name]} { lappend matching_pins $pin } } @@ -95,15 +157,31 @@ proc find_cell_pins {cell name_match {should_exist 1}} { return $matching_pins } -proc get_cell_pin {cell name_match} { - foreach pin [get_pins -of_objects $cell] { - set name [get_property NAME $pin] - if {$name == $name_match} { - return $pin - } +proc get_cell_pin {cell name} { + set pin [get_pins -of_objects $cell -filter "NAME == $name"] + if {[llength $pin] == 0} { + puts "ERROR: No matching pin found for '$cell' matching '$name'." + exit -1 } - puts "ERROR: No matching pin found for '$cell' matching '$name_match'." - exit -1 + return $pin +} + +proc remove_cell_from_netlist {cell} { + variable debug + + puts "INFO: Removing cell '$cell' from the netlist." + + # Disconnect all pins of the cell + #foreach pin [get_pins -quiet -of_objects $cell] { + # foreach net [get_nets -quiet -of_objects $pin] { + # disconnect_net -net $net -objects $pin + # if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."} + # } + #} + + # Remove the cell + remove_cell $cell + if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."} } proc replace_pin_source {pin source_pin} { @@ -141,10 +219,42 @@ proc replace_pin_source {pin source_pin} { if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."} } -proc create_register_next {reg_cell prefix_name} { +proc find_net_driver {input_net {should_exist 1}} { + set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}] + if {[llength $driverPins] == 0} { + set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}] + if {[llength $driverPorts] == 0} { + print_error "No driver found for '$input_net'." $should_exist + } elseif {[llength $driverPorts] > 1} { + puts "WARNING: Multiple driver ports found for '$input_net'." + return [lindex $driverPorts 0] + } + return $driverPorts + } elseif {[llength $driverPins] > 1} { + puts "WARNING: Multiple driver pins found for '$input_net'." + return [lindex $driverPins 0] + } + return $driverPins +} + +proc find_pin_driver {input_pin {should_exist 1}} { + set net [get_nets -quiet -of_objects $input_pin] + if {[llength $net] == 0} { + print_error "No net connected to pin '$input_pin'." $should_exist + return "" + } elseif {[llength $net] > 1} { + puts "ERROR: Multiple nets connected to pin '$input_pin'." + exit -1 + } + return [find_net_driver $net] +} + +proc create_register_next {parent reg_cell} { variable debug - set reg_d_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/D"}] + set hier_sep [get_hierarchy_separator] + + set reg_d_pin [get_pins "${reg_cell}${hier_sep}D"] if {[llength $reg_d_pin] == 0} { puts "ERROR: No D pin found on register cell '$reg_cell'." exit -1 @@ -167,7 +277,7 @@ proc create_register_next {reg_cell prefix_name} { set register_type [get_property REF_NAME $reg_cell] if {$register_type == "FDRE"} { - set reg_r_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/R"}] + set reg_r_pin [get_pins "${reg_cell}${hier_sep}R"] if {[llength $reg_r_pin] == 0} { puts "ERROR: No R pin found on FDRE cell '$reg_cell'." exit -1 @@ -184,7 +294,7 @@ proc create_register_next {reg_cell prefix_name} { exit -1 } } elseif {$register_type == "FDSE"} { - set reg_s_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/S"}] + set reg_s_pin [get_pins "${reg_cell}${hier_sep}S"] if {[llength $reg_s_pin] == 0} { puts "ERROR: No S pin found on FDSE cell '$reg_cell'." exit -1 @@ -229,7 +339,7 @@ proc create_register_next {reg_cell prefix_name} { # Use a 2x1 LUT to describe the logic: # FDRE: O = I1 ? 0 : I0; where I0=D, I1=R # FDSE: O = I1 ? 1 : I0; where I0=D, I1=S - set lut_name [unique_cell_name $prefix_name] + set lut_name [unique_cell_name "${parent}${hier_sep}raddr_next"] set lut_cell [create_cell -reference LUT2 $lut_name] puts "INFO: Created lut cell: '$lut_cell'" @@ -242,7 +352,7 @@ proc create_register_next {reg_cell prefix_name} { exit 1 } - set lut_i0_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/I0"}] + set lut_i0_pin [get_pins "${lut_cell}${hier_sep}I0"] if {[llength $lut_i0_pin] == 0} { puts "ERROR: No I0 pin found on FDSE cell '$lut_cell'." exit -1 @@ -251,7 +361,7 @@ proc create_register_next {reg_cell prefix_name} { exit -1 } - set lut_i1_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/I1"}] + set lut_i1_pin [get_pins "${lut_cell}${hier_sep}I1"] if {[llength $lut_i1_pin] == 0} { puts "ERROR: No I1 pin found on FDSE cell '$lut_cell'." exit -1 @@ -260,7 +370,7 @@ proc create_register_next {reg_cell prefix_name} { exit -1 } - set lut_o_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/O"}] + set lut_o_pin [get_pins "${lut_cell}${hier_sep}O"] if {[llength $lut_o_pin] == 0} { puts "ERROR: No O pin found on FDSE cell '$lut_cell'." exit -1 @@ -278,19 +388,22 @@ proc create_register_next {reg_cell prefix_name} { return $lut_o_pin } -proc getOrCreateVCCPin {prefix_name} { +proc getOrCreateVCCPin {parent} { variable debug - set vcc_cell "" - set vcc_cells [get_cells -quiet -filter {REF_NAME == VCC}] - if {[llength $vcc_cells] == 0} { - set cell_name [unique_cell_name $prefix_name] + set hier_sep [get_hierarchy_separator] + set cell_name "${parent}${hier_sep}VCC" + + set vcc_cell [get_cells -quiet $cell_name] + if {[llength $vcc_cell] == 0} { set vcc_cell [create_cell -reference VCC $cell_name] puts "INFO: Created VCC cell: '$vcc_cell'" - } else { - set vcc_cell [lindex $vcc_cells 0] + } elseif {[llength $vcc_cell] > 1} { + puts "ERROR: Multiple VCC cells found with name '$cell_name'." + exit -1 } - set vcc_pin [get_pins -of_objects $vcc_cell -filter {NAME =~ "*/P"}] + + set vcc_pin [get_pins "${vcc_cell}${hier_sep}P"] if {[llength $vcc_pin] == 0} { puts "ERROR: No VCC pin found on VCC cell '$vcc_cell'." exit -1 @@ -298,22 +411,26 @@ proc getOrCreateVCCPin {prefix_name} { puts "ERROR: Multiple VCC pins found on VCC cell '$vcc_cell'." exit -1 } + return $vcc_pin } -proc getOrCreateGNDPin {prefix_name} { +proc getOrCreateGNDPin {parent} { variable debug - set gnd_cell "" - set gnd_cells [get_cells -quiet -filter {REF_NAME == GND}] - if {[llength $gnd_cells] == 0} { - set cell_name [unique_cell_name $prefix_name] + set hier_sep [get_hierarchy_separator] + set cell_name "${parent}${hier_sep}GND" + + set gnd_cell [get_cells -quiet $cell_name] + if {[llength $gnd_cell] == 0} { set gnd_cell [create_cell -reference GND $cell_name] puts "INFO: Created GND cell: '$gnd_cell'" - } else { - set gnd_cell [lindex $gnd_cells 0] + } elseif {[llength $gnd_cell] > 1} { + puts "ERROR: Multiple GND cells found with name '$cell_name'." + exit -1 } - set gnd_pin [get_pins -of_objects $gnd_cell -filter {NAME =~ "*/G"}] + + set gnd_pin [get_pins "${gnd_cell}${hier_sep}G"] if {[llength $gnd_pin] == 0} { puts "ERROR: No GND pin found on GND cell '$gnd_cell'." exit -1 @@ -321,6 +438,7 @@ proc getOrCreateGNDPin {prefix_name} { puts "ERROR: Multiple GND pins found on GND cell '$gnd_cell'." exit -1 } + return $gnd_pin } @@ -338,35 +456,6 @@ proc find_net_sinks {input_net {should_exist 1}} { return $sink_pins } -proc find_net_driver {input_net {should_exist 1}} { - set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}] - if {[llength $driverPins] == 0} { - set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}] - if {[llength $driverPorts] == 0} { - print_error "No driver found for '$input_net'." $should_exist - } elseif {[llength $driverPorts] > 1} { - puts "WARNING: Multiple driver ports found for '$input_net'." - return [lindex $driverPorts 0] - } - return $driverPorts - } elseif {[llength $driverPins] > 1} { - puts "WARNING: Multiple driver pins found for '$input_net'." - return [lindex $driverPins 0] - } - return $driverPins -} - -proc find_pin_driver {input_pin {should_exist 1}} { - set net [get_nets -quiet -of_objects $input_pin] - if {[llength $net] == 0} { - print_error "No net connected to pin '$input_pin'." $should_exist - } elseif {[llength $net] > 1} { - puts "ERROR: Multiple nets connected to pin '$input_pin'." - exit -1 - } - return [find_net_driver $net] -} - proc find_matching_nets {cell nets match repl} { set matching_nets {} foreach net $nets { @@ -386,6 +475,25 @@ proc find_matching_nets {cell nets match repl} { return $matching_nets } +proc find_matching_pins {cell pins match repl} { + set matching_pins {} + foreach pin $pins { + set pin_name [str_replace $pin $match $repl] + set matching_pin [get_cell_pin $cell $pin_name] + if {$matching_pin != ""} { + lappend matching_pins $matching_pin + } + } + if {[llength $matching_pins] == 0} { + puts "ERROR: No matching pins found for '$pins'." + exit -1 + } elseif {[llength $matching_pins] != [llength $pins]} { + puts "ERROR: Mismatch in number of matching pins." + exit -1 + } + return $matching_pins +} + proc replace_net_source {net source_pin} { foreach pin [find_net_sinks $net 0] { replace_pin_source $pin $source_pin @@ -397,6 +505,8 @@ proc resolve_async_bram {inst} { puts "INFO: Resolving asynchronous BRAM patch: '$inst'." + set hier_sep [get_hierarchy_separator] + set raddr_w_nets [find_cell_nets $inst "raddr_w(\\\[\\d+\\\])?$"] set read_s_net [find_cell_nets $inst "read_s$"] set is_raddr_reg_net [find_cell_nets $inst "is_raddr_reg$"] @@ -433,7 +543,7 @@ proc resolve_async_bram {inst} { } # Create register next cell and return output pin - set reg_next_pin [create_register_next $raddr_src_cell "$inst/raddr_next"] + set reg_next_pin [create_register_next $inst $raddr_src_cell] if {$reg_next_pin == ""} { puts "ERROR: failed to create register next value for '$raddr_src_cell'." exit -1 @@ -444,7 +554,7 @@ proc resolve_async_bram {inst} { # Find the CE pin on raddr_src_cell if {$reg_ce_src_pin == ""} { - set reg_ce_pin [get_pins -of_objects $raddr_src_cell -filter {NAME =~ "*/CE"}] + set reg_ce_pin [get_pins "${raddr_src_cell}${hier_sep}CE"] if {[llength $reg_ce_pin] == 0} { puts "ERROR: No CE pin found on register cell '$raddr_src_cell'." exit -1 @@ -466,9 +576,10 @@ proc resolve_async_bram {inst} { # do we have a fully registered read address? if {[llength $reg_next_pins] == [llength $raddr_w_nets]} { puts "INFO: Fully registered read address detected." + + # Connect all reg_next_pins to all input pins attached to raddr_s_nets set addr_width [llength $raddr_w_nets] for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} { - set raddr_w_net [lindex $raddr_w_nets $addr_idx] set raddr_s_net [lindex $raddr_s_nets $addr_idx] set reg_next_pin [lindex $reg_next_pins $addr_idx] puts "INFO: Connecting pin '$reg_next_pin' to '$raddr_s_net's pins." @@ -481,7 +592,7 @@ proc resolve_async_bram {inst} { replace_net_source $read_s_net $reg_ce_src_pin # Create Const<1>'s pin - set vcc_pin [getOrCreateVCCPin "$inst/VCC"] + set vcc_pin [getOrCreateVCCPin $inst] # Connect vcc_pin to all input pins attached to is_raddr_reg_net puts "INFO: Connecting pin '$vcc_pin' to '$is_raddr_reg_net's pins." @@ -490,18 +601,16 @@ proc resolve_async_bram {inst} { puts "WARNING: Not all read addresses are registered!" # Create Const<0>'s pin - set gnd_pin [getOrCreateGNDPin "$inst/GND"] + set gnd_pin [getOrCreateGNDPin $inst] # Connect gnd_pin to all input pins attached to is_raddr_reg_net puts "INFO: Connecting pin '$gnd_pin' to '$is_raddr_reg_net's pins." replace_net_source $is_raddr_reg_net $gnd_pin } - # Remove all placeholder cells - foreach cell [find_nested_cells $inst "placeholder$"] { - remove_cell $cell - if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."} - } + # Remove placeholder cell + set placeholder [get_cells "${inst}${hier_sep}placeholder"] + remove_cell_from_netlist $placeholder } proc resolve_async_brams {} { diff --git a/hw/scripts/xilinx_export_netlist.tcl b/hw/scripts/xilinx_export_netlist.tcl index 25a0d17e8..a6ff22ff5 100644 --- a/hw/scripts/xilinx_export_netlist.tcl +++ b/hw/scripts/xilinx_export_netlist.tcl @@ -1,3 +1,16 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Function to export netlist to a Graphviz DOT file proc export_netlist {dot_file_name} { # Open the DOT file for writing diff --git a/hw/syn/xilinx/README b/hw/syn/xilinx/README index 0fb83e71b..a1ca231fe 100644 --- a/hw/syn/xilinx/README +++ b/hw/syn/xilinx/README @@ -47,6 +47,9 @@ TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make chipscope # analyze build report vitis_analyzer build_xilinx_u50_gen3x16_xdma_5_202210_1_hw_4c/bin/vortex_afu.xclbin.link_summary +# resuming build for routing +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.route_design" make > build.log 2>&1 & + # running test FPGA_BIN_DIR= TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo FPGA_BIN_DIR= TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index 643724069..288031e2e 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -180,6 +180,7 @@ ifeq ($(TARGET), hw) cp $(BUILD_DIR)/_x/logs/link/vivado.log $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/reports/link/syn/ulp_vortex_afu_1_0_synth_1_ulp_vortex_afu_1_0_utilization_synth.rpt $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_utilization_placed.rpt $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt $(BUILD_DIR)/bin endif From b0c48e7a46dbd5169c500c4e51f6949587184c67 Mon Sep 17 00:00:00 2001 From: tinebp Date: Wed, 20 Nov 2024 18:27:52 -0800 Subject: [PATCH 03/36] stream buffer area optimization --- hw/rtl/libs/VX_stream_buffer.sv | 39 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/hw/rtl/libs/VX_stream_buffer.sv b/hw/rtl/libs/VX_stream_buffer.sv index 4b77df83d..2cf08c0f4 100644 --- a/hw/rtl/libs/VX_stream_buffer.sv +++ b/hw/rtl/libs/VX_stream_buffer.sv @@ -86,38 +86,47 @@ module VX_stream_buffer #( end else begin : g_no_out_reg - reg [1:0][DATAW-1:0] shift_reg; - reg [1:0] fifo_state, fifo_state_n; + reg [DATAW-1:0] data_out_r, buffer; + reg valid_in_r, valid_out_r; wire fire_in = valid_in && ready_in; wire fire_out = valid_out && ready_out; - always @(*) begin - case ({fire_in, fire_out}) - 2'b10: fifo_state_n = {fifo_state[0], 1'b1}; // 00 -> 01, 01 -> 10 - 2'b01: fifo_state_n = {1'b0, fifo_state[1]}; // 10 -> 01, 01 -> 00 - default: fifo_state_n = fifo_state; - endcase + always @(posedge clk) begin + if (reset) begin + valid_in_r <= 1'b1; + end else begin + if (fire_in ^ fire_out) begin + valid_in_r <= valid_out_r ^ fire_in; + end + end end always @(posedge clk) begin if (reset) begin - fifo_state <= 2'b00; + valid_out_r <= 1'b0; end else begin - fifo_state <= fifo_state_n; + if (fire_in ^ fire_out) begin + valid_out_r <= valid_in_r ^ fire_out; + end end end always @(posedge clk) begin if (fire_in) begin - shift_reg[1] <= shift_reg[0]; - shift_reg[0] <= data_in; + data_out_r <= data_in; end end - assign ready_in = ~fifo_state[1]; - assign valid_out = fifo_state[0]; - assign data_out = shift_reg[fifo_state[1]]; + always @(posedge clk) begin + if (fire_in) begin + buffer <= data_out_r; + end + end + + assign ready_in = valid_in_r; + assign valid_out = valid_out_r; + assign data_out = valid_in_r ? data_out_r : buffer; end From 8d8769c7100b9abcad3d1c1ff0eb011d2cfbb5dc Mon Sep 17 00:00:00 2001 From: tinebp Date: Wed, 20 Nov 2024 19:15:51 -0800 Subject: [PATCH 04/36] stream_buffer area optimization --- hw/rtl/libs/VX_stream_buffer.sv | 88 +++++++++++++-------------------- 1 file changed, 33 insertions(+), 55 deletions(-) diff --git a/hw/rtl/libs/VX_stream_buffer.sv b/hw/rtl/libs/VX_stream_buffer.sv index 2cf08c0f4..ea4467cb3 100644 --- a/hw/rtl/libs/VX_stream_buffer.sv +++ b/hw/rtl/libs/VX_stream_buffer.sv @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// A stream elastic buffer operates at full-bandwidth where fire_in and fire_out can happen simultaneously +// A stream elastic buffer_r operates at full-bandwidth where fire_in and fire_out can happen simultaneously // It has the following benefits: // + full-bandwidth throughput // + ready_in and ready_out are decoupled @@ -45,88 +45,66 @@ module VX_stream_buffer #( assign valid_out = valid_in; assign data_out = data_in; - end else if (OUT_REG != 0) begin : g_out_reg + end else begin : g_buffer - reg [DATAW-1:0] data_out_r; - reg [DATAW-1:0] buffer; - reg valid_out_r; - reg no_buffer; + reg [DATAW-1:0] data_out_r, buffer_r; + reg valid_out_r, valid_in_r; wire fire_in = valid_in && ready_in; wire flow_out = ready_out || ~valid_out; always @(posedge clk) begin if (reset) begin - valid_out_r <= 0; - no_buffer <= 1; - end else begin - if (flow_out) begin - no_buffer <= 1; - end else if (valid_in) begin - no_buffer <= 0; - end - if (flow_out) begin - valid_out_r <= valid_in || ~no_buffer; - end + valid_in_r <= 1'b1; + end else if (valid_in || flow_out) begin + valid_in_r <= flow_out; end end always @(posedge clk) begin - if (fire_in) begin - buffer <= data_in; - end - if (flow_out) begin - data_out_r <= no_buffer ? data_in : buffer; + if (reset) begin + valid_out_r <= 1'b0; + end else if (flow_out) begin + valid_out_r <= valid_in || ~valid_in_r; end end - assign ready_in = no_buffer; - assign valid_out = valid_out_r; - assign data_out = data_out_r; + if (OUT_REG != 0) begin : g_out_reg - end else begin : g_no_out_reg + always @(posedge clk) begin + if (fire_in) begin + buffer_r <= data_in; + end + end - reg [DATAW-1:0] data_out_r, buffer; - reg valid_in_r, valid_out_r; + always @(posedge clk) begin + if (flow_out) begin + data_out_r <= valid_in_r ? data_in : buffer_r; + end + end - wire fire_in = valid_in && ready_in; - wire fire_out = valid_out && ready_out; + assign data_out = data_out_r; - always @(posedge clk) begin - if (reset) begin - valid_in_r <= 1'b1; - end else begin - if (fire_in ^ fire_out) begin - valid_in_r <= valid_out_r ^ fire_in; + end else begin : g_no_out_reg + + always @(posedge clk) begin + if (fire_in) begin + data_out_r <= data_in; end end - end - always @(posedge clk) begin - if (reset) begin - valid_out_r <= 1'b0; - end else begin - if (fire_in ^ fire_out) begin - valid_out_r <= valid_in_r ^ fire_out; + always @(posedge clk) begin + if (fire_in) begin + buffer_r <= data_out_r; end end - end - always @(posedge clk) begin - if (fire_in) begin - data_out_r <= data_in; - end - end + assign data_out = valid_in_r ? data_out_r : buffer_r; - always @(posedge clk) begin - if (fire_in) begin - buffer <= data_out_r; - end end - assign ready_in = valid_in_r; assign valid_out = valid_out_r; - assign data_out = valid_in_r ? data_out_r : buffer; + assign ready_in = valid_in_r; end From 180735c531df8f4dafcc484814ea2600ce9cb711 Mon Sep 17 00:00:00 2001 From: tinebp Date: Thu, 21 Nov 2024 16:47:00 -0800 Subject: [PATCH 05/36] fifoqueue area optimization --- hw/rtl/libs/VX_fifo_queue.sv | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index 720a1a2c6..f3cc65b7b 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -90,9 +90,6 @@ module VX_fifo_queue #( end end - wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); - wire bypass = push && (empty || (going_empty && pop)); - VX_dp_ram #( .DATAW (DATAW), .SIZE (DEPTH), @@ -101,7 +98,7 @@ module VX_fifo_queue #( ) dp_ram ( .clk (clk), .reset (reset), - .read (~bypass), + .read (1'b1), .write (push), .wren (1'b1), .raddr (rd_ptr_r), @@ -112,11 +109,10 @@ module VX_fifo_queue #( if (OUT_REG != 0) begin : g_out_reg reg [DATAW-1:0] data_out_r; + wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); always @(posedge clk) begin - if (bypass) begin - data_out_r <= data_in; - end else if (pop) begin - data_out_r <= data_out_w; + if (pop || (push && empty)) begin + data_out_r <= (empty || going_empty) ? data_in : data_out_w; end end assign data_out = data_out_r; From 18bf49d1e0254e4236a51355edc5c11e1116d624 Mon Sep 17 00:00:00 2001 From: tinebp Date: Thu, 21 Nov 2024 16:48:18 -0800 Subject: [PATCH 06/36] minor update --- hw/scripts/xilinx_async_bram_patch.tcl | 34 ++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/hw/scripts/xilinx_async_bram_patch.tcl b/hw/scripts/xilinx_async_bram_patch.tcl index f0a49ecd6..e4a684e3b 100644 --- a/hw/scripts/xilinx_async_bram_patch.tcl +++ b/hw/scripts/xilinx_async_bram_patch.tcl @@ -597,6 +597,11 @@ proc resolve_async_bram {inst} { # Connect vcc_pin to all input pins attached to is_raddr_reg_net puts "INFO: Connecting pin '$vcc_pin' to '$is_raddr_reg_net's pins." replace_net_source $is_raddr_reg_net $vcc_pin + + # Remove all async_ram cells + foreach cell [find_nested_cells $inst "g_async_ram.*" 0] { + remove_cell_from_netlist $cell + } } else { puts "WARNING: Not all read addresses are registered!" @@ -606,11 +611,17 @@ proc resolve_async_bram {inst} { # Connect gnd_pin to all input pins attached to is_raddr_reg_net puts "INFO: Connecting pin '$gnd_pin' to '$is_raddr_reg_net's pins." replace_net_source $is_raddr_reg_net $gnd_pin + + # Remove all sync_ram cells + foreach cell [find_nested_cells $inst "g_sync_ram.*" 0] { + remove_cell_from_netlist $cell + } } # Remove placeholder cell - set placeholder [get_cells "${inst}${hier_sep}placeholder"] - remove_cell_from_netlist $placeholder + foreach cell [find_nested_cells $inst "placeholder$"] { + remove_cell_from_netlist $cell + } } proc resolve_async_brams {} { @@ -628,7 +639,26 @@ proc resolve_async_brams {} { } } +proc dump_async_bram_cells {} { + set bram_patch_cells [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] + if {[llength $bram_patch_cells] != 0} { + foreach cell $bram_patch_cells { + puts "INFO: Found async BRAM patch cell: '$cell'." + set child_cells [find_cell_descendants $cell] + foreach child $child_cells { + set type [get_property REF_NAME $child] + puts "INFO: child cell: '$child', type: '$type'" + } + } + } else { + puts "INFO: No async BRAM patch cells found in the design." + } +} + } # Invoke the procedure to resolve async BRAM vortex::resolve_async_brams + +# dump async bram cells +#vortex::dump_async_bram_cells From 7c4ce748011e33f8f9e1ce0e2c65744d3f5dd187 Mon Sep 17 00:00:00 2001 From: tinebp Date: Thu, 21 Nov 2024 16:48:41 -0800 Subject: [PATCH 07/36] memory unit timing optimization --- hw/rtl/core/VX_mem_unit.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/rtl/core/VX_mem_unit.sv b/hw/rtl/core/VX_mem_unit.sv index 931ad65cd..98491e73d 100644 --- a/hw/rtl/core/VX_mem_unit.sv +++ b/hw/rtl/core/VX_mem_unit.sv @@ -47,7 +47,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #( for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches VX_lmem_switch #( - .REQ0_OUT_BUF (3), + .REQ0_OUT_BUF (1), .REQ1_OUT_BUF (0), .RSP_OUT_BUF (1), .ARBITER ("P") @@ -78,7 +78,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #( .TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH), .ARBITER ("P"), .REQ_OUT_BUF (3), - .RSP_OUT_BUF (0) + .RSP_OUT_BUF (2) ) lmem_adapter ( .clk (clk), .reset (reset), From 3e4bbfc9f04d29e67bb23b4d25497744ebf85aaa Mon Sep 17 00:00:00 2001 From: tinebp Date: Fri, 22 Nov 2024 11:12:17 -0800 Subject: [PATCH 08/36] minor update --- hw/rtl/libs/VX_fifo_queue.sv | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index f3cc65b7b..c7a4aab6d 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -110,9 +110,12 @@ module VX_fifo_queue #( if (OUT_REG != 0) begin : g_out_reg reg [DATAW-1:0] data_out_r; wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); + wire bypass = push && (empty || (going_empty && pop)); always @(posedge clk) begin - if (pop || (push && empty)) begin - data_out_r <= (empty || going_empty) ? data_in : data_out_w; + if (bypass) begin + data_out_r <= data_in; + end else if (pop) begin + data_out_r <= data_out_w; end end assign data_out = data_out_r; From 1e4583ac17cb600b74a6d104395759eed1dbb601 Mon Sep 17 00:00:00 2001 From: MichaelJSr Date: Tue, 26 Nov 2024 18:41:01 -0800 Subject: [PATCH 09/36] Adds the riscv vector extension into simx --- ci/regression.sh.in | 16 +- hw/rtl/VX_config.vh | 4 + hw/rtl/VX_types.vh | 13 + perf/cache/cache_perf.log | 2 +- sim/common/rvfloats.cpp | 34 + sim/common/rvfloats.h | 5 + sim/common/softfloat_ext.cpp | 486 ++ sim/common/softfloat_ext.h | 14 + sim/opaesim/Makefile | 2 +- sim/rtlsim/Makefile | 2 +- sim/simx/Makefile | 4 +- sim/simx/arch.h | 6 + sim/simx/decode.cpp | 184 +- sim/simx/emulator.cpp | 75 + sim/simx/emulator.h | 88 +- sim/simx/execute.cpp | 141 +- sim/simx/execute_vector.cpp | 4493 +++++++++++++++++ sim/simx/instr.h | 89 +- sim/simx/types.h | 4 +- sim/xrtsim/Makefile | 2 +- tests/riscv/riscv-vector-tests/README | 39 + tests/riscv/riscv-vector-tests/run-test.sh.in | 117 + 22 files changed, 5716 insertions(+), 104 deletions(-) create mode 100644 sim/common/softfloat_ext.cpp create mode 100644 sim/common/softfloat_ext.h create mode 100644 sim/simx/execute_vector.cpp create mode 100644 tests/riscv/riscv-vector-tests/README create mode 100755 tests/riscv/riscv-vector-tests/run-test.sh.in diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 849a8769f..53819490f 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -386,10 +386,20 @@ synthesis() echo "synthesis tests done!" } +vector() +{ + echo "begin vector tests..." + + make -C sim/simx + TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh + + echo "vector tests done!" +} + show_usage() { echo "Vortex Regression Test" - echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--all] [--h|--help]" + echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]" } declare -a tests=() @@ -439,6 +449,9 @@ while [ "$1" != "" ]; do --synthesis ) tests+=("synthesis") ;; + --vector ) + tests+=("vector") + ;; --all ) tests=() tests+=("unittest") @@ -454,6 +467,7 @@ while [ "$1" != "" ]; do tests+=("scope") tests+=("stress") tests+=("synthesis") + tests+=("vector") ;; -h | --help ) show_usage diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 29eb5c9d8..3badaa3d3 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -87,6 +87,10 @@ `endif `endif +`ifndef VLEN +`define VLEN 256 +`endif + `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 048ba0a5c..4c8505e5e 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -188,6 +188,19 @@ `define VX_CSR_MIMPID 12'hF13 `define VX_CSR_MHARTID 12'hF14 +// Vector CSRs + +`define VX_CSR_VSTART 12'h008 +`define VX_CSR_VXSAT 12'h009 +`define VX_CSR_VXRM 12'h00A +`define VX_CSR_VCSR 12'h00F +`define VX_CSR_VL 12'hC20 +`define VX_CSR_VTYPE 12'hC21 +`define VX_CSR_VLENB 12'hC22 +`define VX_CSR_VCYCLE 12'hC00 +`define VX_CSR_VTIME 12'hC01 +`define VX_CSR_VINSTRET 12'hC02 + // GPGU CSRs `define VX_CSR_THREAD_ID 12'hCC0 diff --git a/perf/cache/cache_perf.log b/perf/cache/cache_perf.log index 21a446d25..0a4a55cc8 100644 --- a/perf/cache/cache_perf.log +++ b/perf/cache/cache_perf.log @@ -1,3 +1,3 @@ CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 make -C ./ci/../driver/rtlsim -verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so +verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/softfloat_ext.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so diff --git a/sim/common/rvfloats.cpp b/sim/common/rvfloats.cpp index 3e577f7f9..2b252010c 100644 --- a/sim/common/rvfloats.cpp +++ b/sim/common/rvfloats.cpp @@ -12,6 +12,7 @@ // limitations under the License. #include "rvfloats.h" +#include "softfloat_ext.h" #include extern "C" { @@ -158,6 +159,34 @@ uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) { return from_float64_t(r); } +uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f32_recip7(to_float32_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float32_t(r); +} + +uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f64_recip7(to_float64_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float64_t(r); +} + +uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f32_rsqrte7(to_float32_t(a)); + if (fflags) { *fflags =softfloat_exceptionFlags; } + return from_float32_t(r); +} + +uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f64_rsqrte7(to_float64_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float64_t(r); +} + uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) { rv_init(frm); auto r = f32_sqrt(to_float32_t(a)); @@ -486,6 +515,11 @@ uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) { return r; } +uint32_t rv_dtof_r(uint64_t a, uint32_t frm) { + rv_init(frm); + return rv_dtof(a); +} + uint32_t rv_dtof(uint64_t a) { auto r = f64_to_f32(to_float64_t(a)); return from_float32_t(r); diff --git a/sim/common/rvfloats.h b/sim/common/rvfloats.h index d921846dd..86b60e8ee 100644 --- a/sim/common/rvfloats.h +++ b/sim/common/rvfloats.h @@ -28,6 +28,8 @@ uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags); uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags); uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags); +uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags); +uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags); uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags); uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags); @@ -58,6 +60,8 @@ uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags); +uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags); +uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags); uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags); uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags); @@ -85,6 +89,7 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags); uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags); uint32_t rv_dtof(uint64_t a); +uint32_t rv_dtof_r(uint64_t a, uint32_t frm); uint64_t rv_ftod(uint32_t a); #ifdef __cplusplus diff --git a/sim/common/softfloat_ext.cpp b/sim/common/softfloat_ext.cpp new file mode 100644 index 000000000..877bdc8ac --- /dev/null +++ b/sim/common/softfloat_ext.cpp @@ -0,0 +1,486 @@ +/*============================================================================ + +This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic +Package, Release 3e, by John R. Hauser. + +Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of +California. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions, and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the University nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=============================================================================*/ + +#include +#include +#include +#include <../RISCV/specialize.h> +#include +#include "softfloat_ext.h" + +uint_fast16_t f16_classify( float16_t a ) +{ + union ui16_f16 uA; + uint_fast16_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF16UI( uiA ) == 0x1F; + uint_fast16_t subnormalOrZero = expF16UI( uiA ) == 0; + bool sign = signF16UI( uiA ); + bool fracZero = fracF16UI( uiA ) == 0; + bool isNaN = isNaNF16UI( uiA ); + bool isSNaN = softfloat_isSigNaNF16UI( uiA ); + + return + ( sign && infOrNaN && fracZero ) << 0 | + ( sign && !infOrNaN && !subnormalOrZero ) << 1 | + ( sign && subnormalOrZero && !fracZero ) << 2 | + ( sign && subnormalOrZero && fracZero ) << 3 | + ( !sign && infOrNaN && fracZero ) << 7 | + ( !sign && !infOrNaN && !subnormalOrZero ) << 6 | + ( !sign && subnormalOrZero && !fracZero ) << 5 | + ( !sign && subnormalOrZero && fracZero ) << 4 | + ( isNaN && isSNaN ) << 8 | + ( isNaN && !isSNaN ) << 9; +} + +uint_fast16_t f32_classify( float32_t a ) +{ + union ui32_f32 uA; + uint_fast32_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF32UI( uiA ) == 0xFF; + uint_fast16_t subnormalOrZero = expF32UI( uiA ) == 0; + bool sign = signF32UI( uiA ); + bool fracZero = fracF32UI( uiA ) == 0; + bool isNaN = isNaNF32UI( uiA ); + bool isSNaN = softfloat_isSigNaNF32UI( uiA ); + + return + ( sign && infOrNaN && fracZero ) << 0 | + ( sign && !infOrNaN && !subnormalOrZero ) << 1 | + ( sign && subnormalOrZero && !fracZero ) << 2 | + ( sign && subnormalOrZero && fracZero ) << 3 | + ( !sign && infOrNaN && fracZero ) << 7 | + ( !sign && !infOrNaN && !subnormalOrZero ) << 6 | + ( !sign && subnormalOrZero && !fracZero ) << 5 | + ( !sign && subnormalOrZero && fracZero ) << 4 | + ( isNaN && isSNaN ) << 8 | + ( isNaN && !isSNaN ) << 9; +} + +uint_fast16_t f64_classify( float64_t a ) +{ + union ui64_f64 uA; + uint_fast64_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF64UI( uiA ) == 0x7FF; + uint_fast16_t subnormalOrZero = expF64UI( uiA ) == 0; + bool sign = signF64UI( uiA ); + bool fracZero = fracF64UI( uiA ) == 0; + bool isNaN = isNaNF64UI( uiA ); + bool isSNaN = softfloat_isSigNaNF64UI( uiA ); + + return + ( sign && infOrNaN && fracZero ) << 0 | + ( sign && !infOrNaN && !subnormalOrZero ) << 1 | + ( sign && subnormalOrZero && !fracZero ) << 2 | + ( sign && subnormalOrZero && fracZero ) << 3 | + ( !sign && infOrNaN && fracZero ) << 7 | + ( !sign && !infOrNaN && !subnormalOrZero ) << 6 | + ( !sign && subnormalOrZero && !fracZero ) << 5 | + ( !sign && subnormalOrZero && fracZero ) << 4 | + ( isNaN && isSNaN ) << 8 | + ( isNaN && !isSNaN ) << 9; +} + +static inline uint64_t extract64(uint64_t val, int pos, int len) +{ + assert(pos >= 0 && len > 0 && len <= 64 - pos); + return (val >> pos) & (~UINT64_C(0) >> (64 - len)); +} + +static inline uint64_t make_mask64(int pos, int len) +{ + assert(pos >= 0 && len > 0 && pos < 64 && len <= 64); + return (UINT64_MAX >> (64 - len)) << pos; +} + +//user needs to truncate output to required length +static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) { + uint64_t exp = extract64(val, s, e); + uint64_t sig = extract64(val, 0, s); + uint64_t sign = extract64(val, s + e, 1); + const int p = 7; + + static const uint8_t table[] = { + 52, 51, 50, 48, 47, 46, 44, 43, + 42, 41, 40, 39, 38, 36, 35, 34, + 33, 32, 31, 30, 30, 29, 28, 27, + 26, 25, 24, 23, 23, 22, 21, 20, + 19, 19, 18, 17, 16, 16, 15, 14, + 14, 13, 12, 12, 11, 10, 10, 9, + 9, 8, 7, 7, 6, 6, 5, 4, + 4, 3, 3, 2, 2, 1, 1, 0, + 127, 125, 123, 121, 119, 118, 116, 114, + 113, 111, 109, 108, 106, 105, 103, 102, + 100, 99, 97, 96, 95, 93, 92, 91, + 90, 88, 87, 86, 85, 84, 83, 82, + 80, 79, 78, 77, 76, 75, 74, 73, + 72, 71, 70, 70, 69, 68, 67, 66, + 65, 64, 63, 63, 62, 61, 60, 59, + 59, 58, 57, 56, 56, 55, 54, 53}; + + if (sub) { + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + + sig = (sig << 1) & make_mask64(0 ,s); + } + + int idx = ((exp & 1) << (p-1)) | (sig >> (s-p+1)); + uint64_t out_sig = (uint64_t)(table[idx]) << (s-p); + uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2; + + return (sign << (s+e)) | (out_exp << s) | out_sig; +} + +float16_t f16_rsqrte7(float16_t in) +{ + union ui16_f16 uA; + + uA.f = in; + unsigned int ret = f16_classify(in); + bool sub = false; + switch(ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF16UI; + break; + case 0x008: // -0 + uA.ui = 0xfc00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7c00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 5, 10, sub); + break; + } + + return uA.f; +} + +float32_t f32_rsqrte7(float32_t in) +{ + union ui32_f32 uA; + + uA.f = in; + unsigned int ret = f32_classify(in); + bool sub = false; + switch(ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF32UI; + break; + case 0x008: // -0 + uA.ui = 0xff800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7f800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 8, 23, sub); + break; + } + + return uA.f; +} + +float64_t f64_rsqrte7(float64_t in) +{ + union ui64_f64 uA; + + uA.f = in; + unsigned int ret = f64_classify(in); + bool sub = false; + switch(ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF64UI; + break; + case 0x008: // -0 + uA.ui = 0xfff0000000000000ul; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7ff0000000000000ul; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 11, 52, sub); + break; + } + + return uA.f; +} + +//user needs to truncate output to required length +static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub, + bool *round_abnormal) +{ + uint64_t exp = extract64(val, s, e); + uint64_t sig = extract64(val, 0, s); + uint64_t sign = extract64(val, s + e, 1); + const int p = 7; + + static const uint8_t table[] = { + 127, 125, 123, 121, 119, 117, 116, 114, + 112, 110, 109, 107, 105, 104, 102, 100, + 99, 97, 96, 94, 93, 91, 90, 88, + 87, 85, 84, 83, 81, 80, 79, 77, + 76, 75, 74, 72, 71, 70, 69, 68, + 66, 65, 64, 63, 62, 61, 60, 59, + 58, 57, 56, 55, 54, 53, 52, 51, + 50, 49, 48, 47, 46, 45, 44, 43, + 42, 41, 40, 40, 39, 38, 37, 36, + 35, 35, 34, 33, 32, 31, 31, 30, + 29, 28, 28, 27, 26, 25, 25, 24, + 23, 23, 22, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 15, 15, 14, 14, + 13, 12, 12, 11, 11, 10, 9, 9, + 8, 8, 7, 7, 6, 5, 5, 4, + 4, 3, 3, 2, 2, 1, 1, 0}; + + if (sub) { + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + + sig = (sig << 1) & make_mask64(0 ,s); + + if (exp != 0 && exp != UINT64_MAX) { + *round_abnormal = true; + if (rm == 1 || + (rm == 2 && !sign) || + (rm == 3 && sign)) + return ((sign << (s+e)) | make_mask64(s, e)) - 1; + else + return (sign << (s+e)) | make_mask64(s, e); + } + } + + int idx = sig >> (s-p); + uint64_t out_sig = (uint64_t)(table[idx]) << (s-p); + uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp; + if (out_exp == 0 || out_exp == UINT64_MAX) { + out_sig = (out_sig >> 1) | make_mask64(s - 1, 1); + if (out_exp == UINT64_MAX) { + out_sig >>= 1; + out_exp = 0; + } + } + + return (sign << (s+e)) | (out_exp << s) | out_sig; +} + +float16_t f16_recip7(float16_t in) +{ + union ui16_f16 uA; + + uA.f = in; + unsigned int ret = f16_classify(in); + bool sub = false; + bool round_abnormal = false; + switch(ret) { + case 0x001: // -inf + uA.ui = 0x8000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xfc00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7c00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF16UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 5, 10, + softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) + softfloat_exceptionFlags |= softfloat_flag_inexact | + softfloat_flag_overflow; + break; + } + + return uA.f; +} + +float32_t f32_recip7(float32_t in) +{ + union ui32_f32 uA; + + uA.f = in; + unsigned int ret = f32_classify(in); + bool sub = false; + bool round_abnormal = false; + switch(ret) { + case 0x001: // -inf + uA.ui = 0x80000000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xff800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7f800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF32UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 8, 23, + softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) + softfloat_exceptionFlags |= softfloat_flag_inexact | + softfloat_flag_overflow; + break; + } + + return uA.f; +} + +float64_t f64_recip7(float64_t in) +{ + union ui64_f64 uA; + + uA.f = in; + unsigned int ret = f64_classify(in); + bool sub = false; + bool round_abnormal = false; + switch(ret) { + case 0x001: // -inf + uA.ui = 0x8000000000000000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xfff0000000000000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7ff0000000000000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF64UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 11, 52, + softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) + softfloat_exceptionFlags |= softfloat_flag_inexact | + softfloat_flag_overflow; + break; + } + + return uA.f; +} \ No newline at end of file diff --git a/sim/common/softfloat_ext.h b/sim/common/softfloat_ext.h new file mode 100644 index 000000000..7a18af9f7 --- /dev/null +++ b/sim/common/softfloat_ext.h @@ -0,0 +1,14 @@ +#include +#include + +uint_fast16_t f16_classify( float16_t ); +float16_t f16_rsqrte7( float16_t ); +float16_t f16_recip7( float16_t ); + +uint_fast16_t f32_classify( float32_t ); +float32_t f32_rsqrte7( float32_t ); +float32_t f32_recip7( float32_t ); + +uint_fast16_t f64_classify( float64_t ); +float64_t f64_rsqrte7( float64_t ); +float64_t f64_recip7( float64_t ); \ No newline at end of file diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index b04f8ddb4..49b0f4ab8 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -51,7 +51,7 @@ endif DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(SRC_DIR)/fpga.cpp $(SRC_DIR)/opae_sim.cpp diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index ecaee717b..3903bbd85 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -35,7 +35,7 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) endif RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(SRC_DIR)/processor.cpp diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 31fde7023..b97e9c00f 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -17,8 +17,8 @@ CXXFLAGS += $(CONFIGS) LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp -SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/execute_vector.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp # Debugging ifdef DEBUG diff --git a/sim/simx/arch.h b/sim/simx/arch.h index 6becf5c91..d68345db6 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -29,6 +29,7 @@ class Arch { uint16_t num_cores_; uint16_t num_clusters_; uint16_t socket_size_; + uint16_t vsize_; uint16_t num_barriers_; uint64_t local_mem_base_; @@ -39,6 +40,7 @@ class Arch { , num_cores_(num_cores) , num_clusters_(NUM_CLUSTERS) , socket_size_(SOCKET_SIZE) + , vsize_(VLEN / 8) , num_barriers_(NUM_BARRIERS) , local_mem_base_(LMEM_BASE_ADDR) {} @@ -71,6 +73,10 @@ class Arch { return socket_size_; } + uint16_t vsize() const { + return vsize_; + } + }; } \ No newline at end of file diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index 7a37e79e2..3c184879d 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -47,6 +47,7 @@ static const std::unordered_map sc_instTable = { {Opcode::FMSUB, InstType::R4}, {Opcode::FMNMADD, InstType::R4}, {Opcode::FMNMSUB, InstType::R4}, + {Opcode::VSET, InstType::V}, {Opcode::EXT1, InstType::R}, {Opcode::EXT2, InstType::R4}, {Opcode::R_W, InstType::R}, @@ -54,33 +55,6 @@ static const std::unordered_map sc_instTable = { {Opcode::TCU, InstType::I}, }; -enum Constants { - width_opcode= 7, - width_reg = 5, - width_func2 = 2, - width_func3 = 3, - width_func7 = 7, - width_i_imm = 12, - width_j_imm = 20, - - shift_opcode= 0, - shift_rd = width_opcode, - shift_func3 = shift_rd + width_reg, - shift_rs1 = shift_func3 + width_func3, - shift_rs2 = shift_rs1 + width_reg, - shift_func2 = shift_rs2 + width_reg, - shift_func7 = shift_rs2 + width_reg, - shift_rs3 = shift_func7 + width_func2, - - mask_opcode = (1 << width_opcode) - 1, - mask_reg = (1 << width_reg) - 1, - mask_func2 = (1 << width_func2) - 1, - mask_func3 = (1 << width_func3) - 1, - mask_func7 = (1 << width_func7) - 1, - mask_i_imm = (1 << width_i_imm) - 1, - mask_j_imm = (1 << width_j_imm) - 1, -}; - static const char* op_string(const Instr &instr) { auto opcode = instr.getOpcode(); auto func2 = instr.getFunc2(); @@ -230,10 +204,14 @@ static const char* op_string(const Instr &instr) { case Opcode::FENCE: return "FENCE"; case Opcode::FL: switch (func3) { - case 0x1: return "VL"; case 0x2: return "FLW"; case 0x3: return "FLD"; + case 0x0: return "VL8"; + case 0x5: return "VL16"; + case 0x6: return "VL32"; + case 0x7: return "VL64"; default: + std::cout << "Could not decode float/vector load with func3: " << func3 << std::endl; std::abort(); } case Opcode::FS: @@ -241,7 +219,12 @@ static const char* op_string(const Instr &instr) { case 0x1: return "VS"; case 0x2: return "FSW"; case 0x3: return "FSD"; + case 0x0: return "VS8"; + case 0x5: return "VS16"; + case 0x6: return "VS32"; + case 0x7: return "VS64"; default: + std::cout << "Could not decode float/vector store with func3: " << func3 << std::endl; std::abort(); } case Opcode::AMO: { @@ -390,6 +373,7 @@ static const char* op_string(const Instr &instr) { case Opcode::FMSUB: return func2 ? "FMSUB.D" : "FMSUB.S"; case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S"; case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S"; + case Opcode::VSET: return "VSET"; case Opcode::EXT1: switch (func7) { case 0: @@ -421,6 +405,39 @@ static const char* op_string(const Instr &instr) { } } +inline void vec_log(std::ostream &os, const Instr &instr) { + if (instr.getVUseMask() & set_func3) + os << ", func3:" << instr.getFunc3(); + if (instr.getVUseMask() & set_func6) + os << ", func6:" << instr.getFunc6(); + if (instr.getVUseMask() & set_imm) + os << ", imm:" << instr.getImm(); + if (instr.getVUseMask() & set_vlswidth) + os << ", width:" << instr.getVlsWidth(); + if (instr.getVUseMask() & set_vmop) + os << ", mop:" << instr.getVmop(); + if (instr.getVUseMask() & set_vumop) + os << ", umop:" << instr.getVumop(); + if (instr.getVUseMask() & set_vnf) + os << ", nf:" << instr.getVnf(); + if (instr.getVUseMask() & set_vmask) + os << ", vmask:" << instr.getVmask(); + if (instr.getVUseMask() & set_vs3) + os << ", vs3:" << instr.getVs3(); + if (instr.getVUseMask() & set_zimm) + os << ", zimm:" << ((instr.hasZimm()) ? "true" : "false"); + if (instr.getVUseMask() & set_vlmul) + os << ", lmul:" << instr.getVlmul(); + if (instr.getVUseMask() & set_vsew) + os << ", sew:" << instr.getVsew(); + if (instr.getVUseMask() & set_vta) + os << ", ta:" << instr.getVta(); + if (instr.getVUseMask() & set_vma) + os << ", ma:" << instr.getVma(); + if (instr.getVUseMask() & set_vediv) + os << ", ediv:" << instr.getVediv(); +} + namespace vortex { std::ostream &operator<<(std::ostream &os, const Instr &instr) { os << op_string(instr); @@ -441,6 +458,13 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { if (sep++ != 0) { os << ", "; } else { os << " "; } os << "0x" << std::hex << instr.getImm() << std::dec; } + if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) { + // CSRs with immediate values + if (sep++ != 0) { os << ", "; } else { os << " "; } + os << "0x" << std::hex << instr.getRSrc(0); + } + // Log vector-specific vtype and vreg info + if (instr.isVec()) vec_log(os, instr); return os; } } @@ -452,6 +476,7 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto func2 = (code >> shift_func2) & mask_func2; auto func3 = (code >> shift_func3) & mask_func3; + auto func6 = (code >> shift_func6) & mask_func6; auto func7 = (code >> shift_func7) & mask_func7; auto rd = (code >> shift_rd) & mask_reg; @@ -466,6 +491,12 @@ std::shared_ptr Emulator::decode(uint32_t code) const { } auto iType = op_it->second; + if (op == Opcode::FL || op == Opcode::FS) { + if (func3 != 0x2 && func3 != 0x3) { + iType = InstType::V; + } + } + switch (iType) { case InstType::R: switch (op) { @@ -659,7 +690,104 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto imm = (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); instr->setImm(sext(imm, width_j_imm+1)); } break; + + case InstType::V: + instr->setVec(true); + switch (op) { + case Opcode::VSET: { + instr->setDestReg(rd, RegType::Integer); + instr->setFunc3(func3); + switch (func3) { + case 7: { + if (code >> (shift_vset - 1) == 0b10) { // vsetvl + instr->addSrcReg(rs1, RegType::Integer); + instr->addSrcReg(rs2, RegType::Integer); + } else { + auto zimm = (code >> shift_rs2) & mask_v_zimm; + instr->setZimm(true); + instr->setVlmul(zimm & mask_v_lmul); + instr->setVsew((zimm >> shift_v_sew) & mask_v_sew); + instr->setVta((zimm >> shift_v_ta) & mask_v_ta); + instr->setVma((zimm >> shift_v_ma) & mask_v_ma); + if ((code >> shift_vset)) { // vsetivli + instr->setImm(rs1); + } else { // vsetvli + instr->addSrcReg(rs1, RegType::Integer); + } + } + } break; + case 3: { // Vector - immediate arithmetic instructions + instr->setDestReg(rd, RegType::Vector); + instr->addSrcReg(rs2, RegType::Vector); + instr->setImm(rs1); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setFunc6(func6); + } break; + default: { // Vector - vector/scalar arithmetic instructions + if (func3 == 1 && func6 == 16) { + instr->setDestReg(rd, RegType::Float); + } else if (func3 == 2 && func6 == 16) { + instr->setDestReg(rd, RegType::Integer); + } else { + instr->setDestReg(rd, RegType::Vector); + } + instr->addSrcReg(rs1, RegType::Vector); + instr->addSrcReg(rs2, RegType::Vector); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setFunc6(func6); + } + } + } break; + + case Opcode::FL: + instr->addSrcReg(rs1, RegType::Integer); + instr->setVmop((code >> shift_vmop) & 0b11); + switch (instr->getVmop()) { + case 0b00: + instr->setVumop(rs2); + break; + case 0b10: + instr->addSrcReg(rs2, RegType::Integer); + break; + case 0b01: + case 0b11: + instr->addSrcReg(rs2, RegType::Vector); + break; + } + instr->setVsew(func3 & 0x3); + instr->setDestReg(rd, RegType::Vector); + instr->setVlsWidth(func3); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setVnf((code >> shift_vnf) & mask_func3); + break; + case Opcode::FS: + instr->addSrcReg(rs1, RegType::Integer); + instr->setVmop((code >> shift_vmop) & 0b11); + switch (instr->getVmop()) { + case 0b00: + instr->setVumop(rs2); + break; + case 0b10: + instr->addSrcReg(rs2, RegType::Integer); + break; + case 0b01: + case 0b11: + instr->addSrcReg(rs2, RegType::Vector); + break; + } + instr->setVsew(func3 & 0x3); + instr->addSrcReg(rd, RegType::Vector); + instr->setVlsWidth(func3); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setVmop((code >> shift_vmop) & 0b11); + instr->setVnf((code >> shift_vnf) & mask_func3); + break; + + default: + std::abort(); + } + break; case InstType::R4: instr->setDestReg(rd, RegType::Float); instr->addSrcReg(rs1, RegType::Float); diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 05b3497c4..14cb979d4 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -33,6 +33,7 @@ using namespace vortex; Emulator::warp_t::warp_t(const Arch& arch) : ireg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) , freg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) + , vreg_file(MAX_NUM_REGS, std::vector(arch.vsize())) , uuid(0) {} @@ -64,6 +65,26 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { #endif } } + + for (auto& reg_file : this->vreg_file) { + for (auto& reg : reg_file) { + #ifndef NDEBUG + reg = 0; + #else + reg = std::rand(); + #endif + } + } + + for (auto& reg_file : this->vreg_file) { + for (auto& reg : reg_file) { + #ifndef NDEBUG + reg = 0; + #else + reg = std::rand(); + #endif + } + } } /////////////////////////////////////////////////////////////////////////////// @@ -79,7 +100,12 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) // considered to be big enough to hold input tiles for one output tile. // In future versions, scratchpad size should be fixed to an appropriate value. , scratchpad(std::vector(32 * 32 * 32768)) + , csrs_(arch.num_warps()) { + for (uint32_t i = 0; i < arch_.num_warps(); ++i) { + csrs_.at(i).resize(arch.num_threads()); + } + this->clear(); } @@ -463,6 +489,32 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F; case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5); case VX_CSR_FCSR: return warps_.at(wid).fcsr; + + // Vector CRSs + case VX_CSR_VSTART: + return csrs_.at(wid).at(tid)[VX_CSR_VSTART]; + case VX_CSR_VXSAT: + return csrs_.at(wid).at(tid)[VX_CSR_VXSAT]; + case VX_CSR_VXRM: + return csrs_.at(wid).at(tid)[VX_CSR_VXRM]; + case VX_CSR_VCSR: { + Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT]; + Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM]; + return (vxrm << 1) | vxsat; + } + case VX_CSR_VL: + return csrs_.at(wid).at(tid)[VX_CSR_VL]; + case VX_CSR_VTYPE: + return csrs_.at(wid).at(tid)[VX_CSR_VTYPE]; + case VX_CSR_VLENB: + return VLEN / 8; + case VX_CSR_VCYCLE: + return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE]; + case VX_CSR_VTIME: + return csrs_.at(wid).at(tid)[VX_CSR_VTIME]; + case VX_CSR_VINSTRET: + return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET]; + case VX_CSR_MHARTID: return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid; case VX_CSR_THREAD_ID: return tid; case VX_CSR_WARP_ID: return wid; @@ -578,6 +630,29 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MSCRATCH: csr_mscratch_ = value; break; + + // Vector CRSs + case VX_CSR_VSTART: + csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value; + break; + case VX_CSR_VXSAT: + csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1; + break; + case VX_CSR_VXRM: + csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11; + break; + case VX_CSR_VCSR: + csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1; + csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11; + break; + case VX_CSR_VL: // read only, written by vset(i)vl(i) + csrs_.at(wid).at(tid)[VX_CSR_VL] = value; + break; + case VX_CSR_VTYPE: // read only, written by vset(i)vl(i) + csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value; + break; + case VX_CSR_VLENB: // read only, set to VLEN / 8 + case VX_CSR_SATP: #ifdef VM_ENABLE // warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F); diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 5f1b91d5d..ffe630c3d 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -28,6 +28,76 @@ class Core; class Instr; class instr_trace_t; +enum Constants { + width_opcode= 7, + width_reg = 5, + width_func2 = 2, + width_func3 = 3, + width_func6 = 6, + width_func7 = 7, + width_mop = 3, + width_vmask = 1, + width_i_imm = 12, + width_j_imm = 20, + width_v_zimm = 11, + width_v_ma = 1, + width_v_ta = 1, + width_v_sew = 3, + width_v_lmul = 3, + width_aq = 1, + width_rl = 1, + + shift_opcode= 0, + shift_rd = width_opcode, + shift_func3 = shift_rd + width_reg, + shift_rs1 = shift_func3 + width_func3, + shift_rs2 = shift_rs1 + width_reg, + shift_func2 = shift_rs2 + width_reg, + shift_func7 = shift_rs2 + width_reg, + shift_rs3 = shift_func7 + width_func2, + shift_vmop = shift_func7 + width_vmask, + shift_vnf = shift_vmop + width_mop, + shift_func6 = shift_func7 + width_vmask, + shift_vset = shift_func7 + width_func6, + shift_v_sew = width_v_lmul, + shift_v_ta = shift_v_sew + width_v_sew, + shift_v_ma = shift_v_ta + width_v_ta, + + mask_opcode = (1 << width_opcode) - 1, + mask_reg = (1 << width_reg) - 1, + mask_func2 = (1 << width_func2) - 1, + mask_func3 = (1 << width_func3) - 1, + mask_func6 = (1 << width_func6) - 1, + mask_func7 = (1 << width_func7) - 1, + mask_i_imm = (1 << width_i_imm) - 1, + mask_j_imm = (1 << width_j_imm) - 1, + mask_v_zimm = (1 << width_v_zimm) - 1, + mask_v_ma = (1 << width_v_ma) - 1, + mask_v_ta = (1 << width_v_ta) - 1, + mask_v_sew = (1 << width_v_sew) - 1, + mask_v_lmul = (1 << width_v_lmul) - 1, +}; + +struct vtype { + uint32_t vill; + uint32_t vma; + uint32_t vta; + uint32_t vsew; + uint32_t vlmul; +}; + +union reg_data_t { + Word u; + WordI i; + WordF f; + float f32; + double f64; + uint32_t u32; + uint64_t u64; + int32_t i32; + int64_t i64; +}; + class Emulator { public: Emulator(const Arch &arch, @@ -61,6 +131,10 @@ class Emulator { Word get_tc_size(); Word get_tc_num(); + void dcache_read(void* data, uint64_t addr, uint32_t size); + + void dcache_write(const void* data, uint64_t addr, uint32_t size); + private: struct ipdom_entry_t { @@ -85,9 +159,14 @@ class Emulator { ThreadMask tmask; std::vector> ireg_file; std::vector>freg_file; + std::vector> vreg_file; std::stack ipdom_stack; Byte fcsr; uint32_t uuid; + + struct vtype vtype; + uint32_t vl; + Word VLMAX; }; struct wspawn_t { @@ -100,11 +179,13 @@ class Emulator { void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace); - void icache_read(void* data, uint64_t addr, uint32_t size); + void executeVector(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata); - void dcache_read(void* data, uint64_t addr, uint32_t size); + void loadVector(const Instr &instr, uint32_t wid, std::vector &rsdata); - void dcache_write(const void* data, uint64_t addr, uint32_t size); + void storeVector(const Instr &instr, uint32_t wid, std::vector &rsdata); + + void icache_read(void* data, uint64_t addr, uint32_t size); void dcache_amo_reserve(uint64_t addr); @@ -142,6 +223,7 @@ class Emulator { uint32_t mat_size; uint32_t tc_size; uint32_t tc_num; + std::vector>> csrs_; }; } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index dd8253571..d477a1d45 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -25,22 +25,11 @@ #include "emulator.h" #include "instr.h" #include "core.h" +#include "processor_impl.h" #include "VX_types.h" using namespace vortex; -union reg_data_t { - Word u; - WordI i; - WordF f; - float f32; - double f64; - uint32_t u32; - uint64_t u64; - int32_t i32; - int64_t i64; -}; - inline uint64_t nan_box(uint32_t value) { return value | 0xffffffff00000000; } @@ -128,6 +117,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } DPN(2, "}" << std::endl); break; + case RegType::Vector: + break; default: break; } @@ -678,41 +669,47 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->src_regs[0] = {RegType::Integer, rsrc0}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t data_bytes = 1 << (func3 & 0x3); - uint32_t data_width = 8 * data_bytes; - for (uint32_t t = thread_start; t < num_threads; ++t) { - if (!warp.tmask.test(t)) - continue; - uint64_t mem_addr = rsdata[t][0].i + immsrc; - uint64_t read_data = 0; - this->dcache_read(&read_data, mem_addr, data_bytes); - trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; - switch (func3) { - case 0: // RV32I: LB - case 1: // RV32I: LH - rddata[t].i = sext((Word)read_data, data_width); - break; - case 2: - if (opcode == Opcode::L) { - // RV32I: LW + if ((opcode == Opcode::L ) + || (opcode == Opcode::FL && func3 == 2) + || (opcode == Opcode::FL && func3 == 3)) { + uint32_t data_bytes = 1 << (func3 & 0x3); + uint32_t data_width = 8 * data_bytes; + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint64_t mem_addr = rsdata[t][0].i + immsrc; + uint64_t read_data = 0; + this->dcache_read(&read_data, mem_addr, data_bytes); + trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; + switch (func3) { + case 0: // RV32I: LB + case 1: // RV32I: LH rddata[t].i = sext((Word)read_data, data_width); - } else { - // RV32F: FLW - rddata[t].u64 = nan_box((uint32_t)read_data); + break; + case 2: + if (opcode == Opcode::L) { + // RV32I: LW + rddata[t].i = sext((Word)read_data, data_width); + } else { + // RV32F: FLW + rddata[t].u64 = nan_box((uint32_t)read_data); + } + break; + case 3: // RV64I: LD + // RV32D: FLD + case 4: // RV32I: LBU + case 5: // RV32I: LHU + case 6: // RV64I: LWU + rddata[t].u64 = read_data; + break; + default: + std::abort(); } - break; - case 3: // RV64I: LD - // RV32D: FLD - case 4: // RV32I: LBU - case 5: // RV32I: LHU - case 6: // RV64I: LWU - rddata[t].u64 = read_data; - break; - default: - std::abort(); } + rd_write = true; + } else { + loadVector(instr, wid, rsdata); } - rd_write = true; break; } case Opcode::S: @@ -724,23 +721,29 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->src_regs[1] = {data_type, rsrc1}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t data_bytes = 1 << (func3 & 0x3); - for (uint32_t t = thread_start; t < num_threads; ++t) { - if (!warp.tmask.test(t)) - continue; - uint64_t mem_addr = rsdata[t][0].i + immsrc; - uint64_t write_data = rsdata[t][1].u64; - trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; - switch (func3) { - case 0: - case 1: - case 2: - case 3: - this->dcache_write(&write_data, mem_addr, data_bytes); - break; - default: - std::abort(); + if ((opcode == Opcode::S) + || (opcode == Opcode::FS && func3 == 2) + || (opcode == Opcode::FS && func3 == 3)) { + uint32_t data_bytes = 1 << (func3 & 0x3); + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint64_t mem_addr = rsdata[t][0].i + immsrc; + uint64_t write_data = rsdata[t][1].u64; + trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; + switch (func3) { + case 0: + case 1: + case 2: + case 3: + this->dcache_write(&write_data, mem_addr, data_bytes); + break; + default: + std::abort(); + } } + } else { + storeVector(instr, wid, rsdata); } break; } @@ -925,7 +928,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { for (uint32_t t = thread_start; t < num_threads; ++t) { if (!warp.tmask.test(t)) continue; - uint32_t frm = this->get_fpu_rm(func3, t, wid); + uint32_t frm = (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, t, wid) : func3; uint32_t fflags = 0; switch (func7) { case 0x00: { // RV32F: FADD.S @@ -1240,7 +1243,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { break; } } - this->update_fcrs(fflags, t, wid); + if (fflags) { + this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid); + this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid); + } } rd_write = true; break; @@ -1294,7 +1300,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { default: break; } - this->update_fcrs(fflags, t, wid); + if (fflags) { + this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid); + this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid); + } } rd_write = true; break; @@ -1586,6 +1595,13 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { std::abort(); } } break; + case Opcode::VSET: { + auto func6 = instr.getFunc6(); + if ((func3 == 0x7) || (func3 == 0x2 && func6 == 16) || (func3 == 0x1 && func6 == 16)) { + rd_write = true; + } + executeVector(instr, wid, rsdata, rddata); + } break; default: std::abort(); } @@ -1629,6 +1645,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->dst_reg = {type, rdest}; break; default: + std::cout << "Unrecognized register write back type: " << type << std::endl; std::abort(); break; } diff --git a/sim/simx/execute_vector.cpp b/sim/simx/execute_vector.cpp new file mode 100644 index 000000000..3b2d585db --- /dev/null +++ b/sim/simx/execute_vector.cpp @@ -0,0 +1,4493 @@ +// This is a fork of https://github.com/troibe/vortex/tree/simx-v2-vector +// The purpose of this fork is to make the simx-v2-vector up to date with master +// Thanks to Troibe for his amazing work + +#include +#include +#include +#include +#include +#include "emulator.h" +#include "instr.h" +#include "processor_impl.h" + +using namespace vortex; + +template +class Add { + public: + static R apply(T first, T second, R) { + return (R)first + (R)second; + } + static std::string name() {return "Add";} +}; + +template +class Sub { + public: + static R apply(T first, T second, R) { + return (R)second - (R)first; + } + static std::string name() {return "Sub";} +}; + +template +class Adc { + public: + static R apply(T first, T second, R third) { + return (R)first + (R)second + third; + } + static std::string name() {return "Adc";} +}; + +template +class Madc { + public: + static R apply(T first, T second, R third) { + return (R)first + (R)second + third > (R)std::numeric_limits::max(); + } + static std::string name() {return "Madc";} +}; + +template +class Sbc { + public: + static R apply(T first, T second, R third) { + return (R)second - (R)first - third; + } + static std::string name() {return "Sbc";} +}; + +template +class Msbc { + public: + static R apply(T first, T second, R third) { + return (R)second < (R)first + third; + } + static std::string name() {return "Msbc";} +}; + +template +class Ssub { + public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + T unclippedResult = second - first; + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Ssub";} +}; + +template +class Ssubu { + public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + if (first > second) { + vxsat_ = true; + return 0; + } else { + vxsat_ = false; + return second - first; + } + } + static std::string name() {return "Ssubu";} +}; + +template +class Sadd { + public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + T unclippedResult = second + first; + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Sadd";} +}; + +template +class Rsub { + public: + static R apply(T first, T second, R) { + return first - second; + } + static std::string name() {return "Rsub";} +}; + +template +class Div { + public: + static R apply(T first, T second, R) { + // logic taken from scalar div + if (first == 0) { + return -1; + } else if (second == std::numeric_limits::min() && first == T(-1)) { + return second; + } else { + return (R)second / (R)first; + } + } + static std::string name() {return "Div";} +}; + +template +class Rem { + public: + static R apply(T first, T second, R) { + // logic taken from scalar rem + if (first == 0) { + return second; + } else if (second == std::numeric_limits::min() && first == T(-1)) { + return 0; + } else { + return (R)second % (R)first; + } + } + static std::string name() {return "Rem";} +}; + +template +class Mul { + public: + static R apply(T first, T second, R) { + return (R)first * (R)second; + } + static std::string name() {return "Mul";} +}; + +template +class Mulsu { + public: + static R apply(T first, T second, R) { + R first_ext = zext((R)first, (sizeof(T) * 8)); + return first_ext * (R)second; + } + static std::string name() {return "Mulsu";} +}; + +template +class Mulh { + public: + static R apply(T first, T second, R) { + __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8)); + __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8)); + return (first_ext * second_ext) >> (sizeof(T) * 8); + } + static std::string name() {return "Mulh";} +}; + +template +class Mulhsu { + public: + static R apply(T first, T second, R) { + __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8)); + __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8)); + return (first_ext * second_ext) >> (sizeof(T) * 8); + } + static std::string name() {return "Mulhsu";} +}; + +template +class Mulhu { + public: + static R apply(T first, T second, R) { + return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8); + } + static std::string name() {return "Mulhu";} +}; + +template +class Madd { + public: + static R apply(T first, T second, R third) { + return ((R)first * third) + (R)second; + } + static std::string name() {return "Madd";} +}; + +template +class Nmsac { + public: + static R apply(T first, T second, R third) { + return -((R)first * (R)second) + third; + } + static std::string name() {return "Nmsac";} +}; + +template +class Macc { + public: + static R apply(T first, T second, R third) { + return ((R)first * (R)second) + third; + } + static std::string name() {return "Macc";} +}; + +template +class Maccsu { + public: + static R apply(T first, T second, R third) { + R first_ext = sext((R)first, (sizeof(T) * 8)); + R second_ext = zext((R)second, (sizeof(T) * 8)); + return (first_ext * second_ext) + third; + } + static std::string name() {return "Maccsu";} +}; + +template +class Maccus { + public: + static R apply(T first, T second, R third) { + R first_ext = zext((R)first, (sizeof(T) * 8)); + R second_ext = sext((R)second, (sizeof(T) * 8)); + return (first_ext * second_ext) + third; + } + static std::string name() {return "Maccus";} +}; + +template +class Nmsub { + public: + static R apply(T first, T second, R third) { + return -((R)first * third) + (R)second; + } + static std::string name() {return "Nmsub";} +}; + +template +class Min { + public: + static R apply(T first, T second, R) { + return std::min(first, second); + } + static std::string name() {return "Min";} +}; + +template +class Max { + public: + static R apply(T first, T second, R) { + return std::max(first, second); + } + static std::string name() {return "Max";} +}; + +template +class And { + public: + static R apply(T first, T second, R) { + return first & second; + } + static std::string name() {return "And";} +}; + +template +class Or { + public: + static R apply(T first, T second, R) { + return first | second; + } + static std::string name() {return "Or";} +}; + +template +class Xor { + public: + static R apply(T first, T second, R) { + return first ^ second; + } + static std::string name() {return "Xor";} +}; + +template +class Sll { + public: + static R apply(T first, T second, R) { + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + return second << (first & (sizeof(T) * 8 - 1)); + } + static std::string name() {return "Sll";} +}; + +template +bool bitAt(T value, R pos, R negOffset) { + R offsetPos = pos - negOffset; + return pos >= negOffset && ((value >> offsetPos) & 0x1); +} + +template +bool anyBitUpTo(T value, R to, R negOffset) { + R offsetTo = to - negOffset; + return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1)); +} + +template +bool roundBit(T value, R shiftDown, uint32_t vxrm) { + switch (vxrm){ + case 0: // round-to-nearest-up + return bitAt(value, shiftDown, (R)1); + case 1: // round-to-nearest-even + return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0)); + case 2: // round-down (truncate) + return 0; + case 3: // round-to-odd + return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1); + default: + std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl; + std::abort(); + } +} + +template +class SrlSra { + public: + static R apply(T first, T second, R) { + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + return second >> (first & (sizeof(T) * 8 - 1)); + } + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + T firstValid = first & (sizeof(T) * 8 - 1); + return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm); + } + static std::string name() {return "SrlSra";} +}; + +template +class Aadd { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + T sum = second + first; + return (sum >> 1) + roundBit(sum, 1, vxrm); + } + static std::string name() {return "Aadd";} +}; + +template +class Asub { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + T difference = second - first; + return (difference >> 1) + roundBit(difference, 1, vxrm); + } + static std::string name() {return "Asub";} +}; + +template +class Eq { + public: + static R apply(T first, T second, R) { + return first == second; + } + static std::string name() {return "Eq";} +}; + +template +class Ne { + public: + static R apply(T first, T second, R) { + return first != second; + } + static std::string name() {return "Ne";} +}; + +template +class Lt { + public: + static R apply(T first, T second, R) { + return first > second; + } + static std::string name() {return "Lt";} +}; + +template +class Le { + public: + static R apply(T first, T second, R) { + return first >= second; + } + static std::string name() {return "Le";} +}; + +template +class Gt { + public: + static R apply(T first, T second, R) { + return first < second; + } + static std::string name() {return "Gt";} +}; + +template +class AndNot { + public: + static R apply(T first, T second, R) { + return second & ~first; + } + static std::string name() {return "AndNot";} +}; + +template +class OrNot { + public: + static R apply(T first, T second, R) { + return second | ~first; + } + static std::string name() {return "OrNot";} +}; + +template +class Nand { + public: + static R apply(T first, T second, R) { + return ~(second & first); + } + static std::string name() {return "Nand";} +}; + +template +class Mv { + public: + static R apply(T first, T, R) { + return first; + } + static std::string name() {return "Mv";} +}; + +template +class Nor { + public: + static R apply(T first, T second, R) { + return ~(second | first); + } + static std::string name() {return "Nor";} +}; + +template +class Xnor { + public: + static R apply(T first, T second, R) { + return ~(second ^ first); + } + static std::string name() {return "Xnor";} +}; + +template +class Fadd { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fadd_s(first, second, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fadd_d(first_d, second_d, frm, &fflags); + } else { + std::cout << "Fadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fadd";} +}; + +template +class Fsub { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fsub_s(second, first, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fsub_d(second_d, first_d, frm, &fflags); + } else { + std::cout << "Fsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsub";} +}; + +template +class Fmacc { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmadd_s(first, second, third, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmadd_d(first_d, second_d, third, frm, &fflags); + } else { + std::cout << "Fmacc only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmacc";} +}; + +template +class Fnmacc { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fnmadd_s(first, second, third, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fnmadd_d(first_d, second_d, third, frm, &fflags); + } else { + std::cout << "Fnmacc only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmacc";} +}; + +template +class Fmsac { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags); + } else { + std::cout << "Fmsac only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmsac";} +}; + +template +class Fnmsac { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags); + } else { + std::cout << "Fnmsac only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmsac";} +}; + +template +class Fmadd { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fmacc::apply(first, third, second); + } else { + std::cout << "Fmadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmadd";} +}; + +template +class Fnmadd { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fnmacc::apply(first, third, second); + } else { + std::cout << "Fnmadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmadd";} +}; + +template +class Fmsub { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fmsac::apply(first, third, second); + } else { + std::cout << "Fmsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmsub";} +}; + +template +class Fnmsub { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fnmsac::apply(first, third, second); + } else { + std::cout << "Fnmsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmsub";} +}; + +template +class Fmin { + public: + static R apply(T first, T second, R) { + // ignoring rounding modes for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fmin_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fmin_d(first, second, &fflags); + } else { + std::cout << "Fmin only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmin";} +}; + +template +class Fmax { + public: + static R apply(T first, T second, R) { + // ignoring rounding modes for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fmax_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fmax_d(first, second, &fflags); + } else { + std::cout << "Fmax only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmax";} +}; + +template +class Fsgnj { + public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnj_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnj_d(second, first); + } else { + std::cout << "Fsgnj only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsgnj";} +}; + +template +class Fsgnjn { + public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnjn_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnjn_d(second, first); + } else { + std::cout << "Fsgnjn only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsgnjn";} +}; + +template +class Fsgnjx { + public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnjx_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnjx_d(second, first); + } else { + std::cout << "Fsgnjx only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsgnjx";} +}; + +template +class Fcvt { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + switch (first) { + case 0b00000: // vfcvt.xu.f.v + return rv_ftou_s(second, frm, &fflags); + case 0b00001: // vfcvt.x.f.v + return rv_ftoi_s(second, frm, &fflags); + case 0b00010: // vfcvt.f.xu.v + return rv_utof_s(second, frm, &fflags); + case 0b00011: // vfcvt.f.x.v + return rv_itof_s(second, frm, &fflags); + case 0b00110: // vfcvt.rtz.xu.f.v + return rv_ftou_s(second, 1, &fflags); + case 0b00111: // vfcvt.rtz.x.f.v + return rv_ftoi_s(second, 1, &fflags); + case 0b01000: // vfwcvt.xu.f.v + return rv_ftolu_s(second, frm, &fflags); + case 0b01001: // vfwcvt.x.f.v + return rv_ftol_s(second, frm, &fflags); + case 0b01010: // vfwcvt.f.xu.v + return rv_utof_d(second, frm, &fflags); + case 0b01011: // vfwcvt.f.x.v + return rv_itof_d(second, frm, &fflags); + case 0b01100: // vfwcvt.f.f.v + return rv_ftod(second); + case 0b01110: // vfwcvt.rtz.xu.f.v + return rv_ftolu_s(second, 1, &fflags); + case 0b01111: // vfwcvt.rtz.x.f.v + return rv_ftol_s(second, 1, &fflags); + default: + std::cout << "Fcvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else if (sizeof(T) == 8) { + switch (first) { + case 0b00000: // vfcvt.xu.f.v + return rv_ftolu_d(second, frm, &fflags); + case 0b00001: // vfcvt.x.f.v + return rv_ftol_d(second, frm, &fflags); + case 0b00010: // vfcvt.f.xu.v + return rv_lutof_d(second, frm, &fflags); + case 0b00011: // vfcvt.f.x.v + return rv_ltof_d(second, frm, &fflags); + case 0b00110: // vfcvt.rtz.xu.f.v + return rv_ftolu_d(second, 1, &fflags); + case 0b00111: // vfcvt.rtz.x.f.v + return rv_ftol_d(second, 1, &fflags); + case 0b01000: // vfwcvt.xu.f.v + case 0b01001: // vfwcvt.x.f.v + case 0b01010: // vfwcvt.f.xu.v + case 0b01011: // vfwcvt.f.x.v + case 0b01100: // vfwcvt.f.f.v + case 0b01110: // vfwcvt.rtz.xu.f.v + case 0b01111: // vfwcvt.rtz.x.f.v + std::cout << "Fwcvt only supports f32" << std::endl; + std::abort(); + default: + std::cout << "Fcvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Fcvt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 8) { + switch (first) { + case 0b10000: // vfncvt.xu.f.w + return rv_ftou_d(second, vxrm, &fflags); + case 0b10001: // vfncvt.x.f.w + return rv_ftoi_d(second, vxrm, &fflags); + case 0b10010: // vfncvt.f.xu.w + return rv_lutof_s(second, vxrm, &fflags); + case 0b10011: // vfncvt.f.x.w + return rv_ltof_s(second, vxrm, &fflags); + case 0b10100: // vfncvt.f.f.w + return rv_dtof_r(second, vxrm); + case 0b10101: // vfncvt.rod.f.f.w + return rv_dtof_r(second, 6); + case 0b10110: // vfncvt.rtz.xu.f.w + return rv_ftou_d(second, 1, &fflags); + case 0b10111: // vfncvt.rtz.x.f.w + return rv_ftoi_d(second, 1, &fflags); + default: + std::cout << "Fncvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Fncvt only supports f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fcvt";} +}; + +template +class Funary1 { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + switch (first) { + case 0b00000: // vfsqrt.v + return rv_fsqrt_s(second, frm, &fflags); + case 0b00100: // vfrsqrt7.v + return rv_frsqrt7_s(second, frm, &fflags); + case 0b00101: // vfrec7.v + return rv_frecip7_s(second, frm, &fflags); + case 0b10000: // vfclass.v + return rv_fclss_s(second); + default: + std::cout << "Funary1 has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else if (sizeof(T) == 8) { + switch (first) { + case 0b00000: // vfsqrt.v + return rv_fsqrt_d(second, frm, &fflags); + case 0b00100: // vfrsqrt7.v + return rv_frsqrt7_d(second, frm, &fflags); + case 0b00101: // vfrec7.v + return rv_frecip7_d(second, frm, &fflags); + case 0b10000: // vfclass.v + return rv_fclss_d(second); + default: + std::cout << "Funary1 has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Funary1 only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Funary1";} +}; + +template +class Xunary0 { + public: + static R apply(T, T second, T) { + return second; + } + static std::string name() {return "Xunary0";} +}; + +template +class Feq { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_feq_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_feq_d(second, first, &fflags); + } else { + std::cout << "Feq only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Feq";} +}; + +template +class Fle { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fle_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_fle_d(second, first, &fflags); + } else { + std::cout << "Fle only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fle";} +}; + +template +class Flt { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_flt_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_flt_d(second, first, &fflags); + } else { + std::cout << "Flt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Flt";} +}; + +template +class Fne { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return !rv_feq_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return !rv_feq_d(second, first, &fflags); + } else { + std::cout << "Fne only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fne";} +}; + +template +class Fgt { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_flt_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_flt_d(first, second, &fflags); + } else { + std::cout << "Fgt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fgt";} +}; + +template +class Fge { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fle_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fle_d(first, second, &fflags); + } else { + std::cout << "Fge only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fge";} +}; + +template +class Fdiv { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fdiv_s(second, first, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fdiv_d(second, first, frm, &fflags); + } else { + std::cout << "Fdiv only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fdiv";} +}; + +template +class Frdiv { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fdiv_s(first, second, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fdiv_d(first, second, frm, &fflags); + } else { + std::cout << "Frdiv only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Frdiv";} +}; + +template +class Fmul { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmul_s(first, second, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmul_d(first_d, second_d, frm, &fflags); + } else { + std::cout << "Fmul only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmul";} +}; + +template +class Frsub { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fsub_s(first, second, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fsub_d(first, second, frm, &fflags); + } else { + std::cout << "Frsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Frsub";} +}; + +template +class Clip { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) { + // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to + // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling. + R firstValid = first & (sizeof(T) * 8 - 1); + T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm); + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Clip";} +}; + +template +class Smul { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) { + R shift = sizeof(R) * 8 - 1; + T unshiftedResult = first * second; + T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm); + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Smul";} +}; + +bool isMasked(std::vector> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) { + auto& mask = vreg_file.at(maskVreg); + uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8); + uint8_t value = (emask >> (byteI % 8)) & 0x1; + DP(1, "Masking enabled: " << +!vmask << " mask element: " << +value); + return !vmask && value == 0; +} + +template +uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) { + uint32_t vsew = sizeof(DT) * 8; + return (baseVreg + (byteI / (VLEN / vsew))) % 32; +} + +template +DT &getVregData(std::vector &baseVregVec, uint32_t byteI) { + uint32_t vsew = sizeof(DT) * 8; + return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8); +} + +template +DT &getVregData(std::vector> &vreg_file, uint32_t baseVreg, uint32_t byteI) { + auto& vr1 = vreg_file.at(getVreg
(baseVreg, byteI)); + return getVregData
(vr1, byteI); +} + +template +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + if (nfields * emul > 8) { + std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl; + std::abort(); + } + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + uint32_t nfields_strided = strided ? nfields : 1; + Word mem_addr = ((rsdata[0][0].i) & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT); + Word mem_data = 0; + emul_->dcache_read(&mem_data, mem_addr, vsew / 8); + DP(1, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg
(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + DT &result = getVregData
(vreg_file, rdest + (i % nfields) * emul, i / nfields); + DP(1, "Previous data: " << +result); + result = (DT) mem_data; + } +} + +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 16: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 32: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 64: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + if (nfields * emul > 8) { + std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl; + std::abort(); + } + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + Word offset = 0; + switch (iSew) { + case 8: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 16: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 32: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 64: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + default: + std::cout << "Unsupported iSew: " << iSew << std::endl; + std::abort(); + } + + Word mem_addr = ((rsdata[0][0].i) & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT); + Word mem_data = 0; + emul_->dcache_read(&mem_data, mem_addr, vsew / 8); + DP(1, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg
(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + DT &result = getVregData
(vreg_file, rdest + (i % nfields) * emul, i / nfields); + DP(1, "Previous data: " << +result); + result = (DT) mem_data; + } +} + +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 16: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 32: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 64: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl; + std::abort(); + } +} + +void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector &rsdata) { + auto &warp = warps_.at(wid); + auto vmask = instr.getVmask(); + auto rdest = instr.getRDest(); + auto mop = instr.getVmop(); + switch (mop) { + case 0b00: { // unit-stride + auto lumop = instr.getVumop(); + switch (lumop) { + case 0b10000: // vle8ff.v, vle16ff.v, vle32ff.v, vle64ff.v - we do not support exceptions -> treat like regular unit stride + // vlseg2e8ff.v, vlseg2e16ff.v, vlseg2e32ff.v, vlseg2e64ff.v + // vlseg3e8ff.v, vlseg3e16ff.v, vlseg3e32ff.v, vlseg3e64ff.v + // vlseg4e8ff.v, vlseg4e16ff.v, vlseg4e32ff.v, vlseg4e64ff.v + // vlseg5e8ff.v, vlseg5e16ff.v, vlseg5e32ff.v, vlseg5e64ff.v + // vlseg6e8ff.v, vlseg6e16ff.v, vlseg6e32ff.v, vlseg6e64ff.v + // vlseg7e8ff.v, vlseg7e16ff.v, vlseg7e32ff.v, vlseg7e64ff.v + // vlseg8e8ff.v, vlseg8e16ff.v, vlseg8e32ff.v, vlseg8e64ff.v + case 0b0000: { // vle8.v, vle16.v, vle32.v, vle64.v + // vlseg2e8.v, vlseg2e16.v, vlseg2e32.v, vlseg2e64.v + // vlseg3e8.v, vlseg3e16.v, vlseg3e32.v, vlseg3e64.v + // vlseg4e8.v, vlseg4e16.v, vlseg4e32.v, vlseg4e64.v + // vlseg5e8.v, vlseg5e16.v, vlseg5e32.v, vlseg5e64.v + // vlseg6e8.v, vlseg6e16.v, vlseg6e32.v, vlseg6e64.v + // vlseg7e8.v, vlseg7e16.v, vlseg7e32.v, vlseg7e64.v + // vlseg8e8.v, vlseg8e16.v, vlseg8e32.v, vlseg8e64.v + WordI stride = warp.vtype.vsew / 8; + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b1000: { // vl1r.v, vl2r.v, vl4r.v, vl8r.v + uint32_t nreg = instr.getVnf() + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Whole vector register load - reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + DP(1, "Whole vector register load with nreg: " << nreg); + uint32_t vl = nreg * VLEN / instr.getVsew(); + WordI stride = instr.getVsew() / 8; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, instr.getVsew(), vl, false, stride, 1, 0, vmask); + break; + } + case 0b1011: { // vlm.v + if (warp.vtype.vsew != 8) { + std::cout << "vlm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl; + std::abort(); + } + WordI stride = warp.vtype.vsew / 8; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true); + break; + } + default: + std::cout << "Load vector - unsupported lumop: " << lumop << std::endl; + std::abort(); + } + break; + } + case 0b10: { // strided: vlse8.v, vlse16.v, vlse32.v, vlse64.v + // vlsseg2e8.v, vlsseg2e16.v, vlsseg2e32.v, vlsseg2e64.v + // vlsseg3e8.v, vlsseg3e16.v, vlsseg3e32.v, vlsseg3e64.v + // vlsseg4e8.v, vlsseg4e16.v, vlsseg4e32.v, vlsseg4e64.v + // vlsseg5e8.v, vlsseg5e16.v, vlsseg5e32.v, vlsseg5e64.v + // vlsseg6e8.v, vlsseg6e16.v, vlsseg6e32.v, vlsseg6e64.v + // vlsseg7e8.v, vlsseg7e16.v, vlsseg7e32.v, vlsseg7e64.v + // vlsseg8e8.v, vlsseg8e16.v, vlsseg8e32.v, vlsseg8e64.v + auto rsrc1 = instr.getRSrc(1); + auto rdest = instr.getRDest(); + WordI stride = warp.ireg_file.at(0).at(rsrc1); + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b01: // indexed - unordered, vluxei8.v, vluxei16.v, vluxei32.v, vluxei64.v + // vluxseg2e8.v, vluxseg2e16.v, vluxseg2e32.v, vluxseg2e64.v + // vluxseg3e8.v, vluxseg3e16.v, vluxseg3e32.v, vluxseg3e64.v + // vluxseg4e8.v, vluxseg4e16.v, vluxseg4e32.v, vluxseg4e64.v + // vluxseg5e8.v, vluxseg5e16.v, vluxseg5e32.v, vluxseg5e64.v + // vluxseg6e8.v, vluxseg6e16.v, vluxseg6e32.v, vluxseg6e64.v + // vluxseg7e8.v, vluxseg7e16.v, vluxseg7e32.v, vluxseg7e64.v + // vluxseg8e8.v, vluxseg8e16.v, vluxseg8e32.v, vluxseg8e64.v + case 0b11: { // indexed - ordered, vloxei8.v, vloxei16.v, vloxei32.v, vloxei64.v + // vloxseg2e8.v, vloxseg2e16.v, vloxseg2e32.v, vloxseg2e64.v + // vloxseg3e8.v, vloxseg3e16.v, vloxseg3e32.v, vloxseg3e64.v + // vloxseg4e8.v, vloxseg4e16.v, vloxseg4e32.v, vloxseg4e64.v + // vloxseg5e8.v, vloxseg5e16.v, vloxseg5e32.v, vloxseg5e64.v + // vloxseg6e8.v, vloxseg6e16.v, vloxseg6e32.v, vloxseg6e64.v + // vloxseg7e8.v, vloxseg7e16.v, vloxseg7e32.v, vloxseg7e64.v + // vloxseg8e8.v, vloxseg8e16.v, vloxseg8e32.v, vloxseg8e64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vv_load(warp.vreg_file, this, rsdata, instr.getRSrc(1), rdest, warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask); + break; + } + default: + std::cout << "Load vector - unsupported mop: " << mop << std::endl; + std::abort(); + } +} + +template +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + uint32_t nfields_strided = strided ? nfields : 1; + Word mem_addr = rsdata[0][0].i + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT); + Word mem_data = getVregData
(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields); + DP(1, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg
(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + emul_->dcache_write(&mem_data, mem_addr, vsew / 8); + } +} + +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 16: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 32: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 64: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + Word offset = 0; + switch (iSew) { + case 8: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 16: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 32: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 64: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + default: + std::cout << "Unsupported iSew: " << iSew << std::endl; + std::abort(); + } + + Word mem_addr = rsdata[0][0].i + offset + (i % nfields) * sizeof(DT); + Word mem_data = getVregData
(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields); + DP(1, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg
(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + emul_->dcache_write(&mem_data, mem_addr, vsew / 8); + } +} + +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 16: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 32: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 64: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl; + std::abort(); + } +} + +void Emulator::storeVector(const Instr &instr, uint32_t wid, std::vector &rsdata) { + auto &warp = warps_.at(wid); + auto vmask = instr.getVmask(); + auto mop = instr.getVmop(); + switch (mop) { + case 0b00: { // unit-stride + auto vs3 = instr.getRSrc(1); + auto sumop = instr.getVumop(); + WordI stride = warp.vtype.vsew / 8; + switch (sumop) { + case 0b0000: { // vse8.v, vse16.v, vse32.v, vse64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b1000: { // vs1r.v, vs2r.v, vs4r.v, vs8r.v + uint32_t nreg = instr.getVnf() + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Whole vector register store - reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + DP(1, "Whole vector register store with nreg: " << nreg); + uint32_t vl = nreg * VLEN / 8; + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, vl, false, stride, 1, 0, vmask); + break; + } + case 0b1011: { // vsm.v + if (warp.vtype.vsew != 8) { + std::cout << "vsm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl; + std::abort(); + } + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true); + break; + } + default: + std::cout << "Store vector - unsupported sumop: " << sumop << std::endl; + std::abort(); + } + break; + } + case 0b10: { // strided: vsse8.v, vsse16.v, vsse32.v, vsse64.v + // vssseg2e8.v, vssseg2e16.v, vssseg2e32.v, vssseg2e64.v + // vssseg3e8.v, vssseg3e16.v, vssseg3e32.v, vssseg3e64.v + // vssseg4e8.v, vssseg4e16.v, vssseg4e32.v, vssseg4e64.v + // vssseg5e8.v, vssseg5e16.v, vssseg5e32.v, vssseg5e64.v + // vssseg6e8.v, vssseg6e16.v, vssseg6e32.v, vssseg6e64.v + // vssseg7e8.v, vssseg7e16.v, vssseg7e32.v, vssseg7e64.v + // vssseg8e8.v, vssseg8e16.v, vssseg8e32.v, vssseg8e64.v + auto rsrc1 = instr.getRSrc(1); + auto vs3 = instr.getRSrc(2); + WordI stride = warp.ireg_file.at(0).at(rsrc1); + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b01: // indexed - unordered, vsuxei8.v, vsuxei16.v, vsuxei32.v, vsuxei64.v + // vsuxseg2ei8.v, vsuxseg2ei16.v, vsuxseg2ei32.v, vsuxseg2ei64.v + // vsuxseg3ei8.v, vsuxseg3ei16.v, vsuxseg3ei32.v, vsuxseg3ei64.v + // vsuxseg4ei8.v, vsuxseg4ei16.v, vsuxseg4ei32.v, vsuxseg4ei64.v + // vsuxseg5ei8.v, vsuxseg5ei16.v, vsuxseg5ei32.v, vsuxseg5ei64.v + // vsuxseg6ei8.v, vsuxseg6ei16.v, vsuxseg6ei32.v, vsuxseg6ei64.v + // vsuxseg7ei8.v, vsuxseg7ei16.v, vsuxseg7ei32.v, vsuxseg7ei64.v + // vsuxseg8ei8.v, vsuxseg8ei16.v, vsuxseg8ei32.v, vsuxseg8ei64.v + case 0b11: { // indexed - ordered, vsoxei8.v, vsoxei16.v, vsoxei32.v, vsoxei64.v + // vsoxseg2ei8.v, vsoxseg2ei16.v, vsoxseg2ei32.v, vsoxseg2ei64.v + // vsoxseg3ei8.v, vsoxseg3ei16.v, vsoxseg3ei32.v, vsoxseg3ei64.v + // vsoxseg4ei8.v, vsoxseg4ei16.v, vsoxseg4ei32.v, vsoxseg4ei64.v + // vsoxseg5ei8.v, vsoxseg5ei16.v, vsoxseg5ei32.v, vsoxseg5ei64.v + // vsoxseg6ei8.v, vsoxseg6ei16.v, vsoxseg6ei32.v, vsoxseg6ei64.v + // vsoxseg7ei8.v, vsoxseg7ei16.v, vsoxseg7ei32.v, vsoxseg7ei64.v + // vsoxseg8ei8.v, vsoxseg8ei16.v, vsoxseg8ei32.v, vsoxseg8ei64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vv_store(warp.vreg_file, this, rsdata, instr.getRSrc(1), instr.getRSrc(2), warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask); + break; + } + default: + std::cout << "Store vector - unsupported mop: " << mop << std::endl; + std::abort(); + } +} + +template