diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 71e3046..97d4781 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -59,7 +59,7 @@ jobs: mkdir -p build && cd build source ${HOME}/venv/bin/activate # Simulation across all tests. - pytest .. -v + pytest .. -v --tb=short # Tile translation. pytest ../tile/test/TileRTL_test.py -xvs --test-verilog --dump-vtb --dump-vcd # CGRA template translation. @@ -75,4 +75,6 @@ jobs: # separate crossbars (for tiles and FUs), crossbar-based data memory (for # multi-bank), and controller. pytest ../scale_out/test/RingMultiCgraRTL_test.py -xvs --test-verilog --dump-vtb --dump-vcd + # Multi-cgra with mesh topology. + pytest ../scale_out/test/MeshMultiCgraRTL_test.py -xvs --test-verilog --dump-vtb --dump-vcd diff --git a/cgra/CgraRTL.py b/cgra/CgraRTL.py index 8d8be27..a0f909f 100644 --- a/cgra/CgraRTL.py +++ b/cgra/CgraRTL.py @@ -24,20 +24,21 @@ class CgraRTL(Component): def construct(s, DataType, PredicateType, CtrlPktType, CtrlSignalType, - NocPktType, CmdType, ControllerIdType, controller_id, - width, height, ctrl_mem_size, data_mem_size_global, + NocPktType, CmdType, ControllerIdType, multi_cgra_rows, + multi_cgra_columns, controller_id, width, height, + ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, num_ctrl, - total_steps, FunctionUnit, FuList, topology, - controller2addr_map, preload_data = None, + total_steps, FunctionUnit, FuList, cgra_topology, + controller2addr_map, idTo2d_map, preload_data = None, preload_const = None): # Other topology can simply modify the tiles connections, or # leverage the template for modeling. - assert(topology == "Mesh" or topology == "KingMesh") + assert(cgra_topology == "Mesh" or cgra_topology == "KingMesh") s.num_mesh_ports = 4 - if topology == "Mesh": + if cgra_topology == "Mesh": s.num_mesh_ports = 4 - elif topology == "KingMesh": + elif cgra_topology == "KingMesh": s.num_mesh_ports = 8 s.num_tiles = width * height @@ -81,8 +82,10 @@ def construct(s, DataType, PredicateType, CtrlPktType, CtrlSignalType, preload_data) s.controller = ControllerRTL(ControllerIdType, CmdType, CtrlPktType, NocPktType, DataType, DataAddrType, - controller_id, controller2addr_map) - s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, s.num_tiles, 0) + multi_cgra_rows, multi_cgra_columns, + controller_id, controller2addr_map, + idTo2d_map) + s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, s.num_tiles, 1) # Connections # Connects data memory with controller. @@ -123,7 +126,7 @@ def construct(s, DataType, PredicateType, CtrlPktType, CtrlSignalType, if i % width < width - 1: s.tile[i].send_data[PORT_EAST] //= s.tile[i+1].recv_data[PORT_WEST] - if topology == "KingMesh": + if cgra_topology == "KingMesh": if i % width > 0 and i // width < height - 1: s.tile[i].send_data[PORT_NORTHWEST] //= s.tile[i+width-1].recv_data[PORT_SOUTHEAST] s.tile[i+width-1].send_data[PORT_SOUTHEAST] //= s.tile[i].recv_data[PORT_NORTHWEST] diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index c6a2d79..d582bf3 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -24,12 +24,13 @@ class CgraTemplateRTL(Component): def construct(s, DataType, PredicateType, CtrlPktType, CtrlSignalType, - NocPktType, CmdType, ControllerIdType, controller_id, + NocPktType, CmdType, ControllerIdType, multi_cgra_rows, + multi_cgra_columns, controller_id, ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, num_ctrl, total_steps, FunctionUnit, FuList, TileList, LinkList, - dataSPM, controller2addr_map, preload_data = None, - preload_const = None): + dataSPM, controller2addr_map, idTo2d_map, + preload_data = None, preload_const = None): s.num_mesh_ports = 8 s.num_tiles = len(TileList) @@ -77,8 +78,9 @@ def construct(s, DataType, PredicateType, CtrlPktType, CtrlSignalType, preload_data) s.controller = ControllerRTL(ControllerIdType, CmdType, CtrlPktType, NocPktType, DataType, DataAddrType, - controller_id, controller2addr_map) - s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, s.num_tiles, 0) + multi_cgra_rows, multi_cgra_columns, + controller_id, controller2addr_map, idTo2d_map) + s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, s.num_tiles, 1) # Connections # Connects data memory with controller. diff --git a/cgra/test/CgraRTL_test.py b/cgra/test/CgraRTL_test.py index 20892c9..ac55c7d 100644 --- a/cgra/test/CgraRTL_test.py +++ b/cgra/test/CgraRTL_test.py @@ -43,16 +43,20 @@ def construct(s, DUT, FunctionUnit, FuList, DataType, PredicateType, ControllerIdType, controller_id, width, height, ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, - src_ctrl_pkt, ctrl_steps, topology, controller2addr_map): + src_ctrl_pkt, ctrl_steps, topology, controller2addr_map, + idTo2d_map): s.num_tiles = width * height s.src_ctrl_pkt = TestSrcRTL(CtrlPktType, src_ctrl_pkt) s.dut = DUT(DataType, PredicateType, CtrlPktType, CtrlSignalType, - NocPktType, CmdType, ControllerIdType, controller_id, - width, height, ctrl_mem_size, data_mem_size_global, - data_mem_size_per_bank, num_banks_per_cgra, - ctrl_steps, ctrl_steps, FunctionUnit, FuList, - topology, controller2addr_map) + NocPktType, CmdType, ControllerIdType, + # CGRA terminals on x/y. Assume in total 4, though this + # test is for single CGRA. + 1, 4, + controller_id, width, height, ctrl_mem_size, + data_mem_size_global, data_mem_size_per_bank, + num_banks_per_cgra, ctrl_steps, ctrl_steps, FunctionUnit, + FuList, topology, controller2addr_map, idTo2d_map) # Connections s.src_ctrl_pkt.send //= s.dut.recv_from_cpu_ctrl_pkt @@ -127,6 +131,13 @@ def init_param(topology, FuList = [MemUnitRTL, AdderRTL]): 2: [8, 11], 3: [12, 15], } + + idTo2d_map = { + 0: [0, 0], + 1: [1, 0], + 2: [2, 0], + 3: [3, 0], + } CtrlPktType = \ mk_ring_across_tiles_pkt(width * height, @@ -144,10 +155,11 @@ def init_param(topology, FuList = [MemUnitRTL, AdderRTL]): num_tile_inports, num_tile_outports) - NocPktType = mk_ring_multi_cgra_pkt(nrouters = num_terminals, - addr_nbits = addr_nbits, - data_nbits = 32, - predicate_nbits = 1) + NocPktType = mk_multi_cgra_noc_pkt(ncols = num_terminals, + nrows = 1, + addr_nbits = addr_nbits, + data_nbits = 32, + predicate_nbits = 1) pick_register = [FuInType(x + 1) for x in range(num_fu_inports)] tile_in_code = [TileInType(max(4 - x, 0)) for x in range(num_routing_outports)] fu_out_code = [FuOutType(x % 2) for x in range(num_routing_outports)] @@ -191,7 +203,7 @@ def init_param(topology, FuList = [MemUnitRTL, AdderRTL]): ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, src_ctrl_pkt, ctrl_mem_size, topology, - controller2addr_map) + controller2addr_map, idTo2d_map) return th def test_homogeneous_2x2(cmdline_opts): diff --git a/cgra/test/CgraTemplateRTL_test.py b/cgra/test/CgraTemplateRTL_test.py index c18d3c7..43e584d 100644 --- a/cgra/test/CgraTemplateRTL_test.py +++ b/cgra/test/CgraTemplateRTL_test.py @@ -58,17 +58,21 @@ def construct(s, DUT, FunctionUnit, FuList, DataType, PredicateType, ControllerIdType, controller_id, ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, src_ctrl_pkt, ctrl_steps, TileList, - LinkList, dataSPM, controller2addr_map): + LinkList, dataSPM, controller2addr_map, idTo2d_map): s.num_tiles = len(TileList) s.src_ctrl_pkt = TestSrcRTL(CtrlPktType, src_ctrl_pkt) s.dut = DUT(DataType, PredicateType, CtrlPktType, CtrlSignalType, - NocPktType, CmdType, ControllerIdType, controller_id, - ctrl_mem_size, data_mem_size_global, + NocPktType, CmdType, ControllerIdType, + # CGRA terminals on x/y. Assume in total 4, though this + # test is for single CGRA. + 1, 4, + controller_id, ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, ctrl_steps, ctrl_steps, FunctionUnit, FuList, - TileList, LinkList, dataSPM, controller2addr_map) + TileList, LinkList, dataSPM, controller2addr_map, + idTo2d_map) # Connections s.src_ctrl_pkt.send //= s.dut.recv_from_cpu_ctrl_pkt @@ -205,6 +209,13 @@ def test_cgra_universal(cmdline_opts, paramCGRA = None): 3: [12, 15], } + idTo2d_map = { + 0: [0, 0], + 1: [1, 0], + 2: [2, 0], + 3: [3, 0], + } + CtrlPktType = \ mk_ring_across_tiles_pkt(width * height, num_ctrl_actions, @@ -221,10 +232,11 @@ def test_cgra_universal(cmdline_opts, paramCGRA = None): num_tile_inports, num_tile_outports) - NocPktType = mk_ring_multi_cgra_pkt(nrouters = num_terminals, - addr_nbits = addr_nbits, - data_nbits = 32, - predicate_nbits = 1) + NocPktType = mk_multi_cgra_noc_pkt(ncols = num_terminals, + nrows = 1, + addr_nbits = addr_nbits, + data_nbits = 32, + predicate_nbits = 1) pick_register = [FuInType(x + 1) for x in range(num_fu_inports)] tile_in_code = [TileInType(max(4 - x, 0)) for x in range(num_routing_outports)] fu_out_code = [FuOutType(x % 2) for x in range(num_routing_outports)] @@ -376,7 +388,7 @@ def handleReshape( t_tiles ): ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, src_ctrl_pkt, ctrl_mem_size, tiles, links, dataSPM, - controller2addr_map) + controller2addr_map, idTo2d_map) th.elaborate() th.dut.set_metadata(VerilogTranslationPass.explicit_module_name, diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 11a07a3..f164f1d 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -21,8 +21,15 @@ class ControllerRTL(Component): def construct(s, ControllerIdType, CmdType, CtrlPktType, NocPktType, - CGRADataType, CGRAAddrType, controller_id, - controller2addr_map): + CGRADataType, CGRAAddrType, multi_cgra_rows, + multi_cgra_columns, controller_id, controller2addr_map, + idTo2d_map): + + assert(multi_cgra_columns >= multi_cgra_rows) + + # Used for calculating the x/y position. + XType = mk_bits(max(clog2(multi_cgra_columns), 1)) + YType = mk_bits(max(clog2(multi_cgra_rows), 1)) # Interface # Request from/to other CGRA via NoC. @@ -57,7 +64,7 @@ def construct(s, ControllerIdType, CmdType, CtrlPktType, NocPktType, # s.send_to_other_cmd_queue = ChannelRTL(CmdType, latency = 1, num_entries = 2) # Crossbar with 3 inports (load and store requests towards remote - # memory, and load response from master) and 1 outport (only + # memory, and load response from local memory) and 1 outport (only # allow one request be sent out per cycle). # TODO: Include other cmd requests, e.g., dynamic rescheduling, # termination). @@ -97,6 +104,14 @@ def construct(s, ControllerIdType, CmdType, CtrlPktType, NocPktType, s.addr2controller_lut[addr_base] //= ControllerIdType(src_controller_id) + # Constructs the idTo2d lut. + s.idTo2d_x_lut= [Wire(XType) for _ in range(multi_cgra_columns * multi_cgra_rows)] + s.idTo2d_y_lut= [Wire(YType) for _ in range(multi_cgra_columns * multi_cgra_rows)] + for cgra_id in idTo2d_map: + xy = idTo2d_map[cgra_id] + s.idTo2d_x_lut[cgra_id] //= XType(xy[0]) + s.idTo2d_y_lut[cgra_id] //= YType(xy[1]) + # Connections # Requests towards others, 1 cycle delay to improve timing. s.recv_from_tile_load_request_pkt_queue.recv //= s.recv_from_tile_load_request_pkt @@ -125,39 +140,56 @@ def update_received_msg(): kLoadResponseInportIdx = 1 kStoreRequestInportIdx = 2 - # For the load request from master. + # For the load request from local tiles. s.crossbar.recv[kLoadRequestInportIdx].val @= s.recv_from_tile_load_request_pkt_queue.send.val s.recv_from_tile_load_request_pkt_queue.send.rdy @= s.crossbar.recv[kLoadRequestInportIdx].rdy s.crossbar.recv[kLoadRequestInportIdx].msg @= \ NocPktType(controller_id, 0, + s.idTo2d_x_lut[controller_id], # src_x + s.idTo2d_y_lut[controller_id], # src_y + 0, # dst_x + 0, # dst_y 0, 0, CMD_LOAD_REQUEST, s.recv_from_tile_load_request_pkt_queue.send.msg.addr, 0, - 1) + 1, + 0) + - # For the store request from master. + + # For the store request from local tiles. s.crossbar.recv[kStoreRequestInportIdx].val @= s.recv_from_tile_store_request_pkt_queue.send.val s.recv_from_tile_store_request_pkt_queue.send.rdy @= s.crossbar.recv[kStoreRequestInportIdx].rdy s.crossbar.recv[kStoreRequestInportIdx].msg @= \ NocPktType(controller_id, 0, + s.idTo2d_x_lut[controller_id], # src_x + s.idTo2d_y_lut[controller_id], # src_y + 0, # dst_x + 0, # dst_y 0, 0, CMD_STORE_REQUEST, s.recv_from_tile_store_request_pkt_queue.send.msg.addr, s.recv_from_tile_store_request_pkt_queue.send.msg.data, - s.recv_from_tile_store_request_pkt_queue.send.msg.predicate) + s.recv_from_tile_store_request_pkt_queue.send.msg.predicate, + 0) + - # For the load response (i.e., the data towards other) from master. + # For the load response (i.e., the data towards other) from local memory. s.crossbar.recv[kLoadResponseInportIdx].val @= \ s.recv_from_tile_load_response_pkt_queue.send.val s.recv_from_tile_load_response_pkt_queue.send.rdy @= s.crossbar.recv[kLoadResponseInportIdx].rdy s.crossbar.recv[kLoadResponseInportIdx].msg @= \ NocPktType(controller_id, 0, + s.idTo2d_x_lut[controller_id], # src_x + s.idTo2d_y_lut[controller_id], # src_y + 0, # dst_x + 0, # dst_y 0, 0, CMD_LOAD_RESPONSE, @@ -165,7 +197,9 @@ def update_received_msg(): # The addr information is embedded in the message. s.recv_from_tile_load_response_pkt_queue.send.msg.addr, s.recv_from_tile_load_response_pkt_queue.send.msg.data, - s.recv_from_tile_load_response_pkt_queue.send.msg.predicate) + s.recv_from_tile_load_response_pkt_queue.send.msg.predicate, + 0) + # TODO: For the other cmd types. @@ -224,12 +258,17 @@ def update_sending_to_noc_msg(): s.send_to_noc.msg @= \ NocPktType(s.crossbar.send[0].msg.src, addr_dst_id, + s.crossbar.send[0].msg.src_x, + s.crossbar.send[0].msg.src_y, + s.idTo2d_x_lut[addr_dst_id], + s.idTo2d_y_lut[addr_dst_id], s.crossbar.send[0].msg.opaque, s.crossbar.send[0].msg.vc_id, s.crossbar.send[0].msg.cmd, s.crossbar.send[0].msg.addr, s.crossbar.send[0].msg.data, - s.crossbar.send[0].msg.predicate) + s.crossbar.send[0].msg.predicate, + s.crossbar.send[0].msg.payload) def line_trace(s): send_to_ctrl_ring_ctrl_pkt_str = "send_to_ctrl_ring_ctrl_pkt: " + str(s.send_to_ctrl_ring_ctrl_pkt.msg) diff --git a/controller/test/ControllerRTL_test.py b/controller/test/ControllerRTL_test.py index a1a6783..2942ab8 100644 --- a/controller/test/ControllerRTL_test.py +++ b/controller/test/ControllerRTL_test.py @@ -8,7 +8,6 @@ Date : Dec 15, 2024 ''' - from pymtl3 import * from pymtl3.stdlib.test_utils import TestVectorSimulator from ..ControllerRTL import ControllerRTL @@ -38,7 +37,8 @@ def construct(s, ControllerIdType, CtrlPktType, CmdType, MsgType, expected_to_tile_store_request_data_msgs, from_noc_pkts, expected_to_noc_pkts, - controller2addr_map): + controller2addr_map, + idTo2d_map, num_terminals): cmp_func = lambda a, b : a == b # a.data == b.data @@ -55,8 +55,12 @@ def construct(s, ControllerIdType, CtrlPktType, CmdType, MsgType, s.sink_to_noc_val_rdy = TestNetSinkRTL(PktType, expected_to_noc_pkts, cmp_fn = cmp_func) s.dut = ControllerRTL(ControllerIdType, CmdType, CtrlPktType, - PktType, MsgType, AddrType, controller_id, - controller2addr_map) + PktType, MsgType, AddrType, + # Number of controllers globally (x/y dimension). + 1, num_terminals, + controller_id, + controller2addr_map, + idTo2d_map) # Connections s.src_from_tile_load_request_pkt_en_rdy.send //= s.dut.recv_from_tile_load_request_pkt @@ -152,6 +156,13 @@ def mk_src_pkts(nterminals, lst): controller_id = 1 +idTo2d_map = { + 0: [0, 0], + 1: [1, 0], + 2: [2, 0], + 3: [3, 0] +} + controller2addr_map = { 0: [0, 3], 1: [4, 7], @@ -168,30 +179,30 @@ def mk_src_pkts(nterminals, lst): num_tile_inports, num_tile_outports) -Pkt = mk_ring_multi_cgra_pkt(nterminals, - addr_nbits = addr_nbits, - data_nbits = data_nbits, - predicate_nbits = predicate_nbits) +Pkt = mk_multi_cgra_noc_pkt(nterminals, 1, + addr_nbits = addr_nbits, + data_nbits = data_nbits, + predicate_nbits = predicate_nbits) from_tile_load_request_pkts = [ - # src dst opq vc cmd addr data predicate - Pkt(0, 0, 0, 0, CMD_LOAD_REQUEST, 1, 0, 1), - Pkt(0, 0, 0, 0, CMD_LOAD_REQUEST, 8, 0, 1), - Pkt(0, 0, 0, 0, CMD_LOAD_REQUEST, 13, 0, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_REQUEST, 1, 0, 1), + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_REQUEST, 8, 0, 1), + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_REQUEST, 13, 0, 1), ] from_tile_load_response_pkts = [ - # src dst opq vc cmd addr data predicate - Pkt(0, 0, 0, 0, CMD_LOAD_RESPONSE, 11, 11, 1), - Pkt(0, 0, 0, 0, CMD_LOAD_RESPONSE, 14, 14, 1), - Pkt(0, 0, 0, 0, CMD_LOAD_RESPONSE, 12, 12, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_RESPONSE, 11, 11, 1), + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_RESPONSE, 14, 14, 1), + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_RESPONSE, 12, 12, 1), ] from_tile_store_request_pkts = [ - # src dst opq vc cmd addr data predicate - Pkt(0, 0, 0, 0, CMD_STORE_REQUEST, 11, 110, 1), - Pkt(0, 0, 0, 0, CMD_STORE_REQUEST, 3, 300, 1), - Pkt(0, 0, 0, 0, CMD_STORE_REQUEST, 15, 150, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_STORE_REQUEST, 11, 110, 1), + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_STORE_REQUEST, 3, 300, 1), + Pkt(0, 0, 0, 0, 0, 0, 0, 0, CMD_STORE_REQUEST, 15, 150, 1), ] expected_to_tile_load_request_addr_msgs = [AddrType(2)] @@ -201,26 +212,26 @@ def mk_src_pkts(nterminals, lst): expected_to_tile_store_request_data_msgs = [DataType(50, 1)] from_noc_pkts = [ - # src dst opq vc cmd addr data predicate - Pkt(1, 0, 0, 0, CMD_LOAD_REQUEST, 2, 0, 1), - Pkt(2, 1, 0, 0, CMD_LOAD_RESPONSE, 8, 80, 1), - Pkt(0, 1, 0, 0, CMD_STORE_REQUEST, 5, 50, 1), - Pkt(0, 1, 0, 0, CMD_LOAD_RESPONSE, 9, 90, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + Pkt(1, 0, 1, 0, 0, 0, 0, 0, CMD_LOAD_REQUEST, 2, 0, 1), + Pkt(2, 1, 2, 0, 1, 0, 0, 0, CMD_LOAD_RESPONSE, 8, 80, 1), + Pkt(0, 1, 0, 0, 1, 0, 0, 0, CMD_STORE_REQUEST, 5, 50, 1), + Pkt(0, 1, 0, 0, 1, 0, 0, 0, CMD_LOAD_RESPONSE, 9, 90, 1), ] expected_to_noc_pkts = [ - # src dst opq vc cmd addr data predicate - Pkt(1, 0, 0, 0, CMD_LOAD_REQUEST, 1, 0, 1), - Pkt(1, 2, 0, 0, CMD_LOAD_RESPONSE, 11, 11, 1), - Pkt(1, 2, 0, 0, CMD_STORE_REQUEST, 11, 110, 1), - - Pkt(1, 2, 0, 0, CMD_LOAD_REQUEST, 8, 0, 1), - Pkt(1, 3, 0, 0, CMD_LOAD_RESPONSE, 14, 14, 1), - Pkt(1, 0, 0, 0, CMD_STORE_REQUEST, 3, 300, 1), - - Pkt(1, 3, 0, 0, CMD_LOAD_REQUEST, 13, 0, 1), - Pkt(1, 3, 0, 0, CMD_LOAD_RESPONSE, 12, 12, 1), - Pkt(1, 3, 0, 0, CMD_STORE_REQUEST, 15, 150, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + Pkt(1, 0, 1, 0, 0, 0, 0, 0, CMD_LOAD_REQUEST, 1, 0, 1), + Pkt(1, 2, 1, 0, 2, 0, 0, 0, CMD_LOAD_RESPONSE, 11, 11, 1), + Pkt(1, 2, 1, 0, 2, 0, 0, 0, CMD_STORE_REQUEST, 11, 110, 1), + + Pkt(1, 2, 1, 0, 2, 0, 0, 0, CMD_LOAD_REQUEST, 8, 0, 1), + Pkt(1, 3, 1, 0, 3, 0, 0, 0, CMD_LOAD_RESPONSE, 14, 14, 1), + Pkt(1, 0, 1, 0, 0, 0, 0, 0, CMD_STORE_REQUEST, 3, 300, 1), + + Pkt(1, 3, 1, 0, 3, 0, 0, 0, CMD_LOAD_REQUEST, 13, 0, 1), + Pkt(1, 3, 1, 0, 3, 0, 0, 0, CMD_LOAD_RESPONSE, 12, 12, 1), + Pkt(1, 3, 1, 0, 3, 0, 0, 0, CMD_STORE_REQUEST, 15, 150, 1), ] def test_simple(): @@ -240,6 +251,7 @@ def test_simple(): expected_to_tile_store_request_data_msgs, from_noc_pkts, expected_to_noc_pkts, - controller2addr_map) + controller2addr_map, idTo2d_map, + nterminals) run_sim(th) diff --git a/controller/test/TODO b/controller/test/TODO deleted file mode 100644 index 2a8d445..0000000 --- a/controller/test/TODO +++ /dev/null @@ -1,3 +0,0 @@ - - [ ] DataMemWithCrossbarRTL test with requests from NoC -> return correct response. - - [ ] E2e test with NoC. - - [ ] Translation. diff --git a/lib/messages.py b/lib/messages.py index 6bcbe42..0aab313 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -294,6 +294,76 @@ def str_func(s): namespace = {'__str__': str_func} ) +#========================================================================= +# Mesh multi-CGRA data/config/cmd packet +#========================================================================= + +def mk_multi_cgra_noc_pkt(ncols = 2, nrows = 2, opaque_nbits = 8, vc = 2, + cmd_nbits = 6, addr_nbits = 16, + data_nbits = 16, predicate_nbits = 1, + prefix="MeshMultiCGRAPacket"): + + IdType = mk_bits(max(clog2(ncols * nrows), 1)) + XType = mk_bits(max(clog2(ncols), 1)) + YType = mk_bits(max(clog2(nrows), 1)) + OpqType = mk_bits(opaque_nbits) + CmdType = mk_bits(cmd_nbits) + AddrType = mk_bits(addr_nbits) + DataType = mk_bits(data_nbits) + PredicateType = mk_bits(predicate_nbits) + PayloadType = mk_bits(1) + + new_name = f"{prefix}_{ncols*nrows}_{ncols}x{nrows}_{vc}_{opaque_nbits}_" \ + f"{cmd_nbits}_{addr_nbits}_{data_nbits}_{predicate_nbits}_1" + + if vc > 1: + VcIdType = mk_bits(clog2(vc)) + + def str_func(s): + return f"{s.src}>{s.dst}&{s.src_x},{s.src_y}>{s.dst_x},{s.dst_y}:" \ + f"{s.opaque}:{s.vc_id}:{s.cmd}.{s.addr}.{s.data}.{s.predicate}." \ + f"{s.payload}" + + return mk_bitstruct(new_name, { + 'src': IdType, + 'dst': IdType, + 'src_x': XType, + 'src_y': YType, + 'dst_x': XType, + 'dst_y': YType, + 'opaque': OpqType, + 'vc_id': VcIdType, + 'cmd': CmdType, + 'addr': AddrType, + 'data': DataType, + 'predicate': PredicateType, + 'payload': PayloadType, + }, + namespace = {'__str__': str_func} + ) + + else: + def str_func(s): + return f"{s.src}>{s.dst}&{s.src_x},{s.src_y}>{s.dst_x},{s.dst_y}:" \ + f"{s.opaque}:{s.cmd}.{s.addr}.{s.data}.{s.predicate}.{s.payload}" + + return mk_bitstruct(new_name, { + 'src': IdType, + 'dst': IdType, + 'src_x': XType, + 'src_y': YType, + 'dst_x': XType, + 'dst_y': YType, + 'opaque': OpqType, + 'cmd': CmdType, + 'addr': AddrType, + 'data': DataType, + 'predicate': PredicateType, + 'payload': PayloadType, + }, + namespace = {'__str__': str_func} + ) + #========================================================================= # Ring for delivering ctrl signals and commands across tiles #========================================================================= diff --git a/mem/ctrl/RingMultiCtrlMemDynamicRTL.py b/mem/ctrl/RingMultiCtrlMemDynamicRTL.py index 3be5f32..bd2c8f7 100644 --- a/mem/ctrl/RingMultiCtrlMemDynamicRTL.py +++ b/mem/ctrl/RingMultiCtrlMemDynamicRTL.py @@ -37,7 +37,7 @@ def construct(s, CtrlPktType, CtrlSignalType, width, height, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, ctrl_count_per_iter, total_ctrl_steps) for terminal_id in range(s.num_terminals)] - s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, num_terminals, 0) + s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, num_terminals, 1) # Connections for i in range(s.num_terminals): diff --git a/mem/data/DataMemWithCrossbarRTL.py b/mem/data/DataMemWithCrossbarRTL.py index b32ed4d..60fde6d 100644 --- a/mem/data/DataMemWithCrossbarRTL.py +++ b/mem/data/DataMemWithCrossbarRTL.py @@ -204,10 +204,11 @@ def update_all(): else: s.send_to_noc_load_response_pkt.msg @= \ NocPktType( - 0, 0, 0, 0, CMD_LOAD_RESPONSE, + 0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_RESPONSE, s.read_crossbar.send[s.read_crossbar.packet_on_input_units[i].dst].msg.addr, s.reg_file[trunc(s.read_crossbar.packet_on_input_units[i].dst, LocalBankIndexType)].rdata[0].payload, - s.reg_file[trunc(s.read_crossbar.packet_on_input_units[i].dst, LocalBankIndexType)].rdata[0].predicate + s.reg_file[trunc(s.read_crossbar.packet_on_input_units[i].dst, LocalBankIndexType)].rdata[0].predicate, + 0 ) s.send_to_noc_load_response_pkt.val @= \ s.read_crossbar.send[s.read_crossbar.packet_on_input_units[i].dst].val @@ -234,12 +235,17 @@ def update_all(): s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src 0, # dst + 0, # src_x + 0, # src_y + 0, # dst_x + 0, # dst_y 0, # opaque 0, # vc_id CMD_LOAD_REQUEST, s.read_crossbar.send[num_banks].msg.addr, 0, # data - 1) # predicate + 1, # predicate + 0) # payload # 'send_to_noc_load_pending' avoids sending pending request multiple times. s.send_to_noc_load_request_pkt.val @= s.read_crossbar.send[num_banks].val & \ s.recv_from_noc_rdata.val @@ -272,12 +278,17 @@ def update_all(): s.send_to_noc_store_pkt.msg @= \ NocPktType(0, # src 0, # dst + 0, # src_x + 0, # src_y + 0, # dst_x + 0, # dst_y 0, # opaque 0, # vc_id CMD_STORE_REQUEST, s.write_crossbar.send[num_banks].msg.addr, s.recv_wdata_bypass_q[s.write_crossbar.send[num_banks].msg.src].send.msg.payload, - s.recv_wdata_bypass_q[s.write_crossbar.send[num_banks].msg.src].send.msg.predicate) + s.recv_wdata_bypass_q[s.write_crossbar.send[num_banks].msg.src].send.msg.predicate, + 0) s.send_to_noc_store_pkt.val @= s.write_crossbar.send[num_banks].val # & s.send_to_noc_store_pkt.rdy s.write_crossbar.send[num_banks].rdy @= s.send_to_noc_store_pkt.rdy diff --git a/mem/data/test/DataMemWithCrossbarRTL_test.py b/mem/data/test/DataMemWithCrossbarRTL_test.py index 98f8889..057a54b 100644 --- a/mem/data/test/DataMemWithCrossbarRTL_test.py +++ b/mem/data/test/DataMemWithCrossbarRTL_test.py @@ -121,10 +121,10 @@ def test_const_queue(cmdline_opts): AddrType = mk_bits(addr_nbits) NocPktType = \ - mk_ring_multi_cgra_pkt(nterminals, - addr_nbits = addr_nbits, - data_nbits = data_nbits, - predicate_nbits = predicate_nbits) + mk_multi_cgra_noc_pkt(nterminals, 1, + addr_nbits = addr_nbits, + data_nbits = data_nbits, + predicate_nbits = predicate_nbits) test_meta_data = [ # addr: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 @@ -169,8 +169,8 @@ def test_const_queue(cmdline_opts): # Input data. # noc_send_read_addr = [AddrType(42)] send_to_noc_load_request_pkt = [ - # src dst opq vc cmd addr data predicate - NocPktType(0, 0, 0, 0, CMD_LOAD_REQUEST, 42, 0, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + NocPktType(0, 0, 0, 0, 0, 0, 0, 0, CMD_LOAD_REQUEST, 42, 0, 1), ] noc_recv_load_data = [DataType(0xbbbb, 1)] @@ -178,9 +178,9 @@ def test_const_queue(cmdline_opts): # noc_send_write_addr = [AddrType(40), AddrType(45)] # noc_send_write_data = [DataType(0xd040, 1), DataType(0xd545, 1)] send_to_noc_store_pkt = [ - # src dst opq vc cmd addr data predicate - NocPktType(0, 0, 0, 0, CMD_STORE_REQUEST, 40, 0xd040, 1), - NocPktType(0, 0, 0, 0, CMD_STORE_REQUEST, 45, 0xd545, 1), + # src dst src_x src_y dst_x dst_y opq vc cmd addr data predicate + NocPktType(0, 0, 0, 0, 0, 0, 0, 0, CMD_STORE_REQUEST, 40, 0xd040, 1), + NocPktType(0, 0, 0, 0, 0, 0, 0, 0, CMD_STORE_REQUEST, 45, 0xd545, 1), ] th = TestHarness(NocPktType, DataType, AddrType, data_mem_size_global, diff --git a/noc/PyOCN b/noc/PyOCN index 84c9f40..32c77e0 160000 --- a/noc/PyOCN +++ b/noc/PyOCN @@ -1 +1 @@ -Subproject commit 84c9f407e9a7086d876e40a5ae476feb77d28872 +Subproject commit 32c77e02216cc5e6110e643c60fc3abb2d210a89 diff --git a/scale_out/MeshMultiCgraRTL.py b/scale_out/MeshMultiCgraRTL.py new file mode 100644 index 0000000..f0ee900 --- /dev/null +++ b/scale_out/MeshMultiCgraRTL.py @@ -0,0 +1,115 @@ +""" +========================================================================== +MeshMultiCgraRTL.py +========================================================================== +Mesh connecting multiple CGRAs, each CGRA contains one controller. + +Author : Cheng Tan + Date : Jan 8, 2025 +""" + +from pymtl3 import * +from pymtl3.stdlib.primitive import RegisterFile +from ..cgra.CgraRTL import CgraRTL +from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ..lib.opt_type import * +from ..lib.util.common import * +from ..noc.PyOCN.pymtl3_net.ocnlib.ifcs.positions import mk_mesh_pos +from ..noc.PyOCN.pymtl3_net.meshnet.MeshNetworkRTL import MeshNetworkRTL + +class MeshMultiCgraRTL(Component): + def construct(s, CGRADataType, PredicateType, CtrlPktType, + CtrlSignalType, NocPktType, CmdType, cgra_rows, + cgra_columns, tile_rows, tile_columns, ctrl_mem_size, + data_mem_size_global, data_mem_size_per_bank, + num_banks_per_cgra, num_ctrl, total_steps, FunctionUnit, + FuList, controller2addr_map, preload_data = None, + preload_const = None): + + # Constant + s.num_terminals = cgra_rows * cgra_columns + idTo2d_map = {} + + # Mesh position takes column as argument first. + MeshPos = mk_mesh_pos(cgra_columns, cgra_rows) + s.num_tiles = tile_rows * tile_columns + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + ControllerIdType = mk_bits(clog2(s.num_terminals)) + + # Interface + # Request from/to CPU. + s.recv_from_cpu_ctrl_pkt = RecvIfcRTL(CtrlPktType) + + # Components + for cgra_row in range(cgra_rows): + for cgra_col in range(cgra_columns): + idTo2d_map[cgra_row * cgra_columns + cgra_col] = (cgra_col, cgra_row) + + s.cgra = [CgraRTL(CGRADataType, PredicateType, CtrlPktType, + CtrlSignalType, NocPktType, CmdType, + ControllerIdType, cgra_rows, cgra_columns, + terminal_id, tile_columns, tile_rows, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_ctrl, total_steps, FunctionUnit, FuList, + "Mesh", controller2addr_map, idTo2d_map, + preload_data = None, preload_const = None) + for terminal_id in range(s.num_terminals)] + # Latency is 1. + s.mesh = MeshNetworkRTL(NocPktType, MeshPos, cgra_columns, cgra_rows, 1) + + # Connections + s.recv_from_cpu_ctrl_pkt //= s.cgra[0].recv_from_cpu_ctrl_pkt + for i in range(s.num_terminals): + s.mesh.send[i] //= s.cgra[i].recv_from_noc + s.mesh.recv[i] //= s.cgra[i].send_to_noc + + for i in range(1, s.num_terminals): + s.cgra[i].recv_from_cpu_ctrl_pkt.val //= 0 + s.cgra[i].recv_from_cpu_ctrl_pkt.msg //= CtrlPktType() + + # Connects the tiles on the boundary of each two ajacent CGRAs. + for cgra_row in range(cgra_rows): + for cgra_col in range(cgra_columns): + if cgra_row != 0: + for tile_col in range(tile_columns): + s.cgra[cgra_row * cgra_columns + cgra_col].send_data_on_boundary_south[tile_col] //= \ + s.cgra[(cgra_row - 1) * cgra_columns + cgra_col].recv_data_on_boundary_north[tile_col] + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_south[tile_col] //= \ + s.cgra[(cgra_row - 1) * cgra_columns + cgra_col].send_data_on_boundary_north[tile_col] + else: + for tile_col in range(tile_columns): + s.cgra[cgra_row * cgra_columns + cgra_col].send_data_on_boundary_south[tile_col].rdy //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_south[tile_col].val //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_south[tile_col].msg //= CGRADataType() + + if cgra_row == cgra_rows - 1: + for tile_col in range(tile_columns): + s.cgra[cgra_row * cgra_columns + cgra_col].send_data_on_boundary_north[tile_col].rdy //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_north[tile_col].val //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_north[tile_col].msg //= CGRADataType() + + if cgra_col != 0: + for tile_row in range(tile_rows): + s.cgra[cgra_row * cgra_columns + cgra_col].send_data_on_boundary_west[tile_row] //= \ + s.cgra[cgra_row * cgra_columns + cgra_col - 1].recv_data_on_boundary_east[tile_row] + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_west[tile_row] //= \ + s.cgra[cgra_row * cgra_columns + cgra_col - 1].send_data_on_boundary_east[tile_row] + else: + for tile_row in range(tile_rows): + s.cgra[cgra_row * cgra_columns + cgra_col].send_data_on_boundary_west[tile_row].rdy //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_west[tile_row].val //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_west[tile_row].msg //= CGRADataType() + + if cgra_col == cgra_columns - 1: + for tile_row in range(tile_rows): + s.cgra[cgra_row * cgra_columns + cgra_col].send_data_on_boundary_east[tile_row].rdy //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_east[tile_row].val //= 0 + s.cgra[cgra_row * cgra_columns + cgra_col].recv_data_on_boundary_east[tile_row].msg //= CGRADataType() + + def line_trace(s): + res = "||\n".join([(("[cgra["+str(i)+"]: ") + x.line_trace()) + for (i,x) in enumerate(s.cgra)]) + res += " ## mesh: " + s.mesh.line_trace() + return res + diff --git a/scale_out/RingMultiCgraRTL.py b/scale_out/RingMultiCgraRTL.py index c182768..916dd70 100644 --- a/scale_out/RingMultiCgraRTL.py +++ b/scale_out/RingMultiCgraRTL.py @@ -1,6 +1,6 @@ """ ========================================================================== -RingMultiCgraRingCtrlMemRTL.py +RingMultiCgraRTL.py ========================================================================== Ring connecting multiple CGRAs, each CGRA contains one controller. @@ -11,8 +11,7 @@ from pymtl3 import * from pymtl3.stdlib.primitive import RegisterFile from ..cgra.CgraRTL import CgraRTL -from ..lib.basic.en_rdy.ifcs import SendIfcRTL, RecvIfcRTL -from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL +from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ..lib.opt_type import * from ..lib.util.common import * from ..noc.PyOCN.pymtl3_net.ocnlib.ifcs.positions import mk_ring_pos @@ -28,6 +27,7 @@ def construct(s, CGRADataType, PredicateType, CtrlPktType, preload_const = None): # Constant + idTo2d_map = {} s.num_terminals = cgra_rows * cgra_columns RingPos = mk_ring_pos(s.num_terminals) s.num_tiles = tile_rows * tile_columns @@ -36,19 +36,26 @@ def construct(s, CGRADataType, PredicateType, CtrlPktType, # Interface # Request from/to CPU. - s.recv_from_cpu_ctrl_pkt = ValRdyRecvIfcRTL(CtrlPktType) + s.recv_from_cpu_ctrl_pkt = RecvIfcRTL(CtrlPktType) # Components + # Constructs the topology as 1d. + for terminal_id in range(s.num_terminals): + idTo2d_map[terminal_id] = (terminal_id, 0) + s.cgra = [CgraRTL(CGRADataType, PredicateType, CtrlPktType, CtrlSignalType, NocPktType, CmdType, - ControllerIdType, terminal_id, tile_columns, - tile_rows, ctrl_mem_size, data_mem_size_global, + ControllerIdType, + # Constructs the topology as 1d. + 1, s.num_terminals, + terminal_id, tile_columns, tile_rows, + ctrl_mem_size, data_mem_size_global, data_mem_size_per_bank, num_banks_per_cgra, num_ctrl, total_steps, FunctionUnit, FuList, - "Mesh", controller2addr_map, preload_data = None, - preload_const = None) + "Mesh", controller2addr_map, idTo2d_map, + preload_data = None, preload_const = None) for terminal_id in range(s.num_terminals)] - s.ring = RingNetworkRTL(NocPktType, RingPos, s.num_terminals, 0) + s.ring = RingNetworkRTL(NocPktType, RingPos, s.num_terminals, 1) # Connections s.recv_from_cpu_ctrl_pkt //= s.cgra[0].recv_from_cpu_ctrl_pkt diff --git a/scale_out/test/MeshMultiCgraRTL_test.py b/scale_out/test/MeshMultiCgraRTL_test.py new file mode 100644 index 0000000..abe24b5 --- /dev/null +++ b/scale_out/test/MeshMultiCgraRTL_test.py @@ -0,0 +1,193 @@ +""" +========================================================================== +MeshMultiCgraRTL_test.py +========================================================================== +Test cases for multi-CGRA with mesh NoC. + +Author : Cheng Tan + Date : Jan 8, 2024 +""" + +from pymtl3 import * +from pymtl3.stdlib.test_utils import (run_sim, + config_model_with_cmdline_opts) +from pymtl3.passes.backends.verilog import (VerilogTranslationPass, + VerilogVerilatorImportPass) +from ..MeshMultiCgraRTL import MeshMultiCgraRTL +from ...fu.flexible.FlexibleFuRTL import FlexibleFuRTL +from ...fu.single.AdderRTL import AdderRTL +from ...fu.single.MemUnitRTL import MemUnitRTL +from ...fu.single.ShifterRTL import ShifterRTL +from ...lib.messages import * +from ...lib.opt_type import * +from ...lib.cmd_type import * +from ...lib.basic.val_rdy.SourceRTL import SourceRTL as TestSrcRTL + +#------------------------------------------------------------------------- +# Test harness +#------------------------------------------------------------------------- + +class TestHarness(Component): + def construct(s, DUT, FunctionUnit, FuList, DataType, PredicateType, + CtrlPktType, CtrlSignalType, NocPktType, CmdType, + cgra_rows, cgra_columns, width, height, ctrl_mem_size, + data_mem_size_global, data_mem_size_per_bank, + num_banks_per_cgra, src_ctrl_pkt, ctrl_steps, + controller2addr_map): + + s.num_terminals = cgra_rows * cgra_columns + s.num_tiles = width * height + + s.src_ctrl_pkt = TestSrcRTL(CtrlPktType, src_ctrl_pkt) + + s.dut = DUT(DataType, PredicateType, CtrlPktType, CtrlSignalType, + NocPktType, CmdType, cgra_rows, cgra_columns, + height, width, ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, ctrl_steps, + ctrl_steps, FunctionUnit, FuList, controller2addr_map) + + # Connections + s.src_ctrl_pkt.send //= s.dut.recv_from_cpu_ctrl_pkt + + def done(s): + return s.src_ctrl_pkt.done() + + def line_trace(s): + return s.dut.line_trace() + +def test_homo_2x2(cmdline_opts): + num_tile_inports = 4 + num_tile_outports = 4 + num_fu_inports = 4 + num_fu_outports = 2 + num_routing_outports = num_tile_outports + num_fu_inports + ctrl_mem_size = 6 + data_mem_size_global = 32 + data_mem_size_per_bank = 4 + num_banks_per_cgra = 2 + cgra_rows = 2 + cgra_columns = 2 + num_terminals = cgra_rows * cgra_columns + width = 2 + height = 2 + num_ctrl_actions = 6 + num_ctrl_operations = 64 + TileInType = mk_bits(clog2(num_tile_inports + 1)) + FuInType = mk_bits(clog2(num_fu_inports + 1)) + FuOutType = mk_bits(clog2(num_fu_outports + 1)) + ctrl_addr_nbits = clog2(ctrl_mem_size) + # CtrlAddrType = mk_bits(ctrl_addr_nbits) + data_addr_nbits = clog2(data_mem_size_global) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + num_tiles = width * height + DUT = MeshMultiCgraRTL + FunctionUnit = FlexibleFuRTL + FuList = [MemUnitRTL, AdderRTL] + DataType = mk_data(32, 1) + PredicateType = mk_predicate(1, 1) + cmd_nbits = 5 + CmdType = mk_bits(cmd_nbits) + controller2addr_map = { + 0: [0, 7], + 1: [8, 15], + 2: [16, 23], + 3: [24, 31], + } + CtrlPktType = \ + mk_ring_across_tiles_pkt(width * height, + num_ctrl_actions, + ctrl_mem_size, + num_ctrl_operations, + num_fu_inports, + num_fu_outports, + num_tile_inports, + num_tile_outports) + CtrlSignalType = \ + mk_separate_ctrl(num_ctrl_operations, + num_fu_inports, + num_fu_outports, + num_tile_inports, + num_tile_outports) + NocPktType = mk_multi_cgra_noc_pkt(ncols = cgra_columns, + nrows = cgra_rows, + cmd_nbits = cmd_nbits, + addr_nbits = data_addr_nbits, + data_nbits = 32, + predicate_nbits = 1) + pickRegister = [FuInType(x + 1) for x in range(num_fu_inports)] + src_opt_per_tile = [[ + # src dst vc_id opq cmd_type addr operation predicate + CtrlPktType(0, i, 0, 0, CMD_CONFIG, 0, OPT_INC, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + # TODO: make below as TileInType(5) to double check. + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]), + CtrlPktType(0, i, 0, 0, CMD_CONFIG, 1, OPT_INC, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]), + + CtrlPktType(0, i, 0, 0, CMD_CONFIG, 2, OPT_ADD, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]), + + CtrlPktType(0, i, 0, 0, CMD_CONFIG, 3, OPT_STR, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]), + + CtrlPktType(0, i, 0, 0, CMD_CONFIG, 4, OPT_ADD, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]), + + CtrlPktType(0, i, 0, 0, CMD_CONFIG, 5, OPT_ADD, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]), + + # This last one is for launching kernel. + CtrlPktType(0, i, 0, 0, CMD_LAUNCH, 0, OPT_ADD, b1(0), + pickRegister, + [TileInType(4), TileInType(3), TileInType(2), TileInType(1), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(1), FuOutType(1), FuOutType(1), FuOutType(1)]) + ] for i in range(num_tiles)] + + src_ctrl_pkt = [] + for opt_per_tile in src_opt_per_tile: + src_ctrl_pkt.extend(opt_per_tile) + + th = TestHarness(DUT, FunctionUnit, FuList, DataType, PredicateType, CtrlPktType, + CtrlSignalType, NocPktType, CmdType, cgra_rows, cgra_columns, + width, height, ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, src_ctrl_pkt, + ctrl_mem_size, controller2addr_map) + th.elaborate() + th.dut.set_metadata(VerilogVerilatorImportPass.vl_Wno_list, + ['UNSIGNED', 'UNOPTFLAT', 'WIDTH', 'WIDTHCONCAT', + 'ALWCOMBORDER']) + th = config_model_with_cmdline_opts(th, cmdline_opts, duts = ['dut']) + run_sim(th) + diff --git a/scale_out/test/RingMultiCgraRTL_test.py b/scale_out/test/RingMultiCgraRTL_test.py index f879aa4..7a49245 100644 --- a/scale_out/test/RingMultiCgraRTL_test.py +++ b/scale_out/test/RingMultiCgraRTL_test.py @@ -2,13 +2,12 @@ ========================================================================== RingMultiCgraRTL_test.py ========================================================================== -Test cases for CGRA with controller. +Test cases for multi-CGRA with ring NoC. Author : Cheng Tan Date : Dec 23, 2024 """ - from pymtl3 import * from pymtl3.stdlib.test_utils import (run_sim, config_model_with_cmdline_opts) @@ -86,7 +85,8 @@ def test_homo_2x2(cmdline_opts): FuList = [MemUnitRTL, AdderRTL] DataType = mk_data(32, 1) PredicateType = mk_predicate(1, 1) - CmdType = mk_bits(4) + cmd_nbits = 5 + CmdType = mk_bits(cmd_nbits) controller2addr_map = { 0: [0, 7], 1: [8, 15], @@ -108,10 +108,12 @@ def test_homo_2x2(cmdline_opts): num_fu_outports, num_tile_inports, num_tile_outports) - NocPktType = mk_ring_multi_cgra_pkt(nrouters = num_terminals, - addr_nbits = data_addr_nbits, - data_nbits = 32, - predicate_nbits = 1) + NocPktType = mk_multi_cgra_noc_pkt(ncols = num_terminals, + nrows = 1, + addr_nbits = data_addr_nbits, + cmd_nbits = cmd_nbits, + data_nbits = 32, + predicate_nbits = 1) pickRegister = [FuInType(x + 1) for x in range(num_fu_inports)] src_opt_per_tile = [[ # src dst vc_id opq cmd_type addr operation predicate diff --git a/systolic/CgraSystolicArrayRTL.py b/systolic/CgraSystolicArrayRTL.py index 639c35e..08ec167 100644 --- a/systolic/CgraSystolicArrayRTL.py +++ b/systolic/CgraSystolicArrayRTL.py @@ -77,10 +77,13 @@ def construct(s, DataType, PredicateType, CtrlPktType, CtrlSignalType, # 4 read/write from tiles and 1 read/write from NoC. 4, 4, preload_data) + idTo2d_map = {0: [0, 0]} s.controller = ControllerRTL(ControllerIdType, CmdType, CtrlPktType, NocPktType, DataType, DataAddrType, - controller_id, controller2addr_map) - s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, s.num_tiles, 0) + 1, 1, + controller_id, controller2addr_map, + idTo2d_map) + s.ctrl_ring = RingNetworkRTL(CtrlPktType, CtrlRingPos, s.num_tiles, 1) # Connections # Connects data memory with controller. diff --git a/systolic/test/Cgra3x3MemRightAndBottomRTL_matmul_2x2_test.py b/systolic/test/Cgra3x3MemRightAndBottomRTL_matmul_2x2_test.py index 6c9e1e3..f54600b 100644 --- a/systolic/test/Cgra3x3MemRightAndBottomRTL_matmul_2x2_test.py +++ b/systolic/test/Cgra3x3MemRightAndBottomRTL_matmul_2x2_test.py @@ -151,7 +151,7 @@ def test_CGRA_systolic(cmdline_opts): num_banks_per_cgra = 4 width = 3 height = 3 - num_terminals = 2 + num_terminals = 1 num_ctrl_actions = 6 num_ctrl_operations = 64 TileInType = mk_bits(clog2(num_tile_inports + 1)) @@ -168,7 +168,7 @@ def test_CGRA_systolic(cmdline_opts): FuList = [SeqMulAdderRTL, AdderRTL, MulRTL, LogicRTL, ShifterRTL, PhiRTL, CompRTL, BranchRTL, MemUnitRTL] CmdType = mk_bits(4) - ControllerIdType = mk_bits(clog2(num_terminals)) + ControllerIdType = mk_bits(max(clog2(num_terminals), 1)) controller_id = 0 controller2addr_map = { 0: [0, 15], @@ -191,10 +191,11 @@ def test_CGRA_systolic(cmdline_opts): num_tile_inports, num_tile_outports) - NocPktType = mk_ring_multi_cgra_pkt(nrouters = num_terminals, - addr_nbits = addr_nbits, - data_nbits = 32, - predicate_nbits = 1) + NocPktType = mk_multi_cgra_noc_pkt(ncols = 1, + nrows = 1, + addr_nbits = addr_nbits, + data_nbits = 32, + predicate_nbits = 1) pick_register = [FuInType(x + 1) for x in range(num_fu_inports)] src_opt_per_tile = [