Skip to content

Commit

Permalink
prov/opx: RZV RTS packet exclude empty immediate data
Browse files Browse the repository at this point in the history
Updated the RZV RTS send and receive path to not send the immediate bytes/quadwords cacheline if the send length
is a multiple of 64.

Signed-off-by: Lindsay Reiser <[email protected]>
  • Loading branch information
lsavers authored and j-xiong committed Mar 14, 2024
1 parent 8026684 commit 7376f5c
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 37 deletions.
10 changes: 6 additions & 4 deletions prov/opx/include/rdma/opx/fi_opx_endpoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -1535,6 +1535,7 @@ void complete_receive_operation_internal (struct fid_ep *ep,
};
const uint64_t immediate_byte_count = immediate_info.byte_count;
const uint64_t immediate_qw_count = immediate_info.qw_count;
const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6);
const uint64_t immediate_block_count = immediate_info.block_count;
const uint64_t immediate_total = immediate_byte_count +
immediate_qw_count * sizeof(uint64_t) +
Expand Down Expand Up @@ -1592,7 +1593,7 @@ void complete_receive_operation_internal (struct fid_ep *ep,
}

if (immediate_block_count) {
const union cacheline * const immediate_block = p->rendezvous.contiguous.immediate_block;
const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment;
union cacheline * rbuf_block = (union cacheline *)rbuf;
for (i=0; i<immediate_block_count; ++i) {
rbuf_block[i] = immediate_block[i];
Expand All @@ -1602,7 +1603,7 @@ void complete_receive_operation_internal (struct fid_ep *ep,
/* up to 1 block of immediate end data after the immediate blocks
Copy this to the end of rbuf */
if (immediate_end_block_count) {
const union cacheline * const immediate_block = p->rendezvous.contiguous.immediate_block;
const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment;
uint8_t *rbuf_start = (uint8_t *)recv_buf;
rbuf_start += xfer_len - (immediate_end_block_count << 6);
memcpy(rbuf_start, immediate_block[immediate_block_count].qw,
Expand Down Expand Up @@ -1663,6 +1664,7 @@ void complete_receive_operation_internal (struct fid_ep *ep,
};
const uint64_t immediate_byte_count = immediate_info.byte_count;
const uint64_t immediate_qw_count = immediate_info.qw_count;
const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6);
const uint64_t immediate_block_count = immediate_info.block_count;
const uint64_t immediate_total = immediate_byte_count +
immediate_qw_count * sizeof(uint64_t) +
Expand Down Expand Up @@ -1721,7 +1723,7 @@ void complete_receive_operation_internal (struct fid_ep *ep,
}

if (immediate_block_count) {
const union cacheline * const immediate_block = p->rendezvous.contiguous.immediate_block;
const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment;
union cacheline * rbuf_block = (union cacheline *)rbuf;
for (i=0; i<immediate_block_count; ++i) {
rbuf_block[i] = immediate_block[i];
Expand All @@ -1740,7 +1742,7 @@ void complete_receive_operation_internal (struct fid_ep *ep,
/* up to 1 block of immediate end data after the immediate blocks
Copy this to the end of rbuf */
if (immediate_end_block_count) {
const union cacheline * const immediate_block = p->rendezvous.contiguous.immediate_block;
const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment;
uint8_t *rbuf_start = (uint8_t *)recv_buf;
rbuf_start += xfer_len - (immediate_end_block_count << 6);
if (!is_hmem) {
Expand Down
19 changes: 12 additions & 7 deletions prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2016 by Argonne National Laboratory.
* Copyright (C) 2021-2023 Cornelis Networks.
* Copyright (C) 2021-2024 Cornelis Networks.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -821,9 +821,14 @@ union fi_opx_hfi1_packet_payload {
uint64_t unused[2];

/* ==== CACHE LINE 1 ==== */

uint8_t immediate_byte[8];
uint64_t immediate_qw[7];
union {
struct {
uint8_t immediate_byte[8];
uint64_t immediate_qw[7];
};

union cacheline cache_line_1;
};

/* ==== CACHE LINE 2-127 ==== */

Expand All @@ -833,11 +838,11 @@ union fi_opx_hfi1_packet_payload {
struct {
/* ==== CACHE LINE 0 ==== */

uintptr_t origin_byte_counter_vaddr;
struct fi_opx_hmem_iov iov[2];
uintptr_t origin_byte_counter_vaddr;
struct fi_opx_hmem_iov iov[2];

/* ==== CACHE LINE 1-127 (for 8k mtu) ==== */
struct fi_opx_hmem_iov iov_ext[FI_OPX_MAX_HMEM_IOV - 2];
struct fi_opx_hmem_iov iov_ext[FI_OPX_MAX_HMEM_IOV - 2];
size_t unused;

} noncontiguous;
Expand Down
52 changes: 26 additions & 26 deletions prov/opx/src/fi_opx_hfi1.c
Original file line number Diff line number Diff line change
Expand Up @@ -2905,15 +2905,14 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,
#endif
/* Expected tid needs to send a leading data block and a trailing
* data block for alignment. Limit this to SDMA (8K+) for now */
const bool use_immediate_blocks = len > FI_OPX_SDMA_MIN_LENGTH ? (opx_ep->use_expected_tid_rzv ? 1 : 0) : 0;
const uint64_t immediate_block_count = (len > FI_OPX_SDMA_MIN_LENGTH && opx_ep->use_expected_tid_rzv) ? 1 : 0;
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"use_immediate_blocks %u *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, "
"immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, "
"*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n",
use_immediate_blocks, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr,
immediate_block_count, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr,
origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL,
origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, len, len );

const uint64_t immediate_block_count = use_immediate_blocks ? 1 : 0;
const uint64_t immediate_end_block_count = immediate_block_count;

assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count);
Expand All @@ -2923,6 +2922,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,

const uint64_t immediate_byte_count = len & 0x0007ul;
const uint64_t immediate_qw_count = (len >> 3) & 0x0007ul;
const uint64_t immediate_fragment = (((len & 0x003Ful) + 63) >> 6);
/* Immediate total does not include trailing block */
const uint64_t immediate_total = immediate_byte_count +
immediate_qw_count * sizeof(uint64_t) +
Expand Down Expand Up @@ -2955,7 +2955,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,

const uint64_t payload_blocks_total =
1 + /* rzv metadata */
1 + /* immediate data tail */
immediate_fragment +
immediate_block_count +
immediate_end_block_count;

Expand Down Expand Up @@ -3046,11 +3046,10 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,
for (i=0; i<immediate_qw_count; ++i) {
payload->rendezvous.contiguous.immediate_qw[i] = sbuf_qw[i];
}

sbuf_qw += immediate_qw_count;

memcpy((void*)payload->rendezvous.contiguous.immediate_block,
(const void *)sbuf_qw, immediate_block_count * 64); /* immediate_end_block_count */
memcpy((void*)(&payload->rendezvous.contiguous.cache_line_1 + immediate_fragment),
(const void *)sbuf_qw, immediate_block_count << 6); /* immediate_end_block_count */
}

opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos);
Expand Down Expand Up @@ -3203,34 +3202,34 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,
/* This would lead to more efficient packing on both sides at the expense of */
/* wasting space of a common 0 byte immediate */
/* tmp_payload_t represents the second cache line of the rts packet */
/* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */
/* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */
struct tmp_payload_t {
uint8_t immediate_byte[8];
uint64_t immediate_qw[7];
} __attribute__((packed));

struct tmp_payload_t *tmp_payload = (void*)tmp;
if (immediate_byte_count > 0) {
memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count);
sbuf += immediate_byte_count;
}
uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count);
if (immediate_fragment) {
struct tmp_payload_t *tmp_payload = (void*)tmp;
if (immediate_byte_count > 0) {
memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count);
}

uint64_t * sbuf_qw = (uint64_t *)sbuf;
int i=0;
for (i=0; i<immediate_qw_count; ++i) {
tmp_payload->immediate_qw[i] = sbuf_qw[i];
}
fi_opx_copy_scb(scb_payload, tmp);
sbuf_qw += immediate_qw_count;
for (int i=0; i<immediate_qw_count; ++i) {
tmp_payload->immediate_qw[i] = sbuf_qw[i];
}
fi_opx_copy_scb(scb_payload, tmp);
sbuf_qw += immediate_qw_count;

fi_opx_copy_scb(replay_payload, tmp);
replay_payload += 8;
fi_opx_copy_scb(replay_payload, tmp);
replay_payload += 8;

/* consume one credit for the rendezvous payload immediate data */
FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state);
/* consume one credit for the rendezvous payload immediate data */
FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state);
#ifndef NDEBUG
++credits_consumed;
++credits_consumed;
#endif
}

if(immediate_block_count) {
#ifndef NDEBUG
Expand All @@ -3257,6 +3256,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,
#endif

}

if(immediate_end_block_count) {
char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6);
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE SEND RZV buf %p, buf end %p, sbuf immediate end block %p\n",(char *)buf, (char *)buf+len, sbuf_end);
Expand Down

0 comments on commit 7376f5c

Please sign in to comment.