Skip to content

Commit

Permalink
Update net tile TX flush logic
Browse files Browse the repository at this point in the history
  • Loading branch information
riptl committed Feb 9, 2025
1 parent 1d6103a commit 71c7107
Showing 1 changed file with 126 additions and 44 deletions.
170 changes: 126 additions & 44 deletions src/app/fdctl/run/tiles/fd_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,87 @@ typedef struct {
ulong chunk;
} fd_net_out_ctx_t;

/* fd_net_flusher_t controls the pacing of XDP sendto calls for flushing
TX batches. In the 'wakeup' XDP mode, no TX occurs unless the net
tile wakes up the kernel periodically using the sendto() syscall.
If sendto() is called too frequently, time is wasted on context
switches. If sendto() is called not often enough, packets are
delayed or dropped. sendto() calls make almost no guarantees how
much packets are sent out, nor do they indicate when the kernel
finishes a wakeup call (asynchronously dispatched). The net tile
thus uses a myraid of flush triggers that were tested for best
performance. */

struct fd_net_flusher {

/* Packets that were enqueued after the last sendto() wakeup are
considered "pending". If there are more than pending_wmark packets
pending, a wakeup is dispatched. Thus, this dispatch trigger is
proportional to packet rate, but does not trigger if I/O is seldom. */
ulong pending_cnt;
ulong pending_wmark;

/* Sometimes, packets are not flushed out even after a sendto()
wakeup. This can result in the tail of a burst getting delayed or
overrun. If more than tail_flush_backoff ticks pass since the last
sendto() wakeup and there are still unacknowledged packets in the
TX ring, issues another wakeup. */
long next_tail_flush_ticks;
long tail_flush_backoff;

};

typedef struct fd_net_flusher fd_net_flusher_t;

FD_PROTOTYPES_BEGIN

/* fd_net_flusher_inc marks a new packet as enqueued. */

static inline void
fd_net_flusher_inc( fd_net_flusher_t * flusher,
long now ) {
flusher->pending_cnt++;
long next_flush = now + flusher->tail_flush_backoff;
flusher->next_tail_flush_ticks = fd_long_min( flusher->next_tail_flush_ticks, next_flush );
}

/* fd_net_flusher_check returns 1 if a sendto() wakeup should be issued
immediately. now is a recent fd_tickcount() value.
If tx_ring_empty==0 then the kernel is caught up with the net tile
on the XDP TX ring. (Otherwise, the kernel is behind the net tile) */

static inline int
fd_net_flusher_check( fd_net_flusher_t * flusher,
long now,
int tx_ring_empty ) {
int flush_level = flusher->pending_cnt >= flusher->pending_wmark;
int flush_timeout = now >= flusher->next_tail_flush_ticks;
int flush = flush_level || flush_timeout;
if( !flush ) return 0;
if( FD_UNLIKELY( tx_ring_empty ) ) {
/* Flush requested but caught up */
flusher->pending_cnt = 0UL;
flusher->next_tail_flush_ticks = LONG_MAX;
return 0;
}
return 1;
}

/* fd_net_flusher_wakeup signals a sendto() wakeup was done. now is a
recent fd_tickcount() value. */

static inline void
fd_net_flusher_wakeup( fd_net_flusher_t * flusher,
long now ) {
flusher->pending_cnt = 0UL;
flusher->next_tail_flush_ticks = now + flusher->tail_flush_backoff;
}

FD_PROTOTYPES_END

/* fd_net_free_ring is a FIFO queue that stores pointers to free XDP TX
frames. */

struct fd_net_free_ring {
ulong prod;
ulong cons;
Expand Down Expand Up @@ -110,8 +191,8 @@ typedef struct {
uchar mac_addrs[12]; /* First 12 bytes of Ethernet header */
} tx_op;

/* Round-robin cycle receive */
uint rx_idx;
/* Round-robin cycle serivce operations */
uint rr_idx;

/* Rings tracking free packet buffers */
fd_net_free_ring_t free_tx[ 2 ];
Expand All @@ -134,16 +215,12 @@ typedef struct {
fd_net_out_ctx_t gossip_out[1];
fd_net_out_ctx_t repair_out[1];

/* Timers (measured in fd_tickcount()) */
long xdp_stats_interval_ticks;
long next_xdp_stats_refresh;

long tx_flush_interval_ticks;
long next_tx_flush;
/* XDP stats refresh timer */
long xdp_stats_interval_ticks;
long next_xdp_stats_refresh;

/* Flush every N packets */
ulong flush_pending;
ulong flush_wmark;
/* TX flush timers */
fd_net_flusher_t tx_flusher[2]; /* one per XSK */

/* Route and neighbor tables */
fd_fib4_t const * fib_local;
Expand Down Expand Up @@ -326,6 +403,21 @@ net_tx_wakeup( fd_net_ctx_t * ctx,
ctx->metrics.xsk_tx_wakeup_cnt++;
}

/* net_tx_periodic_wakeup does a timer based xsk_sendmsg wakeup. */

static inline void
net_tx_periodic_wakeup( fd_net_ctx_t * ctx,
uint if_idx,
long now ) {
uint tx_prod = FD_VOLATILE_CONST( *ctx->xsk[ if_idx ]->ring_tx.prod );
uint tx_cons = FD_VOLATILE_CONST( *ctx->xsk[ if_idx ]->ring_tx.cons );
int tx_ring_empty = tx_prod==tx_cons;
if( fd_net_flusher_check( ctx->tx_flusher+if_idx, now, tx_ring_empty ) ) {
net_tx_wakeup( ctx, ctx->xsk[ if_idx ] );
fd_net_flusher_wakeup( ctx->tx_flusher+if_idx, now );
}
}

static void
during_housekeeping( fd_net_ctx_t * ctx ) {
long now = fd_tickcount();
Expand Down Expand Up @@ -528,29 +620,15 @@ after_frag( fd_net_ctx_t * ctx,
.len = (uint)sz,
.options = 0
};
FD_VOLATILE( *xsk->ring_tx.prod ) = tx_ring->cached_prod = tx_seq+1U;

/* Frame is now owned by kernel. Clear tx_op. */
ctx->tx_op.frame = NULL;

/* Register newly enqueued packet */
FD_VOLATILE( *xsk->ring_tx.prod ) = tx_ring->cached_prod = tx_seq+1U;
ctx->metrics.tx_submit_cnt++;
ctx->metrics.tx_bytes_total += sz;

/* Periodically wake up kernel */

int flush_level = ctx->flush_pending >= ctx->flush_wmark;
long now = fd_tickcount();
long next_tx_flush = ctx->next_tx_flush;
long deadline = now + ctx->tx_flush_interval_ticks;
int flush_timeout = now > next_tx_flush;
int flush = flush_level || flush_timeout;
fd_long_store_if( next_tx_flush==LONG_MAX, &ctx->next_tx_flush, deadline ); /* first packet of batch */
fd_long_store_if( flush, &ctx->next_tx_flush, LONG_MAX ); /* last packet of batch */
ctx->flush_pending = flush ? 0UL : ctx->flush_pending+1UL;

if( flush ) {
net_tx_wakeup( ctx, xsk );
}
fd_net_flusher_inc( ctx->tx_flusher+if_idx, fd_tickcount() );

}

Expand Down Expand Up @@ -760,27 +838,29 @@ before_credit( fd_net_ctx_t * ctx,
/* Check if new packets are available or if TX frames are free again
(Round-robin through sockets) */

uint rx_idx = ctx->rx_idx;
fd_xsk_t * rx_xsk = ctx->xsk[ rx_idx ];
ctx->rx_idx++;
ctx->rx_idx = fd_uint_if( ctx->rx_idx>=ctx->xsk_cnt, 0, ctx->rx_idx );
uint rr_idx = ctx->rr_idx;
fd_xsk_t * rr_xsk = ctx->xsk[ rr_idx ];
ctx->rr_idx++;
ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx );

uint rx_cons = FD_VOLATILE_CONST( *rx_xsk->ring_rx.cons );
uint rx_prod = FD_VOLATILE_CONST( *rx_xsk->ring_rx.prod );
uint rx_cons = FD_VOLATILE_CONST( *rr_xsk->ring_rx.cons );
uint rx_prod = FD_VOLATILE_CONST( *rr_xsk->ring_rx.prod );
if( rx_cons!=rx_prod ) {
*charge_busy = 1;
rx_xsk->ring_rx.cached_prod = rx_prod;
net_rx_event( ctx, stem, rx_xsk, rx_cons );
rr_xsk->ring_rx.cached_prod = rx_prod;
net_rx_event( ctx, stem, rr_xsk, rx_cons );
}

uint comp_cons = FD_VOLATILE_CONST( *rx_xsk->ring_cr.cons );
uint comp_prod = FD_VOLATILE_CONST( *rx_xsk->ring_cr.prod );
uint comp_cons = FD_VOLATILE_CONST( *rr_xsk->ring_cr.cons );
uint comp_prod = FD_VOLATILE_CONST( *rr_xsk->ring_cr.prod );
if( comp_cons!=comp_prod ) {
*charge_busy = 1;
rx_xsk->ring_cr.cached_prod = comp_prod;
net_comp_event( ctx, rx_xsk, rx_idx, comp_cons );
rr_xsk->ring_cr.cached_prod = comp_prod;
net_comp_event( ctx, rr_xsk, rr_idx, comp_cons );
}

net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount() );

}

/* net_xsk_bootstrap does the initial UMEM frame to RX/TX ring assignments.
Expand Down Expand Up @@ -923,8 +1003,7 @@ privileged_init( fd_topo_t * topo,
}

double tick_per_ns = fd_tempo_tick_per_ns( NULL );
ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns );
ctx->tx_flush_interval_ticks = (long)( (double)tile->net.tx_flush_timeout_ns * tick_per_ns );
ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns );

ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
Expand Down Expand Up @@ -1035,8 +1114,11 @@ unprivileged_init( fd_topo_t * topo,
FD_LOG_ERR(( "netlink request link not found" ));
}

ctx->flush_wmark = (ulong)( (double)tile->net.xdp_tx_queue_size * 0.7 );
ctx->flush_pending = 0UL;
for( uint j=0U; j<2U; j++ ) {
ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->net.xdp_tx_queue_size * 0.7 );
ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->net.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) );
ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX;
}

/* Join netbase objects */
ctx->fib_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->net.fib4_local_obj_id ) );
Expand Down

0 comments on commit 71c7107

Please sign in to comment.