diff --git a/controller/physical.c b/controller/physical.c index 72e88a203a..2338561ec3 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -1969,6 +1969,57 @@ local_output_pb(int64_t tunnel_key, struct ofpbuf *ofpacts) put_resubmit(OFTABLE_CHECK_LOOPBACK, ofpacts); } +#define MC_OFPACTS_MAX_MSG_SIZE 8192 +#define MC_BUF_START_ID 0x9000 + +static void +mc_ofctrl_add_flow(const struct sbrec_multicast_group *mc, + struct match *match, struct ofpbuf *ofpacts, + struct ofpbuf *ofpacts_last, uint8_t stage, + size_t index, uint32_t *pflow_index, + uint16_t prio, struct ovn_desired_flow_table *flow_table) + +{ + /* do not overcome max netlink message size used by ovs-vswitchd to + * send netlink configuration to the kernel. */ + if (ofpacts->size < MC_OFPACTS_MAX_MSG_SIZE && index < (mc->n_ports - 1)) { + return; + } + + uint32_t flow_index = *pflow_index; + bool is_first = (flow_index == MC_BUF_START_ID); + if (!is_first) { + match_set_reg(match, MFF_REG6 - MFF_REG0, flow_index); + prio += 10; + } + + if (index == (mc->n_ports - 1)) { + ofpbuf_put(ofpacts, ofpacts_last->data, ofpacts_last->size); + } else { + /* Split multicast groups with size greater than + * MC_OFPACTS_MAX_MSG_SIZE in order to not overcome the + * MAX_ACTIONS_BUFSIZE netlink buffer size supported by the kernel. + * In order to avoid all the action buffers to be squashed together by + * ovs, add a controller action for each configured openflow. + */ + size_t oc_offset = encode_start_controller_op( + ACTION_OPCODE_MG_SPLIT_BUF, false, NX_CTLR_NO_METER, ofpacts); + ovs_be32 val = htonl(++flow_index); + ofpbuf_put(ofpacts, &val, sizeof val); + val = htonl(mc->tunnel_key); + ofpbuf_put(ofpacts, &val, sizeof val); + ofpbuf_put(ofpacts, &stage, sizeof stage); + encode_finish_controller_op(oc_offset, ofpacts); + } + + ofctrl_add_flow(flow_table, stage, prio, mc->header_.uuid.parts[0], + match, ofpacts, &mc->header_.uuid); + ofpbuf_clear(ofpacts); + /* reset MFF_REG6. */ + put_load(0, MFF_REG6, 0, 32, ofpacts); + *pflow_index = flow_index; +} + static void consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, enum mf_field_id mff_ovn_geneve, @@ -1990,9 +2041,6 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis); struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis); - struct match match; - match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key); - /* Go through all of the ports in the multicast group: * * - For remote ports, add the chassis to 'remote_chassis' or @@ -2014,9 +2062,20 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, * the redirect port was added. */ struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp; + struct ofpbuf ofpacts_last, ofpacts_ramp_last; ofpbuf_init(&ofpacts, 0); ofpbuf_init(&remote_ofpacts, 0); ofpbuf_init(&remote_ofpacts_ramp, 0); + ofpbuf_init(&ofpacts_last, 0); + ofpbuf_init(&ofpacts_ramp_last, 0); + + bool local_ports = false, remote_ports = false, remote_ramp_ports = false; + + /* local port loop. */ + uint32_t flow_index = MC_BUF_START_ID; + put_load(0, MFF_REG6, 0, 32, &ofpacts); + put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts_last); + for (size_t i = 0; i < mc->n_ports; i++) { struct sbrec_port_binding *port = mc->ports[i]; @@ -2040,19 +2099,15 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, if (ldp->is_transit_switch) { local_output_pb(port->tunnel_key, &ofpacts); } else { - local_output_pb(port->tunnel_key, &remote_ofpacts); - local_output_pb(port->tunnel_key, &remote_ofpacts_ramp); + remote_ramp_ports = true; + remote_ports = true; } } if (!strcmp(port->type, "remote")) { if (port->chassis) { - put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32, - &remote_ofpacts); - tunnel_to_chassis(mff_ovn_geneve, port->chassis->name, - chassis_tunnels, mc->datapath, - port->tunnel_key, &remote_ofpacts); + remote_ports = true; } } else if (!strcmp(port->type, "localport")) { - local_output_pb(port->tunnel_key, &remote_ofpacts); + remote_ports = true; } else if ((port->chassis == chassis || is_additional_chassis(port, chassis)) && (local_binding_get_primary_pb(local_bindings, lport_name) @@ -2095,86 +2150,101 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, } } } + + local_ports |= (ofpacts.size > 0); + if (!local_ports) { + continue; + } + + struct match match; + match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key); + mc_ofctrl_add_flow(mc, &match, &ofpacts, &ofpacts_last, + OFTABLE_LOCAL_OUTPUT, i, &flow_index, 100, + flow_table); } - /* Table 40, priority 100. - * ======================= - * - * Handle output to the local logical ports in the multicast group, if - * any. */ - bool local_ports = ofpacts.size > 0; - if (local_ports) { - /* Following delivery to local logical ports, restore the multicast - * group as the logical output port. */ - put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts); + /* remote port loop. */ + ofpbuf_clear(&ofpacts_last); + if (remote_ports) { + put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts_last); + } - ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100, - mc->header_.uuid.parts[0], - &match, &ofpacts, &mc->header_.uuid); + fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels, + mc->datapath, mc->tunnel_key, false, &ofpacts_last); + fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels, + mc->datapath, mc->tunnel_key, true, &ofpacts_last); + + remote_ports |= (ofpacts_last.size > 0); + if (remote_ports && local_ports) { + put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts_last); } - /* Table 39, priority 100. - * ======================= - * - * Handle output to the remote chassis in the multicast group, if - * any. */ - if (!sset_is_empty(&remote_chassis) || - !sset_is_empty(&vtep_chassis) || remote_ofpacts.size > 0) { - if (remote_ofpacts.size > 0) { - /* Following delivery to logical patch ports, restore the - * multicast group as the logical output port. */ - put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, - &remote_ofpacts); - - if (get_vtep_port(local_datapaths, mc->datapath->tunnel_key)) { - struct match match_ramp; - match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0, - MLF_RCV_FROM_RAMP); - - put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, - &remote_ofpacts_ramp); - - /* MCAST traffic which was originally received from RAMP_SWITCH - * is not allowed to be re-sent to remote_chassis. - * Process "patch" port binding for routing in - * OFTABLE_REMOTE_OUTPUT and resubmit packets to - * OFTABLE_LOCAL_OUTPUT for local delivery. */ - - match_outport_dp_and_port_keys(&match_ramp, dp_key, - mc->tunnel_key); - - /* Match packets coming from RAMP_SWITCH and allowed for - * loopback processing (including routing). */ - match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0, - MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK, - MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK); - - put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp); - - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 120, - mc->header_.uuid.parts[0], &match_ramp, - &remote_ofpacts_ramp, &mc->header_.uuid); + bool has_vtep = get_vtep_port(local_datapaths, mc->datapath->tunnel_key); + uint32_t reverse_ramp_flow_index = MC_BUF_START_ID; + flow_index = MC_BUF_START_ID; + + put_load(0, MFF_REG6, 0, 32, &remote_ofpacts); + put_load(0, MFF_REG6, 0, 32, &remote_ofpacts_ramp); + + put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts_ramp_last); + put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts_ramp_last); + + for (size_t i = 0; remote_ports && i < mc->n_ports; i++) { + struct sbrec_port_binding *port = mc->ports[i]; + + if (port->datapath != mc->datapath) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); + VLOG_WARN_RL(&rl, UUID_FMT": multicast group contains ports " + "in wrong datapath", + UUID_ARGS(&mc->header_.uuid)); + continue; + } + + if (!strcmp(port->type, "patch")) { + if (!ldp->is_transit_switch) { + local_output_pb(port->tunnel_key, &remote_ofpacts); + local_output_pb(port->tunnel_key, &remote_ofpacts_ramp); + } + } if (!strcmp(port->type, "remote")) { + if (port->chassis) { + put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32, + &remote_ofpacts); + tunnel_to_chassis(mff_ovn_geneve, port->chassis->name, + chassis_tunnels, mc->datapath, + port->tunnel_key, &remote_ofpacts); } + } else if (!strcmp(port->type, "localport")) { + local_output_pb(port->tunnel_key, &remote_ofpacts); } - fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels, - mc->datapath, mc->tunnel_key, false, - &remote_ofpacts); - fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels, - mc->datapath, mc->tunnel_key, true, - &remote_ofpacts); + struct match match; + match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key); + if (has_vtep) { + match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0, + MLF_RCV_FROM_RAMP); + } + mc_ofctrl_add_flow(mc, &match, &remote_ofpacts, &ofpacts_last, + OFTABLE_REMOTE_OUTPUT, i, &flow_index, 100, + flow_table); - if (remote_ofpacts.size) { - if (local_ports) { - put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts); - } - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, - mc->header_.uuid.parts[0], - &match, &remote_ofpacts, &mc->header_.uuid); + if (!remote_ramp_ports || !has_vtep) { + continue; } + + struct match match_ramp; + match_outport_dp_and_port_keys(&match_ramp, dp_key, mc->tunnel_key); + match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0, + MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK, + MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK); + mc_ofctrl_add_flow(mc, &match_ramp, &remote_ofpacts_ramp, + &ofpacts_ramp_last, OFTABLE_REMOTE_OUTPUT, i, + &reverse_ramp_flow_index, 120, flow_table); } + ofpbuf_uninit(&ofpacts); ofpbuf_uninit(&remote_ofpacts); + ofpbuf_uninit(&ofpacts_last); + ofpbuf_uninit(&ofpacts_ramp_last); ofpbuf_uninit(&remote_ofpacts_ramp); sset_destroy(&remote_chassis); sset_destroy(&vtep_chassis); diff --git a/controller/pinctrl.c b/controller/pinctrl.c index b909b301a6..d88e951a6d 100644 --- a/controller/pinctrl.c +++ b/controller/pinctrl.c @@ -211,6 +211,10 @@ static void send_mac_binding_buffered_pkts(struct rconn *swconn) static void pinctrl_rarp_activation_strategy_handler(const struct match *md); +static void pinctrl_mg_split_buff_handler( + struct rconn *swconn, struct dp_packet *pkt, + const struct match *md, struct ofpbuf *userdata); + static void init_activated_ports(void); static void destroy_activated_ports(void); static void wait_activated_ports(void); @@ -3283,6 +3287,11 @@ process_packet_in(struct rconn *swconn, const struct ofp_header *msg) ovs_mutex_unlock(&pinctrl_mutex); break; + case ACTION_OPCODE_MG_SPLIT_BUF: + pinctrl_mg_split_buff_handler(swconn, &packet, &pin.flow_metadata, + &userdata); + break; + default: VLOG_WARN_RL(&rl, "unrecognized packet-in opcode %"PRIu32, ntohl(ah->opcode)); @@ -8154,6 +8163,63 @@ pinctrl_rarp_activation_strategy_handler(const struct match *md) notify_pinctrl_main(); } +static void +pinctrl_mg_split_buff_handler(struct rconn *swconn, struct dp_packet *pkt, + const struct match *md, struct ofpbuf *userdata) +{ + ovs_be32 *index = ofpbuf_try_pull(userdata, sizeof *index); + if (!index) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "%s: missing index field", __func__); + return; + } + + ovs_be32 *mg = ofpbuf_try_pull(userdata, sizeof *mg); + if (!mg) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "%s: missing multicast group field", __func__); + return; + } + + uint8_t *table_id = ofpbuf_try_pull(userdata, sizeof *table_id); + if (!table_id) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "%s: missing table_id field", __func__); + return; + } + + struct ofpbuf ofpacts; + ofpbuf_init(&ofpacts, 4096); + reload_metadata(&ofpacts, md); + + /* reload pkt_mark field */ + const struct mf_field *pkt_mark_field = mf_from_id(MFF_PKT_MARK); + union mf_value pkt_mark_value; + mf_get_value(pkt_mark_field, &md->flow, &pkt_mark_value); + ofpact_put_set_field(&ofpacts, pkt_mark_field, &pkt_mark_value, NULL); + + put_load(ntohl(*index), MFF_REG6, 0, 32, &ofpacts); + put_load(ntohl(*mg), MFF_LOG_OUTPORT, 0, 32, &ofpacts); + + struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts); + resubmit->in_port = OFPP_CONTROLLER; + resubmit->table_id = *table_id; + + struct ofputil_packet_out po = { + .packet = dp_packet_data(pkt), + .packet_len = dp_packet_size(pkt), + .buffer_id = UINT32_MAX, + .ofpacts = ofpacts.data, + .ofpacts_len = ofpacts.size, + }; + match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER); + enum ofp_version version = rconn_get_version(swconn); + enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version); + queue_msg(swconn, ofputil_encode_packet_out(&po, proto)); + + ofpbuf_uninit(&ofpacts); +} + static struct hmap put_fdbs; /* MAC learning (fdb) related functions. Runs within the main diff --git a/include/ovn/actions.h b/include/ovn/actions.h index 04bb6ffd0e..49cfe0624c 100644 --- a/include/ovn/actions.h +++ b/include/ovn/actions.h @@ -747,6 +747,9 @@ enum action_opcode { /* activation_strategy_rarp() */ ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, + + /* multicast group split buffer action. */ + ACTION_OPCODE_MG_SPLIT_BUF, }; /* Header. */ diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at index 0ca65f3030..e72438fbf2 100644 --- a/tests/ovn-controller.at +++ b/tests/ovn-controller.at @@ -2655,3 +2655,48 @@ OVS_WAIT_FOR_OUTPUT([as hv1 ovs-ofctl dump-flows br-int table=0 | grep -c in_por OVN_CLEANUP([hv1]) AT_CLEANUP ]) + +OVN_FOR_EACH_NORTHD([ +AT_SETUP([ovn-controller - multicast group buffer split]) +AT_KEYWORDS([ovn]) +ovn_start + +net_add n1 + +sim_add hv1 +as hv1 +ovs-vsctl add-br br-phys +ovn_attach n1 br-phys 192.168.0.1 + +for i in $(seq 1 320); do + ovs-vsctl -- add-port br-int hv1-vif$i -- \ + set interface hv1-vif$i external-ids:iface-id=sw0-p$i \ + options:tx_pcap=hv1/vif${i}-tx.pcap \ + options:rxq_pcap=hv1/vif${i}-rx.pcap \ + ofport-request=$i +done + + +check ovn-nbctl ls-add sw0 +for i in $(seq 1 320); do + check ovn-nbctl lsp-add sw0 sw0-p$i + check ovn-nbctl lsp-set-addresses sw0-p$i "unknown" +done + +wait_for_ports_up +ovn-nbctl --wait=hv sync + +OVS_WAIT_FOR_OUTPUT([as hv1 ovs-ofctl dump-flows br-int table=40 | grep reg15=0x8000,metadata=0x1 | grep -c "controller(userdata=00.00.00.1b"], [0],[dnl +3 +]) + +for i in $(seq 1 280); do + check ovn-nbctl lsp-del sw0-p$i +done +ovn-nbctl --wait=hv sync + +AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=40 | grep -q controller], [1]) + +OVN_CLEANUP([hv1]) +AT_CLEANUP +]) diff --git a/tests/ovn.at b/tests/ovn.at index 637d92bedb..b8c61f87fb 100644 --- a/tests/ovn.at +++ b/tests/ovn.at @@ -36957,3 +36957,60 @@ AT_CHECK([grep -c "NXT_CT_FLUSH_ZONE" hv1/ovs-vswitchd.log], [0], [dnl OVN_CLEANUP([hv1]) AT_CLEANUP ]) + +OVN_FOR_EACH_NORTHD([ +AT_SETUP([multicast group buffer split]) +AT_KEYWORDS([ovn-mc-split]) +AT_SKIP_IF([test $HAVE_SCAPY = no]) +ovn_start + +net_add n +sim_add hv1 + +as hv1 +check ovs-vsctl add-br br-phys +ovn_attach n br-phys 192.168.0.1 +check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys + +for i in $(seq 1 320); do + check ovs-vsctl -- add-port br-int hv1-vif$i -- \ + set interface hv1-vif$i external-ids:iface-id=sw0-port$i \ + options:tx_pcap=hv1/vif$i-tx.pcap \ + options:rxq_pcap=hv1/vif$i-rx.pcap +done + +check ovn-nbctl ls-add sw0 + +check ovn-nbctl lsp-add sw0 sw0-port1 +check ovn-nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:01 10.0.0.2" +for i in $(seq 2 320); do + check ovn-nbctl lsp-add sw0 sw0-port$i -- lsp-set-addresses sw0-port$i "unknown" +done + +check ovn-nbctl --wait=hv sync +wait_for_ports_up +OVN_POPULATE_ARP + +AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=40 | grep -c controller], [0],[dnl +9 +]) + +arp_req=$(fmt_pkt "Ether(dst='ff:ff:ff:ff:ff:ff', src='50:54:00:00:00:01')/ARP(pdst='10.0.0.254', psrc='10.0.0.1')") +echo $arp_req > expected +as hv1 ovs-appctl netdev-dummy/receive hv1-vif1 $arp_req + +OVS_WAIT_UNTIL([test $($PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif2-tx.pcap | wc -l) -ge 1]) +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected]) + +OVS_WAIT_UNTIL([test $($PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif100-tx.pcap | wc -l) -ge 1]) +OVN_CHECK_PACKETS([hv1/vif100-tx.pcap], [expected]) + +OVS_WAIT_UNTIL([test $($PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif150-tx.pcap | wc -l) -ge 1]) +OVN_CHECK_PACKETS([hv1/vif150-tx.pcap], [expected]) + +OVS_WAIT_UNTIL([test $($PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif300-tx.pcap | wc -l) -ge 1]) +OVN_CHECK_PACKETS([hv1/vif150-tx.pcap], [expected]) + +OVN_CLEANUP([hv1]) +AT_CLEANUP +])