From a602a6db12b7fa2b9b4c182c4102473730bf3be0 Mon Sep 17 00:00:00 2001 From: weiyanhua Date: Mon, 22 Jul 2019 18:08:36 +0800 Subject: [PATCH] make it possible to bind laddr to lcore by adding laddr_lcore_mapping option. In this mode, dest addr is needed in FDIR instead of dest addr and dest port mask. NOTICE: number of laddrs should be greater or equal to number of slave lcores. Or, some slave lcores will have no laddr causing FNAT fowarding failed in those slave lcores. Co-authored-by: kldeng Co-authored-by: lixiaoxiao --- conf/dpvs.bond.conf.sample | 1 + conf/dpvs.conf.items | 1 + conf/dpvs.conf.sample | 1 + conf/dpvs.conf.single-bond.sample | 1 + conf/dpvs.conf.single-nic.sample | 1 + include/ipvs/service.h | 9 + include/sa_pool.h | 11 + src/ipvs/ip_vs_laddr.c | 328 ++++++++++++++++++++++++++++-- src/ipvs/ip_vs_service.c | 6 + src/netif.c | 10 +- src/sa_pool.c | 229 +++++++++++++++------ 11 files changed, 517 insertions(+), 81 deletions(-) diff --git a/conf/dpvs.bond.conf.sample b/conf/dpvs.bond.conf.sample index 0281a6a5a..f68376c41 100644 --- a/conf/dpvs.bond.conf.sample +++ b/conf/dpvs.bond.conf.sample @@ -371,4 +371,5 @@ ipvs_defs { ! sa_pool config sa_pool { pool_hash_size 16 + pool_mode laddr_lcore_mapping } diff --git a/conf/dpvs.conf.items b/conf/dpvs.conf.items index 36cbda4dd..3ec6805a3 100644 --- a/conf/dpvs.conf.items +++ b/conf/dpvs.conf.items @@ -245,4 +245,5 @@ ipvs_defs { sa_pool { pool_hash_size 16 <16, 1-128> + pool_mode laddr_lcore_mapping } diff --git a/conf/dpvs.conf.sample b/conf/dpvs.conf.sample index 80f525b38..0c0034641 100644 --- a/conf/dpvs.conf.sample +++ b/conf/dpvs.conf.sample @@ -324,4 +324,5 @@ ipvs_defs { ! sa_pool config sa_pool { pool_hash_size 16 + pool_mode laddr_lcore_mapping } diff --git a/conf/dpvs.conf.single-bond.sample b/conf/dpvs.conf.single-bond.sample index 0d67e4fcb..3940c7c26 100644 --- a/conf/dpvs.conf.single-bond.sample +++ b/conf/dpvs.conf.single-bond.sample @@ -276,4 +276,5 @@ ipvs_defs { ! sa_pool config sa_pool { pool_hash_size 16 + pool_mode laddr_lcore_mapping } diff --git a/conf/dpvs.conf.single-nic.sample b/conf/dpvs.conf.single-nic.sample index beba1a114..f0eb444ba 100644 --- a/conf/dpvs.conf.single-nic.sample +++ b/conf/dpvs.conf.single-nic.sample @@ -249,4 +249,5 @@ ipvs_defs { ! sa_pool config sa_pool { pool_hash_size 16 + pool_mode laddr_lcore_mapping } diff --git a/include/ipvs/service.h b/include/ipvs/service.h index c8d997260..862bea181 100644 --- a/include/ipvs/service.h +++ b/include/ipvs/service.h @@ -44,6 +44,12 @@ rte_rwlock_t __dp_vs_svc_lock; +struct laddr_list_pre_lcore { + struct list_head laddr_list; /* local address (LIP) pool */ + struct list_head *laddr_curr; + uint32_t num_laddrs; +}; + /* virtual service */ struct dp_vs_service { struct list_head s_list; /* node for normal service table */ @@ -88,6 +94,9 @@ struct dp_vs_service { rte_rwlock_t laddr_lock; uint32_t num_laddrs; + struct laddr_list_pre_lcore pre_list[RTE_MAX_LCORE]; +#define this_pre_list pre_list[rte_lcore_id()] + /* ... flags, timer ... */ } __rte_cache_aligned; #endif diff --git a/include/sa_pool.h b/include/sa_pool.h index 7a136aebb..d6b827bcb 100644 --- a/include/sa_pool.h +++ b/include/sa_pool.h @@ -42,12 +42,23 @@ #ifndef __DPVS_SA_POOL__ #define __DPVS_SA_POOL__ +#define SAPOOL +#define RTE_LOGTYPE_SAPOOL RTE_LOGTYPE_USER1 + +enum { + LADDR_LCORE_MAPPING_POOL_MODE, + LPORT_LCORE_MAPPING_POOL_MODE, +}; + struct sa_pool_stats { uint32_t used_cnt; uint32_t free_cnt; uint32_t miss_cnt; }; +extern uint8_t sa_pool_mode; +#define SA_POOL_MODE sa_pool_mode + int sa_pool_init(void); int sa_pool_term(void); diff --git a/src/ipvs/ip_vs_laddr.c b/src/ipvs/ip_vs_laddr.c index 836d49f09..9d3fc42fe 100644 --- a/src/ipvs/ip_vs_laddr.c +++ b/src/ipvs/ip_vs_laddr.c @@ -108,6 +108,7 @@ struct dp_vs_laddr { }; static uint32_t dp_vs_laddr_max_trails = 16; +static uint64_t lcore_mask; static inline int __laddr_step(struct dp_vs_service *svc) { @@ -124,7 +125,7 @@ static inline int __laddr_step(struct dp_vs_service *svc) return 1; } -static inline struct dp_vs_laddr *__get_laddr(struct dp_vs_service *svc) +static inline struct dp_vs_laddr *__get_laddr_port_mode(struct dp_vs_service *svc) { int step; struct dp_vs_laddr *laddr = NULL; @@ -153,6 +154,45 @@ static inline struct dp_vs_laddr *__get_laddr(struct dp_vs_service *svc) return laddr; } +static inline struct dp_vs_laddr *__get_laddr_addr_mode(struct dp_vs_service *svc) +{ + struct dp_vs_laddr *laddr = NULL; + + /* if list not inited ? list_empty() returns true ! */ + assert(svc->this_pre_list.laddr_list.next); + + if (list_empty(&svc->this_pre_list.laddr_list)) { + return NULL; + } + + /* In LADDR_LCORE_MAPPING_POOL_MODE, the iteration step is different + * between laddr_list and realserver rr/wrr scheduler internally since every + * laddr is bound to a dedicated lcore. So we don't need to get a random + * laddr_list step any more. + **/ + if (unlikely(!svc->this_pre_list.laddr_curr)) + svc->this_pre_list.laddr_curr = svc->this_pre_list.laddr_list.next; + else + svc->this_pre_list.laddr_curr = svc->this_pre_list.laddr_curr->next; + + if (svc->this_pre_list.laddr_curr == &svc->this_pre_list.laddr_list) + svc->this_pre_list.laddr_curr = svc->this_pre_list.laddr_list.next; + + laddr = list_entry(svc->this_pre_list.laddr_curr, struct dp_vs_laddr, list); + rte_atomic32_inc(&laddr->refcnt); + + return laddr; +} + +static inline struct dp_vs_laddr *__get_laddr(struct dp_vs_service *svc) +{ + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + return __get_laddr_addr_mode(svc); + } else { + return __get_laddr_port_mode(svc); + } +} + static inline void put_laddr(struct dp_vs_laddr *laddr) { /* use lock if other field need by changed */ @@ -164,8 +204,10 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) { struct dp_vs_laddr *laddr = NULL; int i; + int num_laddrs = 0; uint16_t sport = 0; struct sockaddr_storage dsin, ssin; + struct inet_ifaddr *ifa; if (!conn || !conn->dest || !svc) return EDPVS_INVAL; @@ -182,7 +224,11 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) * 2. we uses svc->num_laddrs; */ rte_rwlock_write_lock(&svc->laddr_lock); - for (i = 0; i < dp_vs_laddr_max_trails && i < svc->num_laddrs; i++) { + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) + num_laddrs = svc->this_pre_list.num_laddrs; + else + num_laddrs = svc->num_laddrs; + for (i = 0; i < dp_vs_laddr_max_trails && i < num_laddrs; i++) { /* select a local IP from service */ laddr = __get_laddr(svc); if (!laddr) { @@ -191,6 +237,21 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) return EDPVS_RESOURCE; } + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + ifa = inet_addr_ifa_get(conn->af, laddr->iface, &laddr->addr); + assert(ifa); + if (!ifa->this_sa_pool) { +#ifdef CONFIG_DPVS_IPVS_DEBUG + char buf[64]; + if (inet_ntop(conn->af, &laddr->addr, buf, sizeof(buf)) == NULL) + snprintf(buf, sizeof(buf), "::"); + RTE_LOG(DEBUG, IPVS, "%s: %s is not assigned on [%d], " + "try next laddr.\n",__func__, buf, rte_lcore_id()); +#endif + continue; + } + } + memset(&dsin, 0, sizeof(struct sockaddr_storage)); memset(&ssin, 0, sizeof(struct sockaddr_storage)); @@ -299,11 +360,71 @@ int dp_vs_laddr_unbind(struct dp_vs_conn *conn) return EDPVS_OK; } +static int __dp_vs_laddr_add_port_mode(struct dp_vs_service *svc, + int af, struct dp_vs_laddr *new) +{ + struct dp_vs_laddr *curr; + + rte_rwlock_write_lock(&svc->laddr_lock); + list_for_each_entry(curr, &svc->laddr_list, list) { + if (af == curr->af && inet_addr_equal(af, &curr->addr, &new->addr)) { + rte_rwlock_write_unlock(&svc->laddr_lock); + //rte_free(new); + return EDPVS_EXIST; + } + } + + list_add_tail(&new->list, &svc->laddr_list); + svc->num_laddrs++; + + rte_rwlock_write_unlock(&svc->laddr_lock); + return EDPVS_OK; +} + +static int __dp_vs_laddr_add_addr_mode(struct dp_vs_service *svc, + int af, struct dp_vs_laddr *new) +{ + struct dp_vs_laddr *curr; + struct inet_ifaddr *ifa; + int cid = 0; + + rte_rwlock_write_lock(&svc->laddr_lock); + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + list_for_each_entry(curr, &svc->pre_list[cid].laddr_list, list) { + if (af == curr->af && inet_addr_equal(af, &curr->addr, &new->addr)) { + rte_rwlock_write_unlock(&svc->laddr_lock); + //rte_free(new); + return EDPVS_EXIST; + } + } + } + + ifa = inet_addr_ifa_get(af, new->iface, &new->addr); + if (!ifa) { + rte_rwlock_write_unlock(&svc->laddr_lock); + return EDPVS_NOTEXIST; + } + + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + /* skip master and unused cores */ + if ((cid == rte_get_master_lcore()) || !is_lcore_id_valid(cid)) + continue; + if (ifa->sa_pools[cid]) { + list_add_tail(&new->list, &svc->pre_list[cid].laddr_list); + svc->pre_list[cid].num_laddrs++; + } + } + + rte_rwlock_write_unlock(&svc->laddr_lock); + return EDPVS_OK; +} + int dp_vs_laddr_add(struct dp_vs_service *svc, int af, const union inet_addr *addr, const char *ifname) { - struct dp_vs_laddr *new, *curr; + struct dp_vs_laddr *new; + int err = 0; if (!svc || !addr) return EDPVS_INVAL; @@ -325,23 +446,19 @@ int dp_vs_laddr_add(struct dp_vs_service *svc, return EDPVS_NOTEXIST; } - rte_rwlock_write_lock(&svc->laddr_lock); - list_for_each_entry(curr, &svc->laddr_list, list) { - if (af == curr->af && inet_addr_equal(af, &curr->addr, &new->addr)) { - rte_rwlock_write_unlock(&svc->laddr_lock); - rte_free(new); - return EDPVS_EXIST; - } + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + err = __dp_vs_laddr_add_addr_mode(svc, af, new); + } else { + err = __dp_vs_laddr_add_port_mode(svc, af, new); } - list_add_tail(&new->list, &svc->laddr_list); - svc->num_laddrs++; - rte_rwlock_write_unlock(&svc->laddr_lock); - - return EDPVS_OK; + if (err != EDPVS_OK) + rte_free(new); + return err; } -int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *addr) +static int __dp_vs_laddr_del_port_mode(struct dp_vs_service *svc, int af, + const union inet_addr *addr) { struct dp_vs_laddr *laddr, *next; int err = EDPVS_NOTEXIST; @@ -373,13 +490,70 @@ int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *ad rte_rwlock_write_unlock(&svc->laddr_lock); if (err == EDPVS_BUSY) - RTE_LOG(DEBUG, IPVS, "%s: laddr is in use.\n", __func__); + RTE_LOG(DEBUG, SAPOOL, "%s: laddr is in use.\n", __func__); return err; } -/* if success, it depend on caller to free @addrs by rte_free() */ -static int dp_vs_laddr_getall(struct dp_vs_service *svc, +static int __dp_vs_laddr_del_addr_mode(struct dp_vs_service *svc, int af, + const union inet_addr *addr) +{ + struct dp_vs_laddr *laddr, *next; + int cid = 0; + int err = EDPVS_NOTEXIST; + + if (!svc || !addr) + return EDPVS_INVAL; + + rte_rwlock_write_lock(&svc->laddr_lock); + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + /* skip master and unused cores */ + if ((cid == rte_get_master_lcore()) || !is_lcore_id_valid(cid)) + continue; + list_for_each_entry_safe(laddr, next, &svc->pre_list[cid].laddr_list, list) { + if (!((af == laddr->af) && inet_addr_equal(af, &laddr->addr, addr))) + continue; + + /* found */ + if (rte_atomic32_read(&laddr->refcnt) == 0) { + /* update svc->curr_laddr */ + if (svc->pre_list[cid].laddr_curr == &laddr->list) + svc->pre_list[cid].laddr_curr = laddr->list.next; + list_del(&laddr->list); + rte_free(laddr); + svc->pre_list[cid].num_laddrs--; + err = EDPVS_OK; + } else { + /* XXX: move to trash list and implement an garbage collector, + * or just try del again ? */ + err = EDPVS_BUSY; + } + break; + } + } + + rte_rwlock_write_unlock(&svc->laddr_lock); + + if (err == EDPVS_BUSY) + RTE_LOG(DEBUG, SAPOOL, "%s: laddr is in use.\n", __func__); + + return err; +} + +int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *addr) +{ + int err = 0; + + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + err = __dp_vs_laddr_del_addr_mode(svc,af, addr); + } else { + err = __dp_vs_laddr_del_port_mode(svc, af, addr); + } + + return err; +} + +static int __dp_vs_laddr_getall_port_mode(struct dp_vs_service *svc, struct dp_vs_laddr_entry **addrs, size_t *naddr) { struct dp_vs_laddr *laddr; @@ -416,7 +590,69 @@ static int dp_vs_laddr_getall(struct dp_vs_service *svc, return EDPVS_OK; } -int dp_vs_laddr_flush(struct dp_vs_service *svc) +static int __dp_vs_laddr_getall_addr_mode(struct dp_vs_service *svc, + struct dp_vs_laddr_entry **addrs, size_t *naddr) +{ + struct dp_vs_laddr *laddr; + int i = 0; + int cid = 0; + int num_laddrs = 0; + + if (!svc || !addrs || !naddr) + return EDPVS_INVAL; + + rte_rwlock_write_lock(&svc->laddr_lock); + + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + num_laddrs += svc->pre_list[cid].num_laddrs; + } + + if (num_laddrs > 0) { + *naddr = num_laddrs; + *addrs = rte_malloc_socket(0, sizeof(struct dp_vs_laddr_entry) * num_laddrs, + RTE_CACHE_LINE_SIZE, rte_socket_id()); + if (!(*addrs)) { + rte_rwlock_write_unlock(&svc->laddr_lock); + return EDPVS_NOMEM; + } + + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + /* skip master and unused cores */ + if ((cid == rte_get_master_lcore()) || !is_lcore_id_valid(cid)) + continue; + list_for_each_entry(laddr, &svc->pre_list[cid].laddr_list, list) { + assert(i < *naddr); + (*addrs)[i].af = laddr->af; + (*addrs)[i].addr = laddr->addr; + (*addrs)[i].nconns = rte_atomic32_read(&laddr->conn_counts); + i++; + } + } + } else { + *naddr = 0; + *addrs = NULL; + } + + rte_rwlock_write_unlock(&svc->laddr_lock); + return EDPVS_OK; +} + +/* if success, it depend on caller to free @addrs by rte_free() */ +static int dp_vs_laddr_getall(struct dp_vs_service *svc, + struct dp_vs_laddr_entry **addrs, size_t *naddr) +{ + int err = EDPVS_OK; + + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + err = __dp_vs_laddr_getall_addr_mode(svc, addrs, naddr); + } else { + err = __dp_vs_laddr_getall_port_mode(svc, addrs, naddr); + } + + return err; +} + +static int __dp_vs_laddr_flush_port_mode(struct dp_vs_service *svc) { struct dp_vs_laddr *laddr, *next; int err = EDPVS_OK; @@ -436,11 +672,59 @@ int dp_vs_laddr_flush(struct dp_vs_service *svc) if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL) snprintf(buf, sizeof(buf), "::"); - RTE_LOG(DEBUG, IPVS, "%s: laddr %s is in use.\n", __func__, buf); + RTE_LOG(DEBUG, SAPOOL, "%s: laddr %s is in use.\n", __func__, buf); err = EDPVS_BUSY; } } + rte_rwlock_write_unlock(&svc->laddr_lock); + return err; +} + +static int __dp_vs_laddr_flush_addr_mode(struct dp_vs_service *svc) +{ + struct dp_vs_laddr *laddr, *next; + int cid = 0; + int err = EDPVS_OK; + + if (!svc) + return EDPVS_INVAL; + + rte_rwlock_write_lock(&svc->laddr_lock); + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + /* skip master and unused cores */ + if ((cid == rte_get_master_lcore()) || !is_lcore_id_valid(cid)) + continue; + list_for_each_entry_safe(laddr, next, &svc->pre_list[cid].laddr_list, list) { + if (rte_atomic32_read(&laddr->refcnt) == 0) { + list_del(&laddr->list); + rte_free(laddr); + svc->pre_list[cid].num_laddrs--; + } else { + char buf[64]; + + if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL) + snprintf(buf, sizeof(buf), "::"); + + RTE_LOG(DEBUG, SAPOOL, "%s: laddr %s is in use.\n", __func__, buf); + err = EDPVS_BUSY; + } + } + } + + rte_rwlock_write_unlock(&svc->laddr_lock); + return err; +} + +int dp_vs_laddr_flush(struct dp_vs_service *svc) +{ + int err = EDPVS_OK; + + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + err = __dp_vs_laddr_flush_addr_mode(svc); + } else { + err = __dp_vs_laddr_flush_port_mode(svc); + } return err; } @@ -571,6 +855,8 @@ int dp_vs_laddr_init(void) if ((err = sockopt_register(&laddr_sockopts)) != EDPVS_OK) return err; + /* enabled lcore should not change after init */ + netif_get_slave_lcores(NULL, &lcore_mask); return EDPVS_OK; } diff --git a/src/ipvs/ip_vs_service.c b/src/ipvs/ip_vs_service.c index 60a8637ee..f30b48428 100644 --- a/src/ipvs/ip_vs_service.c +++ b/src/ipvs/ip_vs_service.c @@ -471,6 +471,7 @@ int dp_vs_add_service(struct dp_vs_service_conf *u, { int ret = 0; int size; + int cid = 0; struct dp_vs_scheduler *sched = NULL; struct dp_vs_service *svc = NULL; @@ -522,6 +523,11 @@ int dp_vs_add_service(struct dp_vs_service_conf *u, svc->num_laddrs = 0; svc->laddr_curr = &svc->laddr_list; + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + INIT_LIST_HEAD(&svc->pre_list[cid].laddr_list); + svc->pre_list[cid].laddr_curr = &svc->pre_list[cid].laddr_list; + svc->pre_list[cid].num_laddrs = 0; + } INIT_LIST_HEAD(&svc->dests); rte_rwlock_init(&svc->sched_lock); diff --git a/src/netif.c b/src/netif.c index 21e4544e2..2af2fe1e4 100644 --- a/src/netif.c +++ b/src/netif.c @@ -37,6 +37,7 @@ #include "timer.h" #include "parser/parser.h" #include "neigh.h" +#include "sa_pool.h" #include #include @@ -138,6 +139,8 @@ static struct list_head worker_list; /* lcore configurations from cfgfile */ #define NETIF_PORT_TABLE_MASK (NETIF_PORT_TABLE_BUCKETS - 1) static struct list_head port_tab[NETIF_PORT_TABLE_BUCKETS]; /* hashed by id */ static struct list_head port_ntab[NETIF_PORT_TABLE_BUCKETS]; /* hashed by name */ +uint32_t lcore_ids[RTE_MAX_LCORE]; + /* Note: Lockless, NIC can only be registered on initialization stage and * unregistered on cleanup stage */ @@ -1289,6 +1292,7 @@ void netif_get_slave_lcores(uint8_t *nb, uint64_t *mask) while (lcore_conf[i].nports > 0) { slave_lcore_nb++; slave_lcore_mask |= (1L << lcore_conf[i].id); + lcore_ids[i] = lcore_conf[i].id; i++; } @@ -3476,7 +3480,8 @@ int netif_port_start(struct netif_port *port) } // device configure - if ((ret = netif_port_fdir_dstport_mask_set(port)) != EDPVS_OK) + if (SA_POOL_MODE == LPORT_LCORE_MAPPING_POOL_MODE && + (ret = netif_port_fdir_dstport_mask_set(port)) != EDPVS_OK) return ret; ret = rte_eth_dev_configure(port->id, port->nrxq, port->ntxq, &port->dev_conf); if (ret < 0 ) { @@ -3768,9 +3773,10 @@ static struct rte_eth_conf default_port_conf = { .dst_ip = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }, .src_port_mask = 0x0000, + .dst_port_mask = 0x0000, /* to be changed according to slave lcore number in use */ - .dst_port_mask = 0x00F8, + // .dst_port_mask = 0x00F8, .mac_addr_byte_mask = 0x00, .tunnel_type_mask = 0, diff --git a/src/sa_pool.c b/src/sa_pool.c index 35da2fcde..40ed12b0d 100644 --- a/src/sa_pool.c +++ b/src/sa_pool.c @@ -60,15 +60,15 @@ #define DEF_MIN_PORT 1025 #define DEF_MAX_PORT 65535 -#define SAPOOL -#define RTE_LOGTYPE_SAPOOL RTE_LOGTYPE_USER1 - #define MAX_FDIR_PROTO 2 #define SAPOOL_DEF_HASH_SZ 16 #define SAPOOL_MIN_HASH_SZ 1 #define SAPOOL_MAX_HASH_SZ 128 +#define LPORT_LCORE_MAPPING_POOL_MODE_NAME "lport_lcore_mapping" +#define LADDR_LCORE_MAPPING_POOL_MODE_NAME "laddr_lcore_mapping" + enum { SA_F_USED = 0x01, }; @@ -99,8 +99,8 @@ struct sa_entry_pool { /* another way is use total_used/free_cnt in sa_pool, * so that we need not travels the hash to get stats. * we use cnt here, since we may need per-pool stats. */ - rte_atomic16_t used_cnt; - rte_atomic16_t free_cnt; + rte_atomic32_t used_cnt; + rte_atomic32_t free_cnt; uint32_t miss_cnt; }; @@ -139,8 +139,74 @@ static uint8_t sa_nlcore; static uint64_t sa_lcore_mask; static uint8_t sa_pool_hash_size = SAPOOL_DEF_HASH_SZ; +uint8_t sa_pool_mode = LPORT_LCORE_MAPPING_POOL_MODE; +extern uint32_t lcore_ids[RTE_MAX_LCORE]; -static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, +static int __add_del_filter_addr_mode(int af, struct netif_port *dev, lcoreid_t cid, + const union inet_addr *dip, + uint32_t filter_id[MAX_FDIR_PROTO], bool add) +{ + struct rte_eth_fdir_filter filt = { + .action.behavior = RTE_ETH_FDIR_ACCEPT, + .action.report_status = RTE_ETH_FDIR_REPORT_ID, + .soft_id = filter_id[0], + }; + + if (af == AF_INET) { + filt.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_OTHER; + filt.input.flow.ip4_flow.dst_ip = dip->in.s_addr; + } else if (af == AF_INET6) { + filt.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_OTHER; + memcpy(filt.input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); + } else { + return EDPVS_NOTSUPP; + } + + queueid_t queue; + int err; + enum rte_filter_op op; +#ifdef CONFIG_DPVS_SAPOOL_DEBUG + char ipaddr[64]; +#endif + + if (dev->netif_ops && dev->netif_ops->op_filter_supported) { + if (dev->netif_ops->op_filter_supported(dev, RTE_ETH_FILTER_FDIR) < 0) { + if (dev->nrxq <= 1) + return EDPVS_OK; + RTE_LOG(ERR, SAPOOL, "%s: FDIR is not supported by device %s. Only" + " single rxq can be configured.\n", __func__, dev->name); + return EDPVS_NOTSUPP; + } + } else { + RTE_LOG(ERR, SAPOOL, "%s: FDIR support of device %s is not known.\n", + __func__, dev->name); + return EDPVS_INVAL; + } + + err = netif_get_queue(dev, cid, &queue); + if (err != EDPVS_OK) + return err; + + filt.action.rx_queue = queue; + op = add ? RTE_ETH_FILTER_ADD : RTE_ETH_FILTER_DELETE; + + err = netif_fdir_filter_set(dev, op, &filt); + if (err != EDPVS_OK) + return err; + +#ifdef CONFIG_DPVS_SAPOOL_DEBUG + RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s %s TCP/UDP " + "ip %s queue %d lcore %2d filterID %d\n", + add ? "add" : "del", dev->name, + af == AF_INET ? "IPv4" : "IPv6", + inet_ntop(af, dip, ipaddr, sizeof(ipaddr)) ? : "::", + queue, cid, filter_id[0]); +#endif + + return err; +} + +static int __add_del_filter_port_mode(int af, struct netif_port *dev, lcoreid_t cid, const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO], bool add) { @@ -230,6 +296,16 @@ static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, return err; } +static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, + const union inet_addr *dip, __be16 dport, + uint32_t filter_id[MAX_FDIR_PROTO], bool add) +{ + if (SA_POOL_MODE == LPORT_LCORE_MAPPING_POOL_MODE) + return __add_del_filter_port_mode(af, dev, cid, dip, dport, filter_id, add); + else + return __add_del_filter_addr_mode(af, dev, cid, dip, filter_id, add); +} + static inline int sa_add_filter(int af, struct netif_port *dev, lcoreid_t cid, const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO]) @@ -264,13 +340,13 @@ static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, INIT_LIST_HEAD(&pool->used_enties); INIT_LIST_HEAD(&pool->free_enties); - rte_atomic16_set(&pool->used_cnt, 0); - rte_atomic16_set(&pool->free_cnt, 0); + rte_atomic32_set(&pool->used_cnt, 0); + rte_atomic32_set(&pool->free_cnt, 0); for (port = ap->low; port <= ap->high; port++) { struct sa_entry *sa; - - if (fdir->mask && + if (SA_POOL_MODE == LPORT_LCORE_MAPPING_POOL_MODE && + fdir->mask && ((uint16_t)port & fdir->mask) != ntohs(fdir->port_base)) continue; @@ -278,7 +354,7 @@ static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, sa->addr = ap->ifa->addr; sa->port = htons((uint16_t)port); list_add_tail(&sa->list, &pool->free_enties); - rte_atomic16_inc(&pool->free_cnt); + rte_atomic32_inc(&pool->free_cnt); } } @@ -292,12 +368,54 @@ static int sa_pool_free_hash(struct sa_pool *ap) return EDPVS_OK; } -int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) +static int __sa_pool_create(struct inet_ifaddr *ifa, lcoreid_t cid, + uint16_t low, uint16_t high) { + uint32_t filtids[MAX_FDIR_PROTO]; + struct sa_fdir *fdir = &sa_fdirs[cid]; struct sa_pool *ap; int err; - lcoreid_t cid; + ap = rte_zmalloc(NULL, sizeof(struct sa_pool), 0); + if (!ap) { + err = EDPVS_NOMEM; + goto errout; + } + + ap->ifa = ifa; + ap->low = low; + ap->high = high; + rte_atomic32_set(&ap->refcnt, 0); + + err = sa_pool_alloc_hash(ap, sa_pool_hash_size, fdir); + if (err != EDPVS_OK) { + rte_free(ap); + goto errout; + } + + /* if add filter failed, waste some soft-id is acceptable. */ + filtids[0] = fdir->soft_id++; + filtids[1] = fdir->soft_id++; + err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, + fdir->port_base, filtids); + if (err != EDPVS_OK) { + sa_pool_free_hash(ap); + rte_free(ap); + goto errout; + } + ap->filter_id[0] = filtids[0]; + ap->filter_id[1] = filtids[1]; + ifa->sa_pools[cid] = ap; + return EDPVS_OK; +errout: + return err; +} + +int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) +{ + int err; + lcoreid_t cid; + static unsigned idx = 0; low = low ? : DEF_MIN_PORT; high = high ? : DEF_MAX_PORT; @@ -306,47 +424,25 @@ int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) return EDPVS_INVAL; } - for (cid = 0; cid < RTE_MAX_LCORE; cid++) { - uint32_t filtids[MAX_FDIR_PROTO]; - struct sa_fdir *fdir = &sa_fdirs[cid]; - - /* skip master and unused cores */ - if (cid > 64 || !(sa_lcore_mask & (1L << cid))) - continue; - assert(rte_lcore_is_enabled(cid) && cid != rte_get_master_lcore()); - - ap = rte_zmalloc(NULL, sizeof(struct sa_pool), 0); - if (!ap) { - err = EDPVS_NOMEM; - goto errout; - } - - ap->ifa = ifa; - ap->low = low; - ap->high = high; - rte_atomic32_set(&ap->refcnt, 0); + if (SA_POOL_MODE == LADDR_LCORE_MAPPING_POOL_MODE) { + cid = lcore_ids[(idx++) % sa_nlcore]; + err = __sa_pool_create(ifa, cid, low, high); - err = sa_pool_alloc_hash(ap, sa_pool_hash_size, fdir); - if (err != EDPVS_OK) { - rte_free(ap); + if (idx >= sa_nlcore) + idx = 0; + if (err != EDPVS_OK) goto errout; - } - - /* if add filter failed, waste some soft-id is acceptable. */ - filtids[0] = fdir->soft_id++; - filtids[1] = fdir->soft_id++; + } else if (SA_POOL_MODE == LPORT_LCORE_MAPPING_POOL_MODE) { + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + /* skip master and unused cores */ + if (cid > 64 || !(sa_lcore_mask & (1L << cid))) + continue; + assert(rte_lcore_is_enabled(cid) && cid != rte_get_master_lcore()); + err = __sa_pool_create(ifa, cid, low, high); - err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, - fdir->port_base, filtids); - if (err != EDPVS_OK) { - sa_pool_free_hash(ap); - rte_free(ap); - goto errout; + if (err != EDPVS_OK) + goto errout; } - ap->filter_id[0] = filtids[0]; - ap->filter_id[1] = filtids[1]; - - ifa->sa_pools[cid] = ap; } #ifdef CONFIG_DPVS_SAPOOL_DEBUG @@ -443,8 +539,8 @@ static inline int sa_pool_fetch(struct sa_entry_pool *pool, if (!ent) { #ifdef CONFIG_DPVS_SAPOOL_DEBUG RTE_LOG(DEBUG, SAPOOL, "%s: no entry (used/free %d/%d)\n", __func__, - rte_atomic16_read(&pool->used_cnt), - rte_atomic16_read(&pool->free_cnt)); + rte_atomic32_read(&pool->used_cnt), + rte_atomic32_read(&pool->free_cnt)); #endif pool->miss_cnt++; return EDPVS_RESOURCE; @@ -464,8 +560,8 @@ static inline int sa_pool_fetch(struct sa_entry_pool *pool, ent->flags |= SA_F_USED; list_move_tail(&ent->list, &pool->used_enties); - rte_atomic16_inc(&pool->used_cnt); - rte_atomic16_dec(&pool->free_cnt); + rte_atomic32_inc(&pool->used_cnt); + rte_atomic32_dec(&pool->free_cnt); #ifdef CONFIG_DPVS_SAPOOL_DEBUG RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d fetched!\n", __func__, @@ -515,8 +611,8 @@ static inline int sa_pool_release(struct sa_entry_pool *pool, ent->flags &= (~SA_F_USED); list_move_tail(&ent->list, &pool->free_enties); - rte_atomic16_dec(&pool->used_cnt); - rte_atomic16_inc(&pool->free_cnt); + rte_atomic32_dec(&pool->used_cnt); + rte_atomic32_inc(&pool->free_cnt); #ifdef CONFIG_DPVS_SAPOOL_DEBUG RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d released!\n", __func__, @@ -830,8 +926,8 @@ static int sa_msg_get_stats(struct dpvs_msg *msg) pool = &ifa->this_sa_pool->pool_hash[hash]; assert(pool); - stats->used_cnt += rte_atomic16_read(&pool->used_cnt); - stats->free_cnt += rte_atomic16_read(&pool->free_cnt); + stats->used_cnt += rte_atomic32_read(&pool->used_cnt); + stats->free_cnt += rte_atomic32_read(&pool->free_cnt); stats->miss_cnt += pool->miss_cnt; } @@ -894,6 +990,22 @@ int sa_pool_term(void) /* * config file */ +static void pool_mode_handler(vector_t tokens) +{ + char *str = set_value(tokens); + assert(str); + + if (!strcmp(str, LADDR_LCORE_MAPPING_POOL_MODE_NAME)) + sa_pool_mode = LADDR_LCORE_MAPPING_POOL_MODE; + else if (!strcmp(str, LPORT_LCORE_MAPPING_POOL_MODE_NAME)) + sa_pool_mode = LPORT_LCORE_MAPPING_POOL_MODE; + else + RTE_LOG(WARNING, SAPOOL, "invalid pool_mode %s, use default %s\n", + str, LPORT_LCORE_MAPPING_POOL_MODE_NAME); + + FREE_PTR(str); +} + static void sa_pool_hash_size_conf(vector_t tokens) { char *str = set_value(tokens); @@ -915,5 +1027,6 @@ static void sa_pool_hash_size_conf(vector_t tokens) void install_sa_pool_keywords(void) { install_keyword_root("sa_pool", NULL); + install_keyword("pool_mode", pool_mode_handler, KW_TYPE_INIT); install_keyword("pool_hash_size", sa_pool_hash_size_conf, KW_TYPE_INIT); }