diff --git a/do.sh b/do.sh index 1a5ba5fa..dadf14d8 100755 --- a/do.sh +++ b/do.sh @@ -449,7 +449,7 @@ function run_test() { cluster_vars="" for var in enable_ssl clustered_db monitor_all use_ovsdb_etcd \ - node_net datapath_type n_relays n_workers; do + node_net datapath_type n_relays n_workers n_az; do cluster_vars="${cluster_vars} $(get_cluster_var ${test_file} ${var})" done echo "-- Cluster vars: ${cluster_vars}" diff --git a/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml b/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml index 2831c33a..6c98c30d 100644 --- a/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml +++ b/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml @@ -22,6 +22,7 @@ tasks: - name: Start central containers environment: + CENTRAL_COUNT: '{{ n_az }}' CHASSIS_COUNT: 0 CREATE_FAKE_VMS: no ENABLE_ETCD: '{{ use_ovsdb_etcd }}' @@ -46,6 +47,7 @@ tasks: - name: Start worker containers environment: + CENTRAL_COUNT: '{{ n_az }}' CHASSIS_COUNT: 0 CREATE_FAKE_VMS: no ENABLE_ETCD: '{{ use_ovsdb_etcd }}' diff --git a/ovn-fake-multinode-utils/translate_yaml.py b/ovn-fake-multinode-utils/translate_yaml.py index 7c37cc00..63586eb0 100755 --- a/ovn-fake-multinode-utils/translate_yaml.py +++ b/ovn-fake-multinode-utils/translate_yaml.py @@ -25,28 +25,6 @@ class GlobalConfig: run_ipv6: bool = False -def calculate_node_remotes( - node_net: str, clustered_db: bool, n_relays: int, enable_ssl: bool -) -> str: - net = netaddr.IPNetwork(node_net) - - ip_gen = net.iter_hosts() - # The first IP is assigned to the tester, skip it. - next(ip_gen) - if n_relays > 0: - skip = 3 if clustered_db else 1 - for _ in range(0, skip): - next(ip_gen) - ip_range = range(0, n_relays) - else: - ip_range = range(0, 3 if clustered_db else 1) - if enable_ssl: - remotes = ["ssl:" + str(next(ip_gen)) + ":6642" for _ in ip_range] - else: - remotes = ["tcp:" + str(next(ip_gen)) + ":6642" for _ in ip_range] - return ','.join(remotes) - - DEFAULT_N_VIPS = 2 DEFAULT_VIP_PORT = 80 @@ -119,7 +97,6 @@ class ClusterConfig: db_inactivity_probe: int = 60000 node_net: str = "192.16.0.0/16" enable_ssl: bool = True - node_remote: str = None node_timeout_s: int = 20 internal_net: str = "16.0.0.0/16" internal_net6: str = "16::/64" @@ -131,6 +108,7 @@ class ClusterConfig: cluster_net6: str = "16::/32" n_workers: int = 2 n_relays: int = 0 + n_az: int = 1 vips: Dict = None vips6: Dict = None vip_subnet: str = "4.0.0.0/8" @@ -141,14 +119,6 @@ class ClusterConfig: def __post_init__(self, **kwargs): # Some defaults have to be calculated - if not self.node_remote: - self.node_remote = calculate_node_remotes( - self.node_net, - self.clustered_db, - self.n_relays, - self.enable_ssl, - ) - if self.vips is None: self.vips = calculate_vips(self.vip_subnet) diff --git a/ovn-tester/ovn_tester.py b/ovn-tester/ovn_tester.py index 27f969d0..120dcd1a 100644 --- a/ovn-tester/ovn_tester.py +++ b/ovn-tester/ovn_tester.py @@ -14,7 +14,7 @@ from ovn_sandbox import PhysicalNode from ovn_workload import BrExConfig, ClusterConfig from ovn_workload import CentralNode, WorkerNode, Cluster -from ovn_utils import DualStackSubnet +from ovn_utils import DualStackSubnet, NodeConf from ovs.stream import Stream @@ -70,8 +70,8 @@ def read_config(config): raft_election_to=cluster_args['raft_election_to'], node_net=netaddr.IPNetwork(cluster_args['node_net']), n_relays=cluster_args['n_relays'], + n_az=cluster_args['n_az'], enable_ssl=cluster_args['enable_ssl'], - node_remote=cluster_args['node_remote'], northd_probe_interval=cluster_args['northd_probe_interval'], db_inactivity_probe=cluster_args['db_inactivity_probe'], node_timeout_s=cluster_args['node_timeout_s'], @@ -182,36 +182,77 @@ def configure_tests(yaml, central_node, worker_nodes, global_cfg): def create_nodes(cluster_config, central, workers): - mgmt_net = cluster_config.node_net - mgmt_ip = mgmt_net.ip + 2 - internal_net = cluster_config.internal_net - external_net = cluster_config.external_net - gw_net = cluster_config.gw_net - db_containers = ( - ['ovn-central-1', 'ovn-central-2', 'ovn-central-3'] - if cluster_config.clustered_db - else ['ovn-central'] - ) + node_az_conf = [ + NodeConf( + cluster_config.node_net, + DualStackSubnet.next( + cluster_config.internal_net, + i * (cluster_config.n_workers // cluster_config.n_az), + ), + DualStackSubnet.next( + cluster_config.external_net, + i * (cluster_config.n_workers // cluster_config.n_az), + ), + DualStackSubnet.next( + cluster_config.gw_net, + i * (cluster_config.n_workers // cluster_config.n_az), + ), + ) + for i in range(cluster_config.n_az) + ] + + db_containers = [ + [ + f'ovn-central-az{i+1}-1', + f'ovn-central-az{i+1}-2', + f'ovn-central-az{i+1}-3' + if cluster_config.clustered_db + else f'ovn-central-az{i+1}', + ] + for i in range(cluster_config.n_az) + ] + relay_containers = [ - f'ovn-relay-{i + 1}' for i in range(cluster_config.n_relays) + [ + f'ovn-relay{i*cluster_config.n_az+j+1}' + for j in range(cluster_config.n_relays) + ] + for i in range(cluster_config.n_az) ] - central_node = CentralNode( - central, db_containers, relay_containers, mgmt_net, mgmt_ip - ) - worker_nodes = [ - WorkerNode( + + central_nodes = [ + CentralNode( + central, + db_containers[i], + relay_containers[i], + node_az_conf[i].getMgmtNet(), + node_az_conf[i].getMgmtIp(), + node_az_conf[i].getGwNet(), + i, + ) + for i in range(cluster_config.n_az) + ] + + worker_nodes = [[] for _ in range(cluster_config.n_az)] + for i in range(cluster_config.n_workers): + wn = WorkerNode( workers[i % len(workers)], f'ovn-scale-{i}', - mgmt_net, - mgmt_ip + i, - DualStackSubnet.next(internal_net, i), - DualStackSubnet.next(external_net, i), - gw_net, + node_az_conf[i % cluster_config.n_az].getMgmtNet(), + node_az_conf[i % cluster_config.n_az].getMgmtIp(), + DualStackSubnet.next( + node_az_conf[i % cluster_config.n_az].getIntNet(), + i // cluster_config.n_az, + ), + DualStackSubnet.next( + node_az_conf[i % cluster_config.n_az].getExtNet(), + i // cluster_config.n_az, + ), + node_az_conf[i % cluster_config.n_az].getGwNet(), i, ) - for i in range(cluster_config.n_workers) - ] - return central_node, worker_nodes + worker_nodes[i % cluster_config.n_az].append(wn) + return central_nodes, worker_nodes def set_ssl_keys(cluster_cfg): @@ -220,28 +261,38 @@ def set_ssl_keys(cluster_cfg): Stream.ssl_set_ca_cert_file(cluster_cfg.ssl_cacert) -def prepare_test(central_node, worker_nodes, cluster_cfg, brex_cfg): +def prepare_test(central_nodes, worker_nodes, cluster_cfg, brex_cfg): + clusters = [] if cluster_cfg.enable_ssl: set_ssl_keys(cluster_cfg) - ovn = Cluster(central_node, worker_nodes, cluster_cfg, brex_cfg) - with Context(ovn, "prepare_test"): - ovn.start() - return ovn + for i in range(0, len(central_nodes)): + ovn = Cluster(central_nodes[i], worker_nodes[i], cluster_cfg, brex_cfg) + with Context(ovn, f'prepare_test for cluster{i}'): + ovn.start() + clusters.append(ovn) + + return clusters def run_base_cluster_bringup(ovn, bringup_cfg, global_cfg): - # create ovn topology - with Context(ovn, "base_cluster_bringup", len(ovn.worker_nodes)) as ctx: - ovn.create_cluster_router("lr-cluster") - ovn.create_cluster_join_switch("ls-join") - ovn.create_cluster_load_balancer("lb-cluster", global_cfg) - for i in ctx: - worker = ovn.worker_nodes[i] - worker.provision(ovn) - ports = worker.provision_ports(ovn, bringup_cfg.n_pods_per_node) - worker.provision_load_balancers(ovn, ports, global_cfg) - worker.ping_ports(ovn, ports) - ovn.provision_lb_group() + for i in range(0, len(clusters)): + ovn = clusters[i] + # create ovn topology + with Context( + ovn, "base_cluster_bringup", len(ovn.worker_nodes) + ) as ctx: + ovn.create_cluster_router(f'lr-cluster{i}') + ovn.create_cluster_join_switch(f'ls-join{i}') + ovn.create_cluster_load_balancer(f'lb-cluster{i}', global_cfg) + for i in ctx: + worker = ovn.worker_nodes[i] + worker.provision(ovn) + ports = worker.provision_ports( + ovn, bringup_cfg.n_pods_per_node + ) + worker.provision_load_balancers(ovn, ports, global_cfg) + worker.ping_ports(ovn, ports) + ovn.provision_lb_group(f'cluster-lb-group{i}') if __name__ == '__main__': @@ -260,11 +311,14 @@ def run_base_cluster_bringup(ovn, bringup_cfg, global_cfg): raise ovn_exceptions.OvnInvalidConfigException() central, workers = read_physical_deployment(sys.argv[1], global_cfg) - central_node, worker_nodes = create_nodes(cluster_cfg, central, workers) - tests = configure_tests(config, central_node, worker_nodes, global_cfg) + central_nodes, worker_nodes = create_nodes(cluster_cfg, central, workers) + tests = configure_tests( + config, central_nodes[0], worker_nodes[0], global_cfg + ) - ovn = prepare_test(central_node, worker_nodes, cluster_cfg, brex_cfg) - run_base_cluster_bringup(ovn, bringup_cfg, global_cfg) + clusters = prepare_test(central_nodes, worker_nodes, cluster_cfg, brex_cfg) + run_base_cluster_bringup(clusters, bringup_cfg, global_cfg) + # FIXME run workloads for all clusters for test in tests: - test.run(ovn, global_cfg) + test.run(clusters[0], global_cfg) sys.exit(0) diff --git a/ovn-tester/ovn_utils.py b/ovn-tester/ovn_utils.py index a53adca6..a2989792 100644 --- a/ovn-tester/ovn_utils.py +++ b/ovn-tester/ovn_utils.py @@ -101,6 +101,29 @@ def external_host_provision(self, ip, gw, netns='ext-ns'): self.run(cmd=cmd) +class NodeConf: + def __init__(self, mgmt_net, int_net, ext_net, gw_net): + self.mgmt_net = mgmt_net + self.int_net = int_net + self.ext_net = ext_net + self.gw_net = gw_net + + def getMgmtNet(self): + return self.mgmt_net + + def getMgmtIp(self): + return self.mgmt_net.ip + 2 + + def getIntNet(self): + return self.int_net + + def getExtNet(self): + return self.ext_net + + def getGwNet(self): + return self.gw_net + + class DualStackSubnet: def __init__(self, n4=None, n6=None): self.n4 = n4 diff --git a/ovn-tester/ovn_workload.py b/ovn-tester/ovn_workload.py index 7b5ff073..1f58a201 100644 --- a/ovn-tester/ovn_workload.py +++ b/ovn-tester/ovn_workload.py @@ -27,7 +27,6 @@ 'db_inactivity_probe', 'node_net', 'enable_ssl', - 'node_remote', 'node_timeout_s', 'internal_net', 'external_net', @@ -35,6 +34,7 @@ 'cluster_net', 'n_workers', 'n_relays', + 'n_az', 'vips', 'vips6', 'vip_subnet', @@ -61,17 +61,27 @@ def __init__(self, phys_node, container, mgmt_net, mgmt_ip): class CentralNode(Node): def __init__( - self, phys_node, db_containers, relay_containers, mgmt_net, mgmt_ip + self, + phys_node, + db_containers, + relay_containers, + mgmt_net, + mgmt_ip, + gw_net, + idx, ): super(CentralNode, self).__init__( phys_node, db_containers[0], mgmt_net, mgmt_ip ) self.db_containers = db_containers self.relay_containers = relay_containers + self.id = idx + self.gw_net = gw_net def start(self, cluster_cfg): log.info('Configuring central node') - self.set_raft_election_timeout(cluster_cfg.raft_election_to) + if cluster_cfg.clustered_db: + self.set_raft_election_timeout(cluster_cfg.raft_election_to) self.enable_trim_on_compaction() self.set_northd_threads(cluster_cfg.northd_threads) @@ -121,7 +131,8 @@ def enable_trim_on_compaction(self): def get_connection_string(self, cluster_cfg, port): protocol = "ssl" if cluster_cfg.enable_ssl else "tcp" - ip = self.mgmt_ip + off = 3 * self.id if cluster_cfg.clustered_db else self.id + ip = self.mgmt_ip + off num_conns = 3 if cluster_cfg.clustered_db else 1 conns = [f"{protocol}:{ip + idx}:{port}" for idx in range(num_conns)] return ",".join(conns) @@ -163,15 +174,37 @@ def start(self, cluster_cfg): cluster_cfg.db_inactivity_probe // 1000, ) + def calculate_node_remotes( + self, node_net, clustered_db, enable_ssl, offset + ): + net = netaddr.IPNetwork(node_net) + + ip_gen = net.iter_hosts() + # The first IP is assigned to the tester, skip it. + next(ip_gen) + skip = 3 * offset if clustered_db else offset + for _ in range(0, skip): + next(ip_gen) + ip_range = range(offset, offset + 3 if clustered_db else offset + 1) + if enable_ssl: + remotes = ["ssl:" + str(next(ip_gen)) + ":6642" for _ in ip_range] + else: + remotes = ["tcp:" + str(next(ip_gen)) + ":6642" for _ in ip_range] + return ','.join(remotes) + @ovn_stats.timeit def connect(self, cluster_cfg): - log.info( - f'Connecting worker {self.container}: ' - f'ovn-remote = {cluster_cfg.node_remote}' + remote = self.calculate_node_remotes( + cluster_cfg.node_net, + cluster_cfg.clustered_db, + cluster_cfg.enable_ssl, + self.id % cluster_cfg.n_az, ) - self.vsctl.set_global_external_id( - 'ovn-remote', f'{cluster_cfg.node_remote}' + + log.info( + f'Connecting worker {self.container}: ' f'ovn-remote = {remote}' ) + self.vsctl.set_global_external_id('ovn-remote', f'{remote}') def configure_localnet(self, physical_net): log.info(f'Creating localnet on {self.container}') @@ -419,9 +452,12 @@ def ping_ports(self, cluster, ports): def get_connection_string(self, cluster_cfg, port): protocol = "ssl" if cluster_cfg.enable_ssl else "tcp" - offset = 0 - offset += 3 if cluster_cfg.clustered_db else 1 - offset += cluster_cfg.n_relays + offset = cluster_cfg.n_az + if cluster_cfg.clustered_db: + offset *= 3 + if cluster_cfg.n_relays > 0: + offset += cluster_cfg.n_relays * cluster_cfg.n_az + offset += self.id return f"{protocol}:{self.mgmt_ip + offset}:{port}" @@ -795,17 +831,17 @@ def create_cluster_load_balancer(self, lb_name, global_cfg): def create_cluster_join_switch(self, sw_name): self.join_switch = self.nbctl.ls_add( - sw_name, net_s=self.cluster_cfg.gw_net + sw_name, net_s=self.central_node.gw_net ) self.join_rp = self.nbctl.lr_port_add( self.router, - 'rtr-to-join', + f'rtr-to-{sw_name}', RandMac(), - self.cluster_cfg.gw_net.reverse(), + self.central_node.gw_net.reverse(), ) self.join_ls_rp = self.nbctl.ls_port_add( - self.join_switch, 'join-to-rtr', self.join_rp + self.join_switch, f'{sw_name}-to-rtr', self.join_rp ) def provision_ports(self, n_ports, passive=False): @@ -852,8 +888,8 @@ def select_worker_for_port(self): self.last_selected_worker %= len(self.worker_nodes) return self.worker_nodes[self.last_selected_worker] - def provision_lb_group(self): - self.lb_group = lb.OvnLoadBalancerGroup('cluster-lb-group', self.nbctl) + def provision_lb_group(self, name='cluster-lb-group'): + self.lb_group = lb.OvnLoadBalancerGroup(name, self.nbctl) for w in self.worker_nodes: self.nbctl.ls_add_lbg(w.switch, self.lb_group.lbg) self.nbctl.lr_add_lbg(w.gw_router, self.lb_group.lbg)