diff --git a/artifacts/lrms/helm_chart.yml b/artifacts/lrms/helm_chart.yml index 575bd97..429b272 100644 --- a/artifacts/lrms/helm_chart.yml +++ b/artifacts/lrms/helm_chart.yml @@ -48,7 +48,7 @@ - name: Create params fact set_fact: - helm_params: "{{ item.key }}={{ item.value }},{{ helm_params }}" + helm_params: "{{ item.key }}='{{ item.value }}',{{ helm_params }}" with_dict: "{{ helm_values }}" - name: Create values file diff --git a/artifacts/lrms/kube_fe_install.yml b/artifacts/lrms/kube_fe_install.yml index ac89b1b..2a4e625 100644 --- a/artifacts/lrms/kube_fe_install.yml +++ b/artifacts/lrms/kube_fe_install.yml @@ -6,6 +6,7 @@ export_hosts: "*.localdomain" export_line: "(fsid=0,rw,async,no_root_squash,no_subtree_check,insecure)" nfs_path: "{{ kube_nfs_path | default('/pv') }}" + csi_driver: "{{ kube_csi_driver | default('NFS') }}" pre_tasks: - name: Create dir for the NFS PV @@ -22,11 +23,15 @@ set_fact: export_hosts: "{{ ansible_default_ipv4.network }}/{{ ansible_default_ipv4.netmask }}" when: IM_NODE_CLOUD_TYPE is defined and IM_NODE_CLOUD_TYPE == 'EC2' - + - name: Set to install Longhorn CSI driver + set_fact: + kube_install_longhorn: true + when: csi_driver == 'Longhorn' roles: - role: grycap.nfs nfs_mode: 'front' nfs_exports: [{path: '{{ nfs_path }}', export: '{{ export_hosts }}{{ export_line }}'}] + when: csi_driver == 'NFS' - role: grycap.kubernetes kube_server: '{{ kube_front_end_ip }}' kube_api_server: '{{ kube_front_end_ip }}' diff --git a/artifacts/lrms/kube_wn_install.yml b/artifacts/lrms/kube_wn_install.yml index ceecfdc..5516c69 100644 --- a/artifacts/lrms/kube_wn_install.yml +++ b/artifacts/lrms/kube_wn_install.yml @@ -3,9 +3,11 @@ connection: local vars: vnode_prefix: vnode- + csi_driver: "{{ kube_csi_driver | default('NFS') }}" roles: - role: grycap.nfs nfs_mode: 'wn' + when: csi_driver == 'NFS' - role: 'grycap.kubernetes' kube_type_of_node: 'wn' kube_server: '{{ kube_front_end_ip }}' diff --git a/artifacts/lrms/nomad_ai4eosc_fe_install.yml b/artifacts/lrms/nomad_ai4eosc_fe_install.yml index b32da8c..604d2f0 100644 --- a/artifacts/lrms/nomad_ai4eosc_fe_install.yml +++ b/artifacts/lrms/nomad_ai4eosc_fe_install.yml @@ -9,8 +9,17 @@ pre_tasks: - name: Convert server_list to list set_fact: - server_list: [server_list] - when: server_list is string + server_list: + - "{{ server_list }}" + when: + - consul_server_join is not defined or consul_server_join == '' + - server_list is string + + - name: Use consul_server_join to join the consul cluster + set_fact: + server_list: + - "{{ consul_server_join }}" + when: consul_server_join is defined and consul_server_join != '' roles: - role: 'grycap.consul' consul_server: true diff --git a/artifacts/lrms/nomad_ai4eosc_wn_install.yml b/artifacts/lrms/nomad_ai4eosc_wn_install.yml index 52b45c9..b206df7 100644 --- a/artifacts/lrms/nomad_ai4eosc_wn_install.yml +++ b/artifacts/lrms/nomad_ai4eosc_wn_install.yml @@ -8,8 +8,18 @@ pre_tasks: - name: Convert server_list to list set_fact: - server_list: [server_list] - when: server_list is string + server_list: + - "{{ server_list }}" + when: + - consul_server_join is not defined or consul_server_join == '' + - server_list is string + + - name: Use consul_server_join to join the consul cluster + set_fact: + server_list: + - "{{ consul_server_join }}" + when: consul_server_join is defined and consul_server_join != '' + - name: Set nomad plugins var set_fact: nomad_plugins_var: diff --git a/custom_types.yaml b/custom_types.yaml index 1959d24..26e7856 100644 --- a/custom_types.yaml +++ b/custom_types.yaml @@ -584,7 +584,13 @@ node_types: type: string default: "docker" constraints: - - valid_values: [ docker, containerd ] + - valid_values: [ docker, containerd, crio ] + csi_driver: + required: no + type: string + default: "NFS" + constraints: + - valid_values: [ NFS, Longhorn ] artifacts: kube_role: file: grycap.kubernetes @@ -601,6 +607,7 @@ node_types: kube_version: { get_property: [ SELF, version ] } kube_nvidia_support: { get_property: [ SELF, nvidia_support ] } kube_cri_runtime: { get_property: [ SELF, cri_runtime ] } + kube_csi_driver: { get_property: [ SELF, csi_driver ] } tosca.nodes.indigo.LRMS.FrontEnd.Kubernetes: derived_from: tosca.nodes.indigo.LRMS.FrontEnd @@ -686,7 +693,13 @@ node_types: type: string default: "docker" constraints: - - valid_values: [ docker, containerd ] + - valid_values: [ docker, containerd, crio ] + csi_driver: + required: no + type: string + default: "NFS" + constraints: + - valid_values: [ NFS, Longhorn ] artifacts: kube_role: file: grycap.kubernetes @@ -728,6 +741,7 @@ node_types: kube_cri_runtime: { get_property: [ SELF, cri_runtime ] } kube_install_yunikorn: { get_property: [ SELF, install_yunikorn ] } kube_deploy_dashboard: { get_property: [ SELF, install_dashboard ] } + kube_csi_driver: { get_property: [ SELF, csi_driver ] } tosca.nodes.indigo.LRMS.WorkerNode.Slurm: derived_from: tosca.nodes.indigo.LRMS.WorkerNode diff --git a/templates/k8s_new_wn_type.yaml b/templates/k8s_new_wn_type.yaml index fa4c81d..4ebdeee 100644 --- a/templates/k8s_new_wn_type.yaml +++ b/templates/k8s_new_wn_type.yaml @@ -82,6 +82,7 @@ topology_template: version: { get_input: kube_version } nvidia_support: { get_input: wng_kube_nvidia_support } cri_runtime: { get_input: kube_cri_runtime } + csi_driver: { get_input: kube_csi_driver } requirements: - host: wng diff --git a/templates/kubernetes.yaml b/templates/kubernetes.yaml index 9e1088d..cca4bfe 100644 --- a/templates/kubernetes.yaml +++ b/templates/kubernetes.yaml @@ -1,7 +1,7 @@ tosca_definitions_version: tosca_simple_yaml_1_0 imports: - - grycap_custom_types: https://raw.githubusercontent.com/grycap/tosca/main/custom_types.yaml + - grycap_custom_types: https://raw.githubusercontent.com/grycap/tosca/devel/custom_types.yaml description: Deploy a Kubernetes Virtual Cluster. @@ -164,11 +164,18 @@ topology_template: description: DNS name of the public interface of the FE node to generate the certificate default: "" - allowed_cidr: + kube_allowed_cidr: type: string description: Allowed remote CIDR to extenal access default: "0.0.0.0/0" + kube_csi_driver: + type: string + description: Name of the CSI driver to install + default: NFS + constraints: + - valid_values: [ NFS, Longhorn ] + node_templates: lrms_front_end: @@ -180,11 +187,11 @@ topology_template: http_port: protocol: tcp source: 80 - remote_cidr: { get_input: allowed_cidr } + remote_cidr: { get_input: kube_allowed_cidr } https_port: protocol: tcp source: 443 - remote_cidr: { get_input: allowed_cidr } + remote_cidr: { get_input: kube_allowed_cidr } properties: admin_username: kubeuser install_nfs_client: true @@ -196,6 +203,7 @@ topology_template: cert_user_email: { get_input: kube_cert_user_email } public_dns_name: { get_input: kube_public_dns_name} cri_runtime: { get_input: kube_cri_runtime } + csi_driver: { get_input: kube_csi_driver } requirements: - host: front @@ -236,6 +244,7 @@ topology_template: version: { get_input: kube_version } nvidia_support: { get_input: wn_kube_nvidia_support } cri_runtime: { get_input: kube_cri_runtime } + csi_driver: { get_input: kube_csi_driver } requirements: - host: wn diff --git a/templates/nomad_join_ai4eosc.yaml b/templates/nomad_join_ai4eosc.yaml new file mode 100644 index 0000000..05567d6 --- /dev/null +++ b/templates/nomad_join_ai4eosc.yaml @@ -0,0 +1,373 @@ +tosca_definitions_version: tosca_simple_yaml_1_0 + +imports: + - grycap_custom_types: https://raw.githubusercontent.com/grycap/tosca/main/custom_types.yaml + +description: Deploy an Consul + Nomad Virtual Cluster joining an previous existing one (AI4EOSC). + +metadata: + template_name: Nomad Join + template_version: "1.0.0" + display_name: Deploy and Join a Consul + Nomad Virtual Cluster + icon: images/nomad.png + tabs: + Server Features: fe_.* + WNs Features: wn_.* + GPU WNs Features: wn_gpu_.* + Pub WNs Features: wn_pub_.* + Nomad Data: + - launch_traefik + - consul_version + - nomad_version + - consul_cert_url + - nomad_cert_url + - consul_server_join + +topology_template: + inputs: + + fe_num: + type: integer + description: Number of Nomad Servers in the cluster. + default: 1 + required: yes + constraints: + - valid_values: [ 1, 3, 5 ] + fe_cpus: + type: integer + description: Number of CPUs for the front-end node + default: 2 + required: yes + constraints: + - valid_values: [ 2, 4, 8, 16, 32, 64 ] + fe_mem: + type: scalar-unit.size + description: Amount of Memory for the front-end node + default: 4 GB + required: yes + constraints: + - valid_values: [ 4 GB, 8 GB, 16 GB, 32 GB, 64 GB, 128 GB, 256 GB, 512 GB ] + + wn_num: + type: integer + description: Number of Nomad Clients in the cluster (without public IP) + default: 1 + required: yes + wn_cpus: + type: integer + description: Number of CPUs for the WNs (without public IP) + default: 2 + required: yes + constraints: + - valid_values: [ 2, 4, 8, 16, 32, 64 ] + wn_mem: + type: scalar-unit.size + description: Amount of Memory for the WNs (without public IP) + default: 4 GB + required: yes + constraints: + - valid_values: [ 4 GB, 8 GB, 16 GB, 32 GB, 64 GB, 128 GB, 256 GB, 512 GB ] + + wn_pub_num: + type: integer + description: Number of Nomad Clients in the cluster (with public IP) + default: 1 + required: yes + wn_pub_cpus: + type: integer + description: Number of CPUs for the WNs (with public IP) + default: 2 + required: yes + constraints: + - valid_values: [ 2, 4, 8, 16, 32, 64 ] + wn_pub_mem: + type: scalar-unit.size + description: Amount of Memory for the WNs (with public IP) + default: 4 GB + required: yes + constraints: + - valid_values: [ 4 GB, 8 GB, 16 GB, 32 GB, 64 GB, 128 GB, 256 GB, 512 GB ] + + wn_gpu_num: + type: integer + description: Number of Nomad Clients in the cluster (with GPU) + default: 0 + required: yes + wn_gpu_cpus: + type: integer + description: Number of CPUs for the GPU WNs + default: 2 + required: yes + constraints: + - valid_values: [ 2, 4, 8, 16, 32, 64 ] + wn_gpu_mem: + type: scalar-unit.size + description: Amount of Memory for the GPU WNs + default: 4 GB + required: yes + constraints: + - valid_values: [ 4 GB, 8 GB, 16 GB, 32 GB, 64 GB, 128 GB, 256 GB, 512 GB ] + wn_gpu_num_gpus: + type: integer + description: Number of GPUs to assing to this VM + default: 1 + constraints: + - valid_values: [ 1, 2, 3, 4 ] + wn_gpu_vendor: + type: string + description: GPU Vendor + default: '' + constraints: + - valid_values: [ '', 'NVIDIA', 'AMD' ] + wn_gpu_model: + type: string + description: GPU Model + default: '' + + + launch_traefik: + type: boolean + description: Launch Traefik job as reverse proxy + default: false + constraints: + - valid_values: [ false, true ] + consul_version: + type: string + description: Consul version to install + default: 1.17.1 + required: yes + nomad_version: + type: string + description: Nomad version to install + default: 1.7.3 + required: yes + + + consul_cert_url: + type: string + description: URL to download the Consul certificates and tokens + default: '' + nomad_cert_url: + type: string + description: URL to download the Nomad certificates + default: '' + consul_server_join: + type: string + description: IP address of the Consul server to join + default: '' + + node_templates: + + lrms_front_end: + type: tosca.nodes.indigo.LRMS.FrontEnd.Nomad + capabilities: + endpoint: + properties: + ports: + port_4646: + protocol: tcp + source: 4646 + port_8501: + protocol: tcp + source: 8501 + port_80: + protocol: tcp + source: 80 + port_443: + protocol: tcp + source: 443 + artifacts: + nomad_role: + file: grycap.nomad,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + consul_role: + file: grycap.consul,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + requirements: + - host: front + interfaces: + Standard: + configure: + implementation: https://raw.githubusercontent.com/grycap/tosca/main/artifacts/lrms/nomad_ai4eosc_fe_install.yml + inputs: + nomad_server_list: { get_attribute: [ front, private_address ] } + nomad_launch_traefik: { get_input: launch_traefik } + nomad_version: { get_input: nomad_version } + consul_version: { get_input: consul_version } + consul_certs_url: { get_input: consul_cert_url } + nomad_certs_url: { get_input: nomad_cert_url } + consul_server_join: { get_input: consul_server_join } + + front: + type: tosca.nodes.indigo.Compute + capabilities: + scalable: + properties: + count: { get_input: fe_num } + endpoint: + properties: + dns_name: server#N# + network_name: PUBLIC + host: + properties: + num_cpus: { get_input: fe_cpus } + mem_size: { get_input: fe_mem } + os: + properties: + type: linux + + wn_node: + type: tosca.nodes.indigo.LRMS.WorkerNode.Nomad + properties: + front_end_ip: { get_attribute: [ front, private_address, 0 ] } + requirements: + - host: wn + artifacts: + docker_role: + file: grycap.docker + type: tosca.artifacts.AnsibleGalaxy.role + nomad_role: + file: grycap.nomad,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + consul_role: + file: grycap.consul,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + interfaces: + Standard: + configure: + implementation: https://raw.githubusercontent.com/grycap/tosca/main/artifacts/lrms/nomad_ai4eosc_wn_install.yml + inputs: + nomad_server_list: { get_attribute: [ front, private_address ] } + nomad_version: { get_input: nomad_version } + consul_version: { get_input: consul_version } + consul_certs_url: { get_input: consul_cert_url } + nomad_certs_url: { get_input: nomad_cert_url } + consul_server_join: { get_input: consul_server_join } + + wn: + type: tosca.nodes.indigo.Compute + capabilities: + scalable: + properties: + count: { get_input: wn_num } + host: + properties: + num_cpus: { get_input: wn_cpus } + mem_size: { get_input: wn_mem } + os: + properties: + type: linux + + wn_pub_node: + type: tosca.nodes.indigo.LRMS.WorkerNode.Nomad + properties: + front_end_ip: { get_attribute: [ front, private_address, 0 ] } + requirements: + - host: wn_pub + artifacts: + docker_role: + file: grycap.docker + type: tosca.artifacts.AnsibleGalaxy.role + nomad_role: + file: grycap.nomad,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + consul_role: + file: grycap.consul,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + interfaces: + Standard: + configure: + implementation: https://raw.githubusercontent.com/grycap/tosca/main/artifacts/lrms/nomad_ai4eosc_wn_install.yml + inputs: + nomad_server_list: { get_attribute: [ front, private_address ] } + nomad_version: { get_input: nomad_version } + consul_version: { get_input: consul_version } + consul_certs_url: { get_input: consul_cert_url } + nomad_certs_url: { get_input: nomad_cert_url } + consul_server_join: { get_input: consul_server_join } + + wn_pub: + type: tosca.nodes.indigo.Compute + capabilities: + endpoint: + properties: + network_name: PUBLIC + ports: + port_80: + protocol: tcp + source: 80 + port_443: + protocol: tcp + source: 443 + scalable: + properties: + count: { get_input: wn_pub_num } + host: + properties: + num_cpus: { get_input: wn_pub_cpus } + mem_size: { get_input: wn_pub_mem } + os: + properties: + type: linux + + wn_gpu_node: + type: tosca.nodes.indigo.LRMS.WorkerNode.Nomad + properties: + front_end_ip: { get_attribute: [ front, private_address, 0 ] } + requirements: + - host: wn_gpu + artifacts: + docker_role: + file: grycap.docker + type: tosca.artifacts.AnsibleGalaxy.role + nomad_role: + file: grycap.nomad,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + consul_role: + file: grycap.consul,ai4eosc + type: tosca.artifacts.AnsibleGalaxy.role + interfaces: + Standard: + configure: + implementation: https://raw.githubusercontent.com/grycap/tosca/main/artifacts/lrms/nomad_ai4eosc_wn_install.yml + inputs: + nomad_server_list: { get_attribute: [ front, private_address ] } + nomad_version: { get_input: nomad_version } + consul_version: { get_input: consul_version } + nomad_nvidia_support: true + consul_certs_url: { get_input: consul_cert_url } + nomad_certs_url: { get_input: nomad_cert_url } + consul_server_join: { get_input: consul_server_join } + + wn_gpu: + type: tosca.nodes.indigo.Compute + capabilities: + scalable: + properties: + count: { get_input: wn_gpu_num } + host: + properties: + num_cpus: { get_input: wn_gpu_cpus } + mem_size: { get_input: wn_gpu_mem } + num_gpus: { get_input: wn_gpu_num_gpus } + gpu_vendor: { get_input: wn_gpu_vendor } + gpu_model: { get_input: wn_gpu_model } + os: + properties: + type: linux + + outputs: + nomad_ui: + value: { concat: [ 'https://', get_attribute: [ front, public_address, 0 ], ':4646' ] } + consul_ui: + value: { concat: [ 'https://', get_attribute: [ front, public_address, 0 ], ':8501' ] } + nomad_token: + value: { get_attribute: [ front, ansible_output, lrms_front_end_front_conf_front, tasks, 'grycap.nomad : nomad_secret_id', output ] } + consul_token: + value: { get_attribute: [ front, ansible_output, lrms_front_end_front_conf_front, tasks, 'grycap.consul : consul_secret_id', output ] } + cluster_ip: + value: { get_attribute: [ front, public_address, 0 ] } + cluster_creds: + value: { get_attribute: [ front, endpoint, credential, 0 ] } + traefik_endpoint: + value: { concat: [ 'https://', get_attribute: [ wn_pub, public_address, 0 ] ] }