From 86095790aaf81e0f1b73e462b4343a9d50e24a05 Mon Sep 17 00:00:00 2001 From: Adrien Nayrat Date: Fri, 6 Oct 2023 21:36:29 +0200 Subject: [PATCH] Add linux kernel tuning sysctl, huges pages, disable THP --- roles/common/files/systemd-tmpfiles.conf | 2 + roles/common/handlers/main.yml | 2 + roles/common/handlers/sysctl.yml | 2 + roles/common/tasks/main.yml | 13 ++-- roles/common/tasks/sysctl.yml | 81 ++++++++++++++++++++++++ roles/common/tasks/sysstat.yml | 5 +- 6 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 roles/common/files/systemd-tmpfiles.conf create mode 100644 roles/common/handlers/sysctl.yml create mode 100644 roles/common/tasks/sysctl.yml diff --git a/roles/common/files/systemd-tmpfiles.conf b/roles/common/files/systemd-tmpfiles.conf new file mode 100644 index 00000000..07a0021d --- /dev/null +++ b/roles/common/files/systemd-tmpfiles.conf @@ -0,0 +1,2 @@ +w /sys/kernel/mm/transparent_hugepage/enabled - - - - never +w /sys/kernel/mm/transparent_hugepage/defrag - - - - never diff --git a/roles/common/handlers/main.yml b/roles/common/handlers/main.yml index 31a1b93e..289c808a 100644 --- a/roles/common/handlers/main.yml +++ b/roles/common/handlers/main.yml @@ -27,6 +27,8 @@ - include: sysstat.yml +- include: sysctl.yml + - name: restart systemd-hostnamed systemd: daemon_reload: yes diff --git a/roles/common/handlers/sysctl.yml b/roles/common/handlers/sysctl.yml new file mode 100644 index 00000000..3ac92fe4 --- /dev/null +++ b/roles/common/handlers/sysctl.yml @@ -0,0 +1,2 @@ +- name: systemd-tmpfiles create + command: systemd-tmpfiles --create diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml index 41b052a8..b5c306d0 100644 --- a/roles/common/tasks/main.yml +++ b/roles/common/tasks/main.yml @@ -670,14 +670,6 @@ notify: - generate locales -# necessary to run a lot of containers, each which systemd launching several inotify -- name: increase fs.inotify.max_user_instances on host - sysctl: - name: fs.inotify.max_user_instances - value: 1024 - sysctl_file: /etc/sysctl.d/ansible.conf - when: "not 'vm' in group_names" - # configure lxfs so that VMs get their own load-average - name: create systemd override directory for lxcfs file: @@ -709,7 +701,10 @@ - include: munin-node.yml - include: sysstat.yml + when: "'proxmox' in group_names" + +- include: sysctl.yml + when: "'proxmox' in group_names" - include: ntp.yml when: "not 'vm' in group_names" - diff --git a/roles/common/tasks/sysctl.yml b/roles/common/tasks/sysctl.yml new file mode 100644 index 00000000..efba6d37 --- /dev/null +++ b/roles/common/tasks/sysctl.yml @@ -0,0 +1,81 @@ +# necessary to run a lot of containers, each which systemd launching several inotify +- name: increase fs.inotify.max_user_instances on host + sysctl: + name: fs.inotify.max_user_instances + value: 1024 + sysctl_file: /etc/sysctl.d/ansible.conf + +- name: Reduce swappiness to 1 + sysctl: + name: vm.swappiness + value: 1 + sysctl_file: /etc/sysctl.d/ansible.conf + +# https://forum.proxmox.com/threads/increase-performance-with-sched_autogroup_enabled-0.41729/ +# https://www.postgresql.org/message-id/50E4AAB1.9040902@optionshouse.com +# +# * sched_migration_cost +# +# The migration cost is the total time the scheduler will consider a +# migrated process "cache hot" and thus less likely to be re-migrated. By +# default, this is 0.5ms (500000 ns), and as the size of the process table +# increases, eventually causes the scheduler to break down. On our +# systems, after a smooth degradation with increasing connection count, +# system CPU spiked from 20 to 70% sustained and TPS was cut by 5-10x once +# we crossed some invisible connection count threshold. For us, that was a +# pgbench with 900 or more clients. +# +# The migration cost should be increased, almost universally on server +# systems with many processes. This means systems like PostgreSQL or +# Apache would benefit from having higher migration costs. We've had good +# luck with a setting of 5ms (5000000 ns) instead. +# +# When the breakdown occurs, system CPU (as obtained from sar) increases +# from 20% on a heavy pgbench (scale 3500 on a 72GB system) to over 70%, +# and %nice/%user is cut by half or more. A higher migration cost +# essentially eliminates this artificial throttle. +# +# +# * sched_autogroup_enabled +# +# This is a relatively new patch which Linus lauded back in late 2010. It +# basically groups tasks by TTY so perceived responsiveness is improved. +# But on server systems, large daemons like PostgreSQL are going to be +# launched from the same pseudo-TTY, and be effectively choked out of CPU +# cycles in favor of less important tasks. +# +# The default setting is 1 (enabled) on some platforms. By setting this to +# 0 (disabled), we saw an outright 30% performance boost on the same +# pgbench test. A fully cached scale 3500 database on a 72GB system went +# from 67k TPS to 82k TPS with 900 client connections. + +- name: Set kernel.sched_autogroup_enabled to 0 + sysctl: + name: kernel.sched_autogroup_enabled + value: 0 + sysctl_file: /etc/sysctl.d/ansible.conf + +- name: Set kernel.sched_migration_cost_ns to 5000000 + sysctl: + name: kernel.sched_migration_cost_ns + value: 5000000 + sysctl_file: /etc/sysctl.d/ansible.conf + +# We use systemd-tmpfiles mechanism to write in pseudo filesystem +# https://sleeplessbeastie.eu/2022/11/18/how-to-create-persistent-sysfs-configuration-using-systemd/ +# https://wiki.archlinux.org/title/Systemd#systemd-tmpfiles_-_temporary_files +- name: Disable Transparent Huge Pages + copy: + src: 'systemd-tmpfiles.conf' + dest: '/etc/tmpfiles.d/thp.conf' + notify: + - systemd-tmpfiles create + +# La mémoire n'est pas allouée/réservée. Le kernel essaiera d'allouer les hugepages si c'est possible, sinon tant pis. +# Ca marche bien au démarrage. Une fois que le serveur tourne et que la mémoire est utilisée pour le cache ou est fragmentée, +# il aura plus de mal à trouver des blocs consécutifs. +- name: Allow 2MB huge pages up to 60% of the RAM + sysctl: + name: vm.nr_overcommit_hugepages + value: "{{ ( ansible_memtotal_mb * 0.6 / 2)|int }}" + sysctl_file: /etc/sysctl.d/ansible.conf diff --git a/roles/common/tasks/sysstat.yml b/roles/common/tasks/sysstat.yml index e43237af..3ca5cafe 100644 --- a/roles/common/tasks/sysstat.yml +++ b/roles/common/tasks/sysstat.yml @@ -1,8 +1,9 @@ -- name: install packages for sysstat +- name: install packages for sysstat and atop apt: pkg={{ item }} update_cache=yes with_items: - sysstat - - xz + - xz-utils + - atop when: ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu' - name: Enable sysstat