Home Home > GIT Browse > SLE15-SP1-AZURE
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPetr Tesarik <ptesarik@suse.cz>2019-06-21 21:04:37 +0200
committerPetr Tesarik <ptesarik@suse.cz>2019-06-21 21:04:37 +0200
commit39708dc5de7505e844b8e0a8265c582dbc65c428 (patch)
tree5cc9e89bf4bea2e3bbf8ba71f8c5cd925a5158c4
parent2b8119b142fb27db7a928cd7e54a570098dab11d (diff)
parenta47b8c82ec23855bea065ff3dbecf0cefb0a572c (diff)
Merge branch 'users/mfleming/SLE15-SP1/for-next' into SLE15-SP1
Pull AMD EPYC load balancing fix from Matt Fleming
-rw-r--r--patches.fixes/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch144
-rw-r--r--series.conf3
2 files changed, 147 insertions, 0 deletions
diff --git a/patches.fixes/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch b/patches.fixes/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch
new file mode 100644
index 0000000000..7ae7d8e588
--- /dev/null
+++ b/patches.fixes/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch
@@ -0,0 +1,144 @@
+From 75e83ec36bfc93bdafaf9dda2262b7ff006781cf Mon Sep 17 00:00:00 2001
+From: Matt Fleming <mfleming@suse.de>
+Date: Fri, 24 May 2019 22:11:42 +0100
+Subject: [PATCH] sched/topology: Improve load balancing on AMD EPYC
+Patch-mainline: Not yet, under discussion on LKML
+References: bsc#1137366
+
+SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
+for any sched domains with a NUMA distance greater than 2 hops
+(RECLAIM_DISTANCE). The idea being that it's expensive to balance
+across domains that far apart.
+
+However, as is rather unfortunately explained in
+
+ commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30")
+
+the value for RECLAIM_DISTANCE is based on node distance tables from
+2011-era hardware.
+
+Current AMD EPYC machines have the following NUMA node distances:
+
+node distances:
+node 0 1 2 3 4 5 6 7
+ 0: 10 16 16 16 32 32 32 32
+ 1: 16 10 16 16 32 32 32 32
+ 2: 16 16 10 16 32 32 32 32
+ 3: 16 16 16 10 32 32 32 32
+ 4: 32 32 32 32 10 16 16 16
+ 5: 32 32 32 32 16 10 16 16
+ 6: 32 32 32 32 16 16 10 16
+ 7: 32 32 32 32 16 16 16 10
+
+where 2 hops is 32.
+
+The result is that the scheduler fails to load balance properly across
+NUMA nodes on different sockets -- 2 hops apart.
+
+For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4
+(CPUs 32-39) like so,
+
+ $ numactl -C 0-7,32-39 ./spinner 16
+
+causes all threads to fork and remain on node 0 until the active
+balancer kicks in after a few seconds and forcibly moves some threads
+to node 4.
+
+Override node_reclaim_distance for AMD Zen.
+
+Signed-off-by: Matt Fleming <mfleming@suse.de>
+---
+ arch/x86/kernel/cpu/amd.c | 5 +++++
+ include/linux/topology.h | 3 +++
+ kernel/sched/topology.c | 3 ++-
+ mm/khugepaged.c | 2 +-
+ mm/page_alloc.c | 2 +-
+ 5 files changed, 12 insertions(+), 3 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
+index 57bb2100e05b..bb2f3e98efbf 100644
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -7,6 +7,7 @@
+ #include <linux/sched.h>
+ #include <linux/sched/clock.h>
+ #include <linux/random.h>
++#include <linux/topology.h>
+ #include <asm/processor.h>
+ #include <asm/apic.h>
+ #include <asm/cacheinfo.h>
+@@ -812,6 +813,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
+ {
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+
++#ifdef CONFIG_NUMA
++ node_reclaim_distance = 32;
++#endif
++
+ /* Fix erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
+diff --git a/include/linux/topology.h b/include/linux/topology.h
+index cb0775e1ee4b..74b484354ac9 100644
+--- a/include/linux/topology.h
++++ b/include/linux/topology.h
+@@ -59,6 +59,9 @@ int arch_update_cpu_topology(void);
+ */
+ #define RECLAIM_DISTANCE 30
+ #endif
++
++extern int __read_mostly node_reclaim_distance;
++
+ #ifndef PENALTY_FOR_NODE_WITH_CPUS
+ #define PENALTY_FOR_NODE_WITH_CPUS (1)
+ #endif
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 8b646058fb57..57b4afe6387a 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1070,6 +1070,7 @@ static int *sched_domains_numa_distance;
+ int sched_max_numa_distance;
+ static struct cpumask ***sched_domains_numa_masks;
+ static int sched_domains_curr_level;
++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+ #endif
+
+ /*
+@@ -1191,7 +1192,7 @@ sd_init(struct sched_domain_topology_level *tl,
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
++ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 2c2813d90cb2..859f6fd0cb84 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -690,7 +690,7 @@ static bool khugepaged_scan_abort(int nid)
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (!khugepaged_node_load[i])
+ continue;
+- if (node_distance(nid, i) > RECLAIM_DISTANCE)
++ if (node_distance(nid, i) > node_reclaim_distance)
+ return true;
+ }
+ return false;
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index b8ba38dc77f4..0c8b489c0f0c 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3191,7 +3191,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+ {
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
+- RECLAIM_DISTANCE;
++ node_reclaim_distance;
+ }
+ #else /* CONFIG_NUMA */
+ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+--
+2.13.7
+
diff --git a/series.conf b/series.conf
index dd3f36c609..94d35bd631 100644
--- a/series.conf
+++ b/series.conf
@@ -47315,6 +47315,9 @@
patches.suse/IBRS-forbid-shooting-in-foot.patch
patches.suse/do-not-default-to-ibrs-on-skl.patch
+ # bsc#1137366
+ patches.fixes/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch
+
########################################################
# locking/core
########################################################