Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Kubecek <mkubecek@suse.cz>2019-11-05 21:24:01 +0100
committerMichal Kubecek <mkubecek@suse.cz>2019-11-05 21:24:01 +0100
commitc7eada4a2549120e1c9b384a126922cbc8ebf80e (patch)
tree156ab8c121e9189be483ef815678b2986821b9c7
parent38547d2da3f6dc34253484eb83471ae832800a27 (diff)
parent471d8373b17701ee909a2f453ebef61babaabddb (diff)
Merge branch 'users/mgorman/SLE15-SP2/for-next' into SLE15-SP2
Pull memory management backport from Mel Gorman.
-rw-r--r--patches.suse/hugetlbfs-don-t-retry-when-pool-page-allocations-start-to-fail.patch236
-rw-r--r--patches.suse/mm-compaction-raise-compaction-priority-after-it-withdrawns.patch127
-rw-r--r--patches.suse/mm-compaction.c-remove-unnecessary-zone-parameter-in-isolate_migratepages.patch74
-rw-r--r--patches.suse/mm-filemap.c-don-t-initiate-writeback-if-mapping-has-no-dirty-pages.patch51
-rw-r--r--patches.suse/mm-filemap.c-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch46
-rw-r--r--patches.suse/mm-mempolicy.c-remove-unnecessary-nodemask-check-in-kernel_migrate_pages.patch50
-rw-r--r--patches.suse/mm-reclaim-cleanup-should_continue_reclaim.patch110
-rw-r--r--patches.suse/mm-reclaim-make-should_continue_reclaim-perform-dryrun-detection.patch118
-rw-r--r--patches.suse/mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch77
-rw-r--r--patches.suse/mm-sl-aou-b-guarantee-natural-alignment-for-kmalloc-power-of-two.patch259
-rw-r--r--patches.suse/mm-sl-ou-b-improve-memory-accounting.patch164
-rw-r--r--patches.suse/mm-vmscan-do-not-share-cgroup-iteration-between-reclaimers.patch110
-rw-r--r--patches.suse/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch3
-rw-r--r--series.conf12
14 files changed, 1436 insertions, 1 deletions
diff --git a/patches.suse/hugetlbfs-don-t-retry-when-pool-page-allocations-start-to-fail.patch b/patches.suse/hugetlbfs-don-t-retry-when-pool-page-allocations-start-to-fail.patch
new file mode 100644
index 0000000000..f7aaea6ad0
--- /dev/null
+++ b/patches.suse/hugetlbfs-don-t-retry-when-pool-page-allocations-start-to-fail.patch
@@ -0,0 +1,236 @@
+From 43f79a33fd1a4f2cc051e47d8635fb0e4f18efe0 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Mon, 23 Sep 2019 15:37:35 -0700
+Subject: [PATCH] hugetlbfs: don't retry when pool page allocations start to
+ fail
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: f60858f9d327c4dd0c432abe9ec943a83929c229
+
+When allocating hugetlbfs pool pages via /proc/sys/vm/nr_hugepages, the
+pages will be interleaved between all nodes of the system. If nodes are
+not equal, it is quite possible for one node to fill up before the others.
+When this happens, the code still attempts to allocate pages from the
+full node. This results in calls to direct reclaim and compaction which
+slow things down considerably.
+
+When allocating pool pages, note the state of the previous allocation for
+each node. If previous allocation failed, do not use the aggressive retry
+algorithm on successive attempts. The allocation will still succeed if
+there is memory available, but it will not try as hard to free up memory.
+
+Link: http://lkml.kernel.org/r/20190806014744.15446-5-mike.kravetz@oracle.com
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/hugetlb.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 79 insertions(+), 10 deletions(-)
+
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index 843ee2f8d356..b45a95363a84 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1404,12 +1404,25 @@ pgoff_t __basepage_index(struct page *page)
+ }
+
+ static struct page *alloc_buddy_huge_page(struct hstate *h,
+- gfp_t gfp_mask, int nid, nodemask_t *nmask)
++ gfp_t gfp_mask, int nid, nodemask_t *nmask,
++ nodemask_t *node_alloc_noretry)
+ {
+ int order = huge_page_order(h);
+ struct page *page;
++ bool alloc_try_hard = true;
+
+- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
++ /*
++ * By default we always try hard to allocate the page with
++ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
++ * a loop (to adjust global huge page counts) and previous allocation
++ * failed, do not continue to try hard on the same node. Use the
++ * node_alloc_noretry bitmap to manage this state information.
++ */
++ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
++ alloc_try_hard = false;
++ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
++ if (alloc_try_hard)
++ gfp_mask |= __GFP_RETRY_MAYFAIL;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+@@ -1418,6 +1431,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
+ else
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
++ /*
++ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
++ * indicates an overall state change. Clear bit so that we resume
++ * normal 'try hard' allocations.
++ */
++ if (node_alloc_noretry && page && !alloc_try_hard)
++ node_clear(nid, *node_alloc_noretry);
++
++ /*
++ * If we tried hard to get a page but failed, set bit so that
++ * subsequent attempts will not try as hard until there is an
++ * overall state change.
++ */
++ if (node_alloc_noretry && !page && alloc_try_hard)
++ node_set(nid, *node_alloc_noretry);
++
+ return page;
+ }
+
+@@ -1426,7 +1455,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
+ * should use this function to get new hugetlb pages
+ */
+ static struct page *alloc_fresh_huge_page(struct hstate *h,
+- gfp_t gfp_mask, int nid, nodemask_t *nmask)
++ gfp_t gfp_mask, int nid, nodemask_t *nmask,
++ nodemask_t *node_alloc_noretry)
+ {
+ struct page *page;
+
+@@ -1434,7 +1464,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
+ page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ else
+ page = alloc_buddy_huge_page(h, gfp_mask,
+- nid, nmask);
++ nid, nmask, node_alloc_noretry);
+ if (!page)
+ return NULL;
+
+@@ -1449,14 +1479,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
++static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
++ nodemask_t *node_alloc_noretry)
+ {
+ struct page *page;
+ int nr_nodes, node;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
++ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
++ node_alloc_noretry);
+ if (page)
+ break;
+ }
+@@ -1600,7 +1632,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
+ goto out_unlock;
+ spin_unlock(&hugetlb_lock);
+
+- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
++ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
+ if (!page)
+ return NULL;
+
+@@ -1636,7 +1668,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ if (hstate_is_gigantic(h))
+ return NULL;
+
+- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
++ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
+ if (!page)
+ return NULL;
+
+@@ -2206,13 +2238,33 @@ static void __init gather_bootmem_prealloc(void)
+ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+ {
+ unsigned long i;
++ nodemask_t *node_alloc_noretry;
++
++ if (!hstate_is_gigantic(h)) {
++ /*
++ * Bit mask controlling how hard we retry per-node allocations.
++ * Ignore errors as lower level routines can deal with
++ * node_alloc_noretry == NULL. If this kmalloc fails at boot
++ * time, we are likely in bigger trouble.
++ */
++ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
++ GFP_KERNEL);
++ } else {
++ /* allocations done at boot time */
++ node_alloc_noretry = NULL;
++ }
++
++ /* bit mask controlling how hard we retry per-node allocations */
++ if (node_alloc_noretry)
++ nodes_clear(*node_alloc_noretry);
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (!alloc_bootmem_huge_page(h))
+ break;
+ } else if (!alloc_pool_huge_page(h,
+- &node_states[N_MEMORY]))
++ &node_states[N_MEMORY],
++ node_alloc_noretry))
+ break;
+ cond_resched();
+ }
+@@ -2224,6 +2276,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+ h->max_huge_pages, buf, i);
+ h->max_huge_pages = i;
+ }
++
++ kfree(node_alloc_noretry);
+ }
+
+ static void __init hugetlb_init_hstates(void)
+@@ -2322,6 +2376,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
+ {
+ unsigned long min_count, ret;
++ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
++
++ /*
++ * Bit mask controlling how hard we retry per-node allocations.
++ * If we can not allocate the bit mask, do not attempt to allocate
++ * the requested huge pages.
++ */
++ if (node_alloc_noretry)
++ nodes_clear(*node_alloc_noretry);
++ else
++ return -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+
+@@ -2355,6 +2420,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
++ NODEMASK_FREE(node_alloc_noretry);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+@@ -2387,7 +2453,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ /* yield cpu to avoid soft lockup */
+ cond_resched();
+
+- ret = alloc_pool_huge_page(h, nodes_allowed);
++ ret = alloc_pool_huge_page(h, nodes_allowed,
++ node_alloc_noretry);
+ spin_lock(&hugetlb_lock);
+ if (!ret)
+ goto out;
+@@ -2428,6 +2495,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ h->max_huge_pages = persistent_huge_pages(h);
+ spin_unlock(&hugetlb_lock);
+
++ NODEMASK_FREE(node_alloc_noretry);
++
+ return 0;
+ }
+
diff --git a/patches.suse/mm-compaction-raise-compaction-priority-after-it-withdrawns.patch b/patches.suse/mm-compaction-raise-compaction-priority-after-it-withdrawns.patch
new file mode 100644
index 0000000000..0746ca53d7
--- /dev/null
+++ b/patches.suse/mm-compaction-raise-compaction-priority-after-it-withdrawns.patch
@@ -0,0 +1,127 @@
+From 9d069226d196effc5b873f24ebcd70f63338a698 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 23 Sep 2019 15:37:32 -0700
+Subject: [PATCH] mm, compaction: raise compaction priority after it withdrawns
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 494330855641269c8a49f1580f0d4e2ead693245
+
+Mike Kravetz reports that "hugetlb allocations could stall for minutes or
+hours when should_compact_retry() would return true more often then it
+should. Specifically, this was in the case where compact_result was
+COMPACT_DEFERRED and COMPACT_PARTIAL_SKIPPED and no progress was being
+made."
+
+The problem is that the compaction_withdrawn() test in
+should_compact_retry() includes compaction outcomes that are only possible
+on low compaction priority, and results in a retry without increasing the
+priority. This may result in furter reclaim, and more incomplete
+compaction attempts.
+
+With this patch, compaction priority is raised when possible, or
+should_compact_retry() returns false.
+
+The COMPACT_SKIPPED result doesn't really fit together with the other
+outcomes in compaction_withdrawn(), as that's a result caused by
+insufficient order-0 pages, not due to low compaction priority. With this
+patch, it is moved to a new compaction_needs_reclaim() function, and for
+that outcome we keep the current logic of retrying if it looks like
+reclaim will be able to help.
+
+Link: http://lkml.kernel.org/r/20190806014744.15446-4-mike.kravetz@oracle.com
+Reported-by: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Tested-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ include/linux/compaction.h | 22 +++++++++++++++++-----
+ mm/page_alloc.c | 16 ++++++++++++----
+ 2 files changed, 29 insertions(+), 9 deletions(-)
+
+diff --git a/include/linux/compaction.h b/include/linux/compaction.h
+index 9569e7c786d3..4b898cdbdf05 100644
+--- a/include/linux/compaction.h
++++ b/include/linux/compaction.h
+@@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result)
+ return false;
+ }
+
+-/*
+- * Compaction has backed off for some reason. It might be throttling or
+- * lock contention. Retrying is still worthwhile.
+- */
+-static inline bool compaction_withdrawn(enum compact_result result)
++/* Compaction needs reclaim to be performed first, so it can continue. */
++static inline bool compaction_needs_reclaim(enum compact_result result)
+ {
+ /*
+ * Compaction backed off due to watermark checks for order-0
+@@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result)
+ if (result == COMPACT_SKIPPED)
+ return true;
+
++ return false;
++}
++
++/*
++ * Compaction has backed off for some reason after doing some work or none
++ * at all. It might be throttling or lock contention. Retrying might be still
++ * worthwhile, but with a higher priority if allowed.
++ */
++static inline bool compaction_withdrawn(enum compact_result result)
++{
+ /*
+ * If compaction is deferred for high-order allocations, it is
+ * because sync compaction recently failed. If this is the case
+@@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result)
+ return false;
+ }
+
++static inline bool compaction_needs_reclaim(enum compact_result result)
++{
++ return false;
++}
++
+ static inline bool compaction_withdrawn(enum compact_result result)
+ {
+ return true;
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 0303a117702b..47338f00f994 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3968,15 +3968,23 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ if (compaction_failed(compact_result))
+ goto check_priority;
+
++ /*
++ * compaction was skipped because there are not enough order-0 pages
++ * to work with, so we retry only if it looks like reclaim can help.
++ */
++ if (compaction_needs_reclaim(compact_result)) {
++ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
++ goto out;
++ }
++
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+- * But do not retry if the given zonelist is not suitable for
+- * compaction.
++ * But the next retry should use a higher priority if allowed, so
++ * we don't just keep bailing out endlessly.
+ */
+ if (compaction_withdrawn(compact_result)) {
+- ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+- goto out;
++ goto check_priority;
+ }
+
+ /*
diff --git a/patches.suse/mm-compaction.c-remove-unnecessary-zone-parameter-in-isolate_migratepages.patch b/patches.suse/mm-compaction.c-remove-unnecessary-zone-parameter-in-isolate_migratepages.patch
new file mode 100644
index 0000000000..5c2ccf1f8c
--- /dev/null
+++ b/patches.suse/mm-compaction.c-remove-unnecessary-zone-parameter-in-isolate_migratepages.patch
@@ -0,0 +1,74 @@
+From 939a91bfa02ae2f24243a6d933396c6a8d3421a6 Mon Sep 17 00:00:00 2001
+From: Pengfei Li <lpf.vector@gmail.com>
+Date: Mon, 23 Sep 2019 15:36:58 -0700
+Subject: [PATCH] mm/compaction.c: remove unnecessary zone parameter in
+ isolate_migratepages()
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 32aaf0553df99cc4314f6e9f43216cd83afc6c20
+
+Like commit 40cacbcb3240 ("mm, compaction: remove unnecessary zone
+parameter in some instances"), remove unnecessary zone parameter.
+
+No functional change.
+
+Link: http://lkml.kernel.org/r/20190806151616.21107-1-lpf.vector@gmail.com
+Signed-off-by: Pengfei Li <lpf.vector@gmail.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Qian Cai <cai@lca.pw>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/compaction.c | 13 ++++++-------
+ 1 file changed, 6 insertions(+), 7 deletions(-)
+
+diff --git a/mm/compaction.c b/mm/compaction.c
+index 5ab9c2b22693..fc02493e7d92 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -1738,8 +1738,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
+ * starting at the block pointed to by the migrate scanner pfn within
+ * compact_control.
+ */
+-static isolate_migrate_t isolate_migratepages(struct zone *zone,
+- struct compact_control *cc)
++static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
+ {
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+@@ -1757,8 +1756,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
+ */
+ low_pfn = fast_find_migrateblock(cc);
+ block_start_pfn = pageblock_start_pfn(low_pfn);
+- if (block_start_pfn < zone->zone_start_pfn)
+- block_start_pfn = zone->zone_start_pfn;
++ if (block_start_pfn < cc->zone->zone_start_pfn)
++ block_start_pfn = cc->zone->zone_start_pfn;
+
+ /*
+ * fast_find_migrateblock marks a pageblock skipped so to avoid
+@@ -1788,8 +1787,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
+
+- page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+- zone);
++ page = pageblock_pfn_to_page(block_start_pfn,
++ block_end_pfn, cc->zone);
+ if (!page)
+ continue;
+
+@@ -2170,7 +2169,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
+ cc->rescan = true;
+ }
+
+- switch (isolate_migratepages(cc->zone, cc)) {
++ switch (isolate_migratepages(cc)) {
+ case ISOLATE_ABORT:
+ ret = COMPACT_CONTENDED;
+ putback_movable_pages(&cc->migratepages);
diff --git a/patches.suse/mm-filemap.c-don-t-initiate-writeback-if-mapping-has-no-dirty-pages.patch b/patches.suse/mm-filemap.c-don-t-initiate-writeback-if-mapping-has-no-dirty-pages.patch
new file mode 100644
index 0000000000..b09417bec0
--- /dev/null
+++ b/patches.suse/mm-filemap.c-don-t-initiate-writeback-if-mapping-has-no-dirty-pages.patch
@@ -0,0 +1,51 @@
+From 85851c6f616f137979a8e9e4ee47894f9413e79e Mon Sep 17 00:00:00 2001
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Date: Mon, 23 Sep 2019 15:34:45 -0700
+Subject: [PATCH] mm/filemap.c: don't initiate writeback if mapping has no
+ dirty pages
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: c3aab9a0bd91b696a852169479b7db1ece6cbf8c
+
+Functions like filemap_write_and_wait_range() should do nothing if inode
+has no dirty pages or pages currently under writeback. But they anyway
+construct struct writeback_control and this does some atomic operations if
+CONFIG_CGROUP_WRITEBACK=y - on fast path it locks inode->i_lock and
+updates state of writeback ownership, on slow path might be more work.
+Current this path is safely avoided only when inode mapping has no pages.
+
+For example generic_file_read_iter() calls filemap_write_and_wait_range()
+at each O_DIRECT read - pretty hot path.
+
+This patch skips starting new writeback if mapping has no dirty tags set.
+If writeback is already in progress filemap_write_and_wait_range() will
+wait for it.
+
+Link: http://lkml.kernel.org/r/156378816804.1087.8607636317907921438.stgit@buzz
+Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/filemap.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/mm/filemap.c b/mm/filemap.c
+index d0cf700bf201..d9572593e5c7 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -408,7 +408,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ .range_end = end,
+ };
+
+- if (!mapping_cap_writeback_dirty(mapping))
++ if (!mapping_cap_writeback_dirty(mapping) ||
++ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
diff --git a/patches.suse/mm-filemap.c-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch b/patches.suse/mm-filemap.c-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch
new file mode 100644
index 0000000000..3c15a75d9f
--- /dev/null
+++ b/patches.suse/mm-filemap.c-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch
@@ -0,0 +1,46 @@
+From 2f7f5df4107b7015113d8bedd31b4bbcca6ee61f Mon Sep 17 00:00:00 2001
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Date: Mon, 23 Sep 2019 15:34:48 -0700
+Subject: [PATCH] mm/filemap.c: rewrite mapping_needs_writeback in less fancy
+ manner
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 875d91b11a201276ac3a9ab79f8b0fa3dc4ee8fd
+
+This actually checks that writeback is needed or in progress.
+
+Link: http://lkml.kernel.org/r/156378817069.1087.1302816672037672488.stgit@buzz
+Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Jan Kara <jack@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/filemap.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/mm/filemap.c b/mm/filemap.c
+index d9572593e5c7..29f503ffd70b 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -618,10 +618,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping)
+ }
+ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
+
++/* Returns true if writeback might be needed or already in progress. */
+ static bool mapping_needs_writeback(struct address_space *mapping)
+ {
+- return (!dax_mapping(mapping) && mapping->nrpages) ||
+- (dax_mapping(mapping) && mapping->nrexceptional);
++ if (dax_mapping(mapping))
++ return mapping->nrexceptional;
++
++ return mapping->nrpages;
+ }
+
+ int filemap_write_and_wait(struct address_space *mapping)
diff --git a/patches.suse/mm-mempolicy.c-remove-unnecessary-nodemask-check-in-kernel_migrate_pages.patch b/patches.suse/mm-mempolicy.c-remove-unnecessary-nodemask-check-in-kernel_migrate_pages.patch
new file mode 100644
index 0000000000..bfe86ca92d
--- /dev/null
+++ b/patches.suse/mm-mempolicy.c-remove-unnecessary-nodemask-check-in-kernel_migrate_pages.patch
@@ -0,0 +1,50 @@
+From 631953991185130a4a0fa7c643e8a8effe7f732d Mon Sep 17 00:00:00 2001
+From: Kefeng Wang <wangkefeng.wang@huawei.com>
+Date: Mon, 23 Sep 2019 15:37:01 -0700
+Subject: [PATCH] mm/mempolicy.c: remove unnecessary nodemask check in
+ kernel_migrate_pages()
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 4406548ee39c2268876b61a927acad45da6f9aef
+
+1) task_nodes = cpuset_mems_allowed(current);
+ -> cpuset_mems_allowed() guaranteed to return some non-empty
+ subset of node_states[N_MEMORY].
+
+2) nodes_and(*new, *new, task_nodes);
+ -> after nodes_and(), the 'new' should be empty or appropriate
+ nodemask(online node and with memory).
+
+After 1) and 2), we could remove unnecessary check whether the 'new'
+AND node_states[N_MEMORY] is empty.
+
+Link: http://lkml.kernel.org/r/20190806023634.55356-1-wangkefeng.wang@huawei.com
+Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/mempolicy.c | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/mm/mempolicy.c b/mm/mempolicy.c
+index 65e0874fce17..5ff884dd25b0 100644
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1513,10 +1513,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
+ if (nodes_empty(*new))
+ goto out_put;
+
+- nodes_and(*new, *new, node_states[N_MEMORY]);
+- if (nodes_empty(*new))
+- goto out_put;
+-
+ err = security_task_movememory(task);
+ if (err)
+ goto out_put;
diff --git a/patches.suse/mm-reclaim-cleanup-should_continue_reclaim.patch b/patches.suse/mm-reclaim-cleanup-should_continue_reclaim.patch
new file mode 100644
index 0000000000..3f34721893
--- /dev/null
+++ b/patches.suse/mm-reclaim-cleanup-should_continue_reclaim.patch
@@ -0,0 +1,110 @@
+From 5e5a9a63a7a03fe146f2fc2ca759fba11c085d74 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 23 Sep 2019 15:37:29 -0700
+Subject: [PATCH] mm, reclaim: cleanup should_continue_reclaim()
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 5ee04716c46ce58989b1256a98af1af89f385db8
+
+After commit "mm, reclaim: make should_continue_reclaim perform dryrun
+detection", closer look at the function shows, that nr_reclaimed == 0
+means the function will always return false. And since non-zero
+nr_reclaimed implies non_zero nr_scanned, testing nr_scanned serves no
+purpose, and so does the testing for __GFP_RETRY_MAYFAIL.
+
+This patch thus cleans up the function to test only !nr_reclaimed upfront,
+and remove the __GFP_RETRY_MAYFAIL test and nr_scanned parameter
+completely. Comment is also updated, explaining that approximating "full
+LRU list has been scanned" with nr_scanned == 0 didn't really work.
+
+Link: http://lkml.kernel.org/r/20190806014744.15446-3-mike.kravetz@oracle.com
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/vmscan.c | 43 ++++++++++++++-----------------------------
+ 1 file changed, 14 insertions(+), 29 deletions(-)
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index c4be05cc681d..0a6a3dfa81c3 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2587,7 +2587,6 @@ static bool in_reclaim_compaction(struct scan_control *sc)
+ */
+ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
+ unsigned long nr_reclaimed,
+- unsigned long nr_scanned,
+ struct scan_control *sc)
+ {
+ unsigned long pages_for_compaction;
+@@ -2598,28 +2597,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
+ if (!in_reclaim_compaction(sc))
+ return false;
+
+- /* Consider stopping depending on scan and reclaim activity */
+- if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
+- /*
+- * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
+- * full LRU list has been scanned and we are still failing
+- * to reclaim pages. This full LRU scan is potentially
+- * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
+- */
+- if (!nr_reclaimed && !nr_scanned)
+- return false;
+- } else {
+- /*
+- * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
+- * fail without consequence, stop if we failed to reclaim
+- * any pages from the last SWAP_CLUSTER_MAX number of
+- * pages that were scanned. This will return to the
+- * caller faster at the risk reclaim/compaction and
+- * the resulting allocation attempt fails
+- */
+- if (!nr_reclaimed)
+- return false;
+- }
++ /*
++ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
++ * number of pages that were scanned. This will return to the caller
++ * with the risk reclaim/compaction and the resulting allocation attempt
++ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
++ * allocations through requiring that the full LRU list has been scanned
++ * first, by assuming that zero delta of sc->nr_scanned means full LRU
++ * scan, but that approximation was wrong, and there were corner cases
++ * where always a non-zero amount of pages were scanned.
++ */
++ if (!nr_reclaimed)
++ return false;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+@@ -2646,11 +2635,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+- return inactive_lru_pages > pages_for_compaction &&
+- /*
+- * avoid dryrun with plenty of inactive pages
+- */
+- nr_scanned && nr_reclaimed;
++ return inactive_lru_pages > pages_for_compaction;
+ }
+
+ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+@@ -2795,7 +2780,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
+ } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
+- sc->nr_scanned - nr_scanned, sc));
++ sc));
+
+ /*
+ * Kswapd gives up on balancing particular nodes after too
diff --git a/patches.suse/mm-reclaim-make-should_continue_reclaim-perform-dryrun-detection.patch b/patches.suse/mm-reclaim-make-should_continue_reclaim-perform-dryrun-detection.patch
new file mode 100644
index 0000000000..8b18ae4e0a
--- /dev/null
+++ b/patches.suse/mm-reclaim-make-should_continue_reclaim-perform-dryrun-detection.patch
@@ -0,0 +1,118 @@
+From a9b39a7419c43d789fe30364f9fb322215f1e60b Mon Sep 17 00:00:00 2001
+From: Hillf Danton <hdanton@sina.com>
+Date: Mon, 23 Sep 2019 15:37:26 -0700
+Subject: [PATCH] mm, reclaim: make should_continue_reclaim perform dryrun
+ detection
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 1c6c15971e4709953f75082a5d44212536b1c2b7
+
+Patch series "address hugetlb page allocation stalls", v2.
+
+Allocation of hugetlb pages via sysctl or procfs can stall for minutes or
+hours. A simple example on a two node system with 8GB of memory is as
+follows:
+
+echo 4096 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages
+echo 4096 > /proc/sys/vm/nr_hugepages
+
+Obviously, both allocation attempts will fall short of their 8GB goal.
+However, one or both of these commands may stall and not be interruptible.
+The issues were initially discussed in mail thread [1] and RFC code at
+[2].
+
+This series addresses the issues causing the stalls. There are two
+distinct fixes, a cleanup, and an optimization. The reclaim patch by
+Hillf and compaction patch by Vlasitmil address corner cases in their
+respective areas. hugetlb page allocation could stall due to either of
+these issues. Vlasitmil added a cleanup patch after Hillf's
+modifications. The hugetlb patch by Mike is an optimization suggested
+during the debug and development process.
+
+[1] http://lkml.kernel.org/r/d38a095e-dc39-7e82-bb76-2c9247929f07@oracle.com
+[2] http://lkml.kernel.org/r/20190724175014.9935-1-mike.kravetz@oracle.com
+
+This patch (of 4):
+
+Address the issue of should_continue_reclaim returning true too often for
+__GFP_RETRY_MAYFAIL attempts when !nr_reclaimed and nr_scanned. This was
+observed during hugetlb page allocation causing stalls for minutes or
+hours.
+
+We can stop reclaiming pages if compaction reports it can make a progress.
+There might be side-effects for other high-order allocations that would
+potentially benefit from reclaiming more before compaction so that they
+would be faster and less likely to stall. However, the consequences of
+premature/over-reclaim are considered worse.
+
+We can also bail out of reclaiming pages if we know that there are not
+enough inactive lru pages left to satisfy the costly allocation.
+
+We can give up reclaiming pages too if we see dryrun occur, with the
+certainty of plenty of inactive pages. IOW with dryrun detected, we are
+sure we have reclaimed as many pages as we could.
+
+Link: http://lkml.kernel.org/r/20190806014744.15446-2-mike.kravetz@oracle.com
+Signed-off-by: Hillf Danton <hdanton@sina.com>
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Tested-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/vmscan.c | 28 +++++++++++++++-------------
+ 1 file changed, 15 insertions(+), 13 deletions(-)
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 3944acd94764..c4be05cc681d 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2621,18 +2621,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
+ return false;
+ }
+
+- /*
+- * If we have not reclaimed enough pages for compaction and the
+- * inactive lists are large enough, continue reclaiming
+- */
+- pages_for_compaction = compact_gap(sc->order);
+- inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+- if (get_nr_swap_pages() > 0)
+- inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+- if (sc->nr_reclaimed < pages_for_compaction &&
+- inactive_lru_pages > pages_for_compaction)
+- return true;
+-
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+@@ -2648,7 +2636,21 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
+ ;
+ }
+ }
+- return true;
++
++ /*
++ * If we have not reclaimed enough pages for compaction and the
++ * inactive lists are large enough, continue reclaiming
++ */
++ pages_for_compaction = compact_gap(sc->order);
++ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
++ if (get_nr_swap_pages() > 0)
++ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
++
++ return inactive_lru_pages > pages_for_compaction &&
++ /*
++ * avoid dryrun with plenty of inactive pages
++ */
++ nr_scanned && nr_reclaimed;
+ }
+
+ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
diff --git a/patches.suse/mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch b/patches.suse/mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch
new file mode 100644
index 0000000000..090a866e38
--- /dev/null
+++ b/patches.suse/mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch
@@ -0,0 +1,77 @@
+From 7afb6e881cdd5b45ff4ca3be2668ad35505626c5 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Mon, 23 Sep 2019 15:34:33 -0700
+Subject: [PATCH] mm: replace list_move_tail() with add_page_to_lru_list_tail()
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: e7a1aaf28770c1f7a06c50cbd02ca0f27ce61ec5
+
+This is a cleanup patch that replaces two historical uses of
+list_move_tail() with relatively recent add_page_to_lru_list_tail().
+
+Link: http://lkml.kernel.org/r/20190716212436.7137-1-yuzhao@google.com
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Ira Weiny <ira.weiny@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/swap.c | 14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/mm/swap.c b/mm/swap.c
+index ae300397dfda..0226c5346560 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
+ del_page_from_lru_list(page, lruvec, lru + active);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+- add_page_to_lru_list(page, lruvec, lru);
+
+ if (PageWriteback(page) || PageDirty(page)) {
+ /*
+@@ -523,13 +522,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
+ * It can make readahead confusing. But race window
+ * is _really_ small and it's non-critical problem.
+ */
++ add_page_to_lru_list(page, lruvec, lru);
+ SetPageReclaim(page);
+ } else {
+ /*
+ * The page's writeback ends up during pagevec
+ * We moves tha page into tail of inactive.
+ */
+- list_move_tail(&page->lru, &lruvec->lists[lru]);
++ add_page_to_lru_list_tail(page, lruvec, lru);
+ __count_vm_event(PGROTATED);
+ }
+
+@@ -844,17 +844,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
+ get_page(page_tail);
+ list_add_tail(&page_tail->lru, list);
+ } else {
+- struct list_head *list_head;
+ /*
+ * Head page has not yet been counted, as an hpage,
+ * so we must account for each subpage individually.
+ *
+- * Use the standard add function to put page_tail on the list,
+- * but then correct its position so they all end up in order.
++ * Put page_tail on the list at the correct position
++ * so they all end up in order.
+ */
+- add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
+- list_head = page_tail->lru.prev;
+- list_move_tail(&page_tail->lru, list_head);
++ add_page_to_lru_list_tail(page_tail, lruvec,
++ page_lru(page_tail));
+ }
+
+ if (!PageUnevictable(page))
diff --git a/patches.suse/mm-sl-aou-b-guarantee-natural-alignment-for-kmalloc-power-of-two.patch b/patches.suse/mm-sl-aou-b-guarantee-natural-alignment-for-kmalloc-power-of-two.patch
new file mode 100644
index 0000000000..78effa75b8
--- /dev/null
+++ b/patches.suse/mm-sl-aou-b-guarantee-natural-alignment-for-kmalloc-power-of-two.patch
@@ -0,0 +1,259 @@
+From 963895c2d88850eeb61b4f862960c3311348aa6e Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Sun, 6 Oct 2019 17:58:45 -0700
+Subject: [PATCH] mm, sl[aou]b: guarantee natural alignment for
+ kmalloc(power-of-two)
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc3
+Git-commit: 59bb47985c1db229ccff8c5deebecd54fc77d2a9
+
+In most configurations, kmalloc() happens to return naturally aligned
+(i.e. aligned to the block size itself) blocks for power of two sizes.
+
+That means some kmalloc() users might unknowingly rely on that
+alignment, until stuff breaks when the kernel is built with e.g.
+CONFIG_SLUB_DEBUG or CONFIG_SLOB, and blocks stop being aligned. Then
+developers have to devise workaround such as own kmem caches with
+specified alignment [1], which is not always practical, as recently
+evidenced in [2].
+
+The topic has been discussed at LSF/MM 2019 [3]. Adding a
+'kmalloc_aligned()' variant would not help with code unknowingly relying
+on the implicit alignment. For slab implementations it would either
+require creating more kmalloc caches, or allocate a larger size and only
+give back part of it. That would be wasteful, especially with a generic
+alignment parameter (in contrast with a fixed alignment to size).
+
+Ideally we should provide to mm users what they need without difficult
+workarounds or own reimplementations, so let's make the kmalloc()
+alignment to size explicitly guaranteed for power-of-two sizes under all
+configurations. What this means for the three available allocators?
+
+* SLAB object layout happens to be mostly unchanged by the patch. The
+ implicitly provided alignment could be compromised with
+ CONFIG_DEBUG_SLAB due to redzoning, however SLAB disables redzoning for
+ caches with alignment larger than unsigned long long. Practically on at
+ least x86 this includes kmalloc caches as they use cache line alignment,
+ which is larger than that. Still, this patch ensures alignment on all
+ arches and cache sizes.
+
+* SLUB layout is also unchanged unless redzoning is enabled through
+ CONFIG_SLUB_DEBUG and boot parameter for the particular kmalloc cache.
+ With this patch, explicit alignment is guaranteed with redzoning as
+ well. This will result in more memory being wasted, but that should be
+ acceptable in a debugging scenario.
+
+* SLOB has no implicit alignment so this patch adds it explicitly for
+ kmalloc(). The potential downside is increased fragmentation. While
+ pathological allocation scenarios are certainly possible, in my testing,
+ after booting a x86_64 kernel+userspace with virtme, around 16MB memory
+ was consumed by slab pages both before and after the patch, with
+ difference in the noise.
+
+[1] https://lore.kernel.org/linux-btrfs/c3157c8e8e0e7588312b40c853f65c02fe6c957a.1566399731.git.christophe.leroy@c-s.fr/
+[2] https://lore.kernel.org/linux-fsdevel/20190225040904.5557-1-ming.lei@redhat.com/
+[3] https://lwn.net/Articles/787740/
+
+[akpm@linux-foundation.org: documentation fixlet, per Matthew]
+Link: http://lkml.kernel.org/r/20190826111627.7505-3-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Acked-by: Christoph Hellwig <hch@lst.de>
+Cc: David Sterba <dsterba@suse.cz>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Ming Lei <ming.lei@redhat.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: "Darrick J . Wong" <darrick.wong@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ Documentation/core-api/memory-allocation.rst | 4 +++
+ include/linux/slab.h | 4 +++
+ mm/slab_common.c | 11 +++++++-
+ mm/slob.c | 42 ++++++++++++++++++++--------
+ 4 files changed, 49 insertions(+), 12 deletions(-)
+
+diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
+index 7744aa3bf2e0..939e3dfc86e9 100644
+--- a/Documentation/core-api/memory-allocation.rst
++++ b/Documentation/core-api/memory-allocation.rst
+@@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel
+ configuration, but it is a good practice to use `kmalloc` for objects
+ smaller than page size.
+
++The address of a chunk allocated with `kmalloc` is aligned to at least
++ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the
++alignment is also guaranteed to be at least the respective size.
++
+ For large allocations you can use :c:func:`vmalloc` and
+ :c:func:`vzalloc`, or directly request pages from the page
+ allocator. The memory allocated by `vmalloc` and related functions is
+diff --git a/include/linux/slab.h b/include/linux/slab.h
+index 56c9c7eed34e..0d4c26395785 100644
+--- a/include/linux/slab.h
++++ b/include/linux/slab.h
+@@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
+ * kmalloc is the normal method of allocating memory
+ * for objects smaller than page size in the kernel.
+ *
++ * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
++ * bytes. For @size of power of two bytes, the alignment is also guaranteed
++ * to be at least to the size.
++ *
+ * The @flags argument may be one of the GFP flags defined at
+ * include/linux/gfp.h and described at
+ * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
+diff --git a/mm/slab_common.c b/mm/slab_common.c
+index 2a827515d573..3d8f36d97242 100644
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -994,10 +994,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
+ unsigned int useroffset, unsigned int usersize)
+ {
+ int err;
++ unsigned int align = ARCH_KMALLOC_MINALIGN;
+
+ s->name = name;
+ s->size = s->object_size = size;
+- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
++
++ /*
++ * For power of two sizes, guarantee natural alignment for kmalloc
++ * caches, regardless of SL*B debugging options.
++ */
++ if (is_power_of_2(size))
++ align = max(align, size);
++ s->align = calculate_alignment(flags, align, size);
++
+ s->useroffset = useroffset;
+ s->usersize = usersize;
+
+diff --git a/mm/slob.c b/mm/slob.c
+index 3dcde9cf2b17..07a39047aa54 100644
+--- a/mm/slob.c
++++ b/mm/slob.c
+@@ -224,6 +224,7 @@ static void slob_free_pages(void *b, int order)
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
++ * @align_offset: Offset in the allocated block that will be aligned.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+@@ -234,7 +235,7 @@ static void slob_free_pages(void *b, int order)
+ * true (set to false otherwise).
+ */
+ static void *slob_page_alloc(struct page *sp, size_t size, int align,
+- bool *page_removed_from_list)
++ int align_offset, bool *page_removed_from_list)
+ {
+ slob_t *prev, *cur, *aligned = NULL;
+ int delta = 0, units = SLOB_UNITS(size);
+@@ -243,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
+ slobidx_t avail = slob_units(cur);
+
++ /*
++ * 'aligned' will hold the address of the slob block so that the
++ * address 'aligned'+'align_offset' is aligned according to the
++ * 'align' parameter. This is for kmalloc() which prepends the
++ * allocated block with its size, so that the block itself is
++ * aligned when needed.
++ */
+ if (align) {
+- aligned = (slob_t *)ALIGN((unsigned long)cur, align);
++ aligned = (slob_t *)
++ (ALIGN((unsigned long)cur + align_offset, align)
++ - align_offset);
+ delta = aligned - cur;
+ }
+ if (avail >= units + delta) { /* room enough? */
+@@ -288,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ /*
+ * slob_alloc: entry point into the slob allocator.
+ */
+-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
++static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
++ int align_offset)
+ {
+ struct page *sp;
+ struct list_head *slob_list;
+@@ -319,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+ if (sp->units < SLOB_UNITS(size))
+ continue;
+
+- b = slob_page_alloc(sp, size, align, &page_removed_from_list);
++ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
+ if (!b)
+ continue;
+
+@@ -356,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+ INIT_LIST_HEAD(&sp->slab_list);
+ set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+ set_slob_page_free(sp, slob_list);
+- b = slob_page_alloc(sp, size, align, &_unused);
++ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
+ BUG_ON(!b);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ }
+@@ -458,7 +469,7 @@ static __always_inline void *
+ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
+ {
+ unsigned int *m;
+- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
++ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ void *ret;
+
+ gfp &= gfp_allowed_mask;
+@@ -466,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
+ fs_reclaim_acquire(gfp);
+ fs_reclaim_release(gfp);
+
+- if (size < PAGE_SIZE - align) {
++ if (size < PAGE_SIZE - minalign) {
++ int align = minalign;
++
++ /*
++ * For power of two sizes, guarantee natural alignment for
++ * kmalloc()'d objects.
++ */
++ if (is_power_of_2(size))
++ align = max(minalign, (int) size);
++
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+- m = slob_alloc(size + align, gfp, align, node);
++ m = slob_alloc(size + minalign, gfp, align, node, minalign);
+
+ if (!m)
+ return NULL;
+ *m = size;
+- ret = (void *)m + align;
++ ret = (void *)m + minalign;
+
+ trace_kmalloc_node(caller, ret,
+- size, size + align, gfp, node);
++ size, size + minalign, gfp, node);
+ } else {
+ unsigned int order = get_order(size);
+
+@@ -579,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+ fs_reclaim_release(flags);
+
+ if (c->size < PAGE_SIZE) {
+- b = slob_alloc(c->size, flags, c->align, node);
++ b = slob_alloc(c->size, flags, c->align, node, 0);
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ SLOB_UNITS(c->size) * SLOB_UNIT,
+ flags, node);
diff --git a/patches.suse/mm-sl-ou-b-improve-memory-accounting.patch b/patches.suse/mm-sl-ou-b-improve-memory-accounting.patch
new file mode 100644
index 0000000000..f8d5dfbb49
--- /dev/null
+++ b/patches.suse/mm-sl-ou-b-improve-memory-accounting.patch
@@ -0,0 +1,164 @@
+From ecb48cd311598ae7dde525bae08bca62f0948d98 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Sun, 6 Oct 2019 17:58:42 -0700
+Subject: [PATCH] mm, sl[ou]b: improve memory accounting
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc3
+Git-commit: 6a486c0ad4dcdee3946842c64884d2978bfe2602
+
+Patch series "guarantee natural alignment for kmalloc()", v2.
+
+This patch (of 2):
+
+SLOB currently doesn't account its pages at all, so in /proc/meminfo the
+Slab field shows zero. Modifying a counter on page allocation and
+freeing should be acceptable even for the small system scenarios SLOB is
+intended for. Since reclaimable caches are not separated in SLOB,
+account everything as unreclaimable.
+
+SLUB currently doesn't account kmalloc() and kmalloc_node() allocations
+larger than order-1 page, that are passed directly to the page
+allocator. As they also don't appear in /proc/slabinfo, it might look
+like a memory leak. For consistency, account them as well. (SLAB
+doesn't actually use page allocator directly, so no change there).
+
+Ideally SLOB and SLUB would be handled in separate patches, but due to
+the shared kmalloc_order() function and different kfree()
+implementations, it's easier to patch both at once to prevent
+inconsistencies.
+
+Link: http://lkml.kernel.org/r/20190826111627.7505-2-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Ming Lei <ming.lei@redhat.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: "Darrick J . Wong" <darrick.wong@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/slab_common.c | 8 ++++++--
+ mm/slob.c | 20 ++++++++++++++++----
+ mm/slub.c | 14 +++++++++++---
+ 3 files changed, 33 insertions(+), 9 deletions(-)
+
+diff --git a/mm/slab_common.c b/mm/slab_common.c
+index 7f492e53a7db..2a827515d573 100644
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -1251,12 +1251,16 @@ void __init create_kmalloc_caches(slab_flags_t flags)
+ */
+ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+ {
+- void *ret;
++ void *ret = NULL;
+ struct page *page;
+
+ flags |= __GFP_COMP;
+ page = alloc_pages(flags, order);
+- ret = page ? page_address(page) : NULL;
++ if (likely(page)) {
++ ret = page_address(page);
++ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
++ 1 << order);
++ }
+ ret = kasan_kmalloc_large(ret, size, flags);
+ /* As ret might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ret, size, 1, flags);
+diff --git a/mm/slob.c b/mm/slob.c
+index 7f421d0ca9ab..3dcde9cf2b17 100644
+--- a/mm/slob.c
++++ b/mm/slob.c
+@@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
+
+ static void *slob_new_pages(gfp_t gfp, int order, int node)
+ {
+- void *page;
++ struct page *page;
+
+ #ifdef CONFIG_NUMA
+ if (node != NUMA_NO_NODE)
+@@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
+ if (!page)
+ return NULL;
+
++ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
++ 1 << order);
+ return page_address(page);
+ }
+
+ static void slob_free_pages(void *b, int order)
+ {
++ struct page *sp = virt_to_page(b);
++
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += 1 << order;
+- free_pages((unsigned long)b, order);
++
++ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
++ -(1 << order));
++ __free_pages(sp, order);
+ }
+
+ /*
+@@ -521,8 +528,13 @@ void kfree(const void *block)
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ slob_free(m, *m + align);
+- } else
+- __free_pages(sp, compound_order(sp));
++ } else {
++ unsigned int order = compound_order(sp);
++ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
++ -(1 << order));
++ __free_pages(sp, order);
++
++ }
+ }
+ EXPORT_SYMBOL(kfree);
+
+diff --git a/mm/slub.c b/mm/slub.c
+index 4bd42abe257d..e251fb195e47 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -3829,11 +3829,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+ {
+ struct page *page;
+ void *ptr = NULL;
++ unsigned int order = get_order(size);
+
+ flags |= __GFP_COMP;
+- page = alloc_pages_node(node, flags, get_order(size));
+- if (page)
++ page = alloc_pages_node(node, flags, order);
++ if (page) {
+ ptr = page_address(page);
++ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
++ 1 << order);
++ }
+
+ return kmalloc_large_node_hook(ptr, size, flags);
+ }
+@@ -3959,9 +3963,13 @@ void kfree(const void *x)
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
++ unsigned int order = compound_order(page);
++
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+- __free_pages(page, compound_order(page));
++ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
++ -(1 << order));
++ __free_pages(page, order);
+ return;
+ }
+ slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
diff --git a/patches.suse/mm-vmscan-do-not-share-cgroup-iteration-between-reclaimers.patch b/patches.suse/mm-vmscan-do-not-share-cgroup-iteration-between-reclaimers.patch
new file mode 100644
index 0000000000..a91380dfa7
--- /dev/null
+++ b/patches.suse/mm-vmscan-do-not-share-cgroup-iteration-between-reclaimers.patch
@@ -0,0 +1,110 @@
+From b9ec6358d338963481c7a89c4fb6b26f1c0cd54f Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Mon, 23 Sep 2019 15:35:01 -0700
+Subject: [PATCH] mm: vmscan: do not share cgroup iteration between reclaimers
+
+References: bnc#1155780 (VM/FS functional and performance backports)
+Patch-mainline: v5.4-rc1
+Git-commit: 1ba6fc9af35bf97c84567d9b3eeb26629d1e3af0
+
+One of our services observed a high rate of cgroup OOM kills in the
+presence of large amounts of clean cache. Debugging showed that the
+culprit is the shared cgroup iteration in page reclaim.
+
+Under high allocation concurrency, multiple threads enter reclaim at the
+same time. Fearing overreclaim when we first switched from the single
+global LRU to cgrouped LRU lists, we introduced a shared iteration state
+for reclaim invocations - whether 1 or 20 reclaimers are active
+concurrently, we only walk the cgroup tree once: the 1st reclaimer
+reclaims the first cgroup, the second the second one etc. With more
+reclaimers than cgroups, we start another walk from the top.
+
+This sounded reasonable at the time, but the problem is that reclaim
+concurrency doesn't scale with allocation concurrency. As reclaim
+concurrency increases, the amount of memory individual reclaimers get to
+scan gets smaller and smaller. Individual reclaimers may only see one
+cgroup per cycle, and that may not have much reclaimable memory. We see
+individual reclaimers declare OOM when there is plenty of reclaimable
+memory available in cgroups they didn't visit.
+
+This patch does away with the shared iterator, and every reclaimer is
+allowed to scan the full cgroup tree and see all of reclaimable memory,
+just like it would on a non-cgrouped system. This way, when OOM is
+declared, we know that the reclaimer actually had a chance.
+
+To still maintain fairness in reclaim pressure, disallow cgroup reclaim
+from bailing out of the tree walk early. Kswapd and regular direct
+reclaim already don't bail, so it's not clear why limit reclaim would have
+to, especially since it only walks subtrees to begin with.
+
+This change completely eliminates the OOM kills on our service, while
+showing no signs of overreclaim - no increased scan rates, %sys time, or
+abrupt free memory spikes. I tested across 100 machines that have 64G of
+RAM and host about 300 cgroups each.
+
+[ It's possible overreclaim never was a *practical* issue to begin
+ with - it was simply a concern we had on the mailing lists at the
+ time, with no real data to back it up. But we have also added more
+ bail-out conditions deeper inside reclaim (e.g. the proportional
+ exit in shrink_node_memcg) since. Regardless, now we have data that
+ suggests full walks are more reliable and scale just fine. ]
+
+Link: http://lkml.kernel.org/r/20190812192316.13615-1-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ mm/vmscan.c | 22 ++--------------------
+ 1 file changed, 2 insertions(+), 20 deletions(-)
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 8d03013b6c59..3944acd94764 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2665,10 +2665,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+
+ do {
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+- struct mem_cgroup_reclaim_cookie reclaim = {
+- .pgdat = pgdat,
+- .priority = sc->priority,
+- };
+ unsigned long node_lru_pages = 0;
+ struct mem_cgroup *memcg;
+
+@@ -2677,7 +2673,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
+
+- memcg = mem_cgroup_iter(root, NULL, &reclaim);
++ memcg = mem_cgroup_iter(root, NULL, NULL);
+ do {
+ unsigned long lru_pages;
+ unsigned long reclaimed;
+@@ -2720,21 +2716,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+- /*
+- * Kswapd have to scan all memory cgroups to fulfill
+- * the overall scan target for the node.
+- *
+- * Limit reclaim, on the other hand, only cares about
+- * nr_to_reclaim pages to be reclaimed and it will
+- * retry with decreasing priority if one round over the
+- * whole hierarchy is not sufficient.
+- */
+- if (!current_is_kswapd() &&
+- sc->nr_reclaimed >= sc->nr_to_reclaim) {
+- mem_cgroup_iter_break(root, memcg);
+- break;
+- }
+- } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
++ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
diff --git a/patches.suse/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch b/patches.suse/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch
index c99c976c38..0c55d174b0 100644
--- a/patches.suse/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch
+++ b/patches.suse/sched-topology-Improve-load-balancing-on-AMD-EPYC.patch
@@ -2,7 +2,8 @@ From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Thu, 8 Aug 2019 20:53:01 +0100
Subject: [PATCH] sched/topology: Improve load balancing on AMD EPYC
-Patch-mainline: Not yet, under discussion on LKML
+Patch-mainline: v5.4-rc1
+Git-commit: a55c7454a8c887b226a01d7eed088ccb5374d81e
References: bsc#1137366
SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
diff --git a/series.conf b/series.conf
index e13afc02c8..ccf512f58d 100644
--- a/series.conf
+++ b/series.conf
@@ -1959,13 +1959,23 @@
patches.suse/kbuild-clean-compressed-initramfs-image.patch
patches.suse/ocfs2-wait-for-recovering-done-after-direct-unlock-r.patch
patches.suse/kmemleak-increase-DEBUG_KMEMLEAK_EARLY_LOG_SIZE-defa.patch
+ patches.suse/mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch
patches.suse/mm-page_owner-record-page-owner-for-each-subpage.patch
patches.suse/mm-page_owner-keep-owner-info-when-freeing-the-page.patch
patches.suse/mm-page_owner-debug_pagealloc-save-and-dump-freeing-stack-trace.patch
+ patches.suse/mm-filemap.c-don-t-initiate-writeback-if-mapping-has-no-dirty-pages.patch
+ patches.suse/mm-filemap.c-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch
+ patches.suse/mm-vmscan-do-not-share-cgroup-iteration-between-reclaimers.patch
patches.suse/mm-gup-add-make_dirty-arg-to-put_user_pages_dirty_lo.patch
patches.suse/z3fold-fix-memory-leak-in-kmem-cache.patch
patches.suse/mm-compaction.c-clear-total_-migrate-free-_scanned-b.patch
+ patches.suse/mm-compaction.c-remove-unnecessary-zone-parameter-in-isolate_migratepages.patch
+ patches.suse/mm-mempolicy.c-remove-unnecessary-nodemask-check-in-kernel_migrate_pages.patch
patches.suse/memcg-oom-don-t-require-__GFP_FS-when-invoking-memcg.patch
+ patches.suse/mm-reclaim-make-should_continue_reclaim-perform-dryrun-detection.patch
+ patches.suse/mm-reclaim-cleanup-should_continue_reclaim.patch
+ patches.suse/mm-compaction-raise-compaction-priority-after-it-withdrawns.patch
+ patches.suse/hugetlbfs-don-t-retry-when-pool-page-allocations-start-to-fail.patch
patches.suse/arm64-consider-stack-randomization-for-mmap-base-onl.patch
patches.suse/arm-properly-account-for-stack-randomization-and-sta.patch
patches.suse/arm-use-STACK_TOP-when-computing-mmap-base-address.patch
@@ -2398,6 +2408,8 @@
patches.suse/mm-z3fold.c-claim-page-in-the-beginning-of-free.patch
patches.suse/mm-page_alloc.c-fix-a-crash-in-free_pages_prepare.patch
patches.suse/mm-vmpressure.c-fix-a-signedness-bug-in-vmpressure_r.patch
+ patches.suse/mm-sl-ou-b-improve-memory-accounting.patch
+ patches.suse/mm-sl-aou-b-guarantee-natural-alignment-for-kmalloc-power-of-two.patch
patches.suse/selinux-fix-context-string-corruption-in-convert_con.patch
patches.suse/gpiolib-don-t-clear-FLAG_IS_OUT-when-emulating-open-.patch
patches.suse/gpio-fix-getting-nonexclusive-gpiods-from-DT.patch