summaryrefslogtreecommitdiff |
diff options
author | Jiri Slaby <jslaby@suse.cz> | 2019-01-18 07:53:27 +0100 |
---|---|---|
committer | Jiri Slaby <jslaby@suse.cz> | 2019-01-18 07:53:36 +0100 |
commit | 037f912a1a92b6c1aaa9632ff31b8a3aa7e1f71b (patch) | |
tree | b8074166687b84e630bb590cf2c4c934465a883f | |
parent | e8c868c962bad20609c8c6f7c43bf5a7caa44f49 (diff) |
mm, memcg: fix reclaim deadlock with writeback (bnc#1012628).
-rw-r--r-- | patches.kernel.org/4.20.3-030-mm-memcg-fix-reclaim-deadlock-with-writeback.patch | 152 | ||||
-rw-r--r-- | series.conf | 1 |
2 files changed, 153 insertions, 0 deletions
diff --git a/patches.kernel.org/4.20.3-030-mm-memcg-fix-reclaim-deadlock-with-writeback.patch b/patches.kernel.org/4.20.3-030-mm-memcg-fix-reclaim-deadlock-with-writeback.patch new file mode 100644 index 0000000000..a80cd17f97 --- /dev/null +++ b/patches.kernel.org/4.20.3-030-mm-memcg-fix-reclaim-deadlock-with-writeback.patch @@ -0,0 +1,152 @@ +From: Michal Hocko <mhocko@suse.com> +Date: Tue, 8 Jan 2019 15:23:07 -0800 +Subject: [PATCH] mm, memcg: fix reclaim deadlock with writeback +References: bnc#1012628 +Patch-mainline: 4.20.3 +Git-commit: 63f3655f950186752236bb88a22f8252c11ce394 + +commit 63f3655f950186752236bb88a22f8252c11ce394 upstream. + +Liu Bo has experienced a deadlock between memcg (legacy) reclaim and the +ext4 writeback + + task1: + wait_on_page_bit+0x82/0xa0 + shrink_page_list+0x907/0x960 + shrink_inactive_list+0x2c7/0x680 + shrink_node_memcg+0x404/0x830 + shrink_node+0xd8/0x300 + do_try_to_free_pages+0x10d/0x330 + try_to_free_mem_cgroup_pages+0xd5/0x1b0 + try_charge+0x14d/0x720 + memcg_kmem_charge_memcg+0x3c/0xa0 + memcg_kmem_charge+0x7e/0xd0 + __alloc_pages_nodemask+0x178/0x260 + alloc_pages_current+0x95/0x140 + pte_alloc_one+0x17/0x40 + __pte_alloc+0x1e/0x110 + alloc_set_pte+0x5fe/0xc20 + do_fault+0x103/0x970 + handle_mm_fault+0x61e/0xd10 + __do_page_fault+0x252/0x4d0 + do_page_fault+0x30/0x80 + page_fault+0x28/0x30 + + task2: + __lock_page+0x86/0xa0 + mpage_prepare_extent_to_map+0x2e7/0x310 [ext4] + ext4_writepages+0x479/0xd60 + do_writepages+0x1e/0x30 + __writeback_single_inode+0x45/0x320 + writeback_sb_inodes+0x272/0x600 + __writeback_inodes_wb+0x92/0xc0 + wb_writeback+0x268/0x300 + wb_workfn+0xb4/0x390 + process_one_work+0x189/0x420 + worker_thread+0x4e/0x4b0 + kthread+0xe6/0x100 + ret_from_fork+0x41/0x50 + +He adds + "task1 is waiting for the PageWriteback bit of the page that task2 has + collected in mpd->io_submit->io_bio, and tasks2 is waiting for the + LOCKED bit the page which tasks1 has locked" + +More precisely task1 is handling a page fault and it has a page locked +while it charges a new page table to a memcg. That in turn hits a +memory limit reclaim and the memcg reclaim for legacy controller is +waiting on the writeback but that is never going to finish because the +writeback itself is waiting for the page locked in the #PF path. So +this is essentially ABBA deadlock: + + lock_page(A) + SetPageWriteback(A) + unlock_page(A) + lock_page(B) + lock_page(B) + pte_alloc_pne + shrink_page_list + wait_on_page_writeback(A) + SetPageWriteback(B) + unlock_page(B) + + # flush A, B to clear the writeback + +This accumulating of more pages to flush is used by several filesystems +to generate a more optimal IO patterns. + +Waiting for the writeback in legacy memcg controller is a workaround for +pre-mature OOM killer invocations because there is no dirty IO +throttling available for the controller. There is no easy way around +that unfortunately. Therefore fix this specific issue by pre-allocating +the page table outside of the page lock. We have that handy +infrastructure for that already so simply reuse the fault-around pattern +which already does this. + +There are probably other hidden __GFP_ACCOUNT | GFP_KERNEL allocations +from under a fs page locked but they should be really rare. I am not +aware of a better solution unfortunately. + +[akpm@linux-foundation.org: fix mm/memory.c:__do_fault()] +[akpm@linux-foundation.org: coding-style fixes] +[mhocko@kernel.org: enhance comment, per Johannes] + Link: http://lkml.kernel.org/r/20181214084948.GA5624@dhcp22.suse.cz +Link: http://lkml.kernel.org/r/20181213092221.27270-1-mhocko@kernel.org +Fixes: c3b94f44fcb0 ("memcg: further prevent OOM with too many dirty pages") +Signed-off-by: Michal Hocko <mhocko@suse.com> +Reported-by: Liu Bo <bo.liu@linux.alibaba.com> +Debugged-by: Liu Bo <bo.liu@linux.alibaba.com> +Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Acked-by: Johannes Weiner <hannes@cmpxchg.org> +Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com> +Cc: Jan Kara <jack@suse.cz> +Cc: Dave Chinner <david@fromorbit.com> +Cc: Theodore Ts'o <tytso@mit.edu> +Cc: Vladimir Davydov <vdavydov.dev@gmail.com> +Cc: Shakeel Butt <shakeelb@google.com> +Cc: <stable@vger.kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Jiri Slaby <jslaby@suse.cz> +--- + mm/memory.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/mm/memory.c b/mm/memory.c +index 4ad2d293ddc2..59c00ae6b928 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -2993,6 +2993,29 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret; + ++ /* ++ * Preallocate pte before we take page_lock because this might lead to ++ * deadlocks for memcg reclaim which waits for pages under writeback: ++ * lock_page(A) ++ * SetPageWriteback(A) ++ * unlock_page(A) ++ * lock_page(B) ++ * lock_page(B) ++ * pte_alloc_pne ++ * shrink_page_list ++ * wait_on_page_writeback(A) ++ * SetPageWriteback(B) ++ * unlock_page(B) ++ * # flush A, B to clear the writeback ++ */ ++ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { ++ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, ++ vmf->address); ++ if (!vmf->prealloc_pte) ++ return VM_FAULT_OOM; ++ smp_wmb(); /* See comment in __pte_alloc() */ ++ } ++ + ret = vma->vm_ops->fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) +-- +2.20.1 + diff --git a/series.conf b/series.conf index 94349af7c3..f9af9b5ff2 100644 --- a/series.conf +++ b/series.conf @@ -267,6 +267,7 @@ patches.kernel.org/4.20.3-027-fork-memcg-fix-cached_stacks-case.patch patches.kernel.org/4.20.3-028-slab-alien-caches-must-not-be-initialized-if-t.patch patches.kernel.org/4.20.3-029-mm-usercopy.c-no-check-page-span-for-stack-obj.patch + patches.kernel.org/4.20.3-030-mm-memcg-fix-reclaim-deadlock-with-writeback.patch ######################################################## # Build fixes that apply to the vanilla kernel too. |