Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2011-07-15 11:26:30 +0200
committerMichal Hocko <mhocko@suse.cz>2011-07-15 11:26:30 +0200
commite75e37de18da4a391f575981bf5c2b663bee9353 (patch)
treef1a047c5eeafdd3f832ef2e4c1015d701aa427a4
parenta74c421e4223bf4803803492cb813f7f2aa06c4c (diff)
- patches.fixes/mm-do_fault-preallocate-cow-page.patch:
mm: preallocate page before lock_page() at filemap COW (bnc#7000995).
-rw-r--r--kernel-source.changes7
-rw-r--r--patches.fixes/mm-do_fault-preallocate-cow-page.patch174
-rw-r--r--series.conf1
3 files changed, 182 insertions, 0 deletions
diff --git a/kernel-source.changes b/kernel-source.changes
index 6c5d1e0f27..a4d15fe31a 100644
--- a/kernel-source.changes
+++ b/kernel-source.changes
@@ -1,4 +1,11 @@
-------------------------------------------------------------------
+Fri Jul 15 11:26:24 CEST 2011 - mhocko@suse.cz
+
+- patches.fixes/mm-do_fault-preallocate-cow-page.patch:
+ mm: preallocate page before lock_page() at filemap COW
+ (bnc#7000995).
+
+-------------------------------------------------------------------
Fri Jul 15 08:29:18 CEST 2011 - tonyj@suse.de
- patches.suse/intel-perf-0001-pmu.patch: perf, x86: Add PEBS
diff --git a/patches.fixes/mm-do_fault-preallocate-cow-page.patch b/patches.fixes/mm-do_fault-preallocate-cow-page.patch
new file mode 100644
index 0000000000..77818d1a86
--- /dev/null
+++ b/patches.fixes/mm-do_fault-preallocate-cow-page.patch
@@ -0,0 +1,174 @@
+From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Date: Thu, 23 Jun 2011 18:50:32 +0900
+Subject: [PATCH] mm: preallocate page before lock_page() at filemap COW.
+Patch-mainline: Not yet (in mmotm tree)
+References: bnc#7000995
+
+Mhocko:
+There is an alternative way to fix this issue (https://lkml.org/lkml/2011/6/22/169)
+but this one got at least to Andrew's mmotm tree. There is no big interest
+in discussion about pros and cons of both approches. Anyway both should work.
+
+Currently we are keeping faulted page locked throughout whole __do_fault
+call (except for page_mkwrite code path) after calling file system's
+fault code. If we do early COW, we allocate a new page which has to be
+charged for a memcg (mem_cgroup_newpage_charge).
+
+This function, however, might block for unbounded amount of time if memcg
+oom killer is disabled or fork-bomb is running because the only way out of
+the OOM situation is either an external event or OOM-situation fix.
+
+In the end we are keeping the faulted page locked and blocking other
+processes from faulting it in which is not good at all because we are
+basically punishing potentially an unrelated process for OOM condition
+in a different group (I have seen stuck system because of ld-2.11.1.so being
+locked).
+
+We can do test easily.
+
+ % cgcreate -g memory:A
+ % cgset -r memory.limit_in_bytes=64M A
+ % cgset -r memory.memsw.limit_in_bytes=64M A
+ % cd kernel_dir; cgexec -g memory:A make -j
+
+Then, the whole system will live-locked until you kill 'make -j'
+by hands (or push reboot...) This is because some important
+page in a shared library are locked.
+
+Considering again, the new page is not necessary to be allocated
+with lock_page() held. So....
+This patch moves "charge" and memory allocation for COW page
+before lock_page(). Then, we can avoid scanning LRU with holding
+a lock on a page.
+
+Then, above livelock disappears.
+
+Reported-by: Lutz Vieweg <lvml@5t9.de>
+Original-idea-by: Michal Hocko <mhocko@suse.cz>
+Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ mm/memory.c | 62 ++++++++++++++++++++++++++++++++++----------------------------
+ 1 file changed, 34 insertions(+), 28 deletions(-)
+Index: linux-2.6.32-SLE11-SP2/mm/memory.c
+===================================================================
+--- linux-2.6.32-SLE11-SP2.orig/mm/memory.c
++++ linux-2.6.32-SLE11-SP2/mm/memory.c
+@@ -2951,14 +2951,34 @@ static int __do_fault(struct mm_struct *
+ pte_t *page_table;
+ spinlock_t *ptl;
+ struct page *page;
++ struct page *cow_page;
+ pte_t entry;
+ int anon = 0;
+- int charged = 0;
+ struct page *dirty_page = NULL;
+ struct vm_fault vmf;
+ int ret;
+ int page_mkwrite = 0;
+
++ /*
++ * If we do COW later, allocate page befor taking lock_page()
++ * on the file cache page. This will reduce lock holding time.
++ */
++ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
++
++ if (unlikely(anon_vma_prepare(vma)))
++ return VM_FAULT_OOM;
++
++ cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
++ if (!cow_page)
++ return VM_FAULT_OOM;
++
++ if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
++ page_cache_release(cow_page);
++ return VM_FAULT_OOM;
++ }
++ } else
++ cow_page = NULL;
++
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = pgoff;
+ vmf.flags = flags;
+@@ -2966,12 +2986,13 @@ static int __do_fault(struct mm_struct *
+
+ ret = vma->vm_ops->fault(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+- return ret;
++ goto uncharge_out;
+
+ if (unlikely(PageHWPoison(vmf.page))) {
+ if (ret & VM_FAULT_LOCKED)
+ unlock_page(vmf.page);
+- return VM_FAULT_HWPOISON;
++ ret = VM_FAULT_HWPOISON;
++ goto uncharge_out;
+ }
+
+ /*
+@@ -2989,29 +3010,8 @@ static int __do_fault(struct mm_struct *
+ page = vmf.page;
+ if (flags & FAULT_FLAG_WRITE) {
+ if (!(vma->vm_flags & VM_SHARED)) {
++ page = cow_page;
+ anon = 1;
+- if (unlikely(anon_vma_prepare(vma))) {
+- ret = VM_FAULT_OOM;
+- goto out;
+- }
+- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+- vma, address);
+- if (!page) {
+- ret = VM_FAULT_OOM;
+- goto out;
+- }
+- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+- ret = VM_FAULT_OOM;
+- page_cache_release(page);
+- goto out;
+- }
+- charged = 1;
+- /*
+- * Don't let another task, with possibly unlocked vma,
+- * keep the mlocked page.
+- */
+- if (vma->vm_flags & VM_LOCKED)
+- clear_page_mlock(vmf.page);
+ copy_user_highpage(page, vmf.page, address, vma);
+ __SetPageUptodate(page);
+ } else {
+@@ -3080,8 +3080,8 @@ static int __do_fault(struct mm_struct *
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, entry);
+ } else {
+- if (charged)
+- mem_cgroup_uncharge_page(page);
++ if (cow_page)
++ mem_cgroup_uncharge_page(cow_page);
+ if (anon)
+ page_cache_release(page);
+ else
+@@ -3090,7 +3090,6 @@ static int __do_fault(struct mm_struct *
+
+ pte_unmap_unlock(page_table, ptl);
+
+-out:
+ if (dirty_page) {
+ struct address_space *mapping = page->mapping;
+
+@@ -3120,6 +3119,13 @@ out:
+ unwritable_page:
+ page_cache_release(page);
+ return ret;
++uncharge_out:
++ /* fs's fault handler get error */
++ if (cow_page) {
++ mem_cgroup_uncharge_page(cow_page);
++ page_cache_release(cow_page);
++ }
++ return ret;
+ }
+
+ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/series.conf b/series.conf
index 45e4df3a3b..6ff6d5b6bc 100644
--- a/series.conf
+++ b/series.conf
@@ -1371,6 +1371,7 @@
patches.fixes/vm-fix-vm_pgoff-wrap-in-stack-expansion.patch
patches.fixes/vm-fix-vm_pgoff-wrap-in-upward-expansion.patch
+ patches.fixes/mm-do_fault-preallocate-cow-page.patch
########################################################
# IPC patches