Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2011-08-22 09:09:31 +0200
committerMichal Hocko <mhocko@suse.cz>2011-08-22 09:10:45 +0200
commit8cceca89009d64663e63df0b94fa265f144b6aa4 (patch)
tree20e0f6c2835af8216b6d6962e7ccab6d610e1f06
parent054734ac3fd20ad51c0fa6c85f5eba18c0c09af0 (diff)
- patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch:
memcg: fix init_page_cgroup nid with sparsemem (bnc#708160, bnc#712316).
-rw-r--r--kernel-source.changes7
-rw-r--r--patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch150
-rw-r--r--series.conf1
3 files changed, 158 insertions, 0 deletions
diff --git a/kernel-source.changes b/kernel-source.changes
index 77ab526868..c7719c6e3f 100644
--- a/kernel-source.changes
+++ b/kernel-source.changes
@@ -1,4 +1,11 @@
-------------------------------------------------------------------
+Mon Aug 22 09:09:31 CEST 2011 - mhocko@suse.cz
+
+- patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch:
+ memcg: fix init_page_cgroup nid with sparsemem (bnc#708160,
+ bnc#712316).
+
+-------------------------------------------------------------------
Fri Aug 19 00:22:42 CEST 2011 - jack@suse.cz
- patches.fixes/jbd-remove_journal_head-oops-fix.diff: jbd:
diff --git a/patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch b/patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch
new file mode 100644
index 0000000000..371e25e6ed
--- /dev/null
+++ b/patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch
@@ -0,0 +1,150 @@
+From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Subject: memcg: fix init_page_cgroup nid with sparsemem
+References: bnc#708160, bnc#712316
+Patch-Mainline: v3.0-rc4
+Git-commit: 37573e8c718277103f61f03741bdc5606d31b07e
+
+Signed-off-by: Thomas Renninger <trenn@suse.de>
+
+Commit 21a3c9646873 ("memcg: allocate memory cgroup structures in local
+nodes") makes page_cgroup allocation as NUMA aware. But that caused a
+problem https://bugzilla.kernel.org/show_bug.cgi?id=36192.
+
+The problem was getting a NID from invalid struct pages, which was not
+initialized because it was out-of-node, out of [node_start_pfn,
+node_end_pfn)
+
+Now, with sparsemem, page_cgroup_init scans pfn from 0 to max_pfn. But
+this may scan a pfn which is not on any node and can access memmap which
+is not initialized.
+
+This makes page_cgroup_init() for SPARSEMEM node aware and remove a code
+to get nid from page->flags. (Then, we'll use valid NID always.)
+
+[akpm@linux-foundation.org: try to fix up comments]
+Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ mm/page_cgroup.c | 71 +++++++++++++++++++++++++++++++++++++++++++---------------
+ 1 file changed, 53 insertions(+), 18 deletions(-)
+Index: linux-2.6.32-SLE11-SP1/mm/page_cgroup.c
+===================================================================
+--- linux-2.6.32-SLE11-SP1.orig/mm/page_cgroup.c
++++ linux-2.6.32-SLE11-SP1/mm/page_cgroup.c
+@@ -158,15 +158,14 @@ static void free_page_cgroup(void *addr)
+ }
+ }
+
+-static int __init_refok init_section_page_cgroup(unsigned long pfn)
++static int __init_refok init_section_page_cgroup(unsigned long pfn, int nid)
+ {
+ struct mem_section *section = __pfn_to_section(pfn);
+ struct page_cgroup *base, *pc;
+ unsigned long table_size;
+- int nid, index;
++ int index;
+
+ if (!section->page_cgroup) {
+- nid = page_to_nid(pfn_to_page(pfn));
+ table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+ base = alloc_page_cgroup(table_size, nid);
+ } else {
+@@ -191,7 +190,11 @@ static int __init_refok init_section_pag
+ pc = base + index;
+ __init_page_cgroup(pc, pfn + index);
+ }
+-
++ /*
++ * The passed "pfn" may not be aligned to SECTION. For the calculation
++ * we need to apply a mask.
++ */
++ pfn &= PAGE_SECTION_MASK;
+ section->page_cgroup = base - pfn;
+ total_usage += table_size;
+ return 0;
+@@ -220,10 +223,20 @@ int __meminit online_page_cgroup(unsigne
+ start = start_pfn & ~(PAGES_PER_SECTION - 1);
+ end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
++ if (nid == -1) {
++ /*
++ * In this case, "nid" already exists and contains valid memory.
++ * "start_pfn" passed to us is a pfn which is an arg for
++ * online__pages(), and start_pfn should exist.
++ */
++ nid = pfn_to_nid(start_pfn);
++ VM_BUG_ON(!node_state(nid, N_ONLINE));
++ }
++
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+ if (!pfn_present(pfn))
+ continue;
+- fail = init_section_page_cgroup(pfn);
++ fail = init_section_page_cgroup(pfn, nid);
+ }
+ if (!fail)
+ return 0;
+@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callbac
+ void __init page_cgroup_init(void)
+ {
+ unsigned long pfn;
+- int fail = 0;
++ int nid;
+
+ if (mem_cgroup_disabled())
+ return;
+
+- for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+- if (!pfn_present(pfn))
+- continue;
+- fail = init_section_page_cgroup(pfn);
+- }
+- if (fail) {
+- printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+- panic("Out of memory");
+- } else {
+- hotplug_memory_notifier(page_cgroup_callback, 0);
++ for_each_node_state(nid, N_HIGH_MEMORY) {
++ unsigned long start_pfn, end_pfn;
++
++ start_pfn = node_start_pfn(nid);
++ end_pfn = node_end_pfn(nid);
++ /*
++ * start_pfn and end_pfn may not be aligned to SECTION and the
++ * page->flags of out of node pages are not initialized. So we
++ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
++ */
++ for (pfn = start_pfn;
++ pfn < end_pfn;
++ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
++
++ if (!pfn_valid(pfn))
++ continue;
++ /*
++ * Nodes's pfns can be overlapping.
++ * We know some arch can have a nodes layout such as
++ * -------------pfn-------------->
++ * N0 | N1 | N2 | N0 | N1 | N2|....
++ */
++ if (pfn_to_nid(pfn) != nid)
++ continue;
++ if (init_section_page_cgroup(pfn, nid))
++ goto oom;
++ }
+ }
++ hotplug_memory_notifier(page_cgroup_callback, 0);
+ printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
+- " want memory cgroups\n");
++ printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
++ "don't want memory cgroups\n");
++ return;
++oom:
++ printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
++ panic("Out of memory");
+ }
+
+ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/series.conf b/series.conf
index 7fe5edf2cf..f827e2ddbc 100644
--- a/series.conf
+++ b/series.conf
@@ -770,6 +770,7 @@
patches.fixes/arch-mm-filter-disallowed-nodes-from-arch-specific-s.patch
patches.fixes/cpusets-randomize-node-rotor-used-in-cpuset_mem_spre.patch
patches.fixes/validate-size-of-efi-guid-partition-entries.patch
+ patches.fixes/mm-fix_memoryless_nodes_exact_nid.patch
########################################################
# IPC patches