Home Home > GIT Browse > openSUSE-15.0
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2013-08-27 10:17:02 +0200
committerMichal Hocko <mhocko@suse.cz>2013-08-27 10:17:02 +0200
commit925d40604ea1bb669154dbec538d61d3ee597a59 (patch)
treefa4f97ef2fa21e5ed7b71f875dc15787b8c256d9
parent80500b728e81a250859ca90d8a02618f40fada61 (diff)
parent4f6cb7d551d9ae85ea99532584fbc59f3aaa134d (diff)
Merge branch 'bnc831949' into SLE11-SP2rpm-3.0.93-0.5
-rw-r--r--kernel-source.changes31
-rw-r--r--patches.fixes/printk-add-nmi-ring-buffer.patch101
-rw-r--r--patches.fixes/printk-do-not-call-unlock_console-from-nmi.patch67
-rw-r--r--patches.fixes/printk-do-not-use-printk_cpu-from-finish_printk.patch73
-rw-r--r--patches.fixes/printk-extract-ringbuffer-handling.patch201
-rw-r--r--patches.fixes/printk-fix-softlockups-during-heavy-printing.patch25
-rw-r--r--patches.fixes/printk-make-nmi-ringbuffer-length-independent.patch115
-rw-r--r--patches.fixes/printk-safe-nmi-handling.patch229
-rw-r--r--patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch404
-rw-r--r--patches.fixes/x86-Do-not-schedule-while-still-in-NMI-context.patch100
-rw-r--r--series.conf10
11 files changed, 1345 insertions, 11 deletions
diff --git a/kernel-source.changes b/kernel-source.changes
index b4a5da3f5a..e0b76c751c 100644
--- a/kernel-source.changes
+++ b/kernel-source.changes
@@ -52,6 +52,14 @@ Wed Aug 21 09:26:20 CEST 2013 - jslaby@suse.de
Delete.
-------------------------------------------------------------------
+Tue Aug 20 13:58:03 CEST 2013 - mhocko@suse.cz
+
+- patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch: x86:
+ Add workaround to NMI iret woes (bnc#831949).
+- patches.fixes/x86-Do-not-schedule-while-still-in-NMI-context.patch:
+ x86: Do not schedule while still in NMI context (bnc#831949).
+
+-------------------------------------------------------------------
Sat Aug 17 22:54:15 CEST 2013 - bpoirier@suse.de
- patches.fixes/bnx2x-Avoid-sending-multiple-statistics-queries.patch:
@@ -87,6 +95,29 @@ Tue Aug 13 17:57:29 CEST 2013 - ohering@suse.de
- kabi/severities: Ignore changes in drivers/hv
-------------------------------------------------------------------
+Tue Aug 13 16:04:11 CEST 2013 - mhocko@suse.cz
+
+- patches.fixes/printk-add-nmi-ring-buffer.patch: printk: Add
+ NMI ringbuffer (bnc#831949).
+- patches.fixes/printk-extract-ringbuffer-handling.patch: printk:
+ extract ringbuffer handling from vprintk (bnc#831949).
+- patches.fixes/printk-safe-nmi-handling.patch: printk: NMI safe
+ printk (bnc#831949).
+- patches.fixes/printk-fix-softlockups-during-heavy-printing.patch:
+ Refresh.
+- patches.fixes/printk-fix-softlockups-during-heavy-printing.patch:
+ Refresh.
+- patches.fixes/printk-make-nmi-ringbuffer-length-independent.patch:
+ printk: Make NMI ringbuffer size independent on log_buf_len
+ (bnc#831949).
+- patches.fixes/printk-do-not-call-unlock_console-from-nmi.patch:
+ printk: Do not call console_unlock from nmi context
+ (bnc#831949).
+- patches.fixes/printk-do-not-use-printk_cpu-from-finish_printk.patch:
+ printk: Do not use printk_cpu from finish_printk (bnc#831949).
+
+
+-------------------------------------------------------------------
Fri Aug 9 09:16:03 CEST 2013 - jbeulich@novell.com
- patches.xen/xen-netback-generalize: Refresh (bnc#827378).
diff --git a/patches.fixes/printk-add-nmi-ring-buffer.patch b/patches.fixes/printk-add-nmi-ring-buffer.patch
new file mode 100644
index 0000000000..3b1761c33f
--- /dev/null
+++ b/patches.fixes/printk-add-nmi-ring-buffer.patch
@@ -0,0 +1,101 @@
+From: Michal Hocko <mhocko@suse.cz>
+Subject: printk: Add NMI ringbuffer
+Patch-mainline: not yet
+References: bnc#831949
+
+This is a preparatory patch for the NMI safe printk implementation. It adds a
+new ring buffer which will keep messages printed from NMI context which cannot
+access the regular ring buffer because logbuf_lock is already held.
+
+The ringbuffer is allocated during early initialization same as log_buf and it
+shares the same lenght.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ kernel/printk.c | 43 +++++++++++++++++++++++++++++++------------
+ 1 file changed, 31 insertions(+), 12 deletions(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -103,7 +103,8 @@ static int console_locked, console_suspe
+ static DEFINE_SPINLOCK(logbuf_lock);
+
+ #define LOG_BUF_MASK (log_buf_len-1)
+-#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
++#define __LOG_BUF(buf, idx) (buf[(idx) & LOG_BUF_MASK])
++#define LOG_BUF(idx) (__LOG_BUF(log_buf, idx))
+
+ /*
+ * The indices into log_buf are not constrained to log_buf_len - they
+@@ -113,6 +114,9 @@ static unsigned log_start; /* Index into
+ static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
+ static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
+
++static unsigned nmi_log_start; /* Index into nmi_log_buf: next char to be copied to printk ringbuf */
++static unsigned nmi_log_end; /* Index into nmi_log_buf: most-recently-written-char + 1 */
++
+ /*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+@@ -146,6 +150,7 @@ static int console_may_schedule;
+
+ static char __log_buf[__LOG_BUF_LEN];
+ static char *log_buf = __log_buf;
++static char *nmi_log_buf = NULL;
+ static int log_buf_len = __LOG_BUF_LEN;
+ static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+ static int saved_console_loglevel = -1;
+@@ -185,6 +190,20 @@ static int __init log_buf_len_setup(char
+ }
+ early_param("log_buf_len", log_buf_len_setup);
+
++char * __init alloc_log_buf(int early, unsigned len)
++{
++ if (early) {
++ unsigned long mem;
++
++ mem = memblock_alloc(len, PAGE_SIZE);
++ if (mem == MEMBLOCK_ERROR)
++ return NULL;
++ return __va(mem);
++ }
++
++ return alloc_bootmem_nopanic(len);
++}
++
+ void __init setup_log_buf(int early)
+ {
+ unsigned long flags;
+@@ -192,20 +211,20 @@ void __init setup_log_buf(int early)
+ char *new_log_buf;
+ int free;
+
++ if (!nmi_log_buf) {
++ unsigned len = (new_log_buf_len > log_buf_len) ? new_log_buf_len : log_buf_len;
++ nmi_log_buf = alloc_log_buf(early, len);
++ if (!nmi_log_buf)
++ pr_err("%ld bytes not available for nmi ring buffer\n",
++ new_log_buf_len);
++ else
++ pr_info("nmi ring buffer: %d\n", len);
++ }
++
+ if (!new_log_buf_len)
+ return;
+
+- if (early) {
+- unsigned long mem;
+-
+- mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+- if (mem == MEMBLOCK_ERROR)
+- return;
+- new_log_buf = __va(mem);
+- } else {
+- new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
+- }
+-
++ new_log_buf = alloc_log_buf(early, new_log_buf_len);
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
diff --git a/patches.fixes/printk-do-not-call-unlock_console-from-nmi.patch b/patches.fixes/printk-do-not-call-unlock_console-from-nmi.patch
new file mode 100644
index 0000000000..05e4ccd06a
--- /dev/null
+++ b/patches.fixes/printk-do-not-call-unlock_console-from-nmi.patch
@@ -0,0 +1,67 @@
+From: Michal Hocko <mhocko@suse.cz>
+Subject: printk: Do not call console_unlock from nmi context
+Patch-mainline: not yet
+References: bnc#831949
+
+console_unlock is tricky piece of code. It re-acquires logbuf_lock and takes
+other locks (e.g. from IRQ tracing path) which is a no-go for NMI context.
+Make sure we never call this function from nmi context. As a result we have to
+live with NMI messages potentially sitting in the NMI ringbuffer until a
+non-NMI printk happens. Which might be a lot of time.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+---
+ kernel/printk.c | 31 ++++++++++++++++++++-----------
+ 1 file changed, 20 insertions(+), 11 deletions(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -868,7 +868,6 @@ static int console_trylock_for_printk(un
+ retval = 0;
+ }
+ }
+- printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);
+ return retval;
+ }
+@@ -1095,20 +1094,30 @@ asmlinkage int vprintk(const char *fmt,
+ * The console_trylock_for_printk() function
+ * will release 'logbuf_lock' regardless of whether it
+ * actually gets the semaphore or not.
++ *
++ * This whole magic is not allowed from nmi context as
++ * console_unlock re-takes logbuf_lock and other locks
++ * from follow-up paths.
+ */
+ if (!in_nmi_delayed_printk) {
+- if (console_trylock_for_printk(this_cpu))
+- console_unlock();
++ printk_cpu = UINT_MAX;
++ if (in_nmi()) {
++ spin_unlock(&logbuf_lock);
++ } else {
++ if (console_trylock_for_printk(this_cpu))
++ console_unlock();
+
+- /*
+- * We are calling this outside of the lock just to make sure
+- * that the printk which raced with NMI had a chance to do
+- * some progress since it has been interrupted.
+- * Do not try to handle pending NMI messages from NMI as
+- * we would need to take logbuf_lock and we could deadlock.
+- */
+- if (!in_nmi())
++ /*
++ * We are calling this outside of the lock just to make
++ * sure that the printk which raced with NMI had a
++ * chance to do some progress since it has been
++ * interrupted.
++ * Do not try to handle pending NMI messages from NMI as
++ * we would need to take logbuf_lock and we could
++ * deadlock.
++ */
+ handle_nmi_delayed_printk();
++ }
+ } else
+ spin_unlock(&nmi_logbuf_lock);
+
diff --git a/patches.fixes/printk-do-not-use-printk_cpu-from-finish_printk.patch b/patches.fixes/printk-do-not-use-printk_cpu-from-finish_printk.patch
new file mode 100644
index 0000000000..b028753d86
--- /dev/null
+++ b/patches.fixes/printk-do-not-use-printk_cpu-from-finish_printk.patch
@@ -0,0 +1,73 @@
+From: Michal Hocko <mhocko@suse.cz>
+Subject: printk: Do not use printk_cpu from finish_printk
+Patch-mainline: no
+References: bnc#831949
+
+finish_printk is called for both regular and nmi ringbuffers but printk_cpu is
+updated only for the first one. This means that NMI context might access out of
+date or even reseted value which confuses cpu_clock and its access to per-cpu
+sched_clock_data and either blow up accessing an invalid memory or corrupting a
+target memory.
+
+Using printk_cpu blindly even doesn't make any sense so use the lock cpu
+instead.
+
+Here is a spectacular crash we have seen during bnc#831949 resting:
+PID: 0 TASK: ffff8a9bedeae540 CPU: 57 COMMAND: "kworker/0:1"
+ #0 [ffff8dc01f9c78a0] machine_kexec at ffffffff810267be
+ #1 [ffff8dc01f9c78f0] crash_kexec at ffffffff810a403a
+ #2 [ffff8dc01f9c79c0] oops_end at ffffffff814487a8
+ #3 [ffff8dc01f9c79e0] __bad_area_nosemaphore at ffffffff81032575
+ #4 [ffff8dc01f9c7aa0] do_page_fault at ffffffff8144ad06
+ #5 [ffff8dc01f9c7ba0] page_fault at ffffffff81447965
+ [exception RIP: sched_clock_cpu+112]
+ RIP: ffffffff81082440 RSP: ffff8dc01f9c7c58 RFLAGS: 00010046
+ RAX: 00000488529e9648 RBX: 0000000f00011c8e RCX: 00000488529e9648
+ RDX: 00000488529e9648 RSI: 00000488529e9648 RDI: ffff8dc01f9d1c90
+ RBP: ffff8dc01f9d1c80 R8: 0000000f00011c9e R9: 0000000000000000
+ R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff81059f60
+ R13: 0000000000000000 R14: ffffffff81dcfee0 R15: ffffffff81dcfee0
+ ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
+ #6 [ffff8dc01f9c7c70] cpu_clock at ffffffff81082527
+ #7 [ffff8dc01f9c7c80] finish_printk at ffffffff8105ad21
+ #8 [ffff8dc01f9c7d20] vprintk at ffffffff8105af68
+ #9 [ffff8dc01f9c7dc0] print_modules at ffffffff81094eeb
+#10 [ffff8dc01f9c7e00] show_registers at ffffffff810045c7
+#11 [ffff8dc01f9c7e80] show_regs at ffffffff8100a859
+#12 [ffff8dc01f9c7e90] arch_trigger_all_cpu_backtrace_handler at
+ffffffff81448e4a
+#13 [ffff8dc01f9c7eb0] notifier_call_chain at ffffffff8144aec7
+#14 [ffff8dc01f9c7ee0] __atomic_notifier_call_chain at ffffffff8144af0d
+#15 [ffff8dc01f9c7ef0] notify_die at ffffffff8144af5d
+#16 [ffff8dc01f9c7f20] default_do_nmi at ffffffff8144840b
+#17 [ffff8dc01f9c7f40] do_nmi at ffffffff814485c8
+#18 [ffff8dc01f9c7f50] nmi at ffffffff81447c20
+ [exception RIP: mwait_idle+423]
+ RIP: ffffffff8100b177 RSP: ffff8a9bedeb1f18 RFLAGS: 00000246
+ RAX: 0000000000000000 RBX: ffff8a9bedeb0010 RCX: 0000000000000000
+ RDX: 0000000000000000 RSI: ffff8a9bedeb1fd8 RDI: ffffffff81d2a108
+ RBP: 0000000000000039 R8: 0000000000000000 R9: 0000000000000000
+ R10: 0000000000000000 R11: ffffffff81020060 R12: 0000000000000000
+ R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+ ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
+--- <NMI exception stack> ---
+#19 [ffff8a9bedeb1f18] mwait_idle at ffffffff8100b177
+#20 [ffff8a9bedeb1f30] cpu_idle at ffffffff81002106PID: 0 TASK: ffff8a9bedeae540 CPU: 57 COMMAND: "kworker/0:1"
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ kernel/printk.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -990,7 +990,7 @@ static int finish_printk(char *msg, int
+ unsigned long long t;
+ unsigned long nanosec_rem;
+
+- t = cpu_clock(printk_cpu);
++ t = cpu_clock(smp_processor_id());
+ nanosec_rem = do_div(t, 1000000000);
+ tlen = sprintf(tbuf, "[%5lu.%06lu] ",
+ (unsigned long) t,
diff --git a/patches.fixes/printk-extract-ringbuffer-handling.patch b/patches.fixes/printk-extract-ringbuffer-handling.patch
new file mode 100644
index 0000000000..c1a6342b7a
--- /dev/null
+++ b/patches.fixes/printk-extract-ringbuffer-handling.patch
@@ -0,0 +1,201 @@
+From: Michal Hocko <mhocko@suse.cz>
+Subject: printk: extract ringbuffer handling from vprintk
+Patch-mainline: not yet
+References: bnc#831949
+
+This is just a preparator patch which moves ringbuffer handling code from
+vprintk into a helper function (finish_printk). This involves new lines
+handling, temporary buffer copy into the ring buffer and the log level.
+
+While I am there also do not use printk_buf directly and hide it behind local
+variable because later patches will introduce new temporary buffer.
+
+This patch doesn't introduce any functional changes.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ kernel/printk.c | 124 ++++++++++++++++++++++++++++++--------------------------
+ 1 file changed, 67 insertions(+), 57 deletions(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -858,61 +858,20 @@ static inline void printk_delay(void)
+ }
+ }
+
+-asmlinkage int vprintk(const char *fmt, va_list args)
++static int finish_printk(char *msg, int printed_len)
+ {
+- int printed_len = 0;
+ int current_log_level = default_message_loglevel;
+- unsigned long flags;
+- int this_cpu;
+- char *p;
++ char *msg_start = msg;
+ size_t plen;
+ char special;
++ void (*emit_char)(char c) = emit_log_char;
+
+- boot_delay_msec();
+- printk_delay();
+-
+- preempt_disable();
+- /* This stops the holder of console_sem just where we want him */
+- raw_local_irq_save(flags);
+- this_cpu = smp_processor_id();
+-
+- /*
+- * Ouch, printk recursed into itself!
+- */
+- if (unlikely(printk_cpu == this_cpu)) {
+- /*
+- * If a crash is occurring during printk() on this CPU,
+- * then try to get the crash message out but make sure
+- * we can't deadlock. Otherwise just return to avoid the
+- * recursion and return - but flag the recursion so that
+- * it can be printed at the next appropriate moment:
+- */
+- if (!oops_in_progress) {
+- recursion_bug = 1;
+- goto out_restore_irqs;
+- }
+- zap_locks();
+- }
+-
+- lockdep_off();
+- spin_lock(&logbuf_lock);
+- printk_cpu = this_cpu;
+-
+- if (recursion_bug) {
+- recursion_bug = 0;
+- strcpy(printk_buf, recursion_bug_msg);
+- printed_len = strlen(recursion_bug_msg);
+- }
+- /* Emit the output into the temporary buffer */
+- printed_len += vscnprintf(printk_buf + printed_len,
+- sizeof(printk_buf) - printed_len, fmt, args);
+-
+- p = printk_buf;
++ /* TODO new_text_line needs a special handling for nmi_ring */
+
+ /* Read log level and handle special printk prefix */
+- plen = log_prefix(p, &current_log_level, &special);
++ plen = log_prefix(msg, &current_log_level, &special);
+ if (plen) {
+- p += plen;
++ msg += plen;
+
+ switch (special) {
+ case 'c': /* Strip <c> KERN_CONT, continue line */
+@@ -922,7 +881,7 @@ asmlinkage int vprintk(const char *fmt,
+ plen = 0;
+ default:
+ if (!new_text_line) {
+- emit_log_char('\n');
++ emit_char('\n');
+ new_text_line = 1;
+ }
+ }
+@@ -932,7 +891,7 @@ asmlinkage int vprintk(const char *fmt,
+ * Copy the output into log_buf. If the caller didn't provide
+ * the appropriate log prefix, we insert them here
+ */
+- for (; *p; p++) {
++ for (; *msg; msg++) {
+ if (new_text_line) {
+ new_text_line = 0;
+
+@@ -941,13 +900,13 @@ asmlinkage int vprintk(const char *fmt,
+ int i;
+
+ for (i = 0; i < plen; i++)
+- emit_log_char(printk_buf[i]);
++ emit_char(msg_start[i]);
+ printed_len += plen;
+ } else {
+ /* Add log prefix */
+- emit_log_char('<');
+- emit_log_char(current_log_level + '0');
+- emit_log_char('>');
++ emit_char('<');
++ emit_char(current_log_level + '0');
++ emit_char('>');
+ printed_len += 3;
+ }
+
+@@ -965,19 +924,70 @@ asmlinkage int vprintk(const char *fmt,
+ nanosec_rem / 1000);
+
+ for (tp = tbuf; tp < tbuf + tlen; tp++)
+- emit_log_char(*tp);
++ emit_char(*tp);
+ printed_len += tlen;
+ }
+
+- if (!*p)
++ if (!*msg)
+ break;
+ }
+
+- emit_log_char(*p);
+- if (*p == '\n')
++ emit_char(*msg);
++ if (*msg == '\n')
+ new_text_line = 1;
+ }
+
++ return printed_len;
++}
++
++asmlinkage int vprintk(const char *fmt, va_list args)
++{
++ int printed_len = 0;
++ unsigned long flags;
++ int this_cpu;
++ char *buf = printk_buf;
++ unsigned buf_len = sizeof(printk_buf);
++
++ boot_delay_msec();
++ printk_delay();
++
++ preempt_disable();
++ /* This stops the holder of console_sem just where we want him */
++ raw_local_irq_save(flags);
++ this_cpu = smp_processor_id();
++
++ /*
++ * Ouch, printk recursed into itself!
++ */
++ if (unlikely(printk_cpu == this_cpu)) {
++ /*
++ * If a crash is occurring during printk() on this CPU,
++ * then try to get the crash message out but make sure
++ * we can't deadlock. Otherwise just return to avoid the
++ * recursion and return - but flag the recursion so that
++ * it can be printed at the next appropriate moment:
++ */
++ if (!oops_in_progress) {
++ recursion_bug = 1;
++ goto out_restore_irqs;
++ }
++ zap_locks();
++ }
++
++ lockdep_off();
++ spin_lock(&logbuf_lock);
++ printk_cpu = this_cpu;
++
++ if (recursion_bug) {
++ recursion_bug = 0;
++ strcpy(printk_buf, recursion_bug_msg);
++ printed_len = strlen(recursion_bug_msg);
++ }
++ /* Emit the output into the temporary buffer */
++ printed_len += vscnprintf(buf + printed_len,
++ buf_len - printed_len, fmt, args);
++ printed_len = finish_printk(buf, printed_len);
++
+ /*
+ * Try to acquire and then immediately release the
+ * console semaphore. The release will do all the
diff --git a/patches.fixes/printk-fix-softlockups-during-heavy-printing.patch b/patches.fixes/printk-fix-softlockups-during-heavy-printing.patch
index c41b388c05..d16ea34ee4 100644
--- a/patches.fixes/printk-fix-softlockups-during-heavy-printing.patch
+++ b/patches.fixes/printk-fix-softlockups-during-heavy-printing.patch
@@ -15,21 +15,24 @@ queued work.
Signed-off-by: Jan Kara <jack@suse.cz>
-diff -rupX /crypted/home/jack/.kerndiffexclude linux-3.0-SLE11-SP2/kernel/printk.c linux-3.0-SLE11-SP2-1-console_lock/kernel/printk.c
---- linux-3.0-SLE11-SP2/kernel/printk.c 2012-12-10 13:13:02.881717602 +0100
-+++ linux-3.0-SLE11-SP2-1-console_lock/kernel/printk.c 2012-12-13 10:45:08.809885871 +0100
-@@ -118,6 +118,10 @@ static unsigned log_start; /* Index into
- static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
- static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
+---
+ kernel/printk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 59 insertions(+), 9 deletions(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -114,6 +114,10 @@ static DEFINE_SPINLOCK(nmi_logbuf_lock);
+ #define __LOG_BUF(buf, len, idx) (buf[(idx) & ((len)-1)])
+ #define LOG_BUF(idx) (__LOG_BUF(log_buf, log_buf_len, idx))
+/* Worker to print accumulated data to console when there's too much of it */
+static void printk_worker(struct work_struct *work);
+static DECLARE_WORK(printk_work, printk_worker);
+
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-@@ -1241,6 +1245,13 @@ void wake_up_klogd(void)
+ * The indices into log_buf are not constrained to log_buf_len - they
+ * must be masked before subscripting
+@@ -1426,6 +1430,13 @@ void wake_up_klogd(void)
this_cpu_write(printk_pending, 1);
}
@@ -43,7 +46,7 @@ diff -rupX /crypted/home/jack/.kerndiffexclude linux-3.0-SLE11-SP2/kernel/printk
/**
* console_unlock - unlock the console system
*
-@@ -1249,39 +1260,54 @@ void wake_up_klogd(void)
+@@ -1434,39 +1445,54 @@ void wake_up_klogd(void)
*
* While the console_lock was held, console output may have been buffered
* by printk(). If this is the case, console_unlock(); emits
@@ -107,7 +110,7 @@ diff -rupX /crypted/home/jack/.kerndiffexclude linux-3.0-SLE11-SP2/kernel/printk
console_locked = 0;
/* Release the exclusive_console once it is used */
-@@ -1292,9 +1318,33 @@ void console_unlock(void)
+@@ -1477,9 +1503,33 @@ void console_unlock(void)
spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
wake_up_klogd();
diff --git a/patches.fixes/printk-make-nmi-ringbuffer-length-independent.patch b/patches.fixes/printk-make-nmi-ringbuffer-length-independent.patch
new file mode 100644
index 0000000000..e3dc1692b4
--- /dev/null
+++ b/patches.fixes/printk-make-nmi-ringbuffer-length-independent.patch
@@ -0,0 +1,115 @@
+From: Michal Hocko <mhocko@suse.cz>
+Subject: printk: Make NMI ringbuffer size independent on log_buf_len
+Patch-mainline: not yet
+References: bnc#831949
+
+Having NMI ring buffer same size as regular ring buffer might be considered as
+a waste of memory. Especially when the regular ring buffer is increased to a
+big value. So create a separate kernel parameters to set nmi_log_buf_len.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ kernel/printk.c | 47 ++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 34 insertions(+), 13 deletions(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -112,8 +112,8 @@ static DEFINE_SPINLOCK(logbuf_lock);
+ static DEFINE_SPINLOCK(nmi_logbuf_lock);
+
+ #define LOG_BUF_MASK (log_buf_len-1)
+-#define __LOG_BUF(buf, idx) (buf[(idx) & LOG_BUF_MASK])
+-#define LOG_BUF(idx) (__LOG_BUF(log_buf, idx))
++#define __LOG_BUF(buf, len, idx) (buf[(idx) & ((len)-1)])
++#define LOG_BUF(idx) (__LOG_BUF(log_buf, log_buf_len, idx))
+
+ /*
+ * The indices into log_buf are not constrained to log_buf_len - they
+@@ -161,6 +161,7 @@ static char __log_buf[__LOG_BUF_LEN];
+ static char *log_buf = __log_buf;
+ static char *nmi_log_buf = NULL;
+ static int log_buf_len = __LOG_BUF_LEN;
++static int nmi_log_buf_len = __LOG_BUF_LEN;
+ static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+ static int saved_console_loglevel = -1;
+
+@@ -184,6 +185,7 @@ void log_buf_kexec_setup(void)
+
+ /* requested log_buf_len from kernel cmdline */
+ static unsigned long __initdata new_log_buf_len;
++static unsigned long __initdata new_nmi_log_buf_len;
+
+ /* save requested log_buf_len since it's too early to process it */
+ static int __init log_buf_len_setup(char *str)
+@@ -199,6 +201,19 @@ static int __init log_buf_len_setup(char
+ }
+ early_param("log_buf_len", log_buf_len_setup);
+
++static int __init nmi_log_buf_len_setup(char *str)
++{
++ unsigned size = memparse(str, &str);
++
++ if (size)
++ size = roundup_pow_of_two(size);
++ if (size > nmi_log_buf_len)
++ new_nmi_log_buf_len = size;
++
++ return 0;
++}
++early_param("nmi_log_buf_len", nmi_log_buf_len_setup);
++
+ char * __init alloc_log_buf(int early, unsigned len)
+ {
+ if (early) {
+@@ -221,13 +236,19 @@ void __init setup_log_buf(int early)
+ int free;
+
+ if (!nmi_log_buf) {
+- unsigned len = (new_log_buf_len > log_buf_len) ? new_log_buf_len : log_buf_len;
+- nmi_log_buf = alloc_log_buf(early, len);
+- if (!nmi_log_buf)
+- pr_err("%ld bytes not available for nmi ring buffer\n",
+- new_log_buf_len);
+- else
+- pr_info("nmi ring buffer: %d\n", len);
++ unsigned len = (nmi_log_buf_len > new_nmi_log_buf_len) ?
++ nmi_log_buf_len: new_nmi_log_buf_len;
++
++ if (len) {
++ nmi_log_buf = alloc_log_buf(early, len);
++ if (!nmi_log_buf)
++ pr_err("%ld bytes not available for nmi ring buffer\n",
++ len);
++ else {
++ nmi_log_buf_len = len;
++ pr_info("nmi ring buffer: %d\n", len);
++ }
++ }
+ }
+
+ if (!new_log_buf_len)
+@@ -709,7 +730,7 @@ static void emit_log_char(char c)
+
+ static void emit_nmi_log_char(char c)
+ {
+- __LOG_BUF(nmi_log_buf, nmi_log_end) = c;
++ __LOG_BUF(nmi_log_buf, nmi_log_buf_len, nmi_log_end) = c;
+ /*
+ * Make sure that the buffer content is visible before nmi_log_end
+ * for out of lock access so that we can be sure that the content
+@@ -897,11 +918,11 @@ static void handle_nmi_delayed_printk(vo
+ break;
+
+ /* Make sure the ring buffer doesn't overflow */
+- if (end_idx - idx > log_buf_len)
+- idx = end_idx - log_buf_len;
++ if (end_idx - idx > nmi_log_buf_len)
++ idx = end_idx - nmi_log_buf_len;
+
+ smp_rmb();
+- emit_log_char(__LOG_BUF(nmi_log_buf, idx));
++ emit_log_char(__LOG_BUF(nmi_log_buf, nmi_log_buf_len, idx));
+ }
+ /* Nobody touches nmi_log_buf except for us and we are locked */
+ nmi_log_start = idx;
diff --git a/patches.fixes/printk-safe-nmi-handling.patch b/patches.fixes/printk-safe-nmi-handling.patch
new file mode 100644
index 0000000000..29aa746eac
--- /dev/null
+++ b/patches.fixes/printk-safe-nmi-handling.patch
@@ -0,0 +1,229 @@
+From: Michal Hocko <mhocko@suse.cz>
+Subject: printk: NMI safe printk
+Patch-mainline: not yet
+References: bnc#831949
+
+"Never ever even think about calling printk from NMI context", they said. Yes,
+calling anything from NMI that uses locks internally and it might be called
+from outside of NMI context as well is broken by definition. Printk is one such
+example. Unfortunately there are circumstances when calling printk from NMI is
+very useful. E.g. all WARN.*(in_nmi()) would be much more helpful if they
+didn't lockup the machine.
+
+Another example would be arch_trigger_all_cpu_backtrace for x86 which uses NMI
+to dump traces on all CPU (either triggered by sysrq+l or from RCU stall
+detector).
+
+This patch prevents from deadlock on logbuf_lock by using trylock rather than
+spin_lock and falling back into delayed message dumping from NMI context. It
+uses NMI specific ring buffer to store the message and relies on the current
+logbuf_lock holder to copy the content from the nmi_log_buf to the standard
+log_buf after it is done with its own business.
+
+In order to synchronize parallel printks from NMI context there was a new lock
+introduced. This one is held only from the NMI context and it doesn't nest into
+any other lock so it is safe against deadlocks.
+
+handle_nmi_delayed_printk which is responsible for nmi->regular ring buffer
+copying has to be prepared to race with ongoing NMI updating the NMI ring
+buffer. This is possible because emit_nmi_log_char only updates the finger to
+the end of the buffer. nmi_log_start is updated only from !NMI context with
+log_buf held.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ kernel/printk.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 111 insertions(+), 13 deletions(-)
+
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -102,6 +102,15 @@ static int console_locked, console_suspe
+ */
+ static DEFINE_SPINLOCK(logbuf_lock);
+
++/*
++ * nmi_logbuf_lock protects nmi_log_buf, log_start and log_end from NNI context
++ * when logbuf_lock is held to synchronize NMI contexts which try to do printk
++ * at the same time. NEVER EVER take this lock outside of NMI context.
++ * non NMI consumer of nmi_log_buf has to take logbuf_lock and be careful about
++ * racing with NMI context (see handle_nmi_delayed_printk).
++ */
++static DEFINE_SPINLOCK(nmi_logbuf_lock);
++
+ #define LOG_BUF_MASK (log_buf_len-1)
+ #define __LOG_BUF(buf, idx) (buf[(idx) & LOG_BUF_MASK])
+ #define LOG_BUF(idx) (__LOG_BUF(log_buf, idx))
+@@ -698,6 +707,18 @@ static void emit_log_char(char c)
+ logged_chars++;
+ }
+
++static void emit_nmi_log_char(char c)
++{
++ __LOG_BUF(nmi_log_buf, nmi_log_end) = c;
++ /*
++ * Make sure that the buffer content is visible before nmi_log_end
++ * for out of lock access so that we can be sure that the content
++ * is up-to-date
++ */
++ smp_wmb();
++ nmi_log_end++;
++}
++
+ /*
+ * Zap console related locks when oopsing. Only zap at most once
+ * every 10 seconds, to leave time for slow consoles to print a
+@@ -835,6 +856,7 @@ static const char recursion_bug_msg [] =
+ static int recursion_bug;
+ static int new_text_line = 1;
+ static char printk_buf[1024];
++static char nmi_printk_buf[1024];
+
+ int printk_delay_msec __read_mostly;
+
+@@ -850,13 +872,50 @@ static inline void printk_delay(void)
+ }
+ }
+
+-static int finish_printk(char *msg, int printed_len)
++/*
++ * Called from non-NMI context to move nmi ring buffer into the regular printk
++ * ring buffer
++ */
++static void handle_nmi_delayed_printk(void)
++{
++ unsigned end_idx, start_idx, idx;
++
++ end_idx = ACCESS_ONCE(nmi_log_end);
++ start_idx = ACCESS_ONCE(nmi_log_start);
++
++ if (likely(end_idx == start_idx))
++ return;
++
++ spin_lock(&logbuf_lock);
++ for (idx = nmi_log_start; ; idx++) {
++ /*
++ * nmi_log_end might be updated from NMI context. Make
++ * sure we refetch a new value every loop invocation
++ */
++ end_idx = ACCESS_ONCE(nmi_log_end);
++ if (idx == end_idx)
++ break;
++
++ /* Make sure the ring buffer doesn't overflow */
++ if (end_idx - idx > log_buf_len)
++ idx = end_idx - log_buf_len;
++
++ smp_rmb();
++ emit_log_char(__LOG_BUF(nmi_log_buf, idx));
++ }
++ /* Nobody touches nmi_log_buf except for us and we are locked */
++ nmi_log_start = idx;
++ if (console_trylock_for_printk(smp_processor_id()))
++ console_unlock();
++}
++
++static int finish_printk(char *msg, int printed_len, bool nmi_ring)
+ {
+ int current_log_level = default_message_loglevel;
+ char *msg_start = msg;
+ size_t plen;
+ char special;
+- void (*emit_char)(char c) = emit_log_char;
++ void (*emit_char)(char c) = (nmi_ring) ? emit_nmi_log_char : emit_log_char;
+
+ /* TODO new_text_line needs a special handling for nmi_ring */
+
+@@ -939,6 +998,7 @@ asmlinkage int vprintk(const char *fmt,
+ int this_cpu;
+ char *buf = printk_buf;
+ unsigned buf_len = sizeof(printk_buf);
++ bool in_nmi_delayed_printk = false;
+
+ boot_delay_msec();
+ printk_delay();
+@@ -951,7 +1011,7 @@ asmlinkage int vprintk(const char *fmt,
+ /*
+ * Ouch, printk recursed into itself!
+ */
+- if (unlikely(printk_cpu == this_cpu)) {
++ if (!in_nmi() && unlikely(printk_cpu == this_cpu)) {
+ /*
+ * If a crash is occurring during printk() on this CPU,
+ * then try to get the crash message out but make sure
+@@ -967,18 +1027,43 @@ asmlinkage int vprintk(const char *fmt,
+ }
+
+ lockdep_off();
+- spin_lock(&logbuf_lock);
+- printk_cpu = this_cpu;
+-
+- if (recursion_bug) {
+- recursion_bug = 0;
+- strcpy(printk_buf, recursion_bug_msg);
+- printed_len = strlen(recursion_bug_msg);
++ /*
++ * Make sure we are not going to deadlock when we managed to preempt the
++ * currently running printk from NMI. Copy the current message into nmi
++ * ring buffer and let the current lock owner to print the message after
++ * he is back on CPU.
++ */
++ if (!spin_trylock(&logbuf_lock)) {
++ if (!in_nmi()) {
++ spin_lock(&logbuf_lock);
++ } else {
++ if (!nmi_log_buf) {
++ lockdep_on();
++ goto out_restore_irqs;
++ }
++ /*
++ * The lock is allowed to be taken only from NMI context
++ * to synchronize NMI printk callers.
++ */
++ spin_lock(&nmi_logbuf_lock);
++ buf = nmi_printk_buf;
++ buf_len = sizeof(nmi_printk_buf);
++ in_nmi_delayed_printk = true;
++ }
+ }
++ if (!in_nmi_delayed_printk) {
++ printk_cpu = this_cpu;
++ if (recursion_bug) {
++ recursion_bug = 0;
++ strcpy(buf, recursion_bug_msg);
++ printed_len = strlen(recursion_bug_msg);
++ }
++ }
++
+ /* Emit the output into the temporary buffer */
+ printed_len += vscnprintf(buf + printed_len,
+ buf_len - printed_len, fmt, args);
+- printed_len = finish_printk(buf, printed_len);
++ printed_len = finish_printk(buf, printed_len, in_nmi_delayed_printk);
+
+ /*
+ * Try to acquire and then immediately release the
+@@ -990,8 +1075,21 @@ asmlinkage int vprintk(const char *fmt,
+ * will release 'logbuf_lock' regardless of whether it
+ * actually gets the semaphore or not.
+ */
+- if (console_trylock_for_printk(this_cpu))
+- console_unlock();
++ if (!in_nmi_delayed_printk) {
++ if (console_trylock_for_printk(this_cpu))
++ console_unlock();
++
++ /*
++ * We are calling this outside of the lock just to make sure
++ * that the printk which raced with NMI had a chance to do
++ * some progress since it has been interrupted.
++ * Do not try to handle pending NMI messages from NMI as
++ * we would need to take logbuf_lock and we could deadlock.
++ */
++ if (!in_nmi())
++ handle_nmi_delayed_printk();
++ } else
++ spin_unlock(&nmi_logbuf_lock);
+
+ lockdep_on();
+ out_restore_irqs:
diff --git a/patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch b/patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch
new file mode 100644
index 0000000000..3e29234b94
--- /dev/null
+++ b/patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch
@@ -0,0 +1,404 @@
+From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <srostedt@redhat.com>
+Date: Thu, 8 Dec 2011 12:36:23 -0500
+Subject: [PATCH] x86: Add workaround to NMI iret woes
+Git-commit: 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0
+Patch-mainline: 3.3-rc1
+References: bnc#831949
+
+mhocko@suse.cz:
+This patch is needed because stack dumber (show_regs to be more precise)
+might trigger a page fault when printing userspace op code. This would
+trigger iret and if we are within NMI context we can just blow up easily.
+
+In x86, when an NMI goes off, the CPU goes into an NMI context that
+prevents other NMIs to trigger on that CPU. If an NMI is suppose to
+trigger, it has to wait till the previous NMI leaves NMI context.
+At that time, the next NMI can trigger (note, only one more NMI will
+trigger, as only one can be latched at a time).
+
+The way x86 gets out of NMI context is by calling iret. The problem
+with this is that this causes problems if the NMI handle either
+triggers an exception, or a breakpoint. Both the exception and the
+breakpoint handlers will finish with an iret. If this happens while
+in NMI context, the CPU will leave NMI context and a new NMI may come
+in. As NMI handlers are not made to be re-entrant, this can cause
+havoc with the system, not to mention, the nested NMI will write
+all over the previous NMI's stack.
+
+Linus Torvalds proposed the following workaround to this problem:
+
+https://lkml.org/lkml/2010/7/14/264
+
+"In fact, I wonder if we couldn't just do a software NMI disable
+instead? Hav ea per-cpu variable (in the _core_ percpu areas that get
+allocated statically) that points to the NMI stack frame, and just
+make the NMI code itself do something like
+
+ NMI entry:
+ - load percpu NMI stack frame pointer
+ - if non-zero we know we're nested, and should ignore this NMI:
+ - we're returning to kernel mode, so return immediately by using
+"popf/ret", which also keeps NMI's disabled in the hardware until the
+"real" NMI iret happens.
+ - before the popf/iret, use the NMI stack pointer to make the NMI
+return stack be invalid and cause a fault
+ - set the NMI stack pointer to the current stack pointer
+
+ NMI exit (not the above "immediate exit because we nested"):
+ clear the percpu NMI stack pointer
+ Just do the iret.
+
+Now, the thing is, now the "iret" is atomic. If we had a nested NMI,
+we'll take a fault, and that re-does our "delayed" NMI - and NMI's
+will stay masked.
+
+And if we didn't have a nested NMI, that iret will now unmask NMI's,
+and everything is happy."
+
+I first tried to follow this advice but as I started implementing this
+code, a few gotchas showed up.
+
+One, is accessing per-cpu variables in the NMI handler.
+
+The problem is that per-cpu variables use the %gs register to get the
+variable for the given CPU. But as the NMI may happen in userspace,
+we must first perform a SWAPGS to get to it. The NMI handler already
+does this later in the code, but its too late as we have saved off
+all the registers and we don't want to do that for a disabled NMI.
+
+Peter Zijlstra suggested to keep all variables on the stack. This
+simplifies things greatly and it has the added benefit of cache locality.
+
+Two, faulting on the iret.
+
+I really wanted to make this work, but it was becoming very hacky, and
+I never got it to be stable. The iret already had a fault handler for
+userspace faulting with bad segment registers, and getting NMI to trigger
+a fault and detect it was very tricky. But for strange reasons, the system
+would usually take a double fault and crash. I never figured out why
+and decided to go with a simple "jmp" approach. The new approach I took
+also simplified things.
+
+Finally, the last problem with Linus's approach was to have the nested
+NMI handler do a ret instead of an iret to give the first NMI NMI-context
+again.
+
+The problem is that ret is much more limited than an iret. I couldn't figure
+out how to get the stack back where it belonged. I could have copied the
+current stack, pushed the return onto it, but my fear here is that there
+may be some place that writes data below the stack pointer. I know that
+is not something code should depend on, but I don't want to chance it.
+I may add this feature later, but for now, an NMI handler that loses NMI
+context will not get it back.
+
+Here's what is done:
+
+When an NMI comes in, the HW pushes the interrupt stack frame onto the
+per cpu NMI stack that is selected by the IST.
+
+A special location on the NMI stack holds a variable that is set when
+the first NMI handler runs. If this variable is set then we know that
+this is a nested NMI and we process the nested NMI code.
+
+There is still a race when this variable is cleared and an NMI comes
+in just before the first NMI does the return. For this case, if the
+variable is cleared, we also check if the interrupted stack is the
+NMI stack. If it is, then we process the nested NMI code.
+
+Why the two tests and not just test the interrupted stack?
+
+If the first NMI hits a breakpoint and loses NMI context, and then it
+hits another breakpoint and while processing that breakpoint we get a
+nested NMI. When processing a breakpoint, the stack changes to the
+breakpoint stack. If another NMI comes in here we can't rely on the
+interrupted stack to be the NMI stack.
+
+If the variable is not set and the interrupted task's stack is not the
+NMI stack, then we know this is the first NMI and we can process things
+normally. But in order to do so, we need to do a few things first.
+
+1) Set the stack variable that tells us that we are in an NMI handler
+
+2) Make two copies of the interrupt stack frame.
+ One copy is used to return on iret
+ The other is used to restore the first one if we have a nested NMI.
+
+This is what the stack will look like:
+
+ +-------------------------+
+ | original SS |
+ | original Return RSP |
+ | original RFLAGS |
+ | original CS |
+ | original RIP |
+ +-------------------------+
+ | temp storage for rdx |
+ +-------------------------+
+ | NMI executing variable |
+ +-------------------------+
+ | Saved SS |
+ | Saved Return RSP |
+ | Saved RFLAGS |
+ | Saved CS |
+ | Saved RIP |
+ +-------------------------+
+ | copied SS |
+ | copied Return RSP |
+ | copied RFLAGS |
+ | copied CS |
+ | copied RIP |
+ +-------------------------+
+ | pt_regs |
+ +-------------------------+
+
+The original stack frame contains what the HW put in when we entered
+the NMI.
+
+We store %rdx as a temp variable to use. Both the original HW stack
+frame and this %rdx storage will be clobbered by nested NMIs so we
+can not rely on them later in the first NMI handler.
+
+The next item is the special stack variable that is set when we execute
+the rest of the NMI handler.
+
+Then we have two copies of the interrupt stack. The second copy is
+modified by any nested NMIs to let the first NMI know that we triggered
+a second NMI (latched) and that we should repeat the NMI handler.
+
+If the first NMI hits an exception or breakpoint that takes it out of
+NMI context, if a second NMI comes in before the first one finishes,
+it will update the copied interrupt stack to point to a fix up location
+to trigger another NMI.
+
+When the first NMI calls iret, it will instead jump to the fix up
+location. This fix up location will copy the saved interrupt stack back
+to the copy and execute the nmi handler again.
+
+Note, the nested NMI knows enough to check if it preempted a previous
+NMI handler while it is in the fixup location. If it has, it will not
+modify the copied interrupt stack and will just leave as if nothing
+happened. As the NMI handle is about to execute again, there's no reason
+to latch now.
+
+To test all this, I forced the NMI handler to call iret and take itself
+out of NMI context. I also added assemble code to write to the serial to
+make sure that it hits the nested path as well as the fix up path.
+Everything seems to be working fine.
+
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: H. Peter Anvin <hpa@linux.intel.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Paul Turner <pjt@google.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 177 insertions(+)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1480,11 +1480,166 @@ ENTRY(error_exit)
+ CFI_ENDPROC
+ END(error_exit)
+
++/*
++ * Test if a given stack is an NMI stack or not.
++ */
++ .macro test_in_nmi reg stack nmi_ret normal_ret
++ cmpq %\reg, \stack
++ ja \normal_ret
++ subq $EXCEPTION_STKSZ, %\reg
++ cmpq %\reg, \stack
++ jb \normal_ret
++ jmp \nmi_ret
++ .endm
+
+ /* runs on exception stack */
+ ENTRY(nmi)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
++ /*
++ * We allow breakpoints in NMIs. If a breakpoint occurs, then
++ * the iretq it performs will take us out of NMI context.
++ * This means that we can have nested NMIs where the next
++ * NMI is using the top of the stack of the previous NMI. We
++ * can't let it execute because the nested NMI will corrupt the
++ * stack of the previous NMI. NMI handlers are not re-entrant
++ * anyway.
++ *
++ * To handle this case we do the following:
++ * Check the a special location on the stack that contains
++ * a variable that is set when NMIs are executing.
++ * The interrupted task's stack is also checked to see if it
++ * is an NMI stack.
++ * If the variable is not set and the stack is not the NMI
++ * stack then:
++ * o Set the special variable on the stack
++ * o Copy the interrupt frame into a "saved" location on the stack
++ * o Copy the interrupt frame into a "copy" location on the stack
++ * o Continue processing the NMI
++ * If the variable is set or the previous stack is the NMI stack:
++ * o Modify the "copy" location to jump to the repeate_nmi
++ * o return back to the first NMI
++ *
++ * Now on exit of the first NMI, we first clear the stack variable
++ * The NMI stack will tell any nested NMIs at that point that it is
++ * nested. Then we pop the stack normally with iret, and if there was
++ * a nested NMI that updated the copy interrupt stack frame, a
++ * jump will be made to the repeat_nmi code that will handle the second
++ * NMI.
++ */
++
++ /* Use %rdx as out temp variable throughout */
++ pushq_cfi %rdx
++
++ /*
++ * Check the special variable on the stack to see if NMIs are
++ * executing.
++ */
++ cmp $1, -8(%rsp)
++ je nested_nmi
++
++ /*
++ * Now test if the previous stack was an NMI stack.
++ * We need the double check. We check the NMI stack to satisfy the
++ * race when the first NMI clears the variable before returning.
++ * We check the variable because the first NMI could be in a
++ * breakpoint routine using a breakpoint stack.
++ */
++ lea 6*8(%rsp), %rdx
++ test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
++
++nested_nmi:
++ /*
++ * Do nothing if we interrupted the fixup in repeat_nmi.
++ * It's about to repeat the NMI handler, so we are fine
++ * with ignoring this one.
++ */
++ movq $repeat_nmi, %rdx
++ cmpq 8(%rsp), %rdx
++ ja 1f
++ movq $end_repeat_nmi, %rdx
++ cmpq 8(%rsp), %rdx
++ ja nested_nmi_out
++
++1:
++ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
++ leaq -6*8(%rsp), %rdx
++ movq %rdx, %rsp
++ CFI_ADJUST_CFA_OFFSET 6*8
++ pushq_cfi $__KERNEL_DS
++ pushq_cfi %rdx
++ pushfq_cfi
++ pushq_cfi $__KERNEL_CS
++ pushq_cfi $repeat_nmi
++
++ /* Put stack back */
++ addq $(11*8), %rsp
++ CFI_ADJUST_CFA_OFFSET -11*8
++
++nested_nmi_out:
++ popq_cfi %rdx
++
++ /* No need to check faults here */
++ INTERRUPT_RETURN
++
++first_nmi:
++ /*
++ * Because nested NMIs will use the pushed location that we
++ * stored in rdx, we must keep that space available.
++ * Here's what our stack frame will look like:
++ * +-------------------------+
++ * | original SS |
++ * | original Return RSP |
++ * | original RFLAGS |
++ * | original CS |
++ * | original RIP |
++ * +-------------------------+
++ * | temp storage for rdx |
++ * +-------------------------+
++ * | NMI executing variable |
++ * +-------------------------+
++ * | Saved SS |
++ * | Saved Return RSP |
++ * | Saved RFLAGS |
++ * | Saved CS |
++ * | Saved RIP |
++ * +-------------------------+
++ * | copied SS |
++ * | copied Return RSP |
++ * | copied RFLAGS |
++ * | copied CS |
++ * | copied RIP |
++ * +-------------------------+
++ * | pt_regs |
++ * +-------------------------+
++ *
++ * The saved RIP is used to fix up the copied RIP that a nested
++ * NMI may zero out. The original stack frame and the temp storage
++ * is also used by nested NMIs and can not be trusted on exit.
++ */
++ /* Set the NMI executing variable on the stack. */
++ pushq_cfi $1
++
++ /* Copy the stack frame to the Saved frame */
++ .rept 5
++ pushq_cfi 6*8(%rsp)
++ .endr
++
++ /* Make another copy, this one may be modified by nested NMIs */
++ .rept 5
++ pushq_cfi 4*8(%rsp)
++ .endr
++
++ /* Do not pop rdx, nested NMIs will corrupt it */
++ movq 11*8(%rsp), %rdx
++
++ /*
++ * Everything below this point can be preempted by a nested
++ * NMI if the first NMI took an exception. Repeated NMIs
++ * caused by an exception and nested NMI will start here, and
++ * can still be preempted by another NMI.
++ */
++restart_nmi:
+ pushq_cfi $-1
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+@@ -1500,10 +1655,32 @@ nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+ nmi_restore:
+ RESTORE_ALL 8
++ /* Clear the NMI executing stack variable */
++ movq $0, 10*8(%rsp)
+ jmp irq_return
+ CFI_ENDPROC
+ END(nmi)
+
++ /*
++ * If an NMI hit an iret because of an exception or breakpoint,
++ * it can lose its NMI context, and a nested NMI may come in.
++ * In that case, the nested NMI will change the preempted NMI's
++ * stack to jump to here when it does the final iret.
++ */
++repeat_nmi:
++ INTR_FRAME
++ /* Update the stack variable to say we are still in NMI */
++ movq $1, 5*8(%rsp)
++
++ /* copy the saved stack back to copy stack */
++ .rept 5
++ pushq_cfi 4*8(%rsp)
++ .endr
++
++ jmp restart_nmi
++ CFI_ENDPROC
++end_repeat_nmi:
++
+ ENTRY(ignore_sysret)
+ CFI_STARTPROC
+ mov $-ENOSYS,%eax
diff --git a/patches.fixes/x86-Do-not-schedule-while-still-in-NMI-context.patch b/patches.fixes/x86-Do-not-schedule-while-still-in-NMI-context.patch
new file mode 100644
index 0000000000..1f55b51f92
--- /dev/null
+++ b/patches.fixes/x86-Do-not-schedule-while-still-in-NMI-context.patch
@@ -0,0 +1,100 @@
+From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Tue, 29 Nov 2011 12:44:55 -0800
+Subject: [PATCH] x86: Do not schedule while still in NMI context
+Git-commit: 549c89b98c4530b278dde1a3f68ce5ebbb1e6304
+Patch-mainline: 3.3-rc1
+References: bnc#831949
+
+mhocko@suse.cz:
+This patch has been backported mainly as a dependency for
+patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch as the direct
+backport would be more complicated - friends do not let friends patch .S
+files.
+The patch itself should be harmless as NMIs are rare events and not
+rescheduling shouldn't matter even for PREEMPT_VOLUNTARY kernels.
+
+The NMI handler uses the paranoid_exit routine that checks the
+NEED_RESCHED flag, and if it is set and the return is for userspace,
+then interrupts are enabled, the stack is swapped to the thread's stack,
+and schedule is called. The problem with this is that we are still in an
+NMI context until an iret is executed. This means that any new NMIs are
+now starved until an interrupt or exception occurs and does the iret.
+
+As NMIs can not be masked and can interrupt any location, they are
+treated as a special case. NEED_RESCHED should not be set in an NMI
+handler. The interruption by the NMI should not disturb the work flow
+for scheduling. Any IPI sent to a processor after sending the
+NEED_RESCHED would have to wait for the NMI anyway, and after the IPI
+finishes the schedule would be called as required.
+
+There is no reason to do anything special leaving an NMI. Remove the
+call to paranoid_exit and do a simple return. This not only fixes the
+bug of starved NMIs, but it also cleans up the code.
+
+Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com
+
+Acked-by: Andi Kleen <ak@linux.intel.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: "H. Peter Anvin" <hpa@linux.intel.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Paul Turner <pjt@google.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+
+---
+ arch/x86/kernel/entry_64.S | 32 --------------------------------
+ 1 file changed, 32 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1494,46 +1494,14 @@ ENTRY(nmi)
+ movq %rsp,%rdi
+ movq $-1,%rsi
+ call do_nmi
+-#ifdef CONFIG_TRACE_IRQFLAGS
+- /* paranoidexit; without TRACE_IRQS_OFF */
+- /* ebx: no swapgs flag */
+- DISABLE_INTERRUPTS(CLBR_NONE)
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+- testl $3,CS(%rsp)
+- jnz nmi_userspace
+ nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+ nmi_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+-nmi_userspace:
+- GET_THREAD_INFO(%rcx)
+- movl TI_flags(%rcx),%ebx
+- andl $_TIF_WORK_MASK,%ebx
+- jz nmi_swapgs
+- movq %rsp,%rdi /* &pt_regs */
+- call sync_regs
+- movq %rax,%rsp /* switch stack for scheduling */
+- testl $_TIF_NEED_RESCHED,%ebx
+- jnz nmi_schedule
+- movl %ebx,%edx /* arg3: thread flags */
+- ENABLE_INTERRUPTS(CLBR_NONE)
+- xorl %esi,%esi /* arg2: oldset */
+- movq %rsp,%rdi /* arg1: &pt_regs */
+- call do_notify_resume
+- DISABLE_INTERRUPTS(CLBR_NONE)
+- jmp nmi_userspace
+-nmi_schedule:
+- ENABLE_INTERRUPTS(CLBR_ANY)
+- call schedule
+- DISABLE_INTERRUPTS(CLBR_ANY)
+- jmp nmi_userspace
+ CFI_ENDPROC
+-#else
+- jmp paranoid_exit
+- CFI_ENDPROC
+-#endif
+ END(nmi)
+
+ ENTRY(ignore_sysret)
diff --git a/series.conf b/series.conf
index 6f541dde42..ed75435d67 100644
--- a/series.conf
+++ b/series.conf
@@ -388,6 +388,16 @@
patches.fixes/sched-harden-rq-rt-usage-accounting.patch
+ # NMI safe printk
+ patches.fixes/printk-extract-ringbuffer-handling.patch
+ patches.fixes/printk-add-nmi-ring-buffer.patch
+ patches.fixes/printk-safe-nmi-handling.patch
+ patches.fixes/printk-make-nmi-ringbuffer-length-independent.patch
+ patches.fixes/printk-do-not-call-unlock_console-from-nmi.patch
+ patches.fixes/printk-do-not-use-printk_cpu-from-finish_printk.patch
+ patches.fixes/x86-Do-not-schedule-while-still-in-NMI-context.patch
+ patches.fixes/x86-Add-workaround-to-NMI-iret-woes.patch
+
########################################################
# ia64
########################################################