Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2017-12-29 11:25:37 +0100
committerJiri Kosina <jkosina@suse.cz>2017-12-29 11:25:37 +0100
commit5f5299b1066bc526314a409fa40cd5bbc86c43dd (patch)
tree306503e6abe1d4d4652614f6680ba077ab7caaf1
parentcc90f8a0f474a3f6f0649f3274a0486c79cee1dd (diff)
parent1b24463daaed9f43d66e7eb8e9a4efbd7e320097 (diff)
Merge branch 'users/jkosina/SLE11-SP4/pti' into SLE11-SP4_EMBARGOrpm-3.0.101-108.21
Conflicts: blacklist.conf series.conf
-rw-r--r--blacklist.conf5
-rw-r--r--config/x86_64/debug1
-rw-r--r--config/x86_64/default1
-rw-r--r--config/x86_64/trace1
-rw-r--r--patches.arch/s390-sles11sp4-99-01-cpu-alternatives.patch395
-rw-r--r--patches.arch/s390-sles11sp4-99-02-gmb.patch35
-rw-r--r--patches.arch/s390-sles11sp4-99-03-nobp.patch397
-rw-r--r--patches.arch/s390-virtio-ccw-0001-KVM-s390-Perform-early-event-mask-processing-durin.patch67
-rw-r--r--patches.kabi/kaiser-preserve-kabi.patch146
-rw-r--r--patches.kabi/xen3-kaiser-preserve-kabi.patch79
-rw-r--r--patches.suse/0001-locking-barriers-introduce-new-memory-barrier-gmb.patch48
-rw-r--r--patches.suse/0001-x86-64-Give-vvars-their-own-page.patch186
-rw-r--r--patches.suse/0001-x86-64-Map-the-HPET-NX.patch66
-rw-r--r--patches.suse/0001-x86-boot-Carve-out-early-cmdline-parsing-function.patch131
-rw-r--r--patches.suse/0002-bpf-prevent-speculative-execution-in-eBPF-interprete.patch44
-rw-r--r--patches.suse/0003-uvcvideo-prevent-speculative-execution.patch34
-rw-r--r--patches.suse/0004-carl9170-prevent-speculative-execution.patch34
-rw-r--r--patches.suse/0004-sk_run_filter-add-bpf_s_anc_seccomp_ld_w.patch15
-rw-r--r--patches.suse/0005-p54-prevent-speculative-execution.patch34
-rw-r--r--patches.suse/0006-fs-prevent-speculative-execution.patch40
-rw-r--r--patches.suse/0007-udf-prevent-speculative-execution.patch53
-rw-r--r--patches.suse/01-x86-feature-enable-the-x86-feature-to-control-speculation.patch63
-rw-r--r--patches.suse/02-x86-enter-add-macros-to-set-clear-ibrs-and-set-ibpb.patch87
-rw-r--r--patches.suse/03-x86-entry-use-ibrs-on-entry-to-kernel-space.patch206
-rw-r--r--patches.suse/04-x86-msr-move-native_-msr-u64-to-msr-h.patch41
-rw-r--r--patches.suse/05-x86-spec-add-ibrs-control-functions.patch64
-rw-r--r--patches.suse/06-x86-idle-toggle-ibrs-when-going-idle.patch57
-rw-r--r--patches.suse/07-x86-idle-disable-ibrs-when-offlining-a-cpu-and-re-enable-on-wakeup.patch38
-rw-r--r--patches.suse/08-x86-spec_ctrl-add-an-indirect-branch-predictor-barrier.patch33
-rw-r--r--patches.suse/09-x86-mm-set-ibpb-upon-context-switch.patch34
-rw-r--r--patches.suse/10-ptrace-add-a-new-thread-access-check.patch83
-rw-r--r--patches.suse/11-x86-mm-only-set-ibpb-when-the-new-thread-cannot-ptrace-current-thread.patch38
-rw-r--r--patches.suse/12-x86-entry-add-a-function-to-overwrite-the-rsb.patch108
-rw-r--r--patches.suse/13-x86-entry-stuff-rsb-for-entry-to-kernel-for-non-smep-platform.patch88
-rw-r--r--patches.suse/14-x86-kvm-add-msr_ia32_spec_ctrl-and-msr_ia32_pred_cmd-to-kvm.patch118
-rw-r--r--patches.suse/15-x86-kvm-flush-ibp-when-switching-vms.patch25
-rw-r--r--patches.suse/16-x86-kvm-toggle-ibrs-on-vm-entry-and-exit.patch28
-rw-r--r--patches.suse/17-x86-kvm-pad-rsb-on-vm-transition.patch102
-rw-r--r--patches.suse/18-x86-spec_ctrl-check-whether-ibrs-is-enabled-before-using-it.patch142
-rw-r--r--patches.suse/19-x86-spec_ctrl-check-whether-ibpb-is-enabled-before-using-it.patch54
-rw-r--r--patches.suse/20-x86-cpu-check-speculation-control-cpuid-bit.patch115
-rw-r--r--patches.suse/21-x86-spec-add-nospec-chicken-bit.patch47
-rw-r--r--patches.suse/22-x86-cpu-amd-add-speculative-control-support-for-amd.patch117
-rw-r--r--patches.suse/23-x86-spec-check-cpuid-direclty-post-microcode-reload-to-support-ibpb-feature.patch53
-rw-r--r--patches.suse/24-kvm-svm-do-not-intercept-new-speculative-control-msrs.patch27
-rw-r--r--patches.suse/25-x86-svm-set-ibrs-value-on-vm-entry-and-exit.patch78
-rw-r--r--patches.suse/26-x86-svm-set-ibpb-when-running-a-different-vcpu.patch58
-rw-r--r--patches.suse/27-kvm-x86-add-speculative-control-cpuid-support-for-guests.patch59
-rw-r--r--patches.suse/28-x86-svm-clobber-the-rsb-on-vm-exit.patch26
-rw-r--r--patches.suse/29-x86-svm-add-code-to-clear-registers-on-vm-exit.patch39
-rw-r--r--patches.suse/30-x86-cpu-amd-make-the-lfence-instruction-serialized.patch53
-rw-r--r--patches.suse/31-x86-cpu-amd-remove-now-unused-definition-of-mfence_rdtsc-feature.patch60
-rw-r--r--patches.suse/kaiser-0002-x86-mm-Add-INVPCID-helpers.patch93
-rw-r--r--patches.suse/kaiser-0003-x86-mm-Fix-INVPCID-asm-constraint.patch67
-rw-r--r--patches.suse/kaiser-0004-x86-mm-Add-a-noinvpcid-boot-option-to-turn-off-INVPC.patch75
-rw-r--r--patches.suse/kaiser-0005-x86-mm-If-INVPCID-is-available-use-it-to-flush-globa.patch56
-rw-r--r--patches.suse/kaiser-0006-mm-mmu_context-sched-core-Fix-mmu_context.h-assumpti.patch40
-rw-r--r--patches.suse/kaiser-0007-sched-core-Add-switch_mm_irqs_off-and-use-it-in-the-.patch76
-rw-r--r--patches.suse/kaiser-0008-x86-mm-Build-arch-x86-mm-tlb.c-even-on-SMP.patch64
-rw-r--r--patches.suse/kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch193
-rw-r--r--patches.suse/kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch66
-rw-r--r--patches.suse/kaiser-0011-sched-core-Idle_task_exit-shouldn-t-use-switch_mm_ir.patch44
-rw-r--r--patches.suse/kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch240
-rw-r--r--patches.suse/kaiser-0013-x86-mm-Disable-PCID-on-32-bit-kernels.patch65
-rw-r--r--patches.suse/kaiser-0014-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch74
-rw-r--r--patches.suse/kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch128
-rw-r--r--patches.suse/kaiser-0016-x86-mm-64-Fix-reboot-interaction-with-CR4.PCIDE.patch44
-rw-r--r--patches.suse/kaiser-0017-x86-mm-fix-bad-backport-to-disable-PCID-on-Xen.patch37
-rw-r--r--patches.suse/kaiser-0018-KAISER-Kernel-Address-Isolation.patch1906
-rw-r--r--patches.suse/kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch130
-rw-r--r--patches.suse/kaiser-0020-fix-ldt-freeing.patch42
-rw-r--r--patches.suse/kaiser-0021-disable-vmstat.patch75
-rw-r--r--patches.suse/kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch655
-rw-r--r--patches.suse/kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch133
-rw-r--r--patches.suse/kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch95
-rw-r--r--patches.suse/kaiser-nokaiser-0008-x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch171
-rw-r--r--patches.suse/kaiser-nokaiser-0009-kaiser-add_pti_cmdline_option_and_documentation.patch119
-rw-r--r--patches.suse/kaiser-nokaiser-0010-move-pti-feature-check-up.patch77
-rw-r--r--patches.suse/powerpc-add-gmb.patch18
-rw-r--r--patches.suse/powerpc-rfi-flush.patch1055
-rw-r--r--patches.xen/xen-x86-pmd-handling56
-rw-r--r--patches.xen/xen-x86_64-pgd-alloc-order86
-rw-r--r--patches.xen/xen3-0001-x86-64-Give-vvars-their-own-page.patch85
-rw-r--r--patches.xen/xen3-0001-x86-64-Map-the-HPET-NX.patch50
-rw-r--r--patches.xen/xen3-kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch259
-rw-r--r--patches.xen/xen3-kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch62
-rw-r--r--patches.xen/xen3-kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch123
-rw-r--r--patches.xen/xen3-kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch80
-rw-r--r--patches.xen/xen3-kaiser-0018-KAISER-Kernel-Address-Isolation.patch509
-rw-r--r--patches.xen/xen3-kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch37
-rw-r--r--patches.xen/xen3-kaiser-build-fix.patch32
-rw-r--r--patches.xen/xen3-kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch244
-rw-r--r--patches.xen/xen3-kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch37
-rw-r--r--patches.xen/xen3-kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch22
-rw-r--r--patches.xen/xen3-patch-2.6.2541
-rw-r--r--patches.xen/xen3-patch-2.6.294744
-rw-r--r--patches.xen/xen3-patch-2.6.304288
-rw-r--r--patches.xen/xen3-patch-2.6.311241
-rw-r--r--patches.xen/xen3-patch-2.6.3426
-rw-r--r--patches.xen/xen3-patch-2.6.3610
-rw-r--r--patches.xen/xen3-patch-2.6.394
-rw-r--r--series.conf97
102 files changed, 16800 insertions, 5124 deletions
diff --git a/blacklist.conf b/blacklist.conf
index b5ac73f083..9f152f56d0 100644
--- a/blacklist.conf
+++ b/blacklist.conf
@@ -197,3 +197,8 @@ a9f8553e935f26cb5447f67e280946b0923cd2dc # depends on features we don't have
c98769475575c8a585f5b3952f4b5f90266f699b # udlfb not supported
5c1ac56b51b9d222ab202dec1ac2f4215346129d # DMI, not needed because 9f9c9cbb6057 is reverted
ff4319dc7cd58c92b389960e375038335d157a60 # DMI, not needed because 9f9c9cbb6057 is reverted
+ef0491ea17f8019821c7e9c8e801184ecf17f85a # no ARM at 11-SP4
+5dd0b16cdaff9b94da06074d5888b03235c0bf17 # no UP build
+f34902c5c6c08024371202a680ce69f2d488776d # we use PCID only for distinguishing ring0 vs ring3
+c7ad5ad297e644601747d6dbee978bf85e14f7bc # we use PCID only for distinguishing ring0 vs ring3
+b8b7abaed7a49b350f8ba659ddc264b04931d581 # This is just for robustness. I haven't seen any actual bugs here
diff --git a/config/x86_64/debug b/config/x86_64/debug
index 6511777518..aa9b3aebba 100644
--- a/config/x86_64/debug
+++ b/config/x86_64/debug
@@ -5764,6 +5764,7 @@ CONFIG_ENCRYPTED_KEYS=m
CONFIG_KEYS_DEBUG_PROC_KEYS=y
# CONFIG_SECURITY_DMESG_RESTRICT is not set
CONFIG_SECURITY=y
+CONFIG_KAISER=y
CONFIG_SECURITYFS=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_NETWORK_XFRM=y
diff --git a/config/x86_64/default b/config/x86_64/default
index 8d2c8b4184..89542784b0 100644
--- a/config/x86_64/default
+++ b/config/x86_64/default
@@ -5736,6 +5736,7 @@ CONFIG_ENCRYPTED_KEYS=m
# CONFIG_KEYS_DEBUG_PROC_KEYS is not set
# CONFIG_SECURITY_DMESG_RESTRICT is not set
CONFIG_SECURITY=y
+CONFIG_KAISER=y
CONFIG_SECURITYFS=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_NETWORK_XFRM=y
diff --git a/config/x86_64/trace b/config/x86_64/trace
index 8cf8705c0b..6208555b06 100644
--- a/config/x86_64/trace
+++ b/config/x86_64/trace
@@ -5746,6 +5746,7 @@ CONFIG_ENCRYPTED_KEYS=m
# CONFIG_KEYS_DEBUG_PROC_KEYS is not set
# CONFIG_SECURITY_DMESG_RESTRICT is not set
CONFIG_SECURITY=y
+CONFIG_KAISER=y
CONFIG_SECURITYFS=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_NETWORK_XFRM=y
diff --git a/patches.arch/s390-sles11sp4-99-01-cpu-alternatives.patch b/patches.arch/s390-sles11sp4-99-01-cpu-alternatives.patch
new file mode 100644
index 0000000000..0247dde589
--- /dev/null
+++ b/patches.arch/s390-sles11sp4-99-01-cpu-alternatives.patch
@@ -0,0 +1,395 @@
+From: Vasily Gorbik <gor@linux.vnet.ibm.com>
+Subject: s390: introduce CPU alternatives
+Git-commit: 686140a1a9c41d85a4212a1c26d671139b76404b
+Patch-mainline: v4.15-rc1
+References: bsc#1068032
+
+Implement CPU alternatives, which allows to optionally patch newer
+instructions at runtime, based on CPU facilities availability.
+
+A new kernel boot parameter "noaltinstr" disables patching.
+
+Current implementation is derived from x86 alternatives. Although
+ideal instructions padding (when altinstr is longer then oldinstr)
+is added at compile time, and no oldinstr nops optimization has to be
+done at runtime. Also couple of compile time sanity checks are done:
+1. oldinstr and altinstr must be <= 254 bytes long,
+2. oldinstr and altinstr must not have an odd length.
+
+alternative(oldinstr, altinstr, facility);
+alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2);
+
+Both compile time and runtime padding consists of either 6/4/2 bytes nop
+or a jump (brcl) + 2 bytes nop filler if padding is longer then 6 bytes.
+
+.altinstructions and .altinstr_replacement sections are part of
+__init_begin : __init_end region and are freed after initialization.
+
+Signed-off-by: Vasily Gorbik <gor@linux.vnet.ibm.com>
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/s390/include/asm/alternative.h | 149 ++++++++++++++++++++++++++++++++++++
+ arch/s390/kernel/Makefile | 2
+ arch/s390/kernel/alternative.c | 110 ++++++++++++++++++++++++++
+ arch/s390/kernel/module.c | 13 +++
+ arch/s390/kernel/setup.c | 3
+ arch/s390/kernel/vmlinux.lds.S | 23 +++++
+ 6 files changed, 299 insertions(+), 1 deletion(-)
+
+--- /dev/null
++++ b/arch/s390/include/asm/alternative.h
+@@ -0,0 +1,149 @@
++#ifndef _ASM_S390_ALTERNATIVE_H
++#define _ASM_S390_ALTERNATIVE_H
++
++#ifndef __ASSEMBLY__
++
++#include <linux/types.h>
++#include <linux/stddef.h>
++#include <linux/stringify.h>
++
++struct alt_instr {
++ s32 instr_offset; /* original instruction */
++ s32 repl_offset; /* offset to replacement instruction */
++ u16 facility; /* facility bit set for replacement */
++ u8 instrlen; /* length of original instruction */
++ u8 replacementlen; /* length of new instruction */
++} __packed;
++
++extern void apply_alternative_instructions(void);
++extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
++
++/*
++ * |661: |662: |6620 |663:
++ * +-----------+---------------------+
++ * | oldinstr | oldinstr_padding |
++ * | +----------+----------+
++ * | | | |
++ * | | >6 bytes |6/4/2 nops|
++ * | |6 bytes jg----------->
++ * +-----------+---------------------+
++ * ^^ static padding ^^
++ *
++ * .altinstr_replacement section
++ * +---------------------+-----------+
++ * |6641: |6651:
++ * | alternative instr 1 |
++ * +-----------+---------+- - - - - -+
++ * |6642: |6652: |
++ * | alternative instr 2 | padding
++ * +---------------------+- - - - - -+
++ * ^ runtime ^
++ *
++ * .altinstructions section
++ * +---------------------------------+
++ * | alt_instr entries for each |
++ * | alternative instr |
++ * +---------------------------------+
++ */
++
++#define b_altinstr(num) "664"#num
++#define e_altinstr(num) "665"#num
++
++#define e_oldinstr_pad_end "663"
++#define oldinstr_len "662b-661b"
++#define oldinstr_total_len e_oldinstr_pad_end"b-661b"
++#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b"
++#define oldinstr_pad_len(num) \
++ "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \
++ "((" altinstr_len(num) ")-(" oldinstr_len "))"
++
++#define INSTR_LEN_SANITY_CHECK(len) \
++ ".if " len " > 254\n" \
++ "\t.error \"cpu alternatives does not support instructions " \
++ "blocks > 254 bytes\"\n" \
++ ".endif\n" \
++ ".if (" len ") %% 2\n" \
++ "\t.error \"cpu alternatives instructions length is odd\"\n" \
++ ".endif\n"
++
++#define OLDINSTR_PADDING(oldinstr, num) \
++ ".if " oldinstr_pad_len(num) " > 6\n" \
++ "\tjg " e_oldinstr_pad_end "f\n" \
++ "6620:\n" \
++ "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \
++ ".else\n" \
++ "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \
++ "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \
++ "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \
++ ".endif\n"
++
++#define OLDINSTR(oldinstr, num) \
++ "661:\n\t" oldinstr "\n662:\n" \
++ OLDINSTR_PADDING(oldinstr, num) \
++ e_oldinstr_pad_end ":\n" \
++ INSTR_LEN_SANITY_CHECK(oldinstr_len)
++
++#define OLDINSTR_2(oldinstr, num1, num2) \
++ "661:\n\t" oldinstr "\n662:\n" \
++ ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \
++ OLDINSTR_PADDING(oldinstr, num2) \
++ ".else\n" \
++ OLDINSTR_PADDING(oldinstr, num1) \
++ ".endif\n" \
++ e_oldinstr_pad_end ":\n" \
++ INSTR_LEN_SANITY_CHECK(oldinstr_len)
++
++#define ALTINSTR_ENTRY(facility, num) \
++ "\t.long 661b - .\n" /* old instruction */ \
++ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \
++ "\t.word " __stringify(facility) "\n" /* facility bit */ \
++ "\t.byte " oldinstr_total_len "\n" /* source len */ \
++ "\t.byte " altinstr_len(num) "\n" /* alt instruction len */
++
++#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \
++ b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \
++ INSTR_LEN_SANITY_CHECK(altinstr_len(num))
++
++/* alternative assembly primitive: */
++#define ALTERNATIVE(oldinstr, altinstr, facility) \
++ ".pushsection .altinstr_replacement, \"ax\"\n" \
++ ALTINSTR_REPLACEMENT(altinstr, 1) \
++ ".popsection\n" \
++ OLDINSTR(oldinstr, 1) \
++ ".pushsection .altinstructions,\"a\"\n" \
++ ALTINSTR_ENTRY(facility, 1) \
++ ".popsection\n"
++
++#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\
++ ".pushsection .altinstr_replacement, \"ax\"\n" \
++ ALTINSTR_REPLACEMENT(altinstr1, 1) \
++ ALTINSTR_REPLACEMENT(altinstr2, 2) \
++ ".popsection\n" \
++ OLDINSTR_2(oldinstr, 1, 2) \
++ ".pushsection .altinstructions,\"a\"\n" \
++ ALTINSTR_ENTRY(facility1, 1) \
++ ALTINSTR_ENTRY(facility2, 2) \
++ ".popsection\n"
++
++/*
++ * Alternative instructions for different CPU types or capabilities.
++ *
++ * This allows to use optimized instructions even on generic binary
++ * kernels.
++ *
++ * oldinstr is padded with jump and nops at compile time if altinstr is
++ * longer. altinstr is padded with jump and nops at run-time during patching.
++ *
++ * For non barrier like inlines please define new variants
++ * without volatile and memory clobber.
++ */
++#define alternative(oldinstr, altinstr, facility) \
++ asm volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory")
++
++#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \
++ asm volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \
++ altinstr2, facility2) ::: "memory")
++
++#endif /* __ASSEMBLY__ */
++
++#endif /* _ASM_S390_ALTERNATIVE_H */
+--- a/arch/s390/kernel/Makefile
++++ b/arch/s390/kernel/Makefile
+@@ -23,7 +23,7 @@ CFLAGS_sysinfo.o += -Iinclude/math-emu -
+ obj-y := bitmap.o traps.o time.o process.o base.o early.o setup.o vtime.o \
+ processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o \
+ debug.o irq.o ipl.o dis.o diag.o mem_detect.o sclp.o vdso.o \
+- sysinfo.o jump_label.o lgr.o os_info.o
++ sysinfo.o jump_label.o lgr.o os_info.o alternative.o
+
+ obj-y += $(if $(CONFIG_64BIT),entry64.o,entry.o)
+ obj-y += $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
+--- /dev/null
++++ b/arch/s390/kernel/alternative.c
+@@ -0,0 +1,110 @@
++#include <linux/module.h>
++#include <linux/uaccess.h>
++#include <asm/alternative.h>
++
++#define MAX_PATCH_LEN (255 - 1)
++
++static int __initdata_or_module alt_instr_disabled;
++
++static int __init disable_alternative_instructions(char *str)
++{
++ alt_instr_disabled = 1;
++ return 0;
++}
++
++early_param("noaltinstr", disable_alternative_instructions);
++
++struct brcl_insn {
++ u16 opc;
++ s32 disp;
++} __packed;
++
++static u16 __initdata_or_module nop16 = 0x0700;
++static u32 __initdata_or_module nop32 = 0x47000000;
++static struct brcl_insn __initdata_or_module nop48 = {
++ 0xc004, 0
++};
++
++static const void * __initdata_or_module nops[] = {
++ &nop16,
++ &nop32,
++ &nop48
++};
++
++static void __init_or_module add_jump_padding(void *insns, unsigned int len)
++{
++ struct brcl_insn brcl = {
++ 0xc0f4,
++ len / 2
++ };
++
++ memcpy(insns, &brcl, sizeof(brcl));
++ insns += sizeof(brcl);
++ len -= sizeof(brcl);
++
++ while (len > 0) {
++ memcpy(insns, &nop16, 2);
++ insns += 2;
++ len -= 2;
++ }
++}
++
++static void __init_or_module add_padding(void *insns, unsigned int len)
++{
++ if (len > 6)
++ add_jump_padding(insns, len);
++ else if (len >= 2)
++ memcpy(insns, nops[len / 2 - 1], len);
++}
++
++static void __init_or_module __apply_alternatives(struct alt_instr *start,
++ struct alt_instr *end)
++{
++ struct alt_instr *a;
++ u8 *instr, *replacement;
++ u8 insnbuf[MAX_PATCH_LEN];
++
++ /*
++ * The scan order should be from start to end. A later scanned
++ * alternative code can overwrite previously scanned alternative code.
++ */
++ for (a = start; a < end; a++) {
++ int insnbuf_sz = 0;
++
++ instr = (u8 *)&a->instr_offset + a->instr_offset;
++ replacement = (u8 *)&a->repl_offset + a->repl_offset;
++
++ if (!test_facility(a->facility))
++ continue;
++
++ if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
++ WARN_ONCE(1, "cpu alternatives instructions length is "
++ "odd, skipping patching\n");
++ continue;
++ }
++
++ memcpy(insnbuf, replacement, a->replacementlen);
++ insnbuf_sz = a->replacementlen;
++
++ if (a->instrlen > a->replacementlen) {
++ add_padding(insnbuf + a->replacementlen,
++ a->instrlen - a->replacementlen);
++ insnbuf_sz += a->instrlen - a->replacementlen;
++ }
++
++ probe_kernel_write(instr, insnbuf, insnbuf_sz);
++ }
++}
++
++void __init_or_module apply_alternatives(struct alt_instr *start,
++ struct alt_instr *end)
++{
++ if (!alt_instr_disabled)
++ __apply_alternatives(start, end);
++}
++
++extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
++void __init apply_alternative_instructions(void)
++{
++ apply_alternatives(__alt_instructions, __alt_instructions_end);
++}
+--- a/arch/s390/kernel/module.c
++++ b/arch/s390/kernel/module.c
+@@ -32,6 +32,7 @@
+ #include <linux/kernel.h>
+ #include <linux/moduleloader.h>
+ #include <linux/bug.h>
++#include <asm/alternative.h>
+
+ #if 0
+ #define DEBUGP printk
+@@ -407,6 +408,18 @@ int module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+ {
++ const Elf_Shdr *s;
++ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
++
++ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
++ if (!strcmp(".altinstructions", secstrings + s->sh_name)) {
++ /* patch .altinstructions */
++ void *aseg = (void *)s->sh_addr;
++
++ apply_alternatives(aseg, aseg + s->sh_size);
++ }
++ }
++
+ vfree(me->arch.syminfo);
+ me->arch.syminfo = NULL;
+ return 0;
+--- a/arch/s390/kernel/setup.c
++++ b/arch/s390/kernel/setup.c
+@@ -47,6 +47,7 @@
+ #include <linux/memory.h>
+ #include <linux/compat.h>
+
++#include <asm/alternative.h>
+ #include <asm/ipl.h>
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -1115,6 +1116,8 @@ setup_arch(char **cmdline_p)
+ conmode_default();
+ set_preferred_console();
+
++ apply_alternative_instructions();
++
+ /* Setup zfcpdump support */
+ setup_zfcpdump(console_devno);
+ }
+--- a/arch/s390/kernel/vmlinux.lds.S
++++ b/arch/s390/kernel/vmlinux.lds.S
+@@ -68,6 +68,29 @@ SECTIONS
+ INIT_TEXT_SECTION(PAGE_SIZE)
+
+ /*
++ * struct alt_inst entries. From the header (alternative.h):
++ * "Alternative instructions for different CPU types or capabilities"
++ * Think locking instructions on spinlocks.
++ * Note, that it is a part of __init region.
++ */
++ . = ALIGN(8);
++ .altinstructions : {
++ __alt_instructions = .;
++ *(.altinstructions)
++ __alt_instructions_end = .;
++ }
++
++ /*
++ * And here are the replacement instructions. The linker sticks
++ * them as binary blobs. The .altinstructions has enough data to
++ * get the address and the length of them to patch the kernel safely.
++ * Note, that it is a part of __init region.
++ */
++ .altinstr_replacement : {
++ *(.altinstr_replacement)
++ }
++
++ /*
+ * .exit.text is discarded at runtime, not link time,
+ * to deal with references from __bug_table
+ */
diff --git a/patches.arch/s390-sles11sp4-99-02-gmb.patch b/patches.arch/s390-sles11sp4-99-02-gmb.patch
new file mode 100644
index 0000000000..996eded0fb
--- /dev/null
+++ b/patches.arch/s390-sles11sp4-99-02-gmb.patch
@@ -0,0 +1,35 @@
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Subject: s390/spinlock: add gmb memory barrier
+References: bsc#1068032
+Patch-mainline: Not yet, under development
+
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/s390/include/asm/system.h | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/arch/s390/include/asm/system.h
++++ b/arch/s390/include/asm/system.h
+@@ -11,6 +11,7 @@
+ #include <linux/kernel.h>
+ #include <linux/errno.h>
+ #include <linux/string.h>
++#include <asm/alternative.h>
+ #include <asm/types.h>
+ #include <asm/ptrace.h>
+ #include <asm/setup.h>
+@@ -176,6 +177,13 @@ extern int copy_from_user_real(void *des
+ * all memory ops have completed wrt other CPU's ( see 7-15 POP DJB ).
+ */
+
++static inline void gmb(void)
++{
++ asm volatile(
++ ALTERNATIVE("", ".long 0xb2e8f000", 81)
++ : : : "memory");
++}
++
+ #define eieio() asm volatile("bcr 15,0" : : : "memory")
+ #define SYNC_OTHER_CORES(x) eieio()
+ #define mb() eieio()
diff --git a/patches.arch/s390-sles11sp4-99-03-nobp.patch b/patches.arch/s390-sles11sp4-99-03-nobp.patch
new file mode 100644
index 0000000000..81ad06f195
--- /dev/null
+++ b/patches.arch/s390-sles11sp4-99-03-nobp.patch
@@ -0,0 +1,397 @@
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Subject: s390: add ppa to system call and program check path
+References: bsc#1068032
+Patch-mainline: Not yet, under development
+
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/s390/include/asm/processor.h | 1
+ arch/s390/kernel/alternative.c | 13 ++++++
+ arch/s390/kernel/entry.S | 73 +++++++++++++++++++++++++++---------
+ arch/s390/kernel/entry64.S | 76 ++++++++++++++++++++++++++++++--------
+ arch/s390/kernel/ipl.c | 1
+ arch/s390/kernel/smp.c | 2 +
+ arch/s390/kernel/vmlinux.lds.S | 3 +
+ 7 files changed, 136 insertions(+), 33 deletions(-)
+
+--- a/arch/s390/include/asm/processor.h
++++ b/arch/s390/include/asm/processor.h
+@@ -34,6 +34,7 @@ static inline void get_cpu_id(struct cpu
+
+ extern void s390_adjust_jiffies(void);
+ extern int get_cpu_capability(unsigned int *);
++extern void __bpon(void);
+
+ /*
+ * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
+--- a/arch/s390/kernel/alternative.c
++++ b/arch/s390/kernel/alternative.c
+@@ -14,6 +14,19 @@ static int __init disable_alternative_in
+
+ early_param("noaltinstr", disable_alternative_instructions);
+
++extern struct alt_instr __alt_nobp[], __alt_nobp_end[];
++static int __init nobp_setup(char *str)
++{
++ bool enabled;
++ int rc;
++
++ rc = strtobool(str, &enabled);
++ if (!rc && enabled)
++ apply_alternatives(__alt_nobp, __alt_nobp_end);
++ return rc;
++}
++__setup("nobp=", nobp_setup);
++
+ struct brcl_insn {
+ u16 opc;
+ s32 disp;
+--- a/arch/s390/kernel/entry.S
++++ b/arch/s390/kernel/entry.S
+@@ -172,24 +172,47 @@ STACK_SIZE = 1 << STACK_SHIFT
+ stm %r0,%r11,SP_R0(%r15) # store gprs %r0-%r11 to kernel stack
+ .endm
+
+- .macro RESTORE_ALL psworg,sync
+- mvc \psworg(8),SP_PSW(%r15) # move user PSW to lowcore
+- .if !\sync
+- ni \psworg+1,0xfd # clear wait state bit
+- .endif
+- lm %r0,%r15,SP_R0(%r15) # load gprs 0-15 of user
+- stpt __LC_EXIT_TIMER
+- lpsw \psworg # back to caller
+- .endm
+-
+ .macro REENABLE_IRQS
+ mvc __SF_EMPTY(1,%r15),SP_PSW(%r15)
+ ni __SF_EMPTY(%r15),0xbf
+ ssm __SF_EMPTY(%r15)
+ .endm
+
++ .macro BPOFF
++ .pushsection .altinstr_replacement, "ax"
++660: .long 0xb2e8c000
++ .popsection
++661: .long 0x47000000
++ .pushsection .altnobp, "a"
++ .long 661b - .
++ .long 660b - .
++ .word 82
++ .byte 4
++ .byte 4
++ .popsection
++ .endm
++
++ .macro BPON
++ .pushsection .altinstr_replacement, "ax"
++662: .long 0xb2e8d000
++ .popsection
++663: .long 0x47000000
++ .pushsection .altnobp, "a"
++ .long 663b - .
++ .long 662b - .
++ .word 82
++ .byte 4
++ .byte 4
++ .popsection
++ .endm
++
+ .section .kprobes.text, "ax"
+
++ENTRY(__bpon)
++ .globl __bpon
++ BPON
++ br %r14
++
+ /*
+ * Scheduler resume function, called by switch_to
+ * gpr2 = (task_struct *) prev
+@@ -229,6 +252,7 @@ system_call:
+ stpt __LC_SYNC_ENTER_TIMER
+ sysc_saveall:
+ SAVE_ALL_SVC __LC_SVC_OLD_PSW,__LC_SAVE_AREA
++ BPOFF
+ CREATE_STACK_FRAME __LC_SAVE_AREA
+ mvc SP_PSW(8,%r15),__LC_SVC_OLD_PSW
+ mvc SP_ILC(4,%r15),__LC_SVC_ILC
+@@ -264,7 +288,11 @@ sysc_tif:
+ tm __TI_flags+3(%r12),_TIF_WORK_SVC
+ bnz BASED(sysc_work) # there is work to do (signals etc.)
+ sysc_restore:
+- RESTORE_ALL __LC_RETURN_PSW,1
++ mvc __LC_RETURN_PSW(8),SP_PSW(%r15)
++ lm %r0,%r15,SP_R0(%r15)
++ BPON
++ stpt __LC_EXIT_TIMER
++ lpsw __LC_RETURN_PSW
+ sysc_done:
+
+ #
+@@ -454,6 +482,7 @@ pgm_check_handler:
+ * for LPSW?).
+ */
+ stpt __LC_SYNC_ENTER_TIMER
++ BPOFF
+ SAVE_ALL_BASE __LC_SAVE_AREA
+ tm __LC_PGM_INT_CODE+1,0x80 # check whether we got a per exception
+ bnz BASED(pgm_per) # got per exception -> special case
+@@ -569,6 +598,7 @@ kernel_per:
+ io_int_handler:
+ stck __LC_INT_CLOCK
+ stpt __LC_ASYNC_ENTER_TIMER
++ BPOFF
+ SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+16
+ CREATE_STACK_FRAME __LC_SAVE_AREA+16
+ mvc SP_PSW(8,%r15),0(%r12) # move user PSW to stack
+@@ -590,7 +620,15 @@ io_tif:
+ tm __TI_flags+3(%r12),_TIF_WORK_INT
+ bnz BASED(io_work) # there is work to do (signals etc.)
+ io_restore:
+- RESTORE_ALL __LC_RETURN_PSW,0
++ mvc __LC_RETURN_PSW(8),SP_PSW(%r15)
++ ni __LC_RETURN_PSW+1,0xfd
++ tm SP_PSW+1(%r15),0x01 # interrupting from user ?
++ bno BASED(io_exit_kernel)
++ BPON
++io_exit_kernel:
++ lm %r0,%r15,SP_R0(%r15)
++ stpt __LC_EXIT_TIMER
++ lpsw __LC_RETURN_PSW
+ io_done:
+
+ #
+@@ -707,6 +745,7 @@ io_notify_resume:
+ ext_int_handler:
+ stck __LC_INT_CLOCK
+ stpt __LC_ASYNC_ENTER_TIMER
++ BPOFF
+ SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+16
+ CREATE_STACK_FRAME __LC_SAVE_AREA+16
+ mvc SP_PSW(8,%r15),0(%r12) # move user PSW to stack
+@@ -734,6 +773,7 @@ __critical_end:
+ .globl mcck_int_handler
+ mcck_int_handler:
+ stck __LC_MCCK_CLOCK
++ BPOFF
+ spt __LC_CPU_TIMER_SAVE_AREA # revalidate cpu timer
+ lm %r0,%r15,__LC_GPREGS_SAVE_AREA # revalidate gprs
+ SAVE_ALL_BASE __LC_SAVE_AREA+32
+@@ -803,15 +843,12 @@ mcck_no_vtime:
+ mcck_return:
+ mvc __LC_RETURN_MCCK_PSW(8),SP_PSW(%r15) # move return PSW
+ ni __LC_RETURN_MCCK_PSW+1,0xfd # clear wait state bit
++ lm %r0,%r15,SP_R0(%r15) # load gprs 0-15
+ tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
+ bno BASED(0f)
+- lm %r0,%r15,SP_R0(%r15) # load gprs 0-15
++ BPON
+ stpt __LC_EXIT_TIMER
+- lpsw __LC_RETURN_MCCK_PSW # back to caller
+-0: lm %r0,%r15,SP_R0(%r15) # load gprs 0-15
+- lpsw __LC_RETURN_MCCK_PSW # back to caller
+-
+- RESTORE_ALL __LC_RETURN_MCCK_PSW,0
++0: lpsw __LC_RETURN_MCCK_PSW
+
+ /*
+ * Restart interruption handler, kick starter for additional CPUs
+--- a/arch/s390/kernel/entry64.S
++++ b/arch/s390/kernel/entry64.S
+@@ -184,19 +184,6 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_
+ stmg %r0,%r10,SP_R0(%r15) # store gprs %r0-%r10 to kernel stack
+ .endm
+
+- .macro RESTORE_ALL psworg,sync
+- mvc \psworg(16),SP_PSW(%r15) # move user PSW to lowcore
+- .if !\sync
+- ni \psworg+1,0xfd # clear wait state bit
+- .endif
+- lg %r14,__LC_VDSO_PER_CPU
+- lmg %r0,%r13,SP_R0(%r15) # load gprs 0-13 of user
+- stpt __LC_EXIT_TIMER
+- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+- lmg %r14,%r15,SP_R14(%r15) # load grps 14-15 of user
+- lpswe \psworg # back to caller
+- .endm
+-
+ .macro LAST_BREAK
+ srag %r10,%r11,23
+ jz 0f
+@@ -210,8 +197,41 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_
+ ssm __SF_EMPTY(%r15)
+ .endm
+
++ .macro BPOFF
++ .pushsection .altinstr_replacement, "ax"
++660: .long 0xb2e8c000
++ .popsection
++661: .long 0x47000000
++ .pushsection .altnobp, "a"
++ .long 661b - .
++ .long 660b - .
++ .word 82
++ .byte 4
++ .byte 4
++ .popsection
++ .endm
++
++ .macro BPON
++ .pushsection .altinstr_replacement, "ax"
++662: .long 0xb2e8d000
++ .popsection
++663: .long 0x47000000
++ .pushsection .altnobp, "a"
++ .long 663b - .
++ .long 662b - .
++ .word 82
++ .byte 4
++ .byte 4
++ .popsection
++ .endm
++
+ .section .kprobes.text, "ax"
+
++ENTRY(__bpon)
++ .globl __bpon
++ BPON
++ br %r14
++
+ /*
+ * Scheduler resume function, called by switch_to
+ * gpr2 = (task_struct *) prev
+@@ -250,6 +270,7 @@ system_call:
+ stpt __LC_SYNC_ENTER_TIMER
+ sysc_saveall:
+ SAVE_ALL_SVC __LC_SVC_OLD_PSW,__LC_SAVE_AREA
++ BPOFF
+ CREATE_STACK_FRAME __LC_SAVE_AREA
+ mvc SP_PSW(16,%r15),__LC_SVC_OLD_PSW
+ mvc SP_ILC(4,%r15),__LC_SVC_ILC
+@@ -292,7 +313,14 @@ sysc_tif:
+ tm __TI_flags+7(%r12),_TIF_WORK_SVC
+ jnz sysc_work # there is work to do (signals etc.)
+ sysc_restore:
+- RESTORE_ALL __LC_RETURN_PSW,1
++ mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
++ lg %r14,__LC_VDSO_PER_CPU
++ lmg %r0,%r13,SP_R0(%r15)
++ BPON
++ stpt __LC_EXIT_TIMER
++ mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
++ lmg %r14,%r15,SP_R14(%r15)
++ lpswe __LC_RETURN_PSW
+ sysc_done:
+
+ #
+@@ -471,6 +499,7 @@ pgm_check_handler:
+ * for LPSW?).
+ */
+ stpt __LC_SYNC_ENTER_TIMER
++ BPOFF
+ tm __LC_PGM_INT_CODE+1,0x80 # check whether we got a per exception
+ jnz pgm_per # got per exception -> special case
+ SAVE_ALL_PGM __LC_PGM_OLD_PSW,__LC_SAVE_AREA
+@@ -598,6 +627,7 @@ kernel_per:
+ io_int_handler:
+ stck __LC_INT_CLOCK
+ stpt __LC_ASYNC_ENTER_TIMER
++ BPOFF
+ SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+40
+ CREATE_STACK_FRAME __LC_SAVE_AREA+40
+ mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack
+@@ -620,7 +650,18 @@ io_tif:
+ tm __TI_flags+7(%r12),_TIF_WORK_INT
+ jnz io_work # there is work to do (signals etc.)
+ io_restore:
+- RESTORE_ALL __LC_RETURN_PSW,0
++ mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
++ ni __LC_RETURN_PSW+1,0xfd
++ lg %r14,__LC_VDSO_PER_CPU
++ lmg %r0,%r13,SP_R0(%r15)
++ tm SP_PSW+1(%r15),0x01 # returning to user ?
++ jno io_exit_kernel
++ BPON
++ stpt __LC_EXIT_TIMER
++ mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
++io_exit_kernel:
++ lmg %r14,%r15,SP_R14(%r15)
++ lpswe __LC_RETURN_PSW
+ io_done:
+
+ #
+@@ -733,6 +774,7 @@ io_notify_resume:
+ ext_int_handler:
+ stck __LC_INT_CLOCK
+ stpt __LC_ASYNC_ENTER_TIMER
++ BPOFF
+ SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+40
+ CREATE_STACK_FRAME __LC_SAVE_AREA+40
+ mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack
+@@ -763,6 +805,7 @@ __critical_end:
+ .globl mcck_int_handler
+ mcck_int_handler:
+ stck __LC_MCCK_CLOCK
++ BPOFF
+ la %r1,4095 # revalidate r1
+ spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer
+ lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
+@@ -838,6 +881,7 @@ mcck_return:
+ lmg %r0,%r15,SP_R0(%r15) # load gprs 0-15
+ tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
+ jno 0f
++ BPON
+ stpt __LC_EXIT_TIMER
+ 0: lpswe __LC_RETURN_MCCK_PSW # back to caller
+ mcck_done:
+@@ -1100,8 +1144,10 @@ sie_loop:
+ jnz sie_exit
+ lg %r14,__SF_EMPTY(%r15) # get control block pointer
+ SPP __SF_EMPTY(%r15) # set guest id
++ BPON
+ sie 0(%r14)
+ sie_done:
++ BPOFF
+ SPP __LC_CMF_HPP # set host id
+ lg %r14,__LC_THREAD_INFO # pointer thread_info struct
+ .globl sie_exit
+--- a/arch/s390/kernel/ipl.c
++++ b/arch/s390/kernel/ipl.c
+@@ -562,6 +562,7 @@ out:
+
+ static void __ipl_run(void *unused)
+ {
++ __bpon();
+ diag308(DIAG308_IPL, NULL);
+ if (MACHINE_IS_VM)
+ __cpcmd("IPL", NULL, 0, NULL);
+--- a/arch/s390/kernel/smp.c
++++ b/arch/s390/kernel/smp.c
+@@ -130,6 +130,7 @@ void smp_switch_to_ipl_cpu(void (*func)(
+ struct pt_regs *regs;
+ unsigned long sp;
+
++ __bpon();
+ if (smp_processor_id() == 0)
+ func(data);
+ __load_psw_mask(PSW_BASE_BITS | PSW_DEFAULT_KEY);
+@@ -784,6 +785,7 @@ void __cpu_die(unsigned int cpu)
+ void __noreturn cpu_die(void)
+ {
+ idle_task_exit();
++ __bpon();
+ while (sigp(smp_processor_id(), sigp_stop) == sigp_busy)
+ cpu_relax();
+ for (;;);
+--- a/arch/s390/kernel/vmlinux.lds.S
++++ b/arch/s390/kernel/vmlinux.lds.S
+@@ -78,6 +78,9 @@ SECTIONS
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
++ __alt_nobp = .;
++ *(.altnobp)
++ __alt_nobp_end = . ;
+ }
+
+ /*
diff --git a/patches.arch/s390-virtio-ccw-0001-KVM-s390-Perform-early-event-mask-processing-durin.patch b/patches.arch/s390-virtio-ccw-0001-KVM-s390-Perform-early-event-mask-processing-durin.patch
index d009518cc6..595aeea34d 100644
--- a/patches.arch/s390-virtio-ccw-0001-KVM-s390-Perform-early-event-mask-processing-durin.patch
+++ b/patches.arch/s390-virtio-ccw-0001-KVM-s390-Perform-early-event-mask-processing-durin.patch
@@ -26,11 +26,9 @@ Signed-off-by: Alexander Graf <agraf@suse.de>
drivers/s390/kvm/kvm_virtio.c | 3 ++-
6 files changed, 61 insertions(+), 14 deletions(-)
-diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
-index fed7bee..6e08aff 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
-@@ -54,6 +54,8 @@ int sclp_chp_configure(struct chp_id chpid);
+@@ -54,6 +54,8 @@ int sclp_chp_configure(struct chp_id chp
int sclp_chp_deconfigure(struct chp_id chpid);
int sclp_chp_read_info(struct sclp_chp_info *info);
void sclp_get_ipl_info(struct sclp_ipl_info *info);
@@ -39,8 +37,6 @@ index fed7bee..6e08aff 100644
void sclp_hsa_size_detect(void);
unsigned long sclp_get_hsa_size(void);
#endif /* _ASM_S390_SCLP_H */
-diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
-index f876a24..c82b357 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -63,6 +63,7 @@
@@ -51,7 +47,7 @@ index f876a24..c82b357 100644
long psw_kernel_bits = (PSW_BASE_BITS | PSW_MASK_DAT | PSW_ASC_PRIMARY |
PSW_MASK_MCHECK | PSW_DEFAULT_KEY);
-@@ -128,9 +129,14 @@ __setup("condev=", condev_setup);
+@@ -137,9 +138,14 @@ __setup("condev=", condev_setup);
static void __init set_preferred_console(void)
{
@@ -69,11 +65,9 @@ index f876a24..c82b357 100644
add_preferred_console("ttyS", 0, NULL);
else if (CONSOLE_IS_3270)
add_preferred_console("tty3270", 0, NULL);
-diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
-index eaa7e78..97bf174 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
-@@ -654,16 +654,6 @@ sclp_remove_processed(struct sccb_header *sccb)
+@@ -685,16 +685,6 @@ sclp_remove_processed(struct sccb_header
EXPORT_SYMBOL(sclp_remove_processed);
@@ -90,32 +84,9 @@ index eaa7e78..97bf174 100644
/* Prepare init mask request. Called while sclp_lock is locked. */
static inline void
__sclp_make_init_req(u32 receive_mask, u32 send_mask)
-diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h
-index 49a1bb5..d7e97ae 100644
---- a/drivers/s390/char/sclp.h
-+++ b/drivers/s390/char/sclp.h
-@@ -88,6 +88,16 @@ struct sccb_header {
- u16 response_code;
- } __attribute__((packed));
-
-+struct init_sccb {
-+ struct sccb_header header;
-+ u16 _reserved;
-+ u16 mask_length;
-+ sccb_mask_t receive_mask;
-+ sccb_mask_t send_mask;
-+ sccb_mask_t sclp_receive_mask;
-+ sccb_mask_t sclp_send_mask;
-+} __attribute__((packed));
-+
- extern u64 sclp_facilities;
- #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL)
- #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL)
-diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
-index be55fb2..dda9b9e 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
-@@ -45,6 +45,7 @@ struct read_info_sccb {
+@@ -46,6 +46,7 @@ struct read_info_sccb {
u8 _reserved5[4096 - 112]; /* 112-4095 */
} __attribute__((packed, aligned(PAGE_SIZE)));
@@ -123,7 +94,7 @@ index be55fb2..dda9b9e 100644
static struct read_info_sccb __initdata early_read_info_sccb;
static int __initdata early_read_info_sccb_valid;
-@@ -100,6 +101,19 @@ static void __init sclp_read_info_early(void)
+@@ -101,6 +102,19 @@ static void __init sclp_read_info_early(
}
}
@@ -143,7 +114,7 @@ index be55fb2..dda9b9e 100644
void __init sclp_facilities_detect(void)
{
struct read_info_sccb *sccb;
-@@ -114,6 +128,30 @@ void __init sclp_facilities_detect(void)
+@@ -115,6 +129,30 @@ void __init sclp_facilities_detect(void)
rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2;
rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2;
rzm <<= 20;
@@ -174,8 +145,25 @@ index be55fb2..dda9b9e 100644
}
unsigned long long sclp_get_rnmax(void)
-diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
-index c967d42..5439ab3 100644
+--- a/drivers/s390/char/sclp.h
++++ b/drivers/s390/char/sclp.h
+@@ -88,6 +88,16 @@ struct sccb_header {
+ u16 response_code;
+ } __attribute__((packed));
+
++struct init_sccb {
++ struct sccb_header header;
++ u16 _reserved;
++ u16 mask_length;
++ sccb_mask_t receive_mask;
++ sccb_mask_t send_mask;
++ sccb_mask_t sclp_receive_mask;
++ sccb_mask_t sclp_send_mask;
++} __attribute__((packed));
++
+ extern u64 sclp_facilities;
+ #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL)
+ #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL)
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -24,6 +24,7 @@
@@ -186,7 +174,7 @@ index c967d42..5439ab3 100644
#include <asm/setup.h>
#include <asm/irq.h>
-@@ -472,7 +473,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
+@@ -494,7 +495,7 @@ static __init int early_put_chars(u32 vt
static int __init s390_virtio_console_init(void)
{
@@ -195,6 +183,3 @@ index c967d42..5439ab3 100644
return -ENODEV;
return virtio_cons_early_init(early_put_chars);
}
---
-1.6.0.2
-
diff --git a/patches.kabi/kaiser-preserve-kabi.patch b/patches.kabi/kaiser-preserve-kabi.patch
new file mode 100644
index 0000000000..384fffea63
--- /dev/null
+++ b/patches.kabi/kaiser-preserve-kabi.patch
@@ -0,0 +1,146 @@
+From: Jiri Kosina <jkosina@suse.cz>
+Subject: [PATCH] kaiser: work around kABI
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: Never, SUSE-specific
+
+The most potentially dangerous one is the vmstats one. I can't imagine what
+3rd party module would realistically be directly allocating pglist_data,
+per_cpu_nodestat, memcg_stat_item, lruvec_stat, etc, but the potential
+non-zero risk is there.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+---
+ arch/x86/include/asm/desc.h | 4 ++++
+ arch/x86/include/asm/fixmap.h | 2 ++
+ arch/x86/include/asm/hw_irq.h | 4 ++++
+ arch/x86/include/asm/pgtable_types.h | 4 ++++
+ arch/x86/include/asm/processor.h | 4 ++++
+ arch/x86/include/asm/tlbflush.h | 2 ++
+ arch/x86/kernel/cpu/common.c | 4 ++++
+ arch/x86/kernel/init_task.c | 4 ++++
+ include/linux/mmu_context.h | 2 ++
+ include/linux/mmzone.h | 4 +++-
+ kernel/sched.c | 4 ++++
+ mm/vmstat.c | 4 +++-
+ 12 files changed, 40 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -40,7 +40,11 @@ struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+ } __attribute__((aligned(PAGE_SIZE)));
+
++#ifdef __GENKSYMS__
++DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++#else
+ DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
++#endif
+
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+ {
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -78,7 +78,9 @@ enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++#ifndef __GENKSYMS__
+ VVAR_PAGE,
++#endif
+ VSYSCALL_HPET,
+ #endif
+ FIX_DBGP_BASE,
+--- a/arch/x86/include/asm/hw_irq.h
++++ b/arch/x86/include/asm/hw_irq.h
+@@ -166,7 +166,11 @@ extern asmlinkage void smp_invalidate_in
+ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+
+ typedef int vector_irq_t[NR_VECTORS];
++#ifndef __GENKSYMS__
+ DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
++#else
++DECLARE_PER_CPU(vector_irq_t, vector_irq);
++#endif
+ extern void setup_vector_irq(int cpu);
+
+ #ifdef CONFIG_X86_IO_APIC
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -266,7 +266,11 @@ struct tss_struct {
+
+ } ____cacheline_aligned;
+
++#ifndef __GENKSYMS__
+ DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
++#else
++DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
++#endif
+
+ /*
+ * Save the original ist values for checking stack pointers during debugging
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -6,7 +6,9 @@
+
+ #include <asm/processor.h>
+ #include <asm/system.h>
++#ifndef __GENKSYMS__
+ #include <asm/smp.h>
++#endif
+
+ static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -84,7 +84,11 @@ static const struct cpu_dev __cpuinitcon
+
+ static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
++#ifndef __GENKSYMS__
+ DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
++#else
++DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++#endif
+ #ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
+--- a/arch/x86/kernel/init_task.c
++++ b/arch/x86/kernel/init_task.c
+@@ -38,5 +38,9 @@ EXPORT_SYMBOL(init_task);
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
++#if defined(CONFIG_GENKSYMS)
++DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
++#else
+ DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
++#endif
+
+--- a/include/linux/mmu_context.h
++++ b/include/linux/mmu_context.h
+@@ -1,7 +1,9 @@
+ #ifndef _LINUX_MMU_CONTEXT_H
+ #define _LINUX_MMU_CONTEXT_H
+
++#ifndef __GENKSYMS__
+ #include <asm/mmu_context.h>
++#endif
+
+ struct mm_struct;
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -32,7 +32,11 @@
+ #include <linux/init.h>
+ #include <linux/uaccess.h>
+ #include <linux/highmem.h>
++#ifdef __GENKSYMS__
++#include <asm/mmu_context.h>
++#else
+ #include <linux/mmu_context.h>
++#endif
+ #include <linux/interrupt.h>
+ #include <linux/capability.h>
+ #include <linux/completion.h>
+
diff --git a/patches.kabi/xen3-kaiser-preserve-kabi.patch b/patches.kabi/xen3-kaiser-preserve-kabi.patch
new file mode 100644
index 0000000000..14c59c8490
--- /dev/null
+++ b/patches.kabi/xen3-kaiser-preserve-kabi.patch
@@ -0,0 +1,79 @@
+From: Jiri Kosina <jkosina@suse.cz>
+Subject: [PATCH] xen/kaiser: work around kABI
+Patch-mainline: Never, SUSE-Xen specific
+
+The most potentially dangerous one is the vmstats one. I can't imagine what
+3rd party module would realistically be directly allocating pglist_data,
+per_cpu_nodestat, memcg_stat_item, lruvec_stat, etc, but the potential
+non-zero risk is there.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+Automatically created from "patches.kabi/kaiser-preserve-kabi.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/desc.h
++++ b/arch/x86/include/mach-xen/asm/desc.h
+@@ -42,7 +42,11 @@ struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+ } __attribute__((aligned(PAGE_SIZE)));
+
++#ifdef __GENKSYMS__
++DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++#else
+ DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
++#endif
+
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+ {
+--- a/arch/x86/include/mach-xen/asm/fixmap.h
++++ b/arch/x86/include/mach-xen/asm/fixmap.h
+@@ -78,7 +78,9 @@ enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++#ifndef __GENKSYMS__
+ VVAR_PAGE,
++#endif
+ VSYSCALL_HPET,
+ #endif
+ FIX_DBGP_BASE,
+--- a/arch/x86/include/mach-xen/asm/processor.h
++++ b/arch/x86/include/mach-xen/asm/processor.h
+@@ -283,7 +283,11 @@ struct tss_struct {
+
+ } ____cacheline_aligned;
+
++#ifndef __GENKSYMS__
+ DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
++#else
++DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
++#endif
+
+ /*
+ * Save the original ist values for checking stack pointers during debugging
+--- a/arch/x86/include/mach-xen/asm/tlbflush.h
++++ b/arch/x86/include/mach-xen/asm/tlbflush.h
+@@ -6,7 +6,9 @@
+
+ #include <asm/processor.h>
+ #include <asm/system.h>
++#ifndef __GENKSYMS__
+ #include <asm/smp.h>
++#endif
+
+ /*
+ * Declare a couple of kaiser interfaces here for convenience,
+--- a/arch/x86/kernel/cpu/common-xen.c
++++ b/arch/x86/kernel/cpu/common-xen.c
+@@ -92,7 +92,11 @@ static const struct cpu_dev __cpuinitcon
+
+ static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
++#ifndef __GENKSYMS__
+ DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
++#else
++DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++#endif
+ #ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
diff --git a/patches.suse/0001-locking-barriers-introduce-new-memory-barrier-gmb.patch b/patches.suse/0001-locking-barriers-introduce-new-memory-barrier-gmb.patch
new file mode 100644
index 0000000000..a3d90ca2f1
--- /dev/null
+++ b/patches.suse/0001-locking-barriers-introduce-new-memory-barrier-gmb.patch
@@ -0,0 +1,48 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:43 +0300
+Subject: locking/barriers: introduce new memory barrier gmb()
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+In constrast to existing mb() and rmb() barriers,
+gmb() barrier is arch-independent and can be used to
+implement any type of memory barrier.
+In x86 case, it is either lfence or mfence, based on
+processor type. ARM and others can define it according
+to their needs.
+
+Suggested-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ arch/x86/include/asm/system.h | 3 +++
+ include/asm-generic/system.h | 4 ++++
+ 2 files changed, 7 insertions(+)
+
+--- a/arch/x86/include/asm/system.h
++++ b/arch/x86/include/asm/system.h
+@@ -416,6 +416,9 @@
+ #define wmb() asm volatile("sfence" ::: "memory")
+ #endif
+
++/* there is no alternative_2 in 3.0, so reuse rdtsc_barrier */
++#define gmb() rdtsc_barrier()
++
+ /**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+--- /dev/null
++++ b/include/linux/system.h
+@@ -0,0 +1,10 @@
++#ifndef _LINUX_SYSTEM_H
++#define _LINUX_SYSTEM_H
++
++#include <asm/system.h>
++
++#ifndef gmb
++#define gmb() do { } while (0)
++#endif
++
++#endif
diff --git a/patches.suse/0001-x86-64-Give-vvars-their-own-page.patch b/patches.suse/0001-x86-64-Give-vvars-their-own-page.patch
new file mode 100644
index 0000000000..e60dbda79f
--- /dev/null
+++ b/patches.suse/0001-x86-64-Give-vvars-their-own-page.patch
@@ -0,0 +1,186 @@
+From: Andy Lutomirski <luto@MIT.EDU>
+Date: Sun, 5 Jun 2011 13:50:19 -0400
+Subject: [PATCH] x86-64: Give vvars their own page
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 9fd67b4ed0714ab718f1f9bd14c344af336a6df7
+Patch-mainline: v3.2
+
+Move vvars out of the vsyscall page into their own page and mark
+it NX.
+
+Without this patch, an attacker who can force a daemon to call
+some fixed address could wait until the time contains, say,
+0xCD80, and then execute the current time.
+
+Signed-off-by: Andy Lutomirski <luto@mit.edu>
+Cc: Jesper Juhl <jj@chaosbits.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@infradead.org>
+Cc: Jan Beulich <JBeulich@novell.com>
+Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
+Cc: Mikael Pettersson <mikpe@it.uu.se>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
+Cc: Valdis.Kletnieks@vt.edu
+Cc: pageexec@freemail.hu
+Link: http://lkml.kernel.org/r/b1460f81dc4463d66ea3f2b5ce240f58d48effec.1307292171.git.luto@mit.edu
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/include/asm/fixmap.h | 1 +
+ arch/x86/include/asm/pgtable_types.h | 2 ++
+ arch/x86/include/asm/vvar.h | 22 ++++++++++------------
+ arch/x86/kernel/vmlinux.lds.S | 28 +++++++++++++++++-----------
+ arch/x86/kernel/vsyscall_64.c | 5 +++++
+ 5 files changed, 35 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -78,6 +78,7 @@ enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++ VVAR_PAGE,
+ VSYSCALL_HPET,
+ #endif
+ FIX_DBGP_BASE,
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -108,6 +108,7 @@
+ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+ #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+ #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+ #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+@@ -130,6 +131,7 @@
+ #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+ #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+ #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
++#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+
+ #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+ #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+--- a/arch/x86/include/asm/vvar.h
++++ b/arch/x86/include/asm/vvar.h
+@@ -10,15 +10,14 @@
+ * In normal kernel code, they are used like any other variable.
+ * In user code, they are accessed through the VVAR macro.
+ *
+- * Each of these variables lives in the vsyscall page, and each
+- * one needs a unique offset within the little piece of the page
+- * reserved for vvars. Specify that offset in DECLARE_VVAR.
+- * (There are 896 bytes available. If you mess up, the linker will
+- * catch it.)
++ * These variables live in a page of kernel data that has an extra RO
++ * mapping for userspace. Each variable needs a unique offset within
++ * that page; specify that offset with the DECLARE_VVAR macro. (If
++ * you mess up, the linker will catch it.)
+ */
+
+-/* Offset of vars within vsyscall page */
+-#define VSYSCALL_VARS_OFFSET (3072 + 128)
++/* Base address of vvars. This is not ABI. */
++#define VVAR_ADDRESS (-10*1024*1024 - 4096)
+
+ #if defined(__VVAR_KERNEL_LDS)
+
+@@ -26,17 +25,17 @@
+ * right place.
+ */
+ #define DECLARE_VVAR(offset, type, name) \
+- EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
++ EMIT_VVAR(name, offset)
+
+ #else
+
+ #define DECLARE_VVAR(offset, type, name) \
+ static type const * const vvaraddr_ ## name = \
+- (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
++ (void *)(VVAR_ADDRESS + (offset));
+
+ #define DEFINE_VVAR(type, name) \
+- type __vvar_ ## name \
+- __attribute__((section(".vsyscall_var_" #name), aligned(16)))
++ type name \
++ __attribute__((section(".vvar_" #name), aligned(16)))
+
+ #define VVAR(name) (*vvaraddr_ ## name)
+
+@@ -49,4 +48,3 @@ DECLARE_VVAR(8, int, vgetcpu_mode)
+ DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+
+ #undef DECLARE_VVAR
+-#undef VSYSCALL_VARS_OFFSET
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -161,12 +161,6 @@ SECTIONS
+
+ #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
+ #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+-#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
+- ADDR(.vsyscall_0) + offset \
+- : AT(VLOAD(.vsyscall_var_ ## x)) { \
+- *(.vsyscall_var_ ## x) \
+- } \
+- x = VVIRT(.vsyscall_var_ ## x);
+
+ . = ALIGN(4096);
+ __vsyscall_0 = .;
+@@ -192,19 +186,31 @@ SECTIONS
+ *(.vsyscall_3)
+ }
+
+-#define __VVAR_KERNEL_LDS
+-#include <asm/vvar.h>
+-#undef __VVAR_KERNEL_LDS
+-
+- . = __vsyscall_0 + PAGE_SIZE;
++ . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
+
+ #undef VSYSCALL_ADDR
+ #undef VLOAD_OFFSET
+ #undef VLOAD
+ #undef VVIRT_OFFSET
+ #undef VVIRT
++
++ __vvar_page = .;
++
++ .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
++
++ /* Place all vvars at the offsets in asm/vvar.h. */
++#define EMIT_VVAR(name, offset) \
++ . = offset; \
++ *(.vvar_ ## name)
++#define __VVAR_KERNEL_LDS
++#include <asm/vvar.h>
++#undef __VVAR_KERNEL_LDS
+ #undef EMIT_VVAR
+
++ } :data
++
++ . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
++
+ #endif /* CONFIG_X86_64 */
+
+ /* Init code and data - will be freed after init */
+--- a/arch/x86/kernel/vsyscall_64.c
++++ b/arch/x86/kernel/vsyscall_64.c
+@@ -284,9 +284,14 @@ void __init map_vsyscall(void)
+ {
+ extern char __vsyscall_0;
+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
++ extern char __vvar_page;
++ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+ /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
++ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
++ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
++ (unsigned long)VVAR_ADDRESS);
+ }
+
+ static int __init vsyscall_init(void)
diff --git a/patches.suse/0001-x86-64-Map-the-HPET-NX.patch b/patches.suse/0001-x86-64-Map-the-HPET-NX.patch
new file mode 100644
index 0000000000..9689ba1754
--- /dev/null
+++ b/patches.suse/0001-x86-64-Map-the-HPET-NX.patch
@@ -0,0 +1,66 @@
+From: Andy Lutomirski <luto@mit.edu>
+Date: Sun, 5 Jun 2011 13:50:21 -0400
+Subject: [PATCH] x86-64: Map the HPET NX
+References: bsc#1068032 CVE-2017-5754
+Git-commit: d319bb79afa4039bda6f85661d6bf0c13299ce93
+Patch-mainline: 4.0-rc1
+
+Currently the HPET mapping is a user-accessible syscall
+instruction at a fixed address some of the time.
+
+A sufficiently determined hacker might be able to guess when.
+
+Signed-off-by: Andy Lutomirski <luto@mit.edu>
+Cc: Jesper Juhl <jj@chaosbits.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@infradead.org>
+Cc: Jan Beulich <JBeulich@novell.com>
+Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
+Cc: Mikael Pettersson <mikpe@it.uu.se>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
+Cc: Valdis.Kletnieks@vt.edu
+Cc: pageexec@freemail.hu
+Link: http://lkml.kernel.org/r/ab41b525a4ca346b1ca1145d16fb8d181861a8aa.1307292171.git.luto@mit.edu
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/include/asm/pgtable_types.h | 4 ++--
+ arch/x86/kernel/hpet.c | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -107,8 +107,8 @@
+ #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+ #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
++#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+@@ -130,8 +130,8 @@
+ #define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+ #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+ #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+ #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
++#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
+
+ #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+ #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+--- a/arch/x86/kernel/hpet.c
++++ b/arch/x86/kernel/hpet.c
+@@ -99,7 +99,7 @@ static inline void hpet_set_mapping(void
+ {
+ hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ #ifdef CONFIG_X86_64
+- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
++ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
+ #endif
+ }
+
diff --git a/patches.suse/0001-x86-boot-Carve-out-early-cmdline-parsing-function.patch b/patches.suse/0001-x86-boot-Carve-out-early-cmdline-parsing-function.patch
new file mode 100644
index 0000000000..5efa54b7b1
--- /dev/null
+++ b/patches.suse/0001-x86-boot-Carve-out-early-cmdline-parsing-function.patch
@@ -0,0 +1,131 @@
+From 1b1ded57a4f2f4420b4de7c395d1b841d8b3c41a Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Mon, 19 May 2014 20:59:16 +0200
+Subject: [PATCH] x86, boot: Carve out early cmdline parsing function
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 1b1ded57a4f2f4420b4de7c395d1b841d8b3c41a
+Patch-mainline: 3.16-rc1
+
+Carve out early cmdline parsing function into .../lib/cmdline.c so it
+can be used by early code in the kernel proper as well.
+
+Adapted from arch/x86/boot/cmdline.c.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Link: http://lkml.kernel.org/r/1400525957-11525-2-git-send-email-bp@alien8.de
+Signed-off-by: H. Peter Anvin <hpa@zytor.com>
+---
+ arch/x86/include/asm/cmdline.h | 6 ++
+ arch/x86/lib/Makefile | 2
+ arch/x86/lib/cmdline.c | 84 +++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 91 insertions(+), 1 deletion(-)
+ create mode 100644 arch/x86/include/asm/cmdline.h
+ create mode 100644 arch/x86/lib/cmdline.c
+
+--- /dev/null
++++ b/arch/x86/include/asm/cmdline.h
+@@ -0,0 +1,6 @@
++#ifndef _ASM_X86_CMDLINE_H
++#define _ASM_X86_CMDLINE_H
++
++int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
++
++#endif /* _ASM_X86_CMDLINE_H */
+--- /dev/null
++++ b/arch/x86/lib/cmdline.c
+@@ -0,0 +1,84 @@
++/*
++ * This file is part of the Linux kernel, and is made available under
++ * the terms of the GNU General Public License version 2.
++ *
++ * Misc librarized functions for cmdline poking.
++ */
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/ctype.h>
++#include <asm/setup.h>
++
++static inline int myisspace(u8 c)
++{
++ return c <= ' '; /* Close enough approximation */
++}
++
++/**
++ * Find a boolean option (like quiet,noapic,nosmp....)
++ *
++ * @cmdline: the cmdline string
++ * @option: option string to look for
++ *
++ * Returns the position of that @option (starts counting with 1)
++ * or 0 on not found.
++ */
++int cmdline_find_option_bool(const char *cmdline, const char *option)
++{
++ char c;
++ int len, pos = 0, wstart = 0;
++ const char *opptr = NULL;
++ enum {
++ st_wordstart = 0, /* Start of word/after whitespace */
++ st_wordcmp, /* Comparing this word */
++ st_wordskip, /* Miscompare, skip */
++ } state = st_wordstart;
++
++ if (!cmdline)
++ return -1; /* No command line */
++
++ len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
++ if (!len)
++ return 0;
++
++ while (len--) {
++ c = *(char *)cmdline++;
++ pos++;
++
++ switch (state) {
++ case st_wordstart:
++ if (!c)
++ return 0;
++ else if (myisspace(c))
++ break;
++
++ state = st_wordcmp;
++ opptr = option;
++ wstart = pos;
++ /* fall through */
++
++ case st_wordcmp:
++ if (!*opptr)
++ if (!c || myisspace(c))
++ return wstart;
++ else
++ state = st_wordskip;
++ else if (!c)
++ return 0;
++ else if (c != *opptr++)
++ state = st_wordskip;
++ else if (!len) /* last word and is matching */
++ return wstart;
++ break;
++
++ case st_wordskip:
++ if (!c)
++ return 0;
++ else if (myisspace(c))
++ state = st_wordstart;
++ break;
++ }
++ }
++
++ return 0; /* Buffer overrun */
++}
+--- a/arch/x86/lib/Makefile
++++ b/arch/x86/lib/Makefile
+@@ -16,7 +16,7 @@ clean-files := inat-tables.c
+
+ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
+
+-lib-y := delay.o
++lib-y := delay.o cmdline.o
+ lib-y += thunk_$(BITS).o
+ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
+ lib-y += memcpy_$(BITS).o
diff --git a/patches.suse/0002-bpf-prevent-speculative-execution-in-eBPF-interprete.patch b/patches.suse/0002-bpf-prevent-speculative-execution-in-eBPF-interprete.patch
new file mode 100644
index 0000000000..3a04db5948
--- /dev/null
+++ b/patches.suse/0002-bpf-prevent-speculative-execution-in-eBPF-interprete.patch
@@ -0,0 +1,44 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:44 +0300
+Subject: bpf: prevent speculative execution in eBPF interpreter
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+This adds a generic memory barrier before LD_IMM_DW and
+LDX_MEM_B/H/W/DW eBPF instructions during eBPF program
+execution in order to prevent speculative execution on out
+of bound BFP_MAP array indexes. This way an arbitary kernel
+memory is not exposed through side channel attacks.
+
+For more details, please see this Google Project Zero report: tbd
+
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ net/core/filter.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -39,6 +39,7 @@
+ #include <linux/filter.h>
+ #include <linux/reciprocal_div.h>
+ #include <linux/ratelimit.h>
++#include <linux/system.h>
+
+ /* No hurry in this branch */
+ static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size)
+@@ -266,9 +267,11 @@
+ X = K;
+ continue;
+ case BPF_S_LD_MEM:
++ gmb();
+ A = mem[K];
+ continue;
+ case BPF_S_LDX_MEM:
++ gmb();
+ X = mem[K];
+ continue;
+ case BPF_S_MISC_TAX:
diff --git a/patches.suse/0003-uvcvideo-prevent-speculative-execution.patch b/patches.suse/0003-uvcvideo-prevent-speculative-execution.patch
new file mode 100644
index 0000000000..97f858be17
--- /dev/null
+++ b/patches.suse/0003-uvcvideo-prevent-speculative-execution.patch
@@ -0,0 +1,34 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:46 +0300
+Subject: uvcvideo: prevent speculative execution
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+real commit text tbd
+
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ drivers/media/video/uvc/uvc_v4l2.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/media/video/uvc/uvc_v4l2.c
++++ b/drivers/media/video/uvc/uvc_v4l2.c
+@@ -21,6 +21,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/mm.h>
+ #include <linux/wait.h>
++#include <linux/system.h>
+ #include <asm/atomic.h>
+
+ #include <media/v4l2-common.h>
+@@ -711,6 +712,7 @@
+ }
+ pin = iterm->id;
+ } else if (index < selector->bNrInPins) {
++ gmb();
+ pin = selector->baSourceID[index];
+ list_for_each_entry(iterm, &chain->entities, chain) {
+ if (!UVC_ENTITY_IS_ITERM(iterm))
diff --git a/patches.suse/0004-carl9170-prevent-speculative-execution.patch b/patches.suse/0004-carl9170-prevent-speculative-execution.patch
new file mode 100644
index 0000000000..403355b277
--- /dev/null
+++ b/patches.suse/0004-carl9170-prevent-speculative-execution.patch
@@ -0,0 +1,34 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:47 +0300
+Subject: carl9170: prevent speculative execution
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+Real commit text tbd
+
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ drivers/net/wireless/ath/carl9170/main.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireless/ath/carl9170/main.c
++++ b/drivers/net/wireless/ath/carl9170/main.c
+@@ -42,6 +42,7 @@
+ #include <linux/module.h>
+ #include <linux/etherdevice.h>
+ #include <linux/random.h>
++#include <linux/system.h>
+ #include <net/mac80211.h>
+ #include <net/cfg80211.h>
+ #include "hw.h"
+@@ -1263,6 +1264,7 @@
+
+ mutex_lock(&ar->mutex);
+ if (queue < ar->hw->queues) {
++ gmb();
+ memcpy(&ar->edcf[ar9170_qmap[queue]], param, sizeof(*param));
+ ret = carl9170_set_qos(ar);
+ } else {
diff --git a/patches.suse/0004-sk_run_filter-add-bpf_s_anc_seccomp_ld_w.patch b/patches.suse/0004-sk_run_filter-add-bpf_s_anc_seccomp_ld_w.patch
index 5bb2bd9a14..84f526c651 100644
--- a/patches.suse/0004-sk_run_filter-add-bpf_s_anc_seccomp_ld_w.patch
+++ b/patches.suse/0004-sk_run_filter-add-bpf_s_anc_seccomp_ld_w.patch
@@ -27,11 +27,9 @@ Acked-by: Bruce Rogers <brogers@suse.com>
net/core/filter.c | 6 ++++++
2 files changed, 7 insertions(+)
-diff --git a/include/linux/filter.h b/include/linux/filter.h
-index 9ee3f9f..d274c84 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
-@@ -228,6 +228,7 @@ enum {
+@@ -228,6 +228,7 @@
BPF_S_ANC_HATYPE,
BPF_S_ANC_RXHASH,
BPF_S_ANC_CPU,
@@ -39,19 +37,17 @@ index 9ee3f9f..d274c84 100644
};
#endif /* __KERNEL__ */
-diff --git a/net/core/filter.c b/net/core/filter.c
-index 4ccf6f4..55fec64 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
-@@ -39,6 +39,7 @@
- #include <linux/filter.h>
+@@ -40,6 +40,7 @@
#include <linux/reciprocal_div.h>
#include <linux/ratelimit.h>
+ #include <linux/system.h>
+#include <linux/seccomp.h>
/* No hurry in this branch */
static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size)
-@@ -358,6 +359,11 @@ load_b:
+@@ -365,6 +366,11 @@
A = 0;
continue;
}
@@ -63,6 +59,3 @@ index 4ccf6f4..55fec64 100644
default:
WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
fentry->code, fentry->jt,
---
-1.7.12.4
-
diff --git a/patches.suse/0005-p54-prevent-speculative-execution.patch b/patches.suse/0005-p54-prevent-speculative-execution.patch
new file mode 100644
index 0000000000..9556336ee8
--- /dev/null
+++ b/patches.suse/0005-p54-prevent-speculative-execution.patch
@@ -0,0 +1,34 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:48 +0300
+Subject: p54: prevent speculative execution
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+Real commit text tbd
+
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ drivers/net/wireless/p54/main.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireless/p54/main.c
++++ b/drivers/net/wireless/p54/main.c
+@@ -20,6 +20,7 @@
+ #include <linux/slab.h>
+ #include <linux/firmware.h>
+ #include <linux/etherdevice.h>
++#include <linux/system.h>
+
+ #include <net/mac80211.h>
+
+@@ -361,6 +362,7 @@
+
+ mutex_lock(&priv->conf_mutex);
+ if (queue < dev->queues) {
++ gmb();
+ P54_SET_QUEUE(priv->qos_params[queue], params->aifs,
+ params->cw_min, params->cw_max, params->txop);
+ ret = p54_set_edcf(priv);
diff --git a/patches.suse/0006-fs-prevent-speculative-execution.patch b/patches.suse/0006-fs-prevent-speculative-execution.patch
new file mode 100644
index 0000000000..28c0299755
--- /dev/null
+++ b/patches.suse/0006-fs-prevent-speculative-execution.patch
@@ -0,0 +1,40 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:54 +0300
+Subject: fs: prevent speculative execution
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+Real commit text tbd
+
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ include/linux/fdtable.h | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/include/linux/fdtable.h
++++ b/include/linux/fdtable.h
+@@ -12,6 +12,9 @@
+ #include <linux/types.h>
+ #include <linux/init.h>
+ #include <linux/fs.h>
++#ifndef __GENKSYMS__
++#include <linux/system.h>
++#endif
+
+ #include <asm/atomic.h>
+
+@@ -86,8 +89,10 @@
+ struct file * file = NULL;
+ struct fdtable *fdt = files_fdtable(files);
+
+- if (fd < fdt->max_fds)
++ if (fd < fdt->max_fds) {
++ gmb();
+ file = rcu_dereference_check_fdtable(files, fdt->fd[fd]);
++ }
+ return file;
+ }
+
diff --git a/patches.suse/0007-udf-prevent-speculative-execution.patch b/patches.suse/0007-udf-prevent-speculative-execution.patch
new file mode 100644
index 0000000000..dbb2cf2613
--- /dev/null
+++ b/patches.suse/0007-udf-prevent-speculative-execution.patch
@@ -0,0 +1,53 @@
+From: Elena Reshetova <elena.reshetova@intel.com>
+Date: Mon, 4 Sep 2017 13:11:56 +0300
+Subject: udf: prevent speculative execution
+References: bsc#1068032 CVE-2017-5753
+Patch-mainline: submitted on 2018/1/9
+References: bnc#1068032
+
+Real commit text tbd
+
+Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+---
+ fs/udf/misc.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/udf/misc.c
++++ b/fs/udf/misc.c
+@@ -25,6 +25,7 @@
+ #include <linux/string.h>
+ #include <linux/buffer_head.h>
+ #include <linux/crc-itu-t.h>
++#include <linux/system.h>
+
+ #include "udf_i.h"
+ #include "udf_sb.h"
+@@ -105,6 +106,8 @@
+ iinfo->i_lenEAttr) {
+ uint32_t aal =
+ le32_to_cpu(eahd->appAttrLocation);
++
++ gmb();
+ memmove(&ea[offset - aal + size],
+ &ea[aal], offset - aal);
+ offset -= aal;
+@@ -115,6 +118,8 @@
+ iinfo->i_lenEAttr) {
+ uint32_t ial =
+ le32_to_cpu(eahd->impAttrLocation);
++
++ gmb();
+ memmove(&ea[offset - ial + size],
+ &ea[ial], offset - ial);
+ offset -= ial;
+@@ -126,6 +131,8 @@
+ iinfo->i_lenEAttr) {
+ uint32_t aal =
+ le32_to_cpu(eahd->appAttrLocation);
++
++ gmb();
+ memmove(&ea[offset - aal + size],
+ &ea[aal], offset - aal);
+ offset -= aal;
diff --git a/patches.suse/01-x86-feature-enable-the-x86-feature-to-control-speculation.patch b/patches.suse/01-x86-feature-enable-the-x86-feature-to-control-speculation.patch
new file mode 100644
index 0000000000..357432c75e
--- /dev/null
+++ b/patches.suse/01-x86-feature-enable-the-x86-feature-to-control-speculation.patch
@@ -0,0 +1,63 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Thu, 24 Aug 2017 09:34:41 -0700
+Subject: x86/feature: Enable the x86 feature to control Speculation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+References: bsc#1068032
+Patch-mainline: submitted on 2018/1/9
+
+cpuid ax=0x7, return rdx bit 26 to indicate presence of this feature
+IA32_SPEC_CTRL (0x48) and IA32_PRED_CMD (0x49)
+IA32_SPEC_CTRL, bit0 – Indirect Branch Restricted Speculation (IBRS)
+IA32_PRED_CMD, bit0 – Indirect Branch Prediction Barrier (IBPB)
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/cpufeature.h | 1 +
+ arch/x86/include/asm/msr-index.h | 5 +++++
+ arch/x86/kernel/cpu/scattered.c | 1 +
+ 3 files changed, 7 insertions(+)
+
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -178,6 +178,7 @@
+ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
+ #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
+ #define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
++#define X86_FEATURE_SPEC_CTRL ( 7*32+19) /* Control Speculation Control */
+
+ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -32,6 +32,9 @@
+ #define EFER_FFXSR (1<<_EFER_FFXSR)
+
+ /* Intel MSRs. Some also available on other CPUs */
++#define MSR_IA32_SPEC_CTRL 0x00000048
++#define MSR_IA32_PRED_CMD 0x00000049
++
+ #define MSR_IA32_PERFCTR0 0x000000c1
+ #define MSR_IA32_PERFCTR1 0x000000c2
+ #define MSR_FSB_FREQ 0x000000cd
+@@ -233,6 +236,8 @@
+ #define FEATURE_CONTROL_LOCKED (1<<0)
+ #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
+ #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
++#define FEATURE_ENABLE_IBRS (1<<0)
++#define FEATURE_SET_IBPB (1<<0)
+
+ #define MSR_IA32_APICBASE 0x0000001b
+ #define MSR_IA32_APICBASE_BSP (1<<8)
+--- a/arch/x86/kernel/cpu/scattered.c
++++ b/arch/x86/kernel/cpu/scattered.c
+@@ -38,6 +38,7 @@ void __cpuinit init_scattered_cpuid_feat
+ { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
+ { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
++ { X86_FEATURE_SPEC_CTRL, CR_EDX,26, 0x00000007, 0 },
+ { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
+ { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
diff --git a/patches.suse/02-x86-enter-add-macros-to-set-clear-ibrs-and-set-ibpb.patch b/patches.suse/02-x86-enter-add-macros-to-set-clear-ibrs-and-set-ibpb.patch
new file mode 100644
index 0000000000..c2569e1a12
--- /dev/null
+++ b/patches.suse/02-x86-enter-add-macros-to-set-clear-ibrs-and-set-ibpb.patch
@@ -0,0 +1,87 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Fri, 15 Sep 2017 18:04:53 -0700
+Subject: x86/enter: Add macros to set/clear IBRS and set IBPB
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Add setup macros to control IBRS and IBPB.
+
+Boris:
+
+Change the alternatives to jump over the code so that backports
+to older versions are easier since ALTERNATIVEs padding came in
+in v4.1.
+
+Also, make them proper asm macros.
+
+Also, use XOR to zero out regs.
+
+Also, fold in __ENABLE_IBRS_CLOBBER into the other macros.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 54 ++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 54 insertions(+)
+
+diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h
+new file mode 100644
+index 000000000000..5e8c4124abed
+--- /dev/null
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -0,0 +1,54 @@
++#ifndef _ASM_X86_SPEC_CTRL_H
++#define _ASM_X86_SPEC_CTRL_H
++
++#include <linux/stringify.h>
++#include <asm/msr-index.h>
++#include <asm/cpufeature.h>
++#include <asm/alternative-asm.h>
++
++#ifdef __ASSEMBLY__
++
++.macro __ENABLE_IBRS_CLOBBER
++ movl $MSR_IA32_SPEC_CTRL, %ecx
++ xorl %edx, %edx
++ movl $FEATURE_ENABLE_IBRS, %eax
++ wrmsr
++.endm
++
++.macro ENABLE_IBRS_CLOBBER
++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
++ __ENABLE_IBRS_CLOBBER
++.Lend_\@:
++.endm
++
++
++.macro ENABLE_IBRS
++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
++ pushq %rax
++ pushq %rcx
++ pushq %rdx
++ __ENABLE_IBRS_CLOBBER
++ popq %rdx
++ popq %rcx
++ popq %rax
++.Lend_\@:
++.endm
++
++
++.macro DISABLE_IBRS
++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
++ pushq %rax
++ pushq %rcx
++ pushq %rdx
++ movl $MSR_IA32_SPEC_CTRL, %ecx
++ xorl %edx, %edx
++ xorl %eax, %eax
++ wrmsr
++ popq %rdx
++ popq %rcx
++ popq %rax
++.Lend_\@:
++.endm
++
++#endif /* __ASSEMBLY__ */
++#endif /* _ASM_X86_SPEC_CTRL_H */
+
diff --git a/patches.suse/03-x86-entry-use-ibrs-on-entry-to-kernel-space.patch b/patches.suse/03-x86-entry-use-ibrs-on-entry-to-kernel-space.patch
new file mode 100644
index 0000000000..bf019f25ee
--- /dev/null
+++ b/patches.suse/03-x86-entry-use-ibrs-on-entry-to-kernel-space.patch
@@ -0,0 +1,206 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Fri, 15 Dec 2017 19:56:13 +0100
+Subject: x86/entry: Use IBRS on entry to kernel space
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Toggle IBRS on entry to kernel space: enable *after* CR3 write and
+disable *before* CR3 write.
+
+Originally-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/ia32/ia32entry.S | 17 ++++++++++++++++-
+ arch/x86/kernel/entry_64.S | 30 +++++++++++++++++++++++++++++-
+ 2 files changed, 45 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/ia32/ia32entry.S
++++ b/arch/x86/ia32/ia32entry.S
+@@ -13,11 +13,11 @@
+ #include <asm/thread_info.h>
+ #include <asm/segment.h>
+ #include <asm/pgtable_types.h>
+-#include <asm/alternative-asm.h>
+ #include <asm/cpufeature.h>
+ #include <asm/kaiser.h>
+ #include <asm/irqflags.h>
+ #include <linux/linkage.h>
++#include <asm/spec_ctrl.h>
+
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+ #include <linux/elf-em.h>
+@@ -148,6 +148,9 @@ ENTRY(ia32_sysenter_target)
+ CFI_REL_OFFSET rip,0
+ pushq_cfi %rax
+ cld
++
++ ENABLE_IBRS
++
+ SAVE_ARGS 0,0,1
+ /* no need to do an access_ok check here because rbp has been
+ 32bit zero extended */
+@@ -188,6 +191,9 @@ sysexit_from_sys_call:
+ popq_cfi %rcx /* User %esp */
+ CFI_REGISTER rsp,rcx
+ TRACE_IRQS_ON
++
++ DISABLE_IBRS
++
+ SWITCH_USER_CR3
+ ENABLE_INTERRUPTS_SYSEXIT32
+
+@@ -309,6 +315,9 @@ ENTRY(ia32_cstar_target)
+ /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
+ movq %r8,RSP-ARGOFFSET(%rsp)
+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
++
++ ENABLE_IBRS
++
+ /* no need to do an access_ok check here because r8 has been
+ 32bit zero extended */
+ /* hardware stack frame is complete now */
+@@ -344,6 +353,9 @@ sysretl_from_sys_call:
+ xorq %r9,%r9
+ xorq %r8,%r8
+ TRACE_IRQS_ON
++
++ DISABLE_IBRS
++
+ SWITCH_USER_CR3
+ movl RSP-ARGOFFSET(%rsp),%esp
+ CFI_RESTORE rsp
+@@ -432,6 +444,9 @@ ENTRY(ia32_syscall)
+ GET_THREAD_INFO(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
++
++ ENABLE_IBRS
++
+ jnz ia32_tracesys
+ cmpq $(IA32_NR_syscalls-1),%rax
+ ja ia32_badsys
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -55,9 +55,9 @@
+ #include <asm/percpu.h>
+ #include <asm/asm.h>
+ #include <asm/pgtable_types.h>
+-#include <asm/alternative-asm.h>
+ #include <asm/cpufeature.h>
+ #include <asm/kaiser.h>
++#include <asm/spec_ctrl.h>
+
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+ #include <linux/elf-em.h>
+@@ -332,6 +332,7 @@ ENTRY(save_args)
+ je 1f
+ SWAPGS
+ SWITCH_KERNEL_CR3
++ ENABLE_IBRS
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+@@ -421,6 +422,8 @@ ENTRY(save_paranoid)
+ movq %rax, %cr3
+ 2:
+ #endif
++
++ ENABLE_IBRS
+ ret
+ CFI_ENDPROC
+ END(save_paranoid)
+@@ -518,6 +521,9 @@ ENTRY(system_call_after_swapgs)
+ movq %rcx,RIP-ARGOFFSET(%rsp)
+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
+ GET_THREAD_INFO(%rcx)
++
++ ENABLE_IBRS
++
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ jnz tracesys
+ system_call_fastpath:
+@@ -550,6 +556,8 @@ sysret_check:
+ CFI_REGISTER rip,rcx
+ RESTORE_ARGS 1,-ARG_SKIP,0
+ /*CFI_REGISTER rflags,r11*/
++
++ DISABLE_IBRS
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+@@ -904,6 +912,9 @@ retint_swapgs: /* return to user-space
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
++
++ DISABLE_IBRS
++
+ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_args
+@@ -946,6 +957,9 @@ native_irq_return_ldt:
+ pushq_cfi %rdi
+ SWAPGS
+ SWITCH_KERNEL_CR3
++
++ ENABLE_IBRS
++
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+@@ -961,6 +975,9 @@ native_irq_return_ldt:
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
++
++ DISABLE_IBRS
++
+ SWITCH_USER_CR3
+ SWAPGS
+ movq %rax,%rsp
+@@ -1512,6 +1529,9 @@ ENTRY(paranoid_exit)
+ paranoid_kernel:
+ movq %r12, %rbx /* restore after paranoid_userspace */
+ TRACE_IRQS_IRETQ 0
++
++ DISABLE_IBRS
++
+ #ifdef CONFIG_KAISER
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+@@ -1585,6 +1605,9 @@ ENTRY(error_entry)
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
++
++ ENABLE_IBRS
++
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+@@ -1732,6 +1755,7 @@ ENTRY(nmi)
+
+ SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
++
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(kernel_stack), %rsp
+@@ -1758,6 +1782,8 @@ ENTRY(nmi)
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
++ ENABLE_IBRS
++
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+@@ -1768,6 +1794,8 @@ ENTRY(nmi)
+ movq $-1, %rsi
+ call do_nmi
+
++ DISABLE_IBRS
++
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
diff --git a/patches.suse/04-x86-msr-move-native_-msr-u64-to-msr-h.patch b/patches.suse/04-x86-msr-move-native_-msr-u64-to-msr-h.patch
new file mode 100644
index 0000000000..5679ef1ae8
--- /dev/null
+++ b/patches.suse/04-x86-msr-move-native_-msr-u64-to-msr-h.patch
@@ -0,0 +1,41 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sat, 16 Dec 2017 12:57:44 +0100
+Subject: x86/MSR: Move native_*msr(.. u64) to msr.h
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Move them to the msr.h header for a wider use.
+
+No functionality change.
+
+Well, we don't have them in 3.x so add them.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/msr.h | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/arch/x86/include/asm/msr.h
++++ b/arch/x86/include/asm/msr.h
+@@ -135,6 +135,21 @@ static inline unsigned long long native_
+ return EAX_EDX_VAL(val, low, high);
+ }
+
++#define native_rdmsr(msr, val1, val2) \
++do { \
++ u64 __val = native_read_msr((msr)); \
++ (void)((val1) = (u32)__val); \
++ (void)((val2) = (u32)(__val >> 32)); \
++} while (0)
++
++#define native_wrmsr(msr, low, high) \
++ native_write_msr(msr, low, high)
++
++#define native_wrmsrl(msr, val) \
++ native_write_msr((msr), \
++ (u32)((u64)(val)), \
++ (u32)((u64)(val) >> 32))
++
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #else
diff --git a/patches.suse/05-x86-spec-add-ibrs-control-functions.patch b/patches.suse/05-x86-spec-add-ibrs-control-functions.patch
new file mode 100644
index 0000000000..babac48ec8
--- /dev/null
+++ b/patches.suse/05-x86-spec-add-ibrs-control-functions.patch
@@ -0,0 +1,64 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sat, 16 Dec 2017 17:50:52 +0100
+Subject: x86/spec: Add IBRS control functions
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+... into a separate compilation unit.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com>.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 3 +++
+ arch/x86/kernel/cpu/Makefile | 1 +
+ arch/x86/kernel/cpu/spec_ctrl.c | 23 +++++++++++++++++++++++
+ 3 files changed, 27 insertions(+)
+
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -50,5 +50,8 @@
+ .Lend_\@:
+ .endm
+
++#else /* __ASSEMBLY__ */
++void x86_enable_ibrs(void);
++void x86_disable_ibrs(void);
+ #endif /* __ASSEMBLY__ */
+ #endif /* _ASM_X86_SPEC_CTRL_H */
+--- a/arch/x86/kernel/cpu/Makefile
++++ b/arch/x86/kernel/cpu/Makefile
+@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o
+ obj-y += proc.o capflags.o powerflags.o common.o
+ obj-y += vmware.o hypervisor.o sched.o mshyperv.o
+ obj-y += rdrand.o
++obj-y += spec_ctrl.o
+
+ obj-$(CONFIG_X86_32) += bugs.o
+ obj-$(CONFIG_X86_64) += bugs_64.o
+--- /dev/null
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -0,0 +1,23 @@
++/*
++ * Speculation control stuff
++ *
++ */
++#include <linux/module.h>
++
++#include <asm/msr.h>
++#include <asm/processor.h>
++#include <asm/spec_ctrl.h>
++
++void x86_disable_ibrs(void)
++{
++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
++}
++EXPORT_SYMBOL_GPL(x86_disable_ibrs);
++
++void x86_enable_ibrs(void)
++{
++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
++}
++EXPORT_SYMBOL_GPL(x86_enable_ibrs);
diff --git a/patches.suse/06-x86-idle-toggle-ibrs-when-going-idle.patch b/patches.suse/06-x86-idle-toggle-ibrs-when-going-idle.patch
new file mode 100644
index 0000000000..47736b3257
--- /dev/null
+++ b/patches.suse/06-x86-idle-toggle-ibrs-when-going-idle.patch
@@ -0,0 +1,57 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sat, 16 Dec 2017 17:59:42 +0100
+Subject: x86/idle: Toggle IBRS when going idle
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Disable when entering idle and reenable it back on exit.
+
+Orginally-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kernel/process.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -22,6 +22,7 @@
+ #include <asm/uaccess.h>
+ #include <asm/i387.h>
+ #include <asm/debugreg.h>
++#include <asm/spec_ctrl.h>
+
+ struct kmem_cache *task_xstate_cachep;
+ EXPORT_SYMBOL_GPL(task_xstate_cachep);
+@@ -432,10 +433,14 @@ void mwait_idle_with_hints(unsigned long
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
++ x86_disable_ibrs();
++
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
++
++ x86_enable_ibrs();
+ }
+ }
+
+@@ -448,12 +453,17 @@ static void mwait_idle(void)
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
++ x86_disable_ibrs();
++
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
++
++ x86_enable_ibrs();
++
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ } else
diff --git a/patches.suse/07-x86-idle-disable-ibrs-when-offlining-a-cpu-and-re-enable-on-wakeup.patch b/patches.suse/07-x86-idle-disable-ibrs-when-offlining-a-cpu-and-re-enable-on-wakeup.patch
new file mode 100644
index 0000000000..d489799447
--- /dev/null
+++ b/patches.suse/07-x86-idle-disable-ibrs-when-offlining-a-cpu-and-re-enable-on-wakeup.patch
@@ -0,0 +1,38 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Wed, 15 Nov 2017 12:24:19 -0800
+Subject: x86/idle: Disable IBRS when offlining a CPU and re-enable on wakeup
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Clear IBRS when cpu is offlined and set it when brining it back online.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+[ Switch to accessors. ]
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kernel/smpboot.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -71,6 +71,7 @@
+
+ #include <asm/smpboot_hooks.h>
+ #include <asm/i8259.h>
++#include <asm/spec_ctrl.h>
+
+ /* State of each CPU */
+ DEFINE_PER_CPU(int, cpu_state) = { 0 };
+@@ -1432,8 +1433,12 @@ void native_play_dead(void)
+ play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
+
++ x86_disable_ibrs();
++
+ mwait_play_dead(); /* Only returns on failure */
+ hlt_play_dead();
++
++ x86_enable_ibrs();
+ }
+
+ #else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/patches.suse/08-x86-spec_ctrl-add-an-indirect-branch-predictor-barrier.patch b/patches.suse/08-x86-spec_ctrl-add-an-indirect-branch-predictor-barrier.patch
new file mode 100644
index 0000000000..6d0c991157
--- /dev/null
+++ b/patches.suse/08-x86-spec_ctrl-add-an-indirect-branch-predictor-barrier.patch
@@ -0,0 +1,33 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sat, 16 Dec 2017 18:18:34 +0100
+Subject: x86/spec_ctrl: Add an Indirect Branch Predictor barrier
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+... to call when code is context-switching to a separate address space
+and needs to prevent earlier code from having influence on later branch
+prediction.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h
+index cae607a2fb6c..a7355c87d34b 100644
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -53,5 +53,12 @@
+ #else /* __ASSEMBLY__ */
+ void x86_enable_ibrs(void);
+ void x86_disable_ibrs(void);
++
++static inline void x86_ibp_barrier(void)
++{
++ if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
++ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
++}
++
+ #endif /* __ASSEMBLY__ */
+ #endif /* _ASM_X86_SPEC_CTRL_H */
+
diff --git a/patches.suse/09-x86-mm-set-ibpb-upon-context-switch.patch b/patches.suse/09-x86-mm-set-ibpb-upon-context-switch.patch
new file mode 100644
index 0000000000..6272880659
--- /dev/null
+++ b/patches.suse/09-x86-mm-set-ibpb-upon-context-switch.patch
@@ -0,0 +1,34 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Sat, 16 Dec 2017 18:25:12 +0100
+Subject: x86/mm: Set IBPB upon context switch
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Set IBPB on context switch when writing CR3.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+[ Convert to do x86_ibp_barrier(). ]
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/mm/tlb.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -13,6 +13,7 @@
+ #include <asm/apic.h>
+ #include <asm/uv/uv.h>
+ #include <asm/kaiser.h>
++#include <asm/spec_ctrl.h>
+
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
+@@ -116,6 +117,8 @@ void switch_mm_irqs_off(struct mm_struct
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
++ x86_ibp_barrier();
++
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
diff --git a/patches.suse/10-ptrace-add-a-new-thread-access-check.patch b/patches.suse/10-ptrace-add-a-new-thread-access-check.patch
new file mode 100644
index 0000000000..4b973bb9d7
--- /dev/null
+++ b/patches.suse/10-ptrace-add-a-new-thread-access-check.patch
@@ -0,0 +1,83 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sat, 16 Dec 2017 18:32:52 +0100
+Subject: ptrace: Add a new thread access check
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+... which checks current and a target task. Add IBPB ptrace mode too.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com>
+
+Boris:
+
+ - shorten PTRACE_MODE_IBPB as the other defines are not here yet.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ include/linux/ptrace.h | 9 +++++++--
+ kernel/ptrace.c | 18 ++++++++++++++----
+ 2 files changed, 21 insertions(+), 6 deletions(-)
+
+--- a/include/linux/ptrace.h
++++ b/include/linux/ptrace.h
+@@ -118,11 +118,16 @@ extern void __ptrace_unlink(struct task_
+ extern void exit_ptrace(struct task_struct *tracer);
+ #define PTRACE_MODE_READ 1
+ #define PTRACE_MODE_ATTACH 2
+-/* Returns 0 on success, -errno on denial. */
+-extern int __ptrace_may_access(struct task_struct *task, unsigned int mode);
++#define PTRACE_MODE_NOACCESS_CHK 0x20
++
++#define PTRACE_MODE_IBPB (PTRACE_MODE_ATTACH | PTRACE_MODE_NOACCESS_CHK )
++
+ /* Returns true on success, false on denial. */
+ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
+
++extern int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task,
++ unsigned int mode);
++
+ static inline int ptrace_reparented(struct task_struct *child)
+ {
+ return child->real_parent != child->parent;
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -169,9 +169,10 @@ int ptrace_check_attach(struct task_stru
+ return ret;
+ }
+
+-int __ptrace_may_access(struct task_struct *task, unsigned int mode)
++int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task,
++ unsigned int mode)
+ {
+- const struct cred *cred = current_cred(), *tcred;
++ const struct cred *cred = __task_cred(cur), *tcred;
+
+ /* May we inspect the given task?
+ * This check is used both for attaching with ptrace
+@@ -183,7 +184,7 @@ int __ptrace_may_access(struct task_stru
+ */
+ int dumpable = 0;
+ /* Don't let security modules deny introspection */
+- if (same_thread_group(task, current))
++ if (same_thread_group(task, cur))
+ return 0;
+ rcu_read_lock();
+ tcred = __task_cred(task);
+@@ -207,7 +208,16 @@ ok:
+ if (dumpable != SUID_DUMP_USER && !task_ns_capable(task, CAP_SYS_PTRACE))
+ return -EPERM;
+
+- return security_ptrace_access_check(task, mode);
++ if (!(mode & PTRACE_MODE_NOACCESS_CHK))
++ return security_ptrace_access_check(task, mode);
++
++ return 0;
++}
++EXPORT_SYMBOL_GPL(___ptrace_may_access);
++
++static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
++{
++ return ___ptrace_may_access(current, task, mode);
+ }
+
+ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
diff --git a/patches.suse/11-x86-mm-only-set-ibpb-when-the-new-thread-cannot-ptrace-current-thread.patch b/patches.suse/11-x86-mm-only-set-ibpb-when-the-new-thread-cannot-ptrace-current-thread.patch
new file mode 100644
index 0000000000..4dfa0f178c
--- /dev/null
+++ b/patches.suse/11-x86-mm-only-set-ibpb-when-the-new-thread-cannot-ptrace-current-thread.patch
@@ -0,0 +1,38 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Sat, 16 Dec 2017 18:37:19 +0100
+Subject: x86/mm: Only set IBPB when the new thread cannot ptrace current thread
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+To reduce overhead of setting IBPB, we only do that when the new thread
+cannot ptrace the current one. If the new thread has ptrace capability
+on current thread, it is safe.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/mm/tlb.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -6,6 +6,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
+ #include <linux/cpu.h>
++#include <linux/ptrace.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+@@ -117,7 +118,10 @@ void switch_mm_irqs_off(struct mm_struct
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+- x86_ibp_barrier();
++
++ /* Null tsk means switching to kernel, so that's safe */
++ if (tsk && ___ptrace_may_access(tsk, current, PTRACE_MODE_IBPB))
++ x86_ibp_barrier();
+
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
diff --git a/patches.suse/12-x86-entry-add-a-function-to-overwrite-the-rsb.patch b/patches.suse/12-x86-entry-add-a-function-to-overwrite-the-rsb.patch
new file mode 100644
index 0000000000..d6fdb6dc89
--- /dev/null
+++ b/patches.suse/12-x86-entry-add-a-function-to-overwrite-the-rsb.patch
@@ -0,0 +1,108 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sat, 16 Dec 2017 18:45:35 +0100
+Subject: x86/entry: Add a function to overwrite the RSB
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Fill up the 32-entry Return Stack Buffer.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com>
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 4 ++
+ arch/x86/kernel/entry_64.S | 70 +++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 74 insertions(+)
+
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -50,6 +50,10 @@
+ .Lend_\@:
+ .endm
+
++.macro STUFF_RSB
++ ALTERNATIVE "call stuff_rsb", "", X86_FEATURE_SMEP
++.endm
++
+ #else /* __ASSEMBLY__ */
+ void x86_enable_ibrs(void);
+ void x86_disable_ibrs(void);
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -2006,6 +2006,76 @@ ENTRY(ignore_sysret)
+ CFI_ENDPROC
+ END(ignore_sysret)
+
++ENTRY(stuff_rsb)
++ call 1f
++ pause
++1: call 2f
++ pause
++2: call 3f;
++ pause
++3: call 4f
++ pause
++4: call 5f
++ pause
++5: call 6f
++ pause
++6: call 7f
++ pause
++7: call 8f
++ pause
++8: call 9f
++ pause
++9: call 10f
++ pause
++10: call 11f
++ pause
++11: call 12f
++ pause
++12: call 13f
++ pause
++13: call 14f
++ pause
++14: call 15f
++ pause
++15: call 16f
++ pause
++16: call 17f
++ pause
++17: call 18f
++ pause
++18: call 19f
++ pause
++19: call 20f
++ pause
++20: call 21f
++ pause
++21: call 22f
++ pause
++22: call 23f
++ pause
++23: call 24f
++ pause
++24: call 25f
++ pause
++25: call 26f
++ pause
++26: call 27f
++ pause
++27: call 28f
++ pause
++28: call 29f
++ pause
++29: call 30f
++ pause
++30: call 31f
++ pause
++31: call 32f
++ pause
++32:
++ add $(32*8), %rsp
++ ret
++END(stuff_rsb)
++
+ /*
+ * End of kprobes section
+ */
diff --git a/patches.suse/13-x86-entry-stuff-rsb-for-entry-to-kernel-for-non-smep-platform.patch b/patches.suse/13-x86-entry-stuff-rsb-for-entry-to-kernel-for-non-smep-platform.patch
new file mode 100644
index 0000000000..01cd6f0e7c
--- /dev/null
+++ b/patches.suse/13-x86-entry-stuff-rsb-for-entry-to-kernel-for-non-smep-platform.patch
@@ -0,0 +1,88 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Sat, 16 Dec 2017 19:01:26 +0100
+Subject: x86/entry: Stuff RSB for entry to kernel for non-SMEP platform
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Stuff RSB to prevent RSB underflow on non-SMEP platforms.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/ia32/ia32entry.S | 3 +++
+ arch/x86/kernel/entry_64.S | 10 ++++++++++
+ 2 files changed, 13 insertions(+)
+
+--- a/arch/x86/ia32/ia32entry.S
++++ b/arch/x86/ia32/ia32entry.S
+@@ -150,6 +150,7 @@ ENTRY(ia32_sysenter_target)
+ cld
+
+ ENABLE_IBRS
++ STUFF_RSB
+
+ SAVE_ARGS 0,0,1
+ /* no need to do an access_ok check here because rbp has been
+@@ -317,6 +318,7 @@ ENTRY(ia32_cstar_target)
+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+
+ ENABLE_IBRS
++ STUFF_RSB
+
+ /* no need to do an access_ok check here because r8 has been
+ 32bit zero extended */
+@@ -446,6 +448,7 @@ ENTRY(ia32_syscall)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+
+ ENABLE_IBRS
++ STUFF_RSB
+
+ jnz ia32_tracesys
+ cmpq $(IA32_NR_syscalls-1),%rax
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -333,6 +333,7 @@ ENTRY(save_args)
+ SWAPGS
+ SWITCH_KERNEL_CR3
+ ENABLE_IBRS
++ STUFF_RSB
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+@@ -395,6 +396,10 @@ ENTRY(save_paranoid)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
++
++ /* Do the stuffing unconditionally from user/kernel to be safe */
++ STUFF_RSB
++
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+@@ -523,6 +528,7 @@ ENTRY(system_call_after_swapgs)
+ GET_THREAD_INFO(%rcx)
+
+ ENABLE_IBRS
++ STUFF_RSB
+
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ jnz tracesys
+@@ -959,6 +965,7 @@ native_irq_return_ldt:
+ SWITCH_KERNEL_CR3
+
+ ENABLE_IBRS
++ STUFF_RSB
+
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+@@ -1598,6 +1605,9 @@ ENTRY(error_entry)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
++
++ STUFF_RSB
++
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
diff --git a/patches.suse/14-x86-kvm-add-msr_ia32_spec_ctrl-and-msr_ia32_pred_cmd-to-kvm.patch b/patches.suse/14-x86-kvm-add-msr_ia32_spec_ctrl-and-msr_ia32_pred_cmd-to-kvm.patch
new file mode 100644
index 0000000000..3f77a1f297
--- /dev/null
+++ b/patches.suse/14-x86-kvm-add-msr_ia32_spec_ctrl-and-msr_ia32_pred_cmd-to-kvm.patch
@@ -0,0 +1,118 @@
+From: Wei Wang <wei.w.wang@intel.com>
+Date: Sat, 16 Dec 2017 19:18:48 +0100
+Subject: x86/kvm: Add MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD to kvm
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Add field to access guest MSR_IA332_SPEC_CTRL and MSR_IA32_PRED_CMD state.
+
+Signed-off-by: Wei Wang <wei.w.wang@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+[ Move MSR u64 to struct vcpu_vmx so as not to break kABI. ]
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/vmx.c | 25 +++++++++++++++++++------
+ arch/x86/kvm/x86.c | 3 ++-
+ 2 files changed, 21 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -39,6 +39,8 @@
+ #include <asm/mce.h>
+ #include <asm/i387.h>
+ #include <asm/xcr.h>
++#include <asm/msr-index.h>
++#include <asm/spec_ctrl.h>
+
+ #include "trace.h"
+
+@@ -463,6 +465,8 @@ struct vcpu_vmx {
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
++
++ u64 spec_ctrl;
+ };
+
+ enum segment_cache_field {
+@@ -2266,6 +2270,7 @@ static int vmx_set_vmx_msr(struct kvm_vc
+ */
+ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+ {
++ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 data;
+ struct shared_msr_entry *msr;
+
+@@ -2283,8 +2288,8 @@ static int vmx_get_msr(struct kvm_vcpu *
+ data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+- vmx_load_host_state(to_vmx(vcpu));
+- data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
++ vmx_load_host_state(vmx);
++ data = vmx->msr_guest_kernel_gs_base;
+ break;
+ #endif
+ case MSR_EFER:
+@@ -2292,6 +2297,9 @@ static int vmx_get_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ data = guest_read_tsc();
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ data = vmx->spec_ctrl;
++ break;
+ case MSR_IA32_SYSENTER_CS:
+ data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+@@ -2302,16 +2310,16 @@ static int vmx_get_msr(struct kvm_vcpu *
+ data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_TSC_AUX:
+- if (!to_vmx(vcpu)->rdtscp_enabled)
++ if (!vmx->rdtscp_enabled)
+ return 1;
+ /* Otherwise falls through */
+ default:
+- vmx_load_host_state(to_vmx(vcpu));
++ vmx_load_host_state(vmx);
+ if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
+ return 0;
+- msr = find_msr_entry(to_vmx(vcpu), msr_index);
++ msr = find_msr_entry(vmx, msr_index);
+ if (msr) {
+- vmx_load_host_state(to_vmx(vcpu));
++ vmx_load_host_state(vmx);
+ data = msr->data;
+ break;
+ }
+@@ -2366,6 +2374,9 @@ static int vmx_set_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr_info);
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ vmx->spec_ctrl = msr_info->data;
++ break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+@@ -8082,6 +8093,8 @@ static int __init vmx_init(void)
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
++ vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
++ vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -904,7 +904,8 @@ static u32 msrs_to_save[] = {
+ #ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+ #endif
+- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
++ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
++ MSR_IA32_SPEC_CTRL,
+ };
+
+ static unsigned num_msrs_to_save;
diff --git a/patches.suse/15-x86-kvm-flush-ibp-when-switching-vms.patch b/patches.suse/15-x86-kvm-flush-ibp-when-switching-vms.patch
new file mode 100644
index 0000000000..399f5efee3
--- /dev/null
+++ b/patches.suse/15-x86-kvm-flush-ibp-when-switching-vms.patch
@@ -0,0 +1,25 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Fri, 13 Oct 2017 14:31:46 -0700
+Subject: x86/kvm: Flush IBP when switching VMs
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Set IBPB (Indirect branch prediction barrier) when switching VMs.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/vmx.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -1609,6 +1609,8 @@ static void vmx_vcpu_load(struct kvm_vcp
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
++
++ x86_ibp_barrier();
+ }
+
+ if (vmx->loaded_vmcs->cpu != cpu) {
diff --git a/patches.suse/16-x86-kvm-toggle-ibrs-on-vm-entry-and-exit.patch b/patches.suse/16-x86-kvm-toggle-ibrs-on-vm-entry-and-exit.patch
new file mode 100644
index 0000000000..6fb1bc0f8d
--- /dev/null
+++ b/patches.suse/16-x86-kvm-toggle-ibrs-on-vm-entry-and-exit.patch
@@ -0,0 +1,28 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Fri, 20 Oct 2017 17:04:35 -0700
+Subject: x86/kvm: Toggle IBRS on VM entry and exit
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Restore guest IBRS on VM entry and set it to 1 on VM exit
+back to kernel.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/vmx.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -6738,6 +6738,10 @@ static void __noclone vmx_vcpu_run(struc
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
++ vmx->spec_ctrl, FEATURE_ENABLE_IBRS);
++
+ vmx->__launched = vmx->loaded_vmcs->launched;
+ asm(
+ /* Store host registers */
diff --git a/patches.suse/17-x86-kvm-pad-rsb-on-vm-transition.patch b/patches.suse/17-x86-kvm-pad-rsb-on-vm-transition.patch
new file mode 100644
index 0000000000..0329ad30ea
--- /dev/null
+++ b/patches.suse/17-x86-kvm-pad-rsb-on-vm-transition.patch
@@ -0,0 +1,102 @@
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Sat, 16 Dec 2017 19:35:49 +0100
+Subject: x86/kvm: Pad RSB on VM transition
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Add code to pad the local CPU's RSB entries to protect
+from previous less privilege mode.
+
+Boris:
+
+ - Use asm function instead of duplicating a C function.
+ - Add indirection to stuff_rsb() so that EXPORT_SYMBOL_GPL works.
+ Otherwise we'd need to backport the asm versions of those from 4.9.
+
+ - Also, that stuff_rsb() dummy for 32-bit should probably be present
+ there too, as we want to do that on 32-bit too but we'll address that
+ properly once the pile goes upstream.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/processor.h | 1 +
+ arch/x86/include/asm/proto.h | 1 +
+ arch/x86/include/asm/spec_ctrl.h | 1 +
+ arch/x86/kernel/cpu/spec_ctrl.c | 11 +++++++++++
+ arch/x86/kvm/vmx.c | 3 +++
+ 5 files changed, 17 insertions(+)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -418,6 +418,7 @@ struct stack_canary {
+ };
+ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+ #endif
++static inline void stuff_rsb(void) {}
+ #endif /* X86_64 */
+
+ extern unsigned int xstate_size;
+--- a/arch/x86/include/asm/proto.h
++++ b/arch/x86/include/asm/proto.h
+@@ -18,6 +18,7 @@ void syscall32_cpu_init(void);
+
+ void x86_configure_nx(void);
+ void x86_report_nx(void);
++void stuff_rsb(void);
+
+ extern int reboot_force;
+
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -57,6 +57,7 @@
+ #else /* __ASSEMBLY__ */
+ void x86_enable_ibrs(void);
+ void x86_disable_ibrs(void);
++void stuff_RSB(void);
+
+ static inline void x86_ibp_barrier(void)
+ {
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -5,6 +5,7 @@
+ #include <linux/module.h>
+
+ #include <asm/msr.h>
++#include <asm/proto.h>
+ #include <asm/processor.h>
+ #include <asm/spec_ctrl.h>
+
+@@ -21,3 +22,13 @@ void x86_enable_ibrs(void)
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+ }
+ EXPORT_SYMBOL_GPL(x86_enable_ibrs);
++
++/*
++ * Do this indirection as otherwise we'd need to backport the
++ * EXPORT_SYMBOL_GPL() for asm stuff.
++ */
++void stuff_RSB(void)
++{
++ stuff_rsb();
++}
++EXPORT_SYMBOL_GPL(stuff_RSB);
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -41,6 +41,7 @@
+ #include <asm/xcr.h>
+ #include <asm/msr-index.h>
+ #include <asm/spec_ctrl.h>
++#include <asm/proto.h>
+
+ #include "trace.h"
+
+@@ -6842,6 +6843,8 @@ static void __noclone vmx_vcpu_run(struc
+ #endif
+ );
+
++ stuff_RSB();
++
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_CPL)
diff --git a/patches.suse/18-x86-spec_ctrl-check-whether-ibrs-is-enabled-before-using-it.patch b/patches.suse/18-x86-spec_ctrl-check-whether-ibrs-is-enabled-before-using-it.patch
new file mode 100644
index 0000000000..341e28420b
--- /dev/null
+++ b/patches.suse/18-x86-spec_ctrl-check-whether-ibrs-is-enabled-before-using-it.patch
@@ -0,0 +1,142 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sun, 17 Dec 2017 16:01:57 +0100
+Subject: x86/spec_ctrl: Check whether IBRS is enabled before using it
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Check whether IBRS is enabled before using it.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com>
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 37 +++++++++++++++++++++++++++++++++++++
+ arch/x86/kernel/cpu/spec_ctrl.c | 15 +++++++++++++--
+ arch/x86/kvm/vmx.c | 2 +-
+ 3 files changed, 51 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -17,27 +17,55 @@
+
+ .macro ENABLE_IBRS_CLOBBER
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
++ call x86_ibrs_enabled
++ test %eax, %eax
++ jz .Llfence_\@
++
+ __ENABLE_IBRS_CLOBBER
++ jmp .Lend_\@
++
++.Llfence_\@:
++ lfence
+ .Lend_\@:
+ .endm
+
+
+ .macro ENABLE_IBRS
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
++
+ pushq %rax
++
++ call x86_ibrs_enabled
++ test %eax, %eax
++ jz .Llfence_\@
++
+ pushq %rcx
+ pushq %rdx
+ __ENABLE_IBRS_CLOBBER
+ popq %rdx
+ popq %rcx
++
++ jmp .Lpop_\@
++
++.Llfence_\@:
++ lfence
++
++.Lpop_\@:
+ popq %rax
++
+ .Lend_\@:
+ .endm
+
+
+ .macro DISABLE_IBRS
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
++
+ pushq %rax
++
++ call x86_ibrs_enabled
++ test %eax, %eax
++ jz .Llfence_\@
++
+ pushq %rcx
+ pushq %rdx
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+@@ -46,7 +74,15 @@
+ wrmsr
+ popq %rdx
+ popq %rcx
++
++ jmp .Lpop_\@
++
++.Llfence_\@:
++ lfence
++
++.Lpop_\@:
+ popq %rax
++
+ .Lend_\@:
+ .endm
+
+@@ -58,6 +94,7 @@
+ void x86_enable_ibrs(void);
+ void x86_disable_ibrs(void);
+ void stuff_RSB(void);
++unsigned int x86_ibrs_enabled(void);
+
+ static inline void x86_ibp_barrier(void)
+ {
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -9,16 +9,27 @@
+ #include <asm/processor.h>
+ #include <asm/spec_ctrl.h>
+
++/*
++ * Keep it open for more flags in case needed.
++ */
++static unsigned int ibrs_state = 0;
++
++unsigned int notrace x86_ibrs_enabled(void)
++{
++ return ibrs_state;
++}
++EXPORT_SYMBOL_GPL(x86_ibrs_enabled);
++
+ void x86_disable_ibrs(void)
+ {
+- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ if (x86_ibrs_enabled())
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+ }
+ EXPORT_SYMBOL_GPL(x86_disable_ibrs);
+
+ void x86_enable_ibrs(void)
+ {
+- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ if (x86_ibrs_enabled())
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+ }
+ EXPORT_SYMBOL_GPL(x86_enable_ibrs);
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -6739,7 +6739,7 @@ static void __noclone vmx_vcpu_run(struc
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ if (x86_ibrs_enabled())
+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
+ vmx->spec_ctrl, FEATURE_ENABLE_IBRS);
+
diff --git a/patches.suse/19-x86-spec_ctrl-check-whether-ibpb-is-enabled-before-using-it.patch b/patches.suse/19-x86-spec_ctrl-check-whether-ibpb-is-enabled-before-using-it.patch
new file mode 100644
index 0000000000..2067d30dad
--- /dev/null
+++ b/patches.suse/19-x86-spec_ctrl-check-whether-ibpb-is-enabled-before-using-it.patch
@@ -0,0 +1,54 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sun, 17 Dec 2017 16:01:57 +0100
+Subject: x86/spec_ctrl: Check whether IBPB is enabled before using it
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Check whether IBPB is enabled before using it.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com>
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 3 ++-
+ arch/x86/kernel/cpu/spec_ctrl.c | 7 +++++++
+ 2 files changed, 9 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -95,10 +95,11 @@ void x86_enable_ibrs(void);
+ void x86_disable_ibrs(void);
+ void stuff_RSB(void);
+ unsigned int x86_ibrs_enabled(void);
++unsigned int x86_ibpb_enabled(void);
+
+ static inline void x86_ibp_barrier(void)
+ {
+- if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
++ if (x86_ibpb_enabled())
+ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
+ }
+
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -13,6 +13,7 @@
+ * Keep it open for more flags in case needed.
+ */
+ static unsigned int ibrs_state = 0;
++static unsigned int ibpb_state = 0;
+
+ unsigned int notrace x86_ibrs_enabled(void)
+ {
+@@ -20,6 +21,12 @@ unsigned int notrace x86_ibrs_enabled(vo
+ }
+ EXPORT_SYMBOL_GPL(x86_ibrs_enabled);
+
++unsigned int notrace x86_ibpb_enabled(void)
++{
++ return ibpb_state;
++}
++EXPORT_SYMBOL_GPL(x86_ibpb_enabled);
++
+ void x86_disable_ibrs(void)
+ {
+ if (x86_ibrs_enabled())
diff --git a/patches.suse/20-x86-cpu-check-speculation-control-cpuid-bit.patch b/patches.suse/20-x86-cpu-check-speculation-control-cpuid-bit.patch
new file mode 100644
index 0000000000..ffc87a6837
--- /dev/null
+++ b/patches.suse/20-x86-cpu-check-speculation-control-cpuid-bit.patch
@@ -0,0 +1,115 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sun, 17 Dec 2017 16:37:58 +0100
+Subject: x86/CPU: Check speculation control CPUID bit
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+... and enable the corresponding flags.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com> and
+improved.
+
+After microcode reload, we need to check CPUID directly as we don't
+update the X86_FEATURE flags after a reload.
+
+While at it, remove the __cpuinitdata annotation of cpu_caps_set which
+is just silly and shuts up an annoying warning. See:
+
+ 148f9bb87745 ("x86: delete __cpuinit usage from all x86 files")
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/spec_ctrl.h | 1 +
+ arch/x86/kernel/cpu/common.c | 4 ++--
+ arch/x86/kernel/cpu/intel.c | 3 +++
+ arch/x86/kernel/cpu/spec_ctrl.c | 16 ++++++++++++++++
+ arch/x86/kernel/microcode_core.c | 6 +++++-
+ 5 files changed, 27 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/spec_ctrl.h
++++ b/arch/x86/include/asm/spec_ctrl.h
+@@ -96,6 +96,7 @@ void x86_disable_ibrs(void);
+ void stuff_RSB(void);
+ unsigned int x86_ibrs_enabled(void);
+ unsigned int x86_ibpb_enabled(void);
++void x86_spec_check(void);
+
+ static inline void x86_ibp_barrier(void)
+ {
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -428,8 +428,8 @@ static const char *__cpuinit table_looku
+ return NULL; /* Not found */
+ }
+
+-__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+-__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
++__u32 cpu_caps_cleared[NCAPINTS];
++__u32 cpu_caps_set[NCAPINTS];
+
+ void load_percpu_segment(int cpu)
+ {
+--- a/arch/x86/kernel/cpu/intel.c
++++ b/arch/x86/kernel/cpu/intel.c
+@@ -14,6 +14,7 @@
+ #include <asm/msr.h>
+ #include <asm/bugs.h>
+ #include <asm/cpu.h>
++#include <asm/spec_ctrl.h>
+
+ #ifdef CONFIG_X86_64
+ #include <linux/topology.h>
+@@ -493,6 +494,8 @@ static void __cpuinit init_intel(struct
+ "ENERGY_PERF_BIAS: View and update with"
+ " cpupower-set(8)\n");
+ }
++
++ x86_spec_check();
+ }
+
+ #ifdef CONFIG_X86_32
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -50,3 +50,19 @@ void stuff_RSB(void)
+ stuff_rsb();
+ }
+ EXPORT_SYMBOL_GPL(stuff_RSB);
++
++/*
++ * Called after upgrading microcode, check CPUID directly.
++ */
++void x86_spec_check(void)
++{
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
++ if (cpuid_edx(7) & BIT(26)) {
++ ibrs_state = 1;
++ ibpb_state = 1;
++
++ setup_force_cpu_cap(X86_FEATURE_SPEC_CTRL);
++ }
++ }
++}
++EXPORT_SYMBOL_GPL(x86_spec_check);
+--- a/arch/x86/kernel/microcode_core.c
++++ b/arch/x86/kernel/microcode_core.c
+@@ -87,6 +87,7 @@
+ #include <asm/microcode.h>
+ #include <asm/processor.h>
+ #include <asm/perf_event.h>
++#include <asm/spec_ctrl.h>
+
+ MODULE_DESCRIPTION("Microcode Update Driver");
+ MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+@@ -321,8 +322,11 @@ static ssize_t reload_store(struct sys_d
+ if (!ret)
+ ret = tmp_ret;
+ }
+- if (!ret)
++ if (!ret) {
+ perf_check_microcode();
++ x86_spec_check();
++ }
++
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
+
diff --git a/patches.suse/21-x86-spec-add-nospec-chicken-bit.patch b/patches.suse/21-x86-spec-add-nospec-chicken-bit.patch
new file mode 100644
index 0000000000..04ab29d57b
--- /dev/null
+++ b/patches.suse/21-x86-spec-add-nospec-chicken-bit.patch
@@ -0,0 +1,47 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Sun, 17 Dec 2017 16:45:58 +0100
+Subject: x86/spec: Add "nospec" chicken bit
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+... which disables the speculation features toggle and avoids the
+performance overhead from them.
+
+Carved out from a patch by Tim Chen <tim.c.chen@linux.intel.com>
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ Documentation/kernel-parameters.txt | 5 +++++
+ arch/x86/kernel/cpu/spec_ctrl.c | 10 ++++++++++
+ 2 files changed, 15 insertions(+)
+
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -66,3 +66,13 @@ void x86_spec_check(void)
+ }
+ }
+ EXPORT_SYMBOL_GPL(x86_spec_check);
++
++static int __init nospec(char *str)
++{
++ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
++ ibrs_state = 0;
++ ibpb_state = 0;
++
++ return 0;
++}
++early_param("nospec", nospec);
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -1775,6 +1775,11 @@ bytes respectively. Such letter suffixes
+ Disable SMEP (Supervisor Mode Execution Protection)
+ even if it is supported by processor.
+
++ nospec [X86]
++ Disable indirect branch restricted speculation and
++ indirect branch prediction barrier to avoid performance
++ penalties in trusted environments.
++
+ noexec32 [X86-64]
+ This affects only 32-bit executables.
+ noexec32=on: enable non-executable mappings (default)
diff --git a/patches.suse/22-x86-cpu-amd-add-speculative-control-support-for-amd.patch b/patches.suse/22-x86-cpu-amd-add-speculative-control-support-for-amd.patch
new file mode 100644
index 0000000000..b41e67410b
--- /dev/null
+++ b/patches.suse/22-x86-cpu-amd-add-speculative-control-support-for-amd.patch
@@ -0,0 +1,117 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 11:50:09 +0100
+Subject: x86/CPU/AMD: Add speculative control support for AMD
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Add speculative control support for AMD processors. For AMD, speculative
+control is indicated as follows:
+
+ CPUID EAX=0x00000007, ECX=0x00 return EDX[26] indicates support for
+ both IBRS and IBPB.
+
+ CPUID EAX=0x80000008, ECX=0x00 return EBX[12] indicates support for
+ just IBPB.
+
+On AMD family 0x10, 0x12 and 0x16 processors where either of the above
+features are not supported, IBPB can be achieved by disabling
+indirect branch predictor support in MSR 0xc0011021[14] at boot.
+
+Boris:
+- make CPUID_0x80000008[EBX], cap leaf 10 as the others are missing
+- use rdmsrl/wrmsrl because we don't have msr_set_bit() yet
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+[ Move everything to spec_ctrl.c ]
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/cpufeature.h | 5 ++++-
+ arch/x86/include/asm/msr-index.h | 2 ++
+ arch/x86/kernel/cpu/amd.c | 3 +++
+ arch/x86/kernel/cpu/scattered.c | 1 +
+ arch/x86/kernel/cpu/spec_ctrl.c | 20 ++++++++++++++++++++
+ 5 files changed, 30 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -89,7 +89,10 @@
+ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
+ #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
+ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
+- /* 21 available, was AMD_C1E */
++
++/* It is in word 13 upstream - move it here due to kABI breakage. */
++#define X86_FEATURE_IBPB (3*32+21) /* Indirect Branch Prediction Barrier */
++
+ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
+ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
+ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -152,6 +152,8 @@
+ /* Fam 15h MSRs */
+ #define MSR_F15H_PERF_CTL 0xc0010200
+ #define MSR_F15H_PERF_CTR 0xc0010201
++#define MSR_F15H_IC_CFG 0xc0011021
++#define MSR_F15H_IC_CFG_DIS_IND (1 << 14)
+
+ /* Fam 10h MSRs */
+ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -8,6 +8,7 @@
+ #include <asm/apic.h>
+ #include <asm/cpu.h>
+ #include <asm/pci-direct.h>
++#include <asm/spec_ctrl.h>
+
+ #ifdef CONFIG_X86_64
+ # include <asm/numa_64.h>
+@@ -680,6 +681,8 @@ static void __cpuinit init_amd(struct cp
+ wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
+ }
+ }
++
++ x86_spec_check();
+ }
+
+ #ifdef CONFIG_X86_32
+--- a/arch/x86/kernel/cpu/scattered.c
++++ b/arch/x86/kernel/cpu/scattered.c
+@@ -41,6 +41,7 @@ void __cpuinit init_scattered_cpuid_feat
+ { X86_FEATURE_SPEC_CTRL, CR_EDX,26, 0x00000007, 0 },
+ { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
+ { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
++ { X86_FEATURE_IBPB, CR_EBX,12, 0x80000008, 0 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -63,6 +63,26 @@ void x86_spec_check(void)
+
+ setup_force_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ }
++ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
++ ibrs_state = 1;
++
++ if (boot_cpu_has(X86_FEATURE_IBPB)) {
++ ibpb_state = 1;
++ } else {
++ u64 val;
++
++ switch (boot_cpu_data.x86) {
++ case 0x10:
++ case 0x12:
++ case 0x16:
++ pr_info_once("Disabling indirect branch predictor support\n");
++ rdmsrl(MSR_F15H_IC_CFG, val);
++ val |= MSR_F15H_IC_CFG_DIS_IND;
++ wrmsrl(MSR_F15H_IC_CFG, val);
++ break;
++ }
++ }
+ }
+ }
+ EXPORT_SYMBOL_GPL(x86_spec_check);
diff --git a/patches.suse/23-x86-spec-check-cpuid-direclty-post-microcode-reload-to-support-ibpb-feature.patch b/patches.suse/23-x86-spec-check-cpuid-direclty-post-microcode-reload-to-support-ibpb-feature.patch
new file mode 100644
index 0000000000..2d4e47fb21
--- /dev/null
+++ b/patches.suse/23-x86-spec-check-cpuid-direclty-post-microcode-reload-to-support-ibpb-feature.patch
@@ -0,0 +1,53 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 11:55:18 +0100
+Subject: x86/spec: Check CPUID direclty post microcode reload to support IBPB
+ feature
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Add an IBPB feature check to the speculative control update check after
+a microcode reload.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+[ Check CPUID directly. ]
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kernel/cpu/spec_ctrl.c | 18 ++++++++----------
+ 1 file changed, 8 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -56,18 +56,15 @@ EXPORT_SYMBOL_GPL(stuff_RSB);
+ */
+ void x86_spec_check(void)
+ {
+- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+- if (cpuid_edx(7) & BIT(26)) {
+- ibrs_state = 1;
+- ibpb_state = 1;
++ if (cpuid_edx(7) & BIT(26)) {
++ ibrs_state = 1;
++ ibpb_state = 1;
+
+- setup_force_cpu_cap(X86_FEATURE_SPEC_CTRL);
+- }
+- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+- ibrs_state = 1;
++ setup_force_cpu_cap(X86_FEATURE_SPEC_CTRL);
++ }
+
+- if (boot_cpu_has(X86_FEATURE_IBPB)) {
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
++ if (cpuid_ebx(0x80000008) & BIT(12)) {
+ ibpb_state = 1;
+ } else {
+ u64 val;
+@@ -82,6 +79,7 @@ void x86_spec_check(void)
+ wrmsrl(MSR_F15H_IC_CFG, val);
+ break;
+ }
++ ibpb_state = 0;
+ }
+ }
+ }
diff --git a/patches.suse/24-kvm-svm-do-not-intercept-new-speculative-control-msrs.patch b/patches.suse/24-kvm-svm-do-not-intercept-new-speculative-control-msrs.patch
new file mode 100644
index 0000000000..cc54d438cb
--- /dev/null
+++ b/patches.suse/24-kvm-svm-do-not-intercept-new-speculative-control-msrs.patch
@@ -0,0 +1,27 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 12:06:31 +0100
+Subject: KVM: SVM: Do not intercept new speculative control MSRs
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Allow guest access to the speculative control MSRs without being
+intercepted.
+
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/svm.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -170,6 +170,8 @@ static struct svm_direct_access_msrs {
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+ #endif
++ { .index = MSR_IA32_SPEC_CTRL, .always = true },
++ { .index = MSR_IA32_PRED_CMD, .always = true },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
diff --git a/patches.suse/25-x86-svm-set-ibrs-value-on-vm-entry-and-exit.patch b/patches.suse/25-x86-svm-set-ibrs-value-on-vm-entry-and-exit.patch
new file mode 100644
index 0000000000..4484a5965f
--- /dev/null
+++ b/patches.suse/25-x86-svm-set-ibrs-value-on-vm-entry-and-exit.patch
@@ -0,0 +1,78 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 12:23:33 +0100
+Subject: x86/svm: Set IBRS value on VM entry and exit
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Set/restore the guests IBRS value on VM entry. On VM exit back to the
+kernel save the guest IBRS value and then set IBRS to 1.
+
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/svm.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -33,6 +33,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/desc.h>
+ #include <asm/kvm_para.h>
++#include <asm/spec_ctrl.h>
+
+ #include <asm/virtext.h>
+ #include "trace.h"
+@@ -128,6 +129,8 @@ struct vcpu_svm {
+
+ u64 next_rip;
+
++ u64 spec_ctrl;
++
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ struct {
+ u16 fs;
+@@ -3055,6 +3058,9 @@ static int svm_get_msr(struct kvm_vcpu *
+ case MSR_VM_CR:
+ *data = svm->nested.vm_cr_msr;
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ *data = svm->spec_ctrl;
++ break;
+ case MSR_IA32_UCODE_REV:
+ *data = 0x01000065;
+ break;
+@@ -3170,6 +3176,9 @@ static int svm_set_msr(struct kvm_vcpu *
+ case MSR_VM_IGNNE:
+ pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ svm->spec_ctrl = data;
++ break;
+ default:
+ return kvm_set_msr_common(vcpu, msr);
+ }
+@@ -3807,6 +3816,9 @@ static void svm_vcpu_run(struct kvm_vcpu
+
+ local_irq_enable();
+
++ if (x86_ibrs_enabled() && (svm->spec_ctrl != FEATURE_ENABLE_IBRS))
++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
++
+ asm volatile (
+ "push %%"R"bp; \n\t"
+ "mov %c[rbx](%[svm]), %%"R"bx \n\t"
+@@ -3878,6 +3890,12 @@ static void svm_vcpu_run(struct kvm_vcpu
+ #endif
+ );
+
++ if (x86_ibrs_enabled()) {
++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
++ if (svm->spec_ctrl != FEATURE_ENABLE_IBRS)
++ wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
++ }
++
+ #ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+ #else
diff --git a/patches.suse/26-x86-svm-set-ibpb-when-running-a-different-vcpu.patch b/patches.suse/26-x86-svm-set-ibpb-when-running-a-different-vcpu.patch
new file mode 100644
index 0000000000..1a1538ab29
--- /dev/null
+++ b/patches.suse/26-x86-svm-set-ibpb-when-running-a-different-vcpu.patch
@@ -0,0 +1,58 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 12:47:01 +0100
+Subject: x86/svm: Set IBPB when running a different VCPU
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Set IBPB (Indirect Branch Prediction Barrier) when the current CPU is
+going to run a VCPU different from what was previously run.
+
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/svm.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -387,6 +387,8 @@ struct svm_cpu_data {
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
++
++ struct vmcb *current_vmcb;
+ };
+
+ static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+@@ -1291,11 +1293,18 @@ static void svm_free_vcpu(struct kvm_vcp
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, svm);
++
++ /*
++ * The VMCB could be recycled, causing a false negative in svm_vcpu_load;
++ * block speculative execution.
++ */
++ x86_ibp_barrier();
+ }
+
+ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int i;
+
+ if (unlikely(cpu != vcpu->cpu)) {
+@@ -1318,6 +1327,11 @@ static void svm_vcpu_load(struct kvm_vcp
+ __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
+ wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ }
++
++ if (sd->current_vmcb != svm->vmcb) {
++ sd->current_vmcb = svm->vmcb;
++ x86_ibp_barrier();
++ }
+ }
+
+ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/patches.suse/27-kvm-x86-add-speculative-control-cpuid-support-for-guests.patch b/patches.suse/27-kvm-x86-add-speculative-control-cpuid-support-for-guests.patch
new file mode 100644
index 0000000000..2000824444
--- /dev/null
+++ b/patches.suse/27-kvm-x86-add-speculative-control-cpuid-support-for-guests.patch
@@ -0,0 +1,59 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 12:53:04 +0100
+Subject: KVM: x86: Add speculative control CPUID support for guests
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Provide the guest with the speculative control CPUID related values.
+
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+
+---
+ arch/x86/kvm/x86.c | 20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2550,6 +2550,8 @@ static void do_cpuid_ent(struct kvm_cpui
+ F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(RTM) | F(FSGSBASE) | F(ERMS) |
+ f_invpcid;
+
++ const u32 kvm_cpuid_7_0_edx_x86_features = F(SPEC_CTRL);
++
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+ do_cpuid_1_ent(entry, function, index);
+@@ -2613,7 +2615,7 @@ static void do_cpuid_ent(struct kvm_cpui
+ entry->ebx = 0;
+ entry->eax = 0;
+ entry->ecx = 0;
+- entry->edx = 0;
++ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+ break;
+ }
+ /* function 0xb has additional index. */
+@@ -2675,6 +2677,22 @@ static void do_cpuid_ent(struct kvm_cpui
+ entry->ecx &= kvm_supported_word6_x86_features;
+ cpuid_mask(&entry->ecx, 6);
+ break;
++
++ case 0x80000008:
++ entry->eax = 0;
++ entry->ecx = 0;
++ entry->edx = 0;
++
++ /*
++ * cpuid 0x80000008.0.ebx
++ *
++ * Boris: hardcode due to prior kABI fix.
++ */
++ if (boot_cpu_has(X86_FEATURE_IBPB))
++ entry->ebx |= (1 << 12);
++
++ break;
++
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
diff --git a/patches.suse/28-x86-svm-clobber-the-rsb-on-vm-exit.patch b/patches.suse/28-x86-svm-clobber-the-rsb-on-vm-exit.patch
new file mode 100644
index 0000000000..cf3424bed8
--- /dev/null
+++ b/patches.suse/28-x86-svm-clobber-the-rsb-on-vm-exit.patch
@@ -0,0 +1,26 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 13:06:12 +0100
+Subject: x86/svm: Clobber the RSB on VM exit
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Overwrite the local CPU RSB entries from the previous less privileged
+mode.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/svm.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -3910,6 +3910,8 @@ static void svm_vcpu_run(struct kvm_vcpu
+ wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+ }
+
++ stuff_RSB();
++
+ #ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+ #else
diff --git a/patches.suse/29-x86-svm-add-code-to-clear-registers-on-vm-exit.patch b/patches.suse/29-x86-svm-add-code-to-clear-registers-on-vm-exit.patch
new file mode 100644
index 0000000000..280ed08378
--- /dev/null
+++ b/patches.suse/29-x86-svm-add-code-to-clear-registers-on-vm-exit.patch
@@ -0,0 +1,39 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 14:05:00 +0100
+Subject: x86/svm: Add code to clear registers on VM exit
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+Clear registers on VM exit to prevent speculative use of them.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/kvm/svm.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -3877,6 +3877,22 @@ static void svm_vcpu_run(struct kvm_vcpu
+ "mov %%r14, %c[r14](%[svm]) \n\t"
+ "mov %%r15, %c[r15](%[svm]) \n\t"
+ #endif
++ /* Clear host registers (marked as clobbered so it's safe) */
++ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
++ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
++ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
++ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
++ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
++#ifdef CONFIG_X86_64
++ "xor %%r8, %%r8 \n\t"
++ "xor %%r9, %%r9 \n\t"
++ "xor %%r10, %%r10 \n\t"
++ "xor %%r11, %%r11 \n\t"
++ "xor %%r12, %%r12 \n\t"
++ "xor %%r13, %%r13 \n\t"
++ "xor %%r14, %%r14 \n\t"
++ "xor %%r15, %%r15 \n\t"
++#endif
+ "pop %%"R"bp"
+ :
+ : [svm]"a"(svm),
diff --git a/patches.suse/30-x86-cpu-amd-make-the-lfence-instruction-serialized.patch b/patches.suse/30-x86-cpu-amd-make-the-lfence-instruction-serialized.patch
new file mode 100644
index 0000000000..58c0d43715
--- /dev/null
+++ b/patches.suse/30-x86-cpu-amd-make-the-lfence-instruction-serialized.patch
@@ -0,0 +1,53 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 14:13:37 +0100
+Subject: x86/CPU/AMD: Make the LFENCE instruction serialized
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+In order to reduce the impact of using MFENCE, make the execution of the
+LFENCE instruction serialized. This is done by setting bit 1 of MSR
+0xc0011029 (DE_CFG).
+
+Some families that support LFENCE do not have this MSR. For these
+families, the LFENCE instruction is already serialized.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/msr-index.h | 2 ++
+ arch/x86/kernel/cpu/amd.c | 13 +++++++++++--
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -163,6 +163,8 @@
+ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
+ #define FAM10H_MMIO_CONF_BASE_SHIFT 20
+ #define MSR_FAM10H_NODE_ID 0xc001100c
++#define MSR_F10H_DECFG 0xc0011029
++#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
+
+ /* K8 MSRs */
+ #define MSR_K8_TOP_MEM1 0xc001001a
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -630,8 +630,17 @@ static void __cpuinit init_amd(struct cp
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ if (cpu_has_xmm2) {
+- /* MFENCE stops RDTSC speculation */
+- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
++ /*
++ * Use LFENCE for execution serialization. On families which
++ * don't have that MSR, LFENCE is already serialized.
++ */
++ if (c->x86 > 0xf)
++ rdmsrl(MSR_F10H_DECFG, value);
++ value |= MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT;
++ wrmsrl(MSR_F10H_DECFG, value);
++
++ /* LFENCE with MSR_F10H_DECFG[1]=1 stops RDTSC speculation */
++ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
+
+ #ifdef CONFIG_X86_64
diff --git a/patches.suse/31-x86-cpu-amd-remove-now-unused-definition-of-mfence_rdtsc-feature.patch b/patches.suse/31-x86-cpu-amd-remove-now-unused-definition-of-mfence_rdtsc-feature.patch
new file mode 100644
index 0000000000..e695ef0cec
--- /dev/null
+++ b/patches.suse/31-x86-cpu-amd-remove-now-unused-definition-of-mfence_rdtsc-feature.patch
@@ -0,0 +1,60 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 18 Dec 2017 14:47:53 +0100
+Subject: x86/CPU/AMD: Remove now unused definition of MFENCE_RDTSC feature
+Patch-mainline: submitted on 2018/1/9
+References: bsc#1068032
+
+With the switch to using LFENCE_RDTSC on AMD platforms there is no longer
+a need for the MFENCE_RDTSC feature. Remove its usage and definition.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ arch/um/sys-i386/shared/sysdep/system.h | 1 -
+ arch/um/sys-x86_64/shared/sysdep/system.h | 1 -
+ arch/x86/include/asm/cpufeature.h | 2 +-
+ arch/x86/include/asm/system.h | 1 -
+ arch/x86/include/mach-xen/asm/system.h | 1 -
+ 5 files changed, 1 insertion(+), 5 deletions(-)
+
+--- a/arch/um/sys-i386/shared/sysdep/system.h
++++ b/arch/um/sys-i386/shared/sysdep/system.h
+@@ -125,7 +125,6 @@ void default_idle(void);
+ */
+ static inline void rdtsc_barrier(void)
+ {
+- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ }
+
+--- a/arch/um/sys-x86_64/shared/sysdep/system.h
++++ b/arch/um/sys-x86_64/shared/sysdep/system.h
+@@ -125,7 +125,6 @@ void default_idle(void);
+ */
+ static inline void rdtsc_barrier(void)
+ {
+- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ }
+
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -85,7 +85,7 @@
+ #define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
+ #define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
+ #define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
+-#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
++/* free, was #define X86_FEATURE_MFENCE_RDTSC (3*32+17) * "" Mfence synchronizes RDTSC */
+ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
+ #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
+ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
+--- a/arch/x86/include/asm/system.h
++++ b/arch/x86/include/asm/system.h
+@@ -519,7 +519,6 @@ void stop_this_cpu(void *dummy);
+ */
+ static __always_inline void rdtsc_barrier(void)
+ {
+- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ }
+
diff --git a/patches.suse/kaiser-0002-x86-mm-Add-INVPCID-helpers.patch b/patches.suse/kaiser-0002-x86-mm-Add-INVPCID-helpers.patch
new file mode 100644
index 0000000000..5b2df46c7f
--- /dev/null
+++ b/patches.suse/kaiser-0002-x86-mm-Add-INVPCID-helpers.patch
@@ -0,0 +1,93 @@
+From 07fbca3a5f1bbe42a39326daac0b7877a9f43eb2 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Fri, 29 Jan 2016 11:42:57 -0800
+Subject: [PATCH 02/19] x86/mm: Add INVPCID helpers
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 060a402a1ddb551455ee410de2eadd3349f2801b
+Patch-mainline: v4.6-rc1
+
+This adds helpers for each of the four currently-specified INVPCID
+modes.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Luis R. Rodriguez <mcgrof@suse.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Toshi Kani <toshi.kani@hp.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 060a402a1ddb551455ee410de2eadd3349f2801b)
+---
+ arch/x86/include/asm/tlbflush.h | 48 ++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 48 insertions(+)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -7,6 +7,54 @@
+ #include <asm/processor.h>
+ #include <asm/system.h>
+
++static inline void __invpcid(unsigned long pcid, unsigned long addr,
++ unsigned long type)
++{
++ u64 desc[2] = { pcid, addr };
++
++ /*
++ * The memory clobber is because the whole point is to invalidate
++ * stale TLB entries and, especially if we're flushing global
++ * mappings, we don't want the compiler to reorder any subsequent
++ * memory accesses before the TLB flush.
++ *
++ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
++ * invpcid (%rcx), %rax in long mode.
++ */
++ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
++ : : "m" (desc), "a" (type), "c" (desc) : "memory");
++}
++
++#define INVPCID_TYPE_INDIV_ADDR 0
++#define INVPCID_TYPE_SINGLE_CTXT 1
++#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
++#define INVPCID_TYPE_ALL_NON_GLOBAL 3
++
++/* Flush all mappings for a given pcid and addr, not including globals. */
++static inline void invpcid_flush_one(unsigned long pcid,
++ unsigned long addr)
++{
++ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
++}
++
++/* Flush all mappings for a given PCID, not including globals. */
++static inline void invpcid_flush_single_context(unsigned long pcid)
++{
++ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
++}
++
++/* Flush all mappings, including globals, for all PCIDs. */
++static inline void invpcid_flush_all(void)
++{
++ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
++}
++
++/* Flush all mappings for all PCIDs except globals. */
++static inline void invpcid_flush_all_nonglobals(void)
++{
++ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
++}
++
+ #ifdef CONFIG_PARAVIRT_MMU
+ #include <asm/paravirt.h>
+ #else
diff --git a/patches.suse/kaiser-0003-x86-mm-Fix-INVPCID-asm-constraint.patch b/patches.suse/kaiser-0003-x86-mm-Fix-INVPCID-asm-constraint.patch
new file mode 100644
index 0000000000..d64f7a40ca
--- /dev/null
+++ b/patches.suse/kaiser-0003-x86-mm-Fix-INVPCID-asm-constraint.patch
@@ -0,0 +1,67 @@
+From c50a6e7cdbe1169d0f37b42ed4d626e7b9d7e8b9 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Wed, 10 Feb 2016 15:51:16 +0100
+Subject: [PATCH 03/19] x86/mm: Fix INVPCID asm constraint
+References: bsc#1068032 CVE-2017-5754
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: e2c7698cd61f11d4077fdb28148b2d31b82ac848
+Patch-mainline: v4.6-rc1
+
+So we want to specify the dependency on both @pcid and @addr so that the
+compiler doesn't reorder accesses to them *before* the TLB flush. But
+for that to work, we need to express this properly in the inline asm and
+deref the whole desc array, not the pointer to it. See clwb() for an
+example.
+
+This fixes the build error on 32-bit:
+
+ arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’:
+ arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable
+
+which gcc4.7 caught but 5.x didn't. Which is strange. :-\
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Luis R. Rodriguez <mcgrof@suse.com>
+Cc: Michael Matz <matz@suse.de>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Toshi Kani <toshi.kani@hp.com>
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+(cherry picked from commit e2c7698cd61f11d4077fdb28148b2d31b82ac848)
+---
+ arch/x86/include/asm/tlbflush.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -10,7 +10,7 @@
+ static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+ {
+- u64 desc[2] = { pcid, addr };
++ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+@@ -22,7 +22,7 @@ static inline void __invpcid(unsigned lo
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+- : : "m" (desc), "a" (type), "c" (desc) : "memory");
++ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+ }
+
+ #define INVPCID_TYPE_INDIV_ADDR 0
diff --git a/patches.suse/kaiser-0004-x86-mm-Add-a-noinvpcid-boot-option-to-turn-off-INVPC.patch b/patches.suse/kaiser-0004-x86-mm-Add-a-noinvpcid-boot-option-to-turn-off-INVPC.patch
new file mode 100644
index 0000000000..5b73b12cf4
--- /dev/null
+++ b/patches.suse/kaiser-0004-x86-mm-Add-a-noinvpcid-boot-option-to-turn-off-INVPC.patch
@@ -0,0 +1,75 @@
+From 86ff7797d0a9061d4d25c74a422a2ba64191edb2 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Fri, 29 Jan 2016 11:42:58 -0800
+Subject: [PATCH 04/19] x86/mm: Add a 'noinvpcid' boot option to turn off
+References: bsc#1068032 CVE-2017-5754
+ INVPCID
+Git-commit: d12a72b844a49d4162f24cefdab30bed3f86730e
+Patch-mainline: v4.6-rc1
+
+This adds a chicken bit to turn off INVPCID in case something goes
+wrong. It's an early_param() because we do TLB flushes before we
+parse __setup() parameters.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Luis R. Rodriguez <mcgrof@suse.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Toshi Kani <toshi.kani@hp.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/f586317ed1bc2b87aee652267e515b90051af385.1454096309.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit d12a72b844a49d4162f24cefdab30bed3f86730e)
+---
+ Documentation/kernel-parameters.txt | 2 ++
+ arch/x86/kernel/cpu/common.c | 16 ++++++++++++++++
+ 2 files changed, 18 insertions(+)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -155,6 +155,22 @@ static int __init x86_xsaveopt_setup(cha
+ }
+ __setup("noxsaveopt", x86_xsaveopt_setup);
+
++static int __init x86_noinvpcid_setup(char *s)
++{
++ /* noinvpcid doesn't accept parameters */
++ if (s)
++ return -EINVAL;
++
++ /* do not emit a message if the feature is not present */
++ if (!boot_cpu_has(X86_FEATURE_INVPCID))
++ return 0;
++
++ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
++ pr_info("noinvpcid: INVPCID feature disabled\n");
++ return 0;
++}
++early_param("noinvpcid", x86_noinvpcid_setup);
++
+ #ifdef CONFIG_X86_32
+ static int cachesize_override __cpuinitdata = -1;
+ static int disable_x86_serial_nr __cpuinitdata = 1;
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -1827,6 +1827,8 @@ bytes respectively. Such letter suffixes
+
+ nointroute [IA-64]
+
++ noinvpcid [X86] Disable the INVPCID cpu feature.
++
+ nojitter [IA64] Disables jitter checking for ITC timers.
+
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
diff --git a/patches.suse/kaiser-0005-x86-mm-If-INVPCID-is-available-use-it-to-flush-globa.patch b/patches.suse/kaiser-0005-x86-mm-If-INVPCID-is-available-use-it-to-flush-globa.patch
new file mode 100644
index 0000000000..fe0e4fcebb
--- /dev/null
+++ b/patches.suse/kaiser-0005-x86-mm-If-INVPCID-is-available-use-it-to-flush-globa.patch
@@ -0,0 +1,56 @@
+From 69d7c1048ae0d4f5315f569209396090cfc36bdd Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Fri, 29 Jan 2016 11:42:59 -0800
+Subject: [PATCH 05/19] x86/mm: If INVPCID is available, use it to flush global
+References: bsc#1068032 CVE-2017-5754
+ mappings
+Git-commit: d8bced79af1db6734f66b42064cc773cada2ce99
+Patch-mainline: v4.6-rc1
+
+On my Skylake laptop, INVPCID function 2 (flush absolutely
+everything) takes about 376ns, whereas saving flags, twiddling
+CR4.PGE to flush global mappings, and restoring flags takes about
+539ns.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Luis R. Rodriguez <mcgrof@suse.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Toshi Kani <toshi.kani@hp.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+(cherry picked from commit d8bced79af1db6734f66b42064cc773cada2ce99)
+---
+ arch/x86/include/asm/tlbflush.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -73,6 +73,15 @@ static inline void __native_flush_tlb_gl
+ unsigned long flags;
+ unsigned long cr4;
+
++ if (static_cpu_has(X86_FEATURE_INVPCID)) {
++ /*
++ * Using INVPCID is considerably faster than a pair of writes
++ * to CR4 sandwiched inside an IRQ flag save/restore.
++ */
++ invpcid_flush_all();
++ return;
++ }
++
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
diff --git a/patches.suse/kaiser-0006-mm-mmu_context-sched-core-Fix-mmu_context.h-assumpti.patch b/patches.suse/kaiser-0006-mm-mmu_context-sched-core-Fix-mmu_context.h-assumpti.patch
new file mode 100644
index 0000000000..af10c42b9c
--- /dev/null
+++ b/patches.suse/kaiser-0006-mm-mmu_context-sched-core-Fix-mmu_context.h-assumpti.patch
@@ -0,0 +1,40 @@
+From 8918faa890790aa8c144c780afcc2a2848aaf407 Mon Sep 17 00:00:00 2001
+From: Ingo Molnar <mingo@kernel.org>
+Date: Thu, 28 Apr 2016 11:39:12 +0200
+Subject: [PATCH 06/19] mm/mmu_context, sched/core: Fix mmu_context.h
+References: bsc#1068032 CVE-2017-5754
+ assumption
+Git-commit: 8efd755ac2fe262d4c8d5c9bbe054bb67dae93da
+Patch-mainline: v4.7-rc1
+
+Some architectures (such as Alpha) rely on include/linux/sched.h definitions
+in their mmu_context.h files.
+
+So include sched.h before mmu_context.h.
+
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: linux-kernel@vger.kernel.org
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 8efd755ac2fe262d4c8d5c9bbe054bb67dae93da)
+---
+ mm/mmu_context.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mmu_context.c
++++ b/mm/mmu_context.c
+@@ -4,9 +4,9 @@
+ */
+
+ #include <linux/mm.h>
++#include <linux/sched.h>
+ #include <linux/mmu_context.h>
+ #include <linux/module.h>
+-#include <linux/sched.h>
+
+ #include <asm/mmu_context.h>
+
diff --git a/patches.suse/kaiser-0007-sched-core-Add-switch_mm_irqs_off-and-use-it-in-the-.patch b/patches.suse/kaiser-0007-sched-core-Add-switch_mm_irqs_off-and-use-it-in-the-.patch
new file mode 100644
index 0000000000..9435689182
--- /dev/null
+++ b/patches.suse/kaiser-0007-sched-core-Add-switch_mm_irqs_off-and-use-it-in-the-.patch
@@ -0,0 +1,76 @@
+From 5d6a097bb91c179eaf82b81f17546f5e3db0e1a5 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 26 Apr 2016 09:39:06 -0700
+Subject: [PATCH 07/19] sched/core: Add switch_mm_irqs_off() and use it in the
+References: bsc#1068032 CVE-2017-5754
+ scheduler
+Git-commit: f98db6013c557c216da5038d9c52045be55cd039
+Patch-mainline: v4.7-rc1
+
+By default, this is the same thing as switch_mm().
+
+x86 will override it as an optimization.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/df401df47bdd6be3e389c6f1e3f5310d70e81b2c.1461688545.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit f98db6013c557c216da5038d9c52045be55cd039)
+---
+ include/linux/mmu_context.h | 7 +++++++
+ kernel/sched.c | 6 +++---
+ 2 files changed, 10 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mmu_context.h
++++ b/include/linux/mmu_context.h
+@@ -1,9 +1,16 @@
+ #ifndef _LINUX_MMU_CONTEXT_H
+ #define _LINUX_MMU_CONTEXT_H
+
++#include <asm/mmu_context.h>
++
+ struct mm_struct;
+
+ void use_mm(struct mm_struct *mm);
+ void unuse_mm(struct mm_struct *mm);
+
++/* Architectures that care about IRQ state in switch_mm can override this. */
++#ifndef switch_mm_irqs_off
++# define switch_mm_irqs_off switch_mm
++#endif
++
+ #endif
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -32,7 +32,7 @@
+ #include <linux/init.h>
+ #include <linux/uaccess.h>
+ #include <linux/highmem.h>
+-#include <asm/mmu_context.h>
++#include <linux/mmu_context.h>
+ #include <linux/interrupt.h>
+ #include <linux/capability.h>
+ #include <linux/completion.h>
+@@ -3444,7 +3444,7 @@ context_switch(struct rq *rq, struct tas
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next);
+ } else
+- switch_mm(oldmm, mm, next);
++ switch_mm_irqs_off(oldmm, mm, next);
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+@@ -6554,7 +6554,7 @@ void idle_task_exit(void)
+ BUG_ON(cpu_online(smp_processor_id()));
+
+ if (mm != &init_mm)
+- switch_mm(mm, &init_mm, current);
++ switch_mm_irqs_off(mm, &init_mm, current);
+ mmdrop(mm);
+ }
+
diff --git a/patches.suse/kaiser-0008-x86-mm-Build-arch-x86-mm-tlb.c-even-on-SMP.patch b/patches.suse/kaiser-0008-x86-mm-Build-arch-x86-mm-tlb.c-even-on-SMP.patch
new file mode 100644
index 0000000000..6b5fd25558
--- /dev/null
+++ b/patches.suse/kaiser-0008-x86-mm-Build-arch-x86-mm-tlb.c-even-on-SMP.patch
@@ -0,0 +1,64 @@
+From 5398f46bba118518895700ff76a3ebafb5e62ce2 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 26 Apr 2016 09:39:07 -0700
+Subject: [PATCH 08/19] x86/mm: Build arch/x86/mm/tlb.c even on !SMP
+References: bsc#1068032 CVE-2017-5754
+Git-commit: e1074888c326038340a1ada9129d679e661f2ea6
+Patch-mainline: v4.7-rc1
+
+Currently all of the functions that live in tlb.c are inlined on
+!SMP builds. One can debate whether this is a good idea (in many
+respects the code in tlb.c is better than the inlined UP code).
+
+Regardless, I want to add code that needs to be built on UP and SMP
+kernels and relates to tlb flushing, so arrange for tlb.c to be
+compiled unconditionally.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/f0d778f0d828fc46e5d1946bca80f0aaf9abf032.1461688545.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+(cherry picked from commit e1074888c326038340a1ada9129d679e661f2ea6)
+---
+ arch/x86/mm/Makefile | 3 +--
+ arch/x86/mm/tlb.c | 4 ++++
+ 2 files changed, 5 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -1,5 +1,5 @@
+ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+- pat.o pgtable.o physaddr.o gup.o setup_nx.o
++ pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+
+ # Make sure __phys_addr has no stackprotector
+ nostackp := $(call cc-option, -fno-stack-protector)
+@@ -7,7 +7,6 @@ CFLAGS_physaddr.o := $(nostackp)
+ CFLAGS_setup_nx.o := $(nostackp)
+
+ obj-$(CONFIG_X86_PAT) += pat_rbtree.o
+-obj-$(CONFIG_SMP) += tlb.o
+
+ obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -38,6 +38,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb
+ * fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+
++#ifdef CONFIG_SMP
++
+ union smp_flush_state {
+ struct {
+ struct mm_struct *flush_mm;
+@@ -350,3 +352,5 @@ void flush_tlb_all(void)
+ {
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
++
++#endif /* CONFIG_SMP */
diff --git a/patches.suse/kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch b/patches.suse/kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch
new file mode 100644
index 0000000000..46a3171d4d
--- /dev/null
+++ b/patches.suse/kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch
@@ -0,0 +1,193 @@
+From 74e28ebd9c306dd4e3101233a25f2ce2294ec49b Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 26 Apr 2016 09:39:08 -0700
+Subject: [PATCH 09/19] x86/mm, sched/core: Uninline switch_mm()
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 69c0319aabba45bcf33178916a2f06967b4adede
+Patch-mainline: v4.7-rc1
+
+It's fairly large and it has quite a few callers. This may also
+help untangle some headers down the road.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/54f3367803e7f80b2be62c8a21879aa74b1a5f57.1461688545.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 69c0319aabba45bcf33178916a2f06967b4adede)
+
+Conflicts:
+arch/x86/include/asm/mmu_context.h
+---
+ arch/x86/include/asm/mmu_context.h | 71 -----------------------------------
+ arch/x86/mm/tlb.c | 75 +++++++++++++++++++++++++++++++++++++
+ 2 files changed, 77 insertions(+), 69 deletions(-)
+
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -74,75 +74,8 @@ static inline void enter_lazy_tlb(struct
+ #endif
+ }
+
+-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+- struct task_struct *tsk)
+-{
+- unsigned cpu = smp_processor_id();
+-
+- if (likely(prev != next)) {
+-#ifdef CONFIG_SMP
+- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+- percpu_write(cpu_tlbstate.active_mm, next);
+-#endif
+- cpumask_set_cpu(cpu, mm_cpumask(next));
+-
+- /*
+- * Re-load page tables.
+- *
+- * This logic has an ordering constraint:
+- *
+- * CPU 0: Write to a PTE for 'next'
+- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+- * CPU 1: set bit 1 in next's mm_cpumask
+- * CPU 1: load from the PTE that CPU 0 writes (implicit)
+- *
+- * We need to prevent an outcome in which CPU 1 observes
+- * the new PTE value and CPU 0 observes bit 1 clear in
+- * mm_cpumask. (If that occurs, then the IPI will never
+- * be sent, and CPU 0's TLB will contain a stale entry.)
+- *
+- * The bad outcome can occur if either CPU's load is
+- * reordered before that CPU's store, so both CPUs much
+- * execute full barriers to prevent this from happening.
+- *
+- * Thus, switch_mm needs a full barrier between the
+- * store to mm_cpumask and any operation that could load
+- * from next->pgd. This barrier synchronizes with
+- * remote TLB flushers. Fortunately, load_cr3 is
+- * serializing and thus acts as a full barrier.
+- *
+- */
+- load_cr3(next->pgd);
+-
+- /* stop flush ipis for the previous mm */
+- cpumask_clear_cpu(cpu, mm_cpumask(prev));
+-
+- /*
+- * load the LDT, if the LDT is different:
+- */
+- if (unlikely(prev->context.ldt != next->context.ldt))
+- load_mm_ldt(next);
+- }
+-#ifdef CONFIG_SMP
+- else {
+- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+- BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+-
+- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+- /* We were in lazy tlb mode and leave_mm disabled
+- * tlb flush IPI delivery. We must reload CR3
+- * to make sure to use no freed page tables.
+- *
+- * As above, this is a barrier that forces
+- * TLB repopulation to be ordered after the
+- * store to mm_cpumask.
+- */
+- load_cr3(next->pgd);
+- load_mm_ldt(next);
+- }
+- }
+-#endif
+-}
++extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk);
+
+ #define activate_mm(prev, next) \
+ do { \
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -71,6 +71,81 @@ void leave_mm(int cpu)
+ }
+ EXPORT_SYMBOL_GPL(leave_mm);
+
++#endif /* CONFIG_SMP */
++
++void switch_mm(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk)
++{
++ unsigned cpu = smp_processor_id();
++
++ if (likely(prev != next)) {
++#ifdef CONFIG_SMP
++ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
++ percpu_write(cpu_tlbstate.active_mm, next);
++#endif
++ cpumask_set_cpu(cpu, mm_cpumask(next));
++
++ /*
++ * Re-load page tables.
++ *
++ * This logic has an ordering constraint:
++ *
++ * CPU 0: Write to a PTE for 'next'
++ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
++ * CPU 1: set bit 1 in next's mm_cpumask
++ * CPU 1: load from the PTE that CPU 0 writes (implicit)
++ *
++ * We need to prevent an outcome in which CPU 1 observes
++ * the new PTE value and CPU 0 observes bit 1 clear in
++ * mm_cpumask. (If that occurs, then the IPI will never
++ * be sent, and CPU 0's TLB will contain a stale entry.)
++ *
++ * The bad outcome can occur if either CPU's load is
++ * reordered before that CPU's store, so both CPUs must
++ * execute full barriers to prevent this from happening.
++ *
++ * Thus, switch_mm needs a full barrier between the
++ * store to mm_cpumask and any operation that could load
++ * from next->pgd. TLB fills are special and can happen
++ * due to instruction fetches or for no reason at all,
++ * and neither LOCK nor MFENCE orders them.
++ * Fortunately, load_cr3() is serializing and gives the
++ * ordering guarantee we need.
++ *
++ */
++ load_cr3(next->pgd);
++
++ /* stop flush ipis for the previous mm */
++ cpumask_clear_cpu(cpu, mm_cpumask(prev));
++
++ /*
++ * load the LDT, if the LDT is different:
++ */
++ if (unlikely(prev->context.ldt != next->context.ldt))
++ load_mm_ldt(next);
++ }
++#ifdef CONFIG_SMP
++ else {
++ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
++ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
++
++ if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
++ /* We were in lazy tlb mode and leave_mm disabled
++ * tlb flush IPI delivery. We must reload CR3
++ * to make sure to use no freed page tables.
++ *
++ * As above, load_cr3() is serializing and orders TLB
++ * fills with respect to the mm_cpumask write.
++ */
++ load_cr3(next->pgd);
++ load_mm_ldt(next);
++ }
++ }
++#endif
++}
++
++#ifdef CONFIG_SMP
++
+ /*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
diff --git a/patches.suse/kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch b/patches.suse/kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch
new file mode 100644
index 0000000000..77703f451c
--- /dev/null
+++ b/patches.suse/kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch
@@ -0,0 +1,66 @@
+From 656ec5851bc05fc918f99ad557e1a113b7267792 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 26 Apr 2016 09:39:09 -0700
+Subject: [PATCH 10/19] x86/mm, sched/core: Turn off IRQs in switch_mm()
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 078194f8e9fe3cf54c8fd8bded48a1db5bd8eb8a
+Patch-mainline: v4.7-rc1
+
+Potential races between switch_mm() and TLB-flush or LDT-flush IPIs
+could be very messy. AFAICT the code is currently okay, whether by
+accident or by careful design, but enabling PCID will make it
+considerably more complicated and will no longer be obviously safe.
+
+Fix it with a big hammer: run switch_mm() with IRQs off.
+
+To avoid a performance hit in the scheduler, we take advantage of
+our knowledge that the scheduler already has IRQs disabled when it
+calls switch_mm().
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/f19baf759693c9dcae64bbff76189db77cb13398.1461688545.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 078194f8e9fe3cf54c8fd8bded48a1db5bd8eb8a)
+---
+ arch/x86/include/asm/mmu_context.h | 4 ++++
+ arch/x86/mm/tlb.c | 10 ++++++++++
+ 2 files changed, 14 insertions(+)
+
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -77,6 +77,10 @@ static inline void enter_lazy_tlb(struct
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+
++extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk);
++#define switch_mm_irqs_off switch_mm_irqs_off
++
+ #define activate_mm(prev, next) \
+ do { \
+ paravirt_activate_mm((prev), (next)); \
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -76,6 +76,16 @@ EXPORT_SYMBOL_GPL(leave_mm);
+ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+ {
++ unsigned long flags;
++
++ local_irq_save(flags);
++ switch_mm_irqs_off(prev, next, tsk);
++ local_irq_restore(flags);
++}
++
++void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk)
++{
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
diff --git a/patches.suse/kaiser-0011-sched-core-Idle_task_exit-shouldn-t-use-switch_mm_ir.patch b/patches.suse/kaiser-0011-sched-core-Idle_task_exit-shouldn-t-use-switch_mm_ir.patch
new file mode 100644
index 0000000000..05cf5058c4
--- /dev/null
+++ b/patches.suse/kaiser-0011-sched-core-Idle_task_exit-shouldn-t-use-switch_mm_ir.patch
@@ -0,0 +1,44 @@
+From 4a6217cb06ee7ea3ceaeb1e77f07f515e5cc7fae Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Fri, 9 Jun 2017 11:49:15 -0700
+Subject: [PATCH 11/19] sched/core: Idle_task_exit() shouldn't use
+References: bsc#1068032 CVE-2017-5754
+ switch_mm_irqs_off()
+Git-commit: 252d2a4117bc181b287eeddf848863788da733ae
+Patch-mainline: v4.12-rc6
+
+idle_task_exit() can be called with IRQs on x86 on and therefore
+should use switch_mm(), not switch_mm_irqs_off().
+
+This doesn't seem to cause any problems right now, but it will
+confuse my upcoming TLB flush changes. Nonetheless, I think it
+should be backported because it's trivial. There won't be any
+meaningful performance impact because idle_task_exit() is only
+used when offlining a CPU.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Fixes: f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler")
+Link: http://lkml.kernel.org/r/ca3d1a9fa93a0b49f5a8ff729eda3640fb6abdf9.1497034141.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 252d2a4117bc181b287eeddf848863788da733ae)
+---
+ kernel/sched.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -6554,7 +6554,7 @@ void idle_task_exit(void)
+ BUG_ON(cpu_online(smp_processor_id()));
+
+ if (mm != &init_mm)
+- switch_mm_irqs_off(mm, &init_mm, current);
++ switch_mm(mm, &init_mm, current);
+ mmdrop(mm);
+ }
+
diff --git a/patches.suse/kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch b/patches.suse/kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch
new file mode 100644
index 0000000000..388e165fad
--- /dev/null
+++ b/patches.suse/kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch
@@ -0,0 +1,240 @@
+From b9731f42739fa8378edd64342a4529a76cb4aec2 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Sun, 28 May 2017 10:00:14 -0700
+Subject: [PATCH 12/19] x86/mm: Remove the UP asm/tlbflush.h code, always use
+References: bsc#1068032 CVE-2017-5754
+ the (formerly) SMP code
+Git-commit: ce4a4e565f5264909a18c733b864c3f74467f69e
+Patch-mainline: v4.13-rc1
+
+The UP asm/tlbflush.h generates somewhat nicer code than the SMP version.
+Aside from that, it's fallen quite a bit behind the SMP code:
+
+ - flush_tlb_mm_range() didn't flush individual pages if the range
+ was small.
+
+ - The lazy TLB code was much weaker. This usually wouldn't matter,
+ but, if a kernel thread flushed its lazy "active_mm" more than
+ once (due to reclaim or similar), it wouldn't be unlazied and
+ would instead pointlessly flush repeatedly.
+
+ - Tracepoints were missing.
+
+Aside from that, simply having the UP code around was a maintanence
+burden, since it means that any change to the TLB flush code had to
+make sure not to break it.
+
+Simplify everything by deleting the UP code.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@linux.intel.com>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Nadav Amit <namit@vmware.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit ce4a4e565f5264909a18c733b864c3f74467f69e)
+
+Conflicts:
+arch/x86/Kconfig
+arch/x86/include/asm/tlbbatch.h
+arch/x86/include/asm/tlbflush.h
+arch/x86/mm/tlb.c
+---
+ arch/x86/include/asm/hardirq.h | 2 -
+ arch/x86/include/asm/mmu.h | 6 ----
+ arch/x86/include/asm/mmu_context.h | 2 -
+ arch/x86/include/asm/tlbflush.h | 47 -------------------------------------
+ arch/x86/mm/tlb.c | 17 +------------
+ 5 files changed, 4 insertions(+), 70 deletions(-)
+
+--- a/arch/x86/include/asm/hardirq.h
++++ b/arch/x86/include/asm/hardirq.h
+@@ -21,8 +21,8 @@ typedef struct {
+ #ifdef CONFIG_SMP
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+- unsigned int irq_tlb_count;
+ #endif
++ unsigned int irq_tlb_count;
+ #ifdef CONFIG_X86_THERMAL_VECTOR
+ unsigned int irq_thermal_count;
+ #endif
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -68,10 +68,8 @@ void destroy_context(struct mm_struct *m
+
+ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+ {
+-#ifdef CONFIG_SMP
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+-#endif
+ }
+
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -21,12 +21,6 @@ typedef struct {
+ void *vdso;
+ } mm_context_t;
+
+-#ifdef CONFIG_SMP
+ void leave_mm(int cpu);
+-#else
+-static inline void leave_mm(int cpu)
+-{
+-}
+-#endif
+
+ #endif /* _ASM_X86_MMU_H */
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -6,6 +6,7 @@
+
+ #include <asm/processor.h>
+ #include <asm/system.h>
++#include <asm/smp.h>
+
+ static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+@@ -138,52 +139,8 @@ static inline void __flush_tlb_one(unsig
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+- *
+- * x86-64 can only flush individual pages or full VMs. For a range flush
+- * we always do the full VM. Might be worth trying if for a small
+- * range a few INVLPGs in a row are a win.
+ */
+
+-#ifndef CONFIG_SMP
+-
+-#define flush_tlb() __flush_tlb()
+-#define flush_tlb_all() __flush_tlb_all()
+-#define local_flush_tlb() __flush_tlb()
+-
+-static inline void flush_tlb_mm(struct mm_struct *mm)
+-{
+- if (mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-static inline void flush_tlb_page(struct vm_area_struct *vma,
+- unsigned long addr)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb_one(addr);
+-}
+-
+-static inline void flush_tlb_range(struct vm_area_struct *vma,
+- unsigned long start, unsigned long end)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
+- struct mm_struct *mm,
+- unsigned long va)
+-{
+-}
+-
+-static inline void reset_lazy_tlbstate(void)
+-{
+-}
+-
+-#else /* SMP */
+-
+-#include <asm/smp.h>
+-
+ #define local_flush_tlb() __flush_tlb()
+
+ extern void flush_tlb_all(void);
+@@ -217,8 +174,6 @@ static inline void reset_lazy_tlbstate(v
+ percpu_write(cpu_tlbstate.active_mm, &init_mm);
+ }
+
+-#endif /* SMP */
+-
+ #ifndef CONFIG_PARAVIRT_MMU
+ #define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
+ #endif
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -17,7 +17,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb
+ = { &init_mm, 0, };
+
+ /*
+- * Smarter SMP flushing macros.
++ * TLB flushing, formerly SMP-only
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+@@ -38,8 +38,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb
+ * fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+
+-#ifdef CONFIG_SMP
+-
+ union smp_flush_state {
+ struct {
+ struct mm_struct *flush_mm;
+@@ -71,8 +69,6 @@ void leave_mm(int cpu)
+ }
+ EXPORT_SYMBOL_GPL(leave_mm);
+
+-#endif /* CONFIG_SMP */
+-
+ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+ {
+@@ -89,10 +85,8 @@ void switch_mm_irqs_off(struct mm_struct
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+-#ifdef CONFIG_SMP
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
+-#endif
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+@@ -133,9 +127,7 @@ void switch_mm_irqs_off(struct mm_struct
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_mm_ldt(next);
+- }
+-#ifdef CONFIG_SMP
+- else {
++ } else {
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+@@ -151,11 +143,8 @@ void switch_mm_irqs_off(struct mm_struct
+ load_mm_ldt(next);
+ }
+ }
+-#endif
+ }
+
+-#ifdef CONFIG_SMP
+-
+ /*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+@@ -437,5 +426,3 @@ void flush_tlb_all(void)
+ {
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
+-
+-#endif /* CONFIG_SMP */
diff --git a/patches.suse/kaiser-0013-x86-mm-Disable-PCID-on-32-bit-kernels.patch b/patches.suse/kaiser-0013-x86-mm-Disable-PCID-on-32-bit-kernels.patch
new file mode 100644
index 0000000000..90d13cb5ae
--- /dev/null
+++ b/patches.suse/kaiser-0013-x86-mm-Disable-PCID-on-32-bit-kernels.patch
@@ -0,0 +1,65 @@
+From fa16a2b1ba3a08b9fbaa1fd7261b5a79c69427d5 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 29 Jun 2017 08:53:19 -0700
+Subject: [PATCH 13/19] x86/mm: Disable PCID on 32-bit kernels
+References: bsc#1068032 CVE-2017-5754
+Git-commit: cba4671af7550e008f7a7835f06df0763825bf3e
+Patch-mainline: v4.14-rc1
+
+32-bit kernels on new hardware will see PCID in CPUID, but PCID can
+only be used in 64-bit mode. Rather than making all PCID code
+conditional, just disable the feature on 32-bit builds.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@linux.intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit cba4671af7550e008f7a7835f06df0763825bf3e)
+---
+ arch/x86/kernel/cpu/bugs.c | 8 ++++++++
+ arch/x86/kernel/cpu/common.c | 5 +++++
+ 2 files changed, 13 insertions(+)
+
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -155,6 +155,14 @@ static void __init check_config(void)
+
+ void __init check_bugs(void)
+ {
++#ifdef CONFIG_X86_32
++ /*
++ * Regardless of whether PCID is enumerated, the SDM says
++ * that it can't be enabled in 32-bit mode.
++ */
++ setup_clear_cpu_cap(X86_FEATURE_PCID);
++#endif
++
+ identify_boot_cpu();
+ #ifndef CONFIG_SMP
+ printk(KERN_INFO "CPU: ");
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -934,6 +934,11 @@ void __cpuinit identify_secondary_cpu(st
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ #ifdef CONFIG_X86_32
++ /*
++ * Regardless of whether PCID is enumerated, the SDM says
++ * that it can't be enabled in 32-bit mode.
++ */
++ clear_cpu_cap(c, X86_FEATURE_PCID);
+ enable_sep_cpu();
+ #endif
+ mtrr_ap_init();
diff --git a/patches.suse/kaiser-0014-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch b/patches.suse/kaiser-0014-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch
new file mode 100644
index 0000000000..5fccbaedb1
--- /dev/null
+++ b/patches.suse/kaiser-0014-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch
@@ -0,0 +1,74 @@
+From aa02d6b6235e208b0884f6f40456b60049cc2e53 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 29 Jun 2017 08:53:20 -0700
+Subject: [PATCH 14/19] x86/mm: Add the 'nopcid' boot option to turn off PCID
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 0790c9aad84901ca1bdc14746175549c8b5da215
+Patch-mainline: v4.14-rc1
+
+The parameter is only present on x86_64 systems to save a few bytes,
+as PCID is always disabled on x86_32.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@linux.intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+(cherry picked from commit 0790c9aad84901ca1bdc14746175549c8b5da215)
+
+Conflicts:
+Documentation/admin-guide/kernel-parameters.txt (not in this tree)
+Documentation/kernel-parameters.txt (patched instead of that)
+---
+ Documentation/kernel-parameters.txt | 2 ++
+ arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++
+ 2 files changed, 20 insertions(+)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -155,6 +155,24 @@ static int __init x86_xsaveopt_setup(cha
+ }
+ __setup("noxsaveopt", x86_xsaveopt_setup);
+
++#ifdef CONFIG_X86_64
++static int __init x86_pcid_setup(char *s)
++{
++ /* require an exact match without trailing characters */
++ if (strlen(s))
++ return 0;
++
++ /* do not emit a message if the feature is not present */
++ if (!boot_cpu_has(X86_FEATURE_PCID))
++ return 1;
++
++ setup_clear_cpu_cap(X86_FEATURE_PCID);
++ pr_info("nopcid: PCID feature disabled\n");
++ return 1;
++}
++__setup("nopcid", x86_pcid_setup);
++#endif
++
+ static int __init x86_noinvpcid_setup(char *s)
+ {
+ /* noinvpcid doesn't accept parameters */
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -1853,6 +1853,8 @@ bytes respectively. Such letter suffixes
+ nopat [X86] Disable PAT (page attribute table extension of
+ pagetables) support.
+
++ nopcid [X86-64] Disable the PCID cpu feature.
++
+ norandmaps Don't use address space randomization. Equivalent to
+ echo 0 > /proc/sys/kernel/randomize_va_space
+
diff --git a/patches.suse/kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch b/patches.suse/kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch
new file mode 100644
index 0000000000..8f59f8920b
--- /dev/null
+++ b/patches.suse/kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch
@@ -0,0 +1,128 @@
+From fa472b69062008c4d0ef4aa2ac5e660252c6234e Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 29 Jun 2017 08:53:21 -0700
+Subject: [PATCH 15/19] x86/mm: Enable CR4.PCIDE on supported systems
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 660da7c9228f685b2ebe664f9fd69aaddcc420b5
+Patch-mainline: v4.14-rc1
+
+We can use PCID if the CPU has PCID and PGE and we're not on Xen.
+
+By itself, this has no effect. A followup patch will start using PCID.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@linux.intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5)
+
+Conflicts:
+arch/x86/xen/enlighten_pv.c (not in this tree)
+arch/x86/xen/enlighten.c (patched instead of that)
+---
+ arch/x86/include/asm/tlbflush.h | 8 ++++++++
+ arch/x86/kernel/cpu/common.c | 33 ++++++++++++++++++++++++++++-----
+ arch/x86/xen/enlighten.c | 6 ++++++
+ 3 files changed, 42 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -110,6 +110,14 @@ static inline void __flush_tlb_all(void)
+ __flush_tlb_global();
+ else
+ __flush_tlb();
++
++ /*
++ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
++ * we'd end up flushing kernel translations for the current ASID but
++ * we might fail to flush kernel translations for other cached ASIDs.
++ *
++ * To avoid this issue, we force PCID off if PGE is off.
++ */
+ }
+
+ static inline void __flush_tlb_one(unsigned long addr)
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -310,6 +310,31 @@ static __cpuinit void setup_smep(struct
+ }
+ }
+
++static void setup_pcid(struct cpuinfo_x86 *c)
++{
++ if (cpu_has(c, X86_FEATURE_PCID)) {
++#ifdef CONFIG_X86_64
++ if (cpu_has(c, X86_FEATURE_PGE)) {
++ /*
++ * Regardless of whether PCID is enumerated, the
++ * SDM says that it can't be enabled in 32-bit mode.
++ */
++ set_in_cr4(X86_CR4_PCIDE);
++ }
++#else
++ /*
++ * flush_tlb_all(), as currently implemented, won't
++ * work if PCID is on but PGE is not. Since that
++ * combination doesn't exist on real hardware, there's
++ * no reason to try to fully support it, but it's
++ * polite to avoid corrupting data if we're on
++ * an improperly configured VM.
++ */
++ clear_cpu_cap(c, X86_FEATURE_PCID);
++#endif
++ }
++}
++
+ /*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+@@ -867,6 +892,9 @@ static void __cpuinit identify_cpu(struc
+ /* Disable the PN if appropriate */
+ squash_the_stupid_serial_number(c);
+
++ /* Set up PCID */
++ setup_pcid(c);
++
+ /*
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
+@@ -952,11 +980,6 @@ void __cpuinit identify_secondary_cpu(st
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ #ifdef CONFIG_X86_32
+- /*
+- * Regardless of whether PCID is enumerated, the SDM says
+- * that it can't be enabled in 32-bit mode.
+- */
+- clear_cpu_cap(c, X86_FEATURE_PCID);
+ enable_sep_cpu();
+ #endif
+ mtrr_ap_init();
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -270,6 +270,12 @@ static void __init xen_init_cpuid_mask(v
+ (1 << X86_FEATURE_MTRR) | /* disable MTRR */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+
++ /*
++ * Xen PV would need some work to support PCID: CR3 handling as well
++ * as xen_flush_tlb_others() would need updating.
++ */
++ cpuid_leaf1_ecx_mask &= ~(1 << X86_FEATURE_PCID); /* disable PCID */
++
+ if (!xen_initial_domain())
+ cpuid_leaf1_edx_mask &=
+ ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
diff --git a/patches.suse/kaiser-0016-x86-mm-64-Fix-reboot-interaction-with-CR4.PCIDE.patch b/patches.suse/kaiser-0016-x86-mm-64-Fix-reboot-interaction-with-CR4.PCIDE.patch
new file mode 100644
index 0000000000..27728f51d8
--- /dev/null
+++ b/patches.suse/kaiser-0016-x86-mm-64-Fix-reboot-interaction-with-CR4.PCIDE.patch
@@ -0,0 +1,44 @@
+From ce87292fa056fff7bcf330c0a48678e9ae275efe Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Sun, 8 Oct 2017 21:53:05 -0700
+Subject: [PATCH 16/19] x86/mm/64: Fix reboot interaction with CR4.PCIDE
+References: bsc#1068032 CVE-2017-5754
+Git-commit: 924c6b900cfdf376b07bccfd80e62b21914f8a5a
+Patch-mainline: v4.14-rc5
+
+Trying to reboot via real mode fails with PCID on: long mode cannot
+be exited while CR4.PCIDE is set. (No, I have no idea why, but the
+SDM and actual CPUs are in agreement here.) The result is a GPF and
+a hang instead of a reboot.
+
+I didn't catch this in testing because neither my computer nor my VM
+reboots this way. I can trigger it with reboot=bios, though.
+
+Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems")
+Reported-and-tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+(cherry picked from commit 924c6b900cfdf376b07bccfd80e62b21914f8a5a)
+---
+ arch/x86/kernel/reboot.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -377,6 +377,12 @@ void machine_real_restart(unsigned int t
+ lowmem_gdt[1] =
+ GDT_ENTRY(0x009b, restart_pa, 0xffff);
+
++#ifdef CONFIG_X86_64
++ /* Exiting long mode will fail if CR4.PCIDE is set. */
++ if (static_cpu_has(X86_FEATURE_PCID))
++ clear_in_cr4(X86_CR4_PCIDE);
++#endif
++
+ /* Jump to the identity-mapped low memory code */
+ restart_lowmem(type);
+ }
diff --git a/patches.suse/kaiser-0017-x86-mm-fix-bad-backport-to-disable-PCID-on-Xen.patch b/patches.suse/kaiser-0017-x86-mm-fix-bad-backport-to-disable-PCID-on-Xen.patch
new file mode 100644
index 0000000000..f234d3e4bb
--- /dev/null
+++ b/patches.suse/kaiser-0017-x86-mm-fix-bad-backport-to-disable-PCID-on-Xen.patch
@@ -0,0 +1,37 @@
+From 6048713fafc0183f6b9a4671dff81c252385c39b Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@alien8.de>
+Date: Wed, 6 Dec 2017 22:08:43 +0100
+Subject: [PATCH 17/19] x86/mm: fix bad backport to disable PCID on Xen
+References: bsc#1068032 CVE-2017-5754
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Patch-mainline: No, backport-specific
+
+arch/x86/xen/enlighten.c: In function ‘xen_init_cpuid_mask’:
+arch/x86/xen/enlighten.c:450:2: warning: left shift count >= width of type
+ cpuid_leaf1_ecx_mask &= ~(1 << X86_FEATURE_PCID); /* disable PCID */
+
+That hunk in xen_init_cpuid_mask() should be like below because those
+X86_FEATURE* things are bits in an u32 array.
+
+X86_FEATURE_ACC and X86_FEATURE_MTRR work, for example, because they're
+the nullth word but they should've been done % 32 anyway just to be
+consistent when converting cap bits into CPUID bits.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/xen/enlighten.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -274,7 +274,7 @@ static void __init xen_init_cpuid_mask(v
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+- cpuid_leaf1_ecx_mask &= ~(1 << X86_FEATURE_PCID); /* disable PCID */
++ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */
+
+ if (!xen_initial_domain())
+ cpuid_leaf1_edx_mask &=
diff --git a/patches.suse/kaiser-0018-KAISER-Kernel-Address-Isolation.patch b/patches.suse/kaiser-0018-KAISER-Kernel-Address-Isolation.patch
new file mode 100644
index 0000000000..be690fcfd0
--- /dev/null
+++ b/patches.suse/kaiser-0018-KAISER-Kernel-Address-Isolation.patch
@@ -0,0 +1,1906 @@
+From 2622b8382cf1aa97ec9cf6d194f1389585f029b2 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Mon, 11 Dec 2017 17:59:50 -0800
+Subject: [PATCH 18/19] KAISER: Kernel Address Isolation
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: No, PTI under development
+
+This patch introduces our implementation of KAISER (Kernel Address
+Isolation to have Side-channels Efficiently Removed), a kernel isolation
+technique to close hardware side channels on kernel address information.
+
+More information about the original patch can be found at:
+https://github.com/IAIK/KAISER
+http://marc.info/?l=linux-kernel&m=149390087310405&w=2
+
+Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+Richard Fellner <richard.fellner@student.tugraz.at>
+Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+<clementine.maurice@iaik.tugraz.at>
+<moritz.lipp@iaik.tugraz.at>
+
+That original was then developed further by
+Dave Hansen <dave.hansen@intel.com>
+Hugh Dickins <hughd@google.com>
+then others after this snapshot.
+
+This combined patch for 3.2.96 was derived from hughd's patches below
+for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last,
+which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been
+combined in order to minimize the effort of rebasing: most of the
+patches in the 3.18.72 series were small fixes and cleanups and
+enhancements to three large patches. About the only new work in this
+backport is a simple reimplementation of kaiser_remove_mapping():
+since mm/pageattr.c changed a lot between 3.2 and 3.18, and the
+mods there for Kaiser never seemed necessary.
+
+Backported to 3.0 (11-SP4 variant of it) by Jiri kosina.
+
+KAISER: Kernel Address Isolation
+kaiser: merged update
+kaiser: do not set _PAGE_NX on pgd_none
+kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
+kaiser: fix build and FIXME in alloc_ldt_struct()
+kaiser: KAISER depends on SMP
+kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
+kaiser: fix perf crashes
+kaiser: ENOMEM if kaiser_pagetable_walk() NULL
+kaiser: tidied up asm/kaiser.h somewhat
+kaiser: tidied up kaiser_add/remove_mapping slightly
+kaiser: kaiser_remove_mapping() move along the pgd
+kaiser: align addition to x86/mm/Makefile
+kaiser: cleanups while trying for gold link
+kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
+kaiser: delete KAISER_REAL_SWITCH option
+kaiser: vmstat show NR_KAISERTABLE as nr_overhead
+kaiser: enhanced by kernel and user PCIDs
+kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
+kaiser: PCID 0 for kernel and 128 for user
+kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
+kaiser: paranoid_entry pass cr3 need to paranoid_exit
+kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
+kaiser: fix unlikely error in alloc_ldt_struct()
+kaiser: drop is_atomic arg to kaiser_pagetable_walk()
+kaiser: extend maping to sched+kprobes+entry sections [jkosina@suse.cz]
+kaiser: port entry code to reentrant NMI support [jkosina@suse.cz]
+kaiser: remove !paravirt dependency [jkosina@suse.cz]
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/boot/compressed/misc.h | 1
+ arch/x86/ia32/ia32entry.S | 7
+ arch/x86/include/asm/cpufeature.h | 1
+ arch/x86/include/asm/desc.h | 2
+ arch/x86/include/asm/hw_irq.h | 2
+ arch/x86/include/asm/kaiser.h | 126 +++++++++
+ arch/x86/include/asm/pgtable.h | 19 +
+ arch/x86/include/asm/pgtable_64.h | 29 ++
+ arch/x86/include/asm/pgtable_types.h | 33 ++
+ arch/x86/include/asm/processor-flags.h | 3
+ arch/x86/include/asm/processor.h | 2
+ arch/x86/include/asm/tlbflush.h | 64 ++++
+ arch/x86/kernel/cpu/common.c | 18 +
+ arch/x86/kernel/cpu/perf_event_intel_ds.c | 54 +++-
+ arch/x86/kernel/entry_64.S | 103 ++++++--
+ arch/x86/kernel/espfix_64.c | 11
+ arch/x86/kernel/head_64.S | 25 +
+ arch/x86/kernel/init_task.c | 2
+ arch/x86/kernel/irqinit.c | 2
+ arch/x86/kernel/ldt.c | 25 +
+ arch/x86/kernel/process_64.c | 2
+ arch/x86/mm/Makefile | 1
+ arch/x86/mm/kaiser.c | 385 ++++++++++++++++++++++++++++++
+ arch/x86/mm/pgtable.c | 31 ++
+ arch/x86/mm/tlb.c | 48 +++
+ include/asm-generic/vmlinux.lds.h | 7
+ include/linux/kaiser.h | 52 ++++
+ include/linux/mmzone.h | 3
+ include/linux/percpu-defs.h | 32 ++
+ init/main.c | 2
+ kernel/fork.c | 6
+ mm/vmstat.c | 1
+ security/Kconfig | 10
+ 33 files changed, 1046 insertions(+), 63 deletions(-)
+ create mode 100644 arch/x86/include/asm/kaiser.h
+ create mode 100644 arch/x86/mm/kaiser.c
+ create mode 100644 include/linux/kaiser.h
+
+--- a/arch/x86/boot/compressed/misc.h
++++ b/arch/x86/boot/compressed/misc.h
+@@ -7,6 +7,7 @@
+ * we just keep it from happening
+ */
+ #undef CONFIG_PARAVIRT
++#undef KAISER
+ #ifdef CONFIG_X86_32
+ #define _ASM_X86_DESC_H 1
+ #endif
+--- a/arch/x86/ia32/ia32entry.S
++++ b/arch/x86/ia32/ia32entry.S
+@@ -12,6 +12,8 @@
+ #include <asm/ia32_unistd.h>
+ #include <asm/thread_info.h>
+ #include <asm/segment.h>
++#include <asm/pgtable_types.h>
++#include <asm/kaiser.h>
+ #include <asm/irqflags.h>
+ #include <linux/linkage.h>
+
+@@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target)
+ CFI_DEF_CFA rsp,0
+ CFI_REGISTER rsp,rbp
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+ movq PER_CPU_VAR(kernel_stack), %rsp
+ addq $(KERNEL_STACK_OFFSET),%rsp
+ /*
+@@ -183,6 +186,7 @@ sysexit_from_sys_call:
+ popq_cfi %rcx /* User %esp */
+ CFI_REGISTER rsp,rcx
+ TRACE_IRQS_ON
++ SWITCH_USER_CR3
+ ENABLE_INTERRUPTS_SYSEXIT32
+
+ #ifdef CONFIG_AUDITSYSCALL
+@@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target)
+ CFI_REGISTER rip,rcx
+ /*CFI_REGISTER rflags,r11*/
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+ movl %esp,%r8d
+ CFI_REGISTER rsp,r8
+ movq PER_CPU_VAR(kernel_stack),%rsp
+@@ -337,6 +342,7 @@ sysretl_from_sys_call:
+ xorq %r9,%r9
+ xorq %r8,%r8
+ TRACE_IRQS_ON
++ SWITCH_USER_CR3
+ movl RSP-ARGOFFSET(%rsp),%esp
+ CFI_RESTORE rsp
+ USERGS_SYSRET32
+@@ -409,6 +415,7 @@ ENTRY(ia32_syscall)
+ CFI_REL_OFFSET rip,RIP-RIP
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ SWAPGS
++ SWITCH_KERNEL_CR3_NO_STACK
+ /*
+ * No need to follow this irqs on/off section: the syscall
+ * disabled irqs and here we enable it straight after entry:
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -177,6 +177,7 @@
+ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
+ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
+ #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
++#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
+
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -40,7 +40,7 @@ struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+ } __attribute__((aligned(PAGE_SIZE)));
+
+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+ {
+--- a/arch/x86/include/asm/hw_irq.h
++++ b/arch/x86/include/asm/hw_irq.h
+@@ -166,7 +166,7 @@ extern asmlinkage void smp_invalidate_in
+ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+
+ typedef int vector_irq_t[NR_VECTORS];
+-DECLARE_PER_CPU(vector_irq_t, vector_irq);
++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
+ extern void setup_vector_irq(int cpu);
+
+ #ifdef CONFIG_X86_IO_APIC
+--- /dev/null
++++ b/arch/x86/include/asm/kaiser.h
+@@ -0,0 +1,126 @@
++#ifndef _ASM_X86_KAISER_H
++#define _ASM_X86_KAISER_H
++
++#include <asm/processor-flags.h> /* For PCID constants */
++
++/*
++ * This file includes the definitions for the KAISER feature.
++ * KAISER is a counter measure against x86_64 side channel attacks on
++ * the kernel virtual memory. It has a shadow pgd for every process: the
++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
++ * user memory. Within a kernel context switch, or when an interrupt is handled,
++ * the pgd is switched to the normal one. When the system switches to user mode,
++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
++ * and the user may not attack the whole kernel memory.
++ *
++ * A minimalistic kernel mapping holds the parts needed to be mapped in user
++ * mode, such as the entry/exit functions of the user space, or the stacks.
++ */
++
++#define KAISER_SHADOW_PGD_OFFSET 0x1000
++
++#ifdef __ASSEMBLY__
++#ifdef CONFIG_KAISER
++
++.macro _SWITCH_TO_KERNEL_CR3 reg
++movq %cr3, \reg
++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
++orq x86_cr3_pcid_noflush, \reg
++movq \reg, %cr3
++.endm
++
++.macro _SWITCH_TO_USER_CR3 reg regb
++/*
++ * regb must be the low byte portion of reg: because we have arranged
++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
++ * not enabled): so that the one register can update both memory and cr3.
++ */
++movq %cr3, \reg
++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
++js 9f
++/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
++9:
++movq \reg, %cr3
++.endm
++
++.macro SWITCH_KERNEL_CR3
++pushq %rax
++_SWITCH_TO_KERNEL_CR3 %rax
++popq %rax
++.endm
++
++.macro SWITCH_USER_CR3
++pushq %rax
++_SWITCH_TO_USER_CR3 %rax %al
++popq %rax
++.endm
++
++.macro SWITCH_KERNEL_CR3_NO_STACK
++movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
++_SWITCH_TO_KERNEL_CR3 %rax
++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
++.endm
++
++#else /* CONFIG_KAISER */
++
++.macro SWITCH_KERNEL_CR3 reg
++.endm
++.macro SWITCH_USER_CR3 reg regb
++.endm
++.macro SWITCH_KERNEL_CR3_NO_STACK
++.endm
++
++#endif /* CONFIG_KAISER */
++
++#else /* __ASSEMBLY__ */
++
++#ifdef CONFIG_KAISER
++/*
++ * Upon kernel/user mode switch, it may happen that the address
++ * space has to be switched before the registers have been
++ * stored. To change the address space, another register is
++ * needed. A register therefore has to be stored/restored.
++*/
++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++
++extern unsigned long x86_cr3_pcid_noflush;
++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
++
++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
++
++/**
++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
++ * @addr: the start address of the range
++ * @size: the size of the range
++ * @flags: The mapping flags of the pages
++ *
++ * The mapping is done on a global scope, so no bigger
++ * synchronization has to be done. the pages have to be
++ * manually unmapped again when they are not needed any longer.
++ */
++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
++
++/**
++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
++ * @addr: the start address of the range
++ * @size: the size of the range
++ */
++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
++
++/**
++ * kaiser_init - Initialize the shadow mapping
++ *
++ * Most parts of the shadow mapping can be mapped upon boot
++ * time. Only per-process things like the thread stacks
++ * or a new LDT have to be mapped at runtime. These boot-
++ * time mappings are permanent and never unmapped.
++ */
++extern void kaiser_init(void);
++
++#endif /* CONFIG_KAISER */
++
++#endif /* __ASSEMBLY */
++
++#endif /* _ASM_X86_KAISER_H */
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_
+ native_set_pud(pud, native_make_pud(0));
+ }
+
++#ifdef CONFIG_KAISER
++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
++
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
++{
++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
++}
++
++static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
++{
++ return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
++}
++#else
++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ return pgd;
++}
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
++{
++ return NULL;
++}
++static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
++{
++ return pgdp;
++}
++#endif /* CONFIG_KAISER */
++
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+- *pgdp = pgd;
++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
+ }
+
+ static inline void native_pgd_clear(pgd_t *pgd)
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -571,7 +571,18 @@ static inline pud_t *pud_offset(pgd_t *p
+
+ static inline int pgd_bad(pgd_t pgd)
+ {
+- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++ pgdval_t ignore_flags = _PAGE_USER;
++ /*
++ * We set NX on KAISER pgds that map userspace memory so
++ * that userspace can not meaningfully use the kernel
++ * page table by accident; it will fault on the first
++ * instruction it tries to run. See native_set_pgd().
++ */
++#ifdef CONFIG_KAISER
++ ignore_flags |= _PAGE_NX;
++#endif
++
++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+
+ static inline int pgd_none(pgd_t pgd)
+@@ -772,6 +783,12 @@ static inline void pmdp_set_wrprotect(st
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+ memcpy(dst, src, count * sizeof(pgd_t));
++#ifdef CONFIG_KAISER
++ /* Clone the shadow pgd part as well */
++ memcpy(native_get_shadow_pgd(dst),
++ native_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
++#endif
+ }
+
+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -39,7 +39,11 @@
+ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
++#ifdef CONFIG_KAISER
++#define _PAGE_GLOBAL (_AT(pteval_t, 0))
++#else
+ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
++#endif
+ #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+ #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+ #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+@@ -62,7 +66,7 @@
+ #endif
+
+ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+@@ -74,6 +78,33 @@
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
++/* The ASID is the lower 12 bits of CR3 */
++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
++
++/* Mask for all the PCID-related bits in CR3: */
++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
++
++#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
++
++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
++#else
++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
++/*
++ * PCIDs are unsupported on 32-bit and none of these bits can be
++ * set in CR3:
++ */
++#define X86_CR3_PCID_KERN_FLUSH (0)
++#define X86_CR3_PCID_USER_FLUSH (0)
++#define X86_CR3_PCID_KERN_NOFLUSH (0)
++#define X86_CR3_PCID_USER_NOFLUSH (0)
++#endif
++
+ #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
+ #define _PAGE_CACHE_WB (0)
+ #define _PAGE_CACHE_WC (_PAGE_PWT)
+--- a/arch/x86/include/asm/processor-flags.h
++++ b/arch/x86/include/asm/processor-flags.h
+@@ -43,7 +43,8 @@
+ */
+ #define X86_CR3_PWT 0x00000008 /* Page Write Through */
+ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
+-#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
++#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)
+
+ /*
+ * Intel CPU features in CR4
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -266,7 +266,7 @@ struct tss_struct {
+
+ } ____cacheline_aligned;
+
+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
+
+ /*
+ * Save the original ist values for checking stack pointers during debugging
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -64,20 +64,52 @@ static inline void invpcid_flush_all_non
+ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+ #endif
+
++/*
++ * Declare a couple of kaiser interfaces here for convenience,
++ * to avoid the need for asm/kaiser.h in unexpected places.
++ */
++#ifdef CONFIG_KAISER
++extern void kaiser_setup_pcid(void);
++extern void kaiser_flush_tlb_on_return_to_user(void);
++#else
++static inline void kaiser_setup_pcid(void)
++{
++}
++static inline void kaiser_flush_tlb_on_return_to_user(void)
++{
++}
++#endif
++
++
+ static inline void __native_flush_tlb(void)
+ {
++ if (this_cpu_has(X86_FEATURE_INVPCID)) {
++ /*
++ * Note, this works with CR4.PCIDE=0 or 1.
++ */
++ invpcid_flush_all_nonglobals();
++ return;
++ }
++ if (this_cpu_has(X86_FEATURE_PCID))
++ kaiser_flush_tlb_on_return_to_user();
+ native_write_cr3(native_read_cr3());
+ }
+
+ static inline void __native_flush_tlb_global(void)
+ {
++#ifdef CONFIG_KAISER
++ /* Globals are not used at all */
++ __native_flush_tlb();
++#else
+ unsigned long flags;
+ unsigned long cr4;
+
+- if (static_cpu_has(X86_FEATURE_INVPCID)) {
++ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
++ *
++ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+@@ -97,11 +129,39 @@ static inline void __native_flush_tlb_gl
+ native_write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
++#endif
+ }
+
+ static inline void __native_flush_tlb_single(unsigned long addr)
+ {
+- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++ /*
++ * SIMICS #GP's if you run INVPCID with type 2/3
++ * and X86_CR4_PCIDE clear. Shame!
++ *
++ * The ASIDs used below are hard-coded. But, we must not
++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
++ * invlpg in the case we are called early.
++ */
++
++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
++ if (this_cpu_has(X86_FEATURE_PCID))
++ kaiser_flush_tlb_on_return_to_user();
++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++ return;
++ }
++ /* Flush the address out of both PCIDs. */
++ /*
++ * An optimization here might be to determine addresses
++ * that are only kernel-mapped and only flush the kernel
++ * ASID. But, userspace flushes are probably much more
++ * important performance-wise.
++ *
++ * Make sure to do only a single invpcid when KAISER is
++ * disabled and we have only a single ASID.
++ */
++ if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ }
+
+ static inline void __flush_tlb_all(void)
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitcon
+
+ static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
+@@ -320,6 +320,19 @@ static void setup_pcid(struct cpuinfo_x8
+ * SDM says that it can't be enabled in 32-bit mode.
+ */
+ set_in_cr4(X86_CR4_PCIDE);
++ /*
++ * INVPCID has two "groups" of types:
++ * 1/2: Invalidate an individual address
++ * 3/4: Invalidate all contexts
++ *
++ * 1/2 take a PCID, but 3/4 do not. So, 3/4
++ * ignore the PCID argument in the descriptor.
++ * But, we have to be careful not to call 1/2
++ * with an actual non-zero PCID in them before
++ * we do the above set_in_cr4().
++ */
++ if (cpu_has(c, X86_FEATURE_INVPCID))
++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ }
+ #else
+ /*
+@@ -617,6 +630,7 @@ void __cpuinit cpu_detect(struct cpuinfo
+ c->x86_cache_alignment = c->x86_clflush_size;
+ }
+ }
++ kaiser_setup_pcid();
+ }
+
+ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+@@ -1117,7 +1131,7 @@ static const unsigned int exception_stac
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+ };
+
+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+
+ /* May not be marked __init: used by software suspend */
+--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
+@@ -2,11 +2,15 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++#include <asm/kaiser.h>
+ #include <asm/perf_event.h>
+ #include <asm/insn.h>
+
+ #include "perf_event.h"
+
++static
++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
++
+ /* The size of a BTS record in bytes: */
+ #define BTS_RECORD_SIZE 24
+
+@@ -147,6 +151,39 @@ void fini_debug_store_on_cpu(int cpu)
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+ }
+
++static void *dsalloc(size_t size, gfp_t flags, int node)
++{
++#ifdef CONFIG_KAISER
++ unsigned int order = get_order(size);
++ struct page *page;
++ unsigned long addr;
++
++ page = alloc_pages_node(node, flags | __GFP_ZERO, order);
++ if (!page)
++ return NULL;
++ addr = (unsigned long)page_address(page);
++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
++ __free_pages(page, order);
++ addr = 0;
++ }
++ return (void *)addr;
++#else
++ return kmalloc_node(size, flags | __GFP_ZERO, node);
++#endif
++}
++
++static void dsfree(const void *buffer, size_t size)
++{
++#ifdef CONFIG_KAISER
++ if (!buffer)
++ return;
++ kaiser_remove_mapping((unsigned long)buffer, size);
++ free_pages((unsigned long)buffer, get_order(size));
++#else
++ kfree(buffer);
++#endif
++}
++
+ static int alloc_pebs_buffer(int cpu)
+ {
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+@@ -157,7 +194,7 @@ static int alloc_pebs_buffer(int cpu)
+ if (!x86_pmu.pebs)
+ return 0;
+
+- buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
++ buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+@@ -181,7 +218,7 @@ static void release_pebs_buffer(int cpu)
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+- kfree((void *)(unsigned long)ds->pebs_buffer_base);
++ dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
+ ds->pebs_buffer_base = 0;
+ }
+
+@@ -195,7 +232,7 @@ static int alloc_bts_buffer(int cpu)
+ if (!x86_pmu.bts)
+ return 0;
+
+- buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+@@ -219,19 +256,15 @@ static void release_bts_buffer(int cpu)
+ if (!ds || !x86_pmu.bts)
+ return;
+
+- kfree((void *)(unsigned long)ds->bts_buffer_base);
++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
+ ds->bts_buffer_base = 0;
+ }
+
+ static int alloc_ds_buffer(int cpu)
+ {
+- int node = cpu_to_node(cpu);
+- struct debug_store *ds;
+-
+- ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
+- if (unlikely(!ds))
+- return -ENOMEM;
++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+
++ memset(ds, 0, sizeof(*ds));
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+
+ return 0;
+@@ -245,7 +278,6 @@ static void release_ds_buffer(int cpu)
+ return;
+
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+- kfree(ds);
+ }
+
+ void release_ds_buffers(void)
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -55,6 +55,7 @@
+ #include <asm/percpu.h>
+ #include <asm/asm.h>
+ #include <asm/pgtable_types.h>
++#include <asm/kaiser.h>
+
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+ #include <linux/elf-em.h>
+@@ -328,6 +329,7 @@ ENTRY(save_args)
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
++ SWITCH_KERNEL_CR3
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+@@ -366,6 +368,12 @@ END(save_rest)
+
+ /* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
++/*
++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
++ */
+ ENTRY(save_paranoid)
+ XCPT_FRAME offset=ORIG_RAX-R15+8
+ cld
+@@ -391,7 +399,25 @@ ENTRY(save_paranoid)
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+-1: ret
++1:
++#ifdef CONFIG_KAISER
++ /*
++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
++ * unconditionally, but we need to find out whether the reverse
++ * should be done on return (conveyed to paranoid_exit in %ebx).
++ */
++ movq %cr3, %rax
++ testl $KAISER_SHADOW_PGD_OFFSET, %eax
++ jz 2f
++ orl $2, %ebx
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ orq x86_cr3_pcid_noflush, %rax
++ movq %rax, %cr3
++2:
++#endif
++ ret
+ CFI_ENDPROC
+ END(save_paranoid)
+ .popsection
+@@ -468,6 +494,7 @@ ENTRY(system_call)
+ CFI_REGISTER rip,rcx
+ /*CFI_REGISTER rflags,r11*/
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+ /*
+ * A hypervisor implementation might want to use a label
+ * after the swapgs, so that it can do the swapgs
+@@ -519,6 +546,14 @@ sysret_check:
+ CFI_REGISTER rip,rcx
+ RESTORE_ARGS 1,-ARG_SKIP,0
+ /*CFI_REGISTER rflags,r11*/
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
++ SWITCH_USER_CR3
+ movq PER_CPU_VAR(old_rsp), %rsp
+ USERGS_SYSRET64
+
+@@ -858,6 +893,14 @@ retint_swapgs: /* return to user-space
+ */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
++ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_args
+
+@@ -898,6 +941,7 @@ native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
++ SWITCH_KERNEL_CR3
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+@@ -913,6 +957,7 @@ native_irq_return_ldt:
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
++ SWITCH_USER_CR3
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+@@ -1447,30 +1492,40 @@ paranoidzeroentry machine_check *machine
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+-
+- /* ebx: no swapgs flag */
++/*
++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
++ */
+ ENTRY(paranoid_exit)
+ DEFAULT_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+- testl %ebx,%ebx /* swapgs needed? */
+- jnz paranoid_restore
+- testl $3,CS(%rsp)
+- jnz paranoid_userspace
+-paranoid_swapgs:
++ movq %rbx, %r12 /* paranoid_userspace uses %ebx */
++ testl $3, CS(%rsp)
++ jnz paranoid_userspace
++paranoid_kernel:
++ movq %r12, %rbx /* restore after paranoid_userspace */
+ TRACE_IRQS_IRETQ 0
++#ifdef CONFIG_KAISER
++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
++ jz paranoid_exit_no_switch
++ SWITCH_USER_CR3
++paranoid_exit_no_switch:
++#endif
++ testl $1, %ebx /* swapgs needed? */
++ jnz paranoid_exit_no_swapgs
+ SWAPGS_UNSAFE_STACK
++paranoid_exit_no_swapgs:
+ RESTORE_ALL 8
+- jmp irq_return
+-paranoid_restore:
+- TRACE_IRQS_IRETQ 0
+- RESTORE_ALL 8
+- jmp irq_return
++ jmp irq_return
++
+ paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+- jz paranoid_swapgs
++ jz paranoid_kernel
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+@@ -1518,6 +1573,13 @@ ENTRY(error_entry)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
++ /*
++ * error_entry() always returns with a kernel gsbase and
++ * CR3. We must also have a kernel CR3/gsbase before
++ * calling TRACE_IRQS_*. Just unconditionally switch to
++ * the kernel CR3 here.
++ */
++ SWITCH_KERNEL_CR3
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+@@ -1664,6 +1726,7 @@ ENTRY(nmi)
+ */
+
+ SWAPGS_UNSAFE_STACK
++ SWITCH_KERNEL_CR3_NO_STACK
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(kernel_stack), %rsp
+@@ -1705,6 +1768,7 @@ ENTRY(nmi)
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
++ SWITCH_USER_CR3
+ SWAPGS
+
+ /*
+@@ -1863,10 +1927,15 @@ restart_nmi:
+ je 1f
+ movq %r12, %cr2
+ 1:
+-
+- testl %ebx,%ebx /* swapgs needed? */
+- jnz nmi_restore
++
++#ifdef CONFIG_KAISER
++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
++ jz nmi_swapgs
++ SWITCH_USER_CR3
+ nmi_swapgs:
++#endif
++ testl $1,%ebx /* swapgs needed? */
++ jnz nmi_restore
+ SWAPGS_UNSAFE_STACK
+ nmi_restore:
+ RESTORE_ALL 8
+--- a/arch/x86/kernel/espfix_64.c
++++ b/arch/x86/kernel/espfix_64.c
+@@ -40,6 +40,7 @@
+ #include <asm/pgtable.h>
+ #include <asm/pgalloc.h>
+ #include <asm/setup.h>
++#include <asm/kaiser.h>
+
+ /*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+@@ -128,7 +129,15 @@ void __init init_espfix_bsp(void)
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+-
++ /*
++ * Just copy the top-level PGD that is mapping the espfix
++ * area to ensure it is mapped into the shadow user page
++ * tables.
++ */
++#ifdef CONFIG_KAISER
++ set_pgd(native_get_shadow_pgd(pgd_p),
++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
++#endif
+ /* Randomize the locations */
+ init_espfix_random();
+
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -343,6 +343,27 @@ early_idt_ripmsg:
+ .balign PAGE_SIZE; \
+ ENTRY(name)
+
++#ifdef CONFIG_KAISER
++/*
++ * Each PGD needs to be 8k long and 8k aligned. We do not
++ * ever go out to userspace with these, so we do not
++ * strictly *need* the second page, but this allows us to
++ * have a single set_pgd() implementation that does not
++ * need to worry about whether it has 4k or 8k to work
++ * with.
++ *
++ * This ensures PGDs are 8k long:
++ */
++#define KAISER_USER_PGD_FILL 512
++/* This ensures they are 8k-aligned: */
++#define NEXT_PGD_PAGE(name) \
++ .balign 2 * PAGE_SIZE; \
++GLOBAL(name)
++#else
++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
++#define KAISER_USER_PGD_FILL 0
++#endif
++
+ /* Automate the creation of 1 to 1 mapping pmd entries */
+ #define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+@@ -358,13 +379,14 @@ ENTRY(name)
+ * 0xffffffff80000000 to physical address 0x000000. (always using
+ * 2Mbyte large pages provided by PAE mode)
+ */
+-NEXT_PAGE(init_level4_pgt)
++NEXT_PGD_PAGE(init_level4_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+@@ -390,6 +412,7 @@ NEXT_PAGE(level2_ident_pgt)
+ * Don't set NX because code runs from these pages.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level2_kernel_pgt)
+ /*
+--- a/arch/x86/kernel/init_task.c
++++ b/arch/x86/kernel/init_task.c
+@@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task);
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
+
+--- a/arch/x86/kernel/irqinit.c
++++ b/arch/x86/kernel/irqinit.c
+@@ -85,7 +85,7 @@ static struct irqaction irq2 = {
+ .flags = IRQF_NO_THREAD,
+ };
+
+-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
+ [0 ... NR_VECTORS - 1] = -1,
+ };
+
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -15,6 +15,7 @@
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
++#include <linux/kaiser.h>
+
+ #include <asm/system.h>
+ #include <asm/ldt.h>
+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
+ set_ldt(ldt->entries, ldt->size);
+ }
+
++static void __free_ldt_struct(struct ldt_struct *ldt)
++{
++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(ldt->entries);
++ else
++ free_page((unsigned long)ldt->entries);
++ kfree(ldt);
++}
++
+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+ static struct ldt_struct *alloc_ldt_struct(int size)
+ {
+ struct ldt_struct *new_ldt;
+ int alloc_size;
++ int ret;
+
+ if (size > LDT_ENTRIES)
+ return NULL;
+@@ -65,7 +76,13 @@ static struct ldt_struct *alloc_ldt_stru
+ return NULL;
+ }
+
++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
++ __PAGE_KERNEL);
+ new_ldt->size = size;
++ if (ret) {
++ __free_ldt_struct(new_ldt);
++ return NULL;
++ }
+ return new_ldt;
+ }
+
+@@ -95,12 +112,10 @@ static void free_ldt_struct(struct ldt_s
+ if (likely(!ldt))
+ return;
+
++ kaiser_remove_mapping((unsigned long)ldt->entries,
++ ldt->size * LDT_ENTRY_SIZE);
+ paravirt_free_ldt(ldt->entries, ldt->size);
+- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(ldt->entries);
+- else
+- kfree(ldt->entries);
+- kfree(ldt);
++ __free_ldt_struct(ldt);
+ }
+
+ /*
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -53,7 +53,7 @@
+
+ asmlinkage extern void ret_from_fork(void);
+
+-DEFINE_PER_CPU(unsigned long, old_rsp);
++DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
+ static DEFINE_PER_CPU(unsigned char, is_idle);
+
+ static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+--- /dev/null
++++ b/arch/x86/mm/kaiser.c
+@@ -0,0 +1,385 @@
++#include <linux/bug.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/bug.h>
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/uaccess.h>
++
++extern struct mm_struct init_mm;
++
++#include <asm/kaiser.h>
++#include <asm/tlbflush.h> /* to verify its kaiser declarations */
++#include <asm/pgtable.h>
++#include <asm/pgalloc.h>
++#include <asm/desc.h>
++#include <asm/asm-offsets.h>
++
++#ifdef CONFIG_KAISER
++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++
++extern char __sched_text_start[], __irqentry_text_start[], __irqentry_text_end[];
++
++/*
++ * These can have bit 63 set, so we can not just use a plain "or"
++ * instruction to get their value or'd into CR3. It would take
++ * another register. So, we use a memory reference to these instead.
++ *
++ * This is also handy because systems that do not support PCIDs
++ * just end up or'ing a 0 into their CR3, which does no harm.
++ */
++unsigned long x86_cr3_pcid_noflush __read_mostly;
++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
++
++/*
++ * At runtime, the only things we map are some things for CPU
++ * hotplug, and stacks for new processes. No two CPUs will ever
++ * be populating the same addresses, so we only need to ensure
++ * that we protect between two CPUs trying to allocate and
++ * populate the same page table page.
++ *
++ * Only take this lock when doing a set_p[4um]d(), but it is not
++ * needed for doing a set_pte(). We assume that only the *owner*
++ * of a given allocation will be doing this for _their_
++ * allocation.
++ *
++ * This ensures that once a system has been running for a while
++ * and there have been stacks all over and these page tables
++ * are fully populated, there will be no further acquisitions of
++ * this lock.
++ */
++static DEFINE_SPINLOCK(shadow_table_allocation_lock);
++
++/*
++ * Returns -1 on error.
++ */
++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
++{
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *pte;
++
++ pgd = pgd_offset_k(vaddr);
++ /*
++ * We made all the kernel PGDs present in kaiser_init().
++ * We expect them to stay that way.
++ */
++ BUG_ON(pgd_none(*pgd));
++ /*
++ * PGDs are either 512GB or 128TB on all x86_64
++ * configurations. We don't handle these.
++ */
++ BUG_ON(pgd_large(*pgd));
++
++ pud = pud_offset(pgd, vaddr);
++ if (pud_none(*pud)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ if (pud_large(*pud))
++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
++
++ pmd = pmd_offset(pud, vaddr);
++ if (pmd_none(*pmd)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ if (pmd_large(*pmd))
++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
++
++ pte = pte_offset_kernel(pmd, vaddr);
++ if (pte_none(*pte)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
++}
++
++/*
++ * This is a relatively normal page table walk, except that it
++ * also tries to allocate page tables pages along the way.
++ *
++ * Returns a pointer to a PTE on success, or NULL on failure.
++ */
++static pte_t *kaiser_pagetable_walk(unsigned long address)
++{
++ pmd_t *pmd;
++ pud_t *pud;
++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
++
++ if (pgd_none(*pgd)) {
++ WARN_ONCE(1, "All shadow pgds should have been populated");
++ return NULL;
++ }
++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
++
++ pud = pud_offset(pgd, address);
++ /* The shadow page tables do not use large mappings: */
++ if (pud_large(*pud)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pud_none(*pud)) {
++ unsigned long new_pmd_page = __get_free_page(gfp);
++ if (!new_pmd_page)
++ return NULL;
++ spin_lock(&shadow_table_allocation_lock);
++ if (pud_none(*pud)) {
++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
++ __inc_zone_page_state(virt_to_page((void *)
++ new_pmd_page), NR_KAISERTABLE);
++ } else
++ free_page(new_pmd_page);
++ spin_unlock(&shadow_table_allocation_lock);
++ }
++
++ pmd = pmd_offset(pud, address);
++ /* The shadow page tables do not use large mappings: */
++ if (pmd_large(*pmd)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pmd_none(*pmd)) {
++ unsigned long new_pte_page = __get_free_page(gfp);
++ if (!new_pte_page)
++ return NULL;
++ spin_lock(&shadow_table_allocation_lock);
++ if (pmd_none(*pmd)) {
++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
++ __inc_zone_page_state(virt_to_page((void *)
++ new_pte_page), NR_KAISERTABLE);
++ } else
++ free_page(new_pte_page);
++ spin_unlock(&shadow_table_allocation_lock);
++ }
++
++ return pte_offset_kernel(pmd, address);
++}
++
++int kaiser_add_user_map(const void *__start_addr, unsigned long size,
++ unsigned long flags)
++{
++ int ret = 0;
++ pte_t *pte;
++ unsigned long start_addr = (unsigned long )__start_addr;
++ unsigned long address = start_addr & PAGE_MASK;
++ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
++ unsigned long target_address;
++
++ for (; address < end_addr; address += PAGE_SIZE) {
++ target_address = get_pa_from_mapping(address);
++ if (target_address == -1) {
++ ret = -EIO;
++ break;
++ }
++ pte = kaiser_pagetable_walk(address);
++ if (!pte) {
++ ret = -ENOMEM;
++ break;
++ }
++ if (pte_none(*pte)) {
++ set_pte(pte, __pte(flags | target_address));
++ } else {
++ pte_t tmp;
++ set_pte(&tmp, __pte(flags | target_address));
++ WARN_ON_ONCE(!pte_same(*pte, tmp));
++ }
++ }
++ return ret;
++}
++
++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
++{
++ unsigned long size = end - start;
++
++ return kaiser_add_user_map(start, size, flags);
++}
++
++/*
++ * Ensure that the top level of the (shadow) page tables are
++ * entirely populated. This ensures that all processes that get
++ * forked have the same entries. This way, we do not have to
++ * ever go set up new entries in older processes.
++ *
++ * Note: we never free these, so there are no updates to them
++ * after this.
++ */
++static void __init kaiser_init_all_pgds(void)
++{
++ pgd_t *pgd;
++ int i = 0;
++
++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
++ pgd_t new_pgd;
++ pud_t *pud = pud_alloc_one(&init_mm,
++ PAGE_OFFSET + i * PGDIR_SIZE);
++ if (!pud) {
++ WARN_ON(1);
++ break;
++ }
++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
++ /*
++ * Make sure not to stomp on some other pgd entry.
++ */
++ if (!pgd_none(pgd[i])) {
++ WARN_ON(1);
++ continue;
++ }
++ set_pgd(pgd + i, new_pgd);
++ }
++}
++
++#define kaiser_add_user_map_early(start, size, flags) do { \
++ int __ret = kaiser_add_user_map(start, size, flags); \
++ WARN_ON(__ret); \
++} while (0)
++
++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
++ WARN_ON(__ret); \
++} while (0)
++
++/*
++ * If anything in here fails, we will likely die on one of the
++ * first kernel->user transitions and init will die. But, we
++ * will have most of the kernel up by then and should be able to
++ * get a clean warning out of it. If we BUG_ON() here, we run
++ * the risk of being before we have good console output.
++ */
++void __init kaiser_init(void)
++{
++ int cpu;
++
++ kaiser_init_all_pgds();
++
++ for_each_possible_cpu(cpu) {
++ void *percpu_vaddr = __per_cpu_user_mapped_start +
++ per_cpu_offset(cpu);
++ unsigned long percpu_sz = __per_cpu_user_mapped_end -
++ __per_cpu_user_mapped_start;
++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
++ __PAGE_KERNEL);
++ }
++
++ /*
++ * Map the entry/exit text section (and things it transitively needs
++ * before switching CR3), which is needed at switches from user to and
++ * from kernel.
++ */
++ kaiser_add_user_map_ptrs_early(__sched_text_start, __entry_text_end,
++ __PAGE_KERNEL_RX);
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
++ __irqentry_text_end,
++ __PAGE_KERNEL_RX);
++#endif
++ kaiser_add_user_map_early((void *)idt_descr.address,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL_RO);
++ kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
++ sizeof(x86_cr3_pcid_noflush),
++ __PAGE_KERNEL);
++}
++
++/* Add a mapping to the shadow mapping, and synchronize the mappings */
++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
++{
++ return kaiser_add_user_map((const void *)addr, size, flags);
++}
++
++void kaiser_remove_mapping(unsigned long start, unsigned long size)
++{
++ unsigned long end = start + size;
++ unsigned long addr;
++ pte_t *pte;
++
++ for (addr = start; addr < end; addr += PAGE_SIZE) {
++ pte = kaiser_pagetable_walk(addr);
++ if (pte)
++ set_pte(pte, __pte(0));
++ }
++}
++
++/*
++ * Page table pages are page-aligned. The lower half of the top
++ * level is used for userspace and the top half for the kernel.
++ * This returns true for user pages that need to get copied into
++ * both the user and kernel copies of the page tables, and false
++ * for kernel pages that should only be in the kernel copy.
++ */
++static inline bool is_userspace_pgd(pgd_t *pgdp)
++{
++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
++}
++
++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ /*
++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
++ * skip cases like kexec and EFI which make temporary low mappings.
++ */
++ if (pgd.pgd & _PAGE_USER) {
++ if (is_userspace_pgd(pgdp)) {
++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++ /*
++ * Even if the entry is *mapping* userspace, ensure
++ * that userspace can not use it. This way, if we
++ * get out to userspace running on the kernel CR3,
++ * userspace will crash instead of running.
++ */
++ pgd.pgd |= _PAGE_NX;
++ }
++ } else if (!pgd.pgd) {
++ /*
++ * pgd_clear() cannot check _PAGE_USER, and is even used to
++ * clear corrupted pgd entries: so just rely on cases like
++ * kexec and EFI never to be using pgd_clear().
++ */
++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
++ is_userspace_pgd(pgdp))
++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++ }
++ return pgd;
++}
++
++void kaiser_setup_pcid(void)
++{
++ unsigned long kern_cr3 = 0;
++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
++
++ if (this_cpu_has(X86_FEATURE_PCID)) {
++ kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
++ }
++ /*
++ * These variables are used by the entry/exit
++ * code to change PCID and pgd and TLB flushing.
++ */
++ x86_cr3_pcid_noflush = kern_cr3;
++ this_cpu_write(x86_cr3_pcid_user, user_cr3);
++}
++
++/*
++ * Make a note that this cpu will need to flush USER tlb on return to user.
++ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
++ * if cpu does not, then the NOFLUSH bit will never have been set.
++ */
++void kaiser_flush_tlb_on_return_to_user(void)
++{
++ this_cpu_write(x86_cr3_pcid_user,
++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
++}
++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
++#endif /* CONFIG_KAISER */
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulatio
+ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
+ obj-$(CONFIG_MEMTEST) += memtest.o
++obj-$(CONFIG_KAISER) += kaiser.o
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -5,7 +5,7 @@
+ #include <asm/tlb.h>
+ #include <asm/fixmap.h>
+
+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+ #ifdef CONFIG_HIGHPTE
+ #define PGALLOC_USER_GFP __GFP_HIGHMEM
+@@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct m
+ }
+ }
+
++#ifdef CONFIG_KAISER
++/*
++ * Instead of one pmd, we aquire two pmds. Being order-1, it is
++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
++ * in a pointer to swap between the two 4k halves.
++ */
++#define PGD_ALLOCATION_ORDER 1
++#else
++#define PGD_ALLOCATION_ORDER 0
++#endif
++
++static inline pgd_t *_pgd_alloc(void)
++{
++ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
++ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
++ PGD_ALLOCATION_ORDER);
++}
++
++static inline void _pgd_free(pgd_t *pgd)
++{
++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
++}
++
+ pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+ pgd_t *pgd;
+ pmd_t *pmds[PREALLOCATED_PMDS];
+
+- pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
++ pgd = _pgd_alloc();
+
+ if (pgd == NULL)
+ goto out;
+@@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ out_free_pmds:
+ free_pmds(pmds);
+ out_free_pgd:
+- free_page((unsigned long)pgd);
++ _pgd_free(pgd);
+ out:
+ return NULL;
+ }
+@@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ paravirt_pgd_free(mm, pgd);
+- free_page((unsigned long)pgd);
++ _pgd_free(pgd);
+ }
+
+ /*
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -12,10 +12,43 @@
+ #include <asm/cache.h>
+ #include <asm/apic.h>
+ #include <asm/uv/uv.h>
++#include <asm/kaiser.h>
+
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
+
++static void load_new_mm_cr3(pgd_t *pgdir)
++{
++ unsigned long new_mm_cr3 = __pa(pgdir);
++
++#ifdef CONFIG_KAISER
++ if (this_cpu_has(X86_FEATURE_PCID)) {
++ /*
++ * We reuse the same PCID for different tasks, so we must
++ * flush all the entries for the PCID out when we change tasks.
++ * Flush KERN below, flush USER when returning to userspace in
++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
++ *
++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
++ * do it here, but can only be used if X86_FEATURE_INVPCID is
++ * available - and many machines support pcid without invpcid.
++ *
++ * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
++ * but keep that line in there in case something changes.
++ */
++ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
++ kaiser_flush_tlb_on_return_to_user();
++ }
++#endif /* CONFIG_KAISER */
++
++ /*
++ * Caution: many callers of this function expect
++ * that load_new_mm_cr3() is serializing and orders TLB
++ * fills with respect to the mm_cpumask writes.
++ */
++ write_cr3(new_mm_cr3);
++}
++
+ /*
+ * TLB flushing, formerly SMP-only
+ * c/o Linus Torvalds.
+@@ -65,7 +98,7 @@ void leave_mm(int cpu)
+ BUG();
+ cpumask_clear_cpu(cpu,
+ mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
+- load_cr3(swapper_pg_dir);
++ load_new_mm_cr3(swapper_pg_dir);
+ }
+ EXPORT_SYMBOL_GPL(leave_mm);
+
+@@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+- * Fortunately, load_cr3() is serializing and gives the
+- * ordering guarantee we need.
+- *
++ * Fortunately, load_new_mm_cr3() is serializing
++ * and gives the ordering guarantee we need.
+ */
+- load_cr3(next->pgd);
++ load_new_mm_cr3(next->pgd);
+
+ /* stop flush ipis for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+@@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ *
+- * As above, load_cr3() is serializing and orders TLB
+- * fills with respect to the mm_cpumask write.
++ * As above, load_new_mm_cr3() is serializing and orders
++ * TLB fills with respect to the mm_cpumask write.
+ */
+- load_cr3(next->pgd);
++ load_new_mm_cr3(next->pgd);
+ load_mm_ldt(next);
+ }
+ }
+--- a/include/asm-generic/vmlinux.lds.h
++++ b/include/asm-generic/vmlinux.lds.h
+@@ -695,7 +695,14 @@
+ */
+ #define PERCPU_INPUT(cacheline) \
+ VMLINUX_SYMBOL(__per_cpu_start) = .; \
++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
+ *(.data..percpu..first) \
++ . = ALIGN(cacheline); \
++ *(.data..percpu..user_mapped) \
++ *(.data..percpu..user_mapped..shared_aligned) \
++ . = ALIGN(PAGE_SIZE); \
++ *(.data..percpu..user_mapped..page_aligned) \
++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..page_aligned) \
+ . = ALIGN(cacheline); \
+--- /dev/null
++++ b/include/linux/kaiser.h
+@@ -0,0 +1,52 @@
++#ifndef _LINUX_KAISER_H
++#define _LINUX_KAISER_H
++
++#ifdef CONFIG_KAISER
++#include <asm/kaiser.h>
++
++static inline int kaiser_map_thread_stack(void *stack)
++{
++ /*
++ * Map that page of kernel stack on which we enter from user context.
++ */
++ return kaiser_add_mapping((unsigned long)stack +
++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
++}
++
++static inline void kaiser_unmap_thread_stack(void *stack)
++{
++ /*
++ * Note: may be called even when kaiser_map_thread_stack() failed.
++ */
++ kaiser_remove_mapping((unsigned long)stack +
++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
++}
++#else
++
++/*
++ * These stubs are used whenever CONFIG_KAISER is off, which
++ * includes architectures that support KAISER, but have it disabled.
++ */
++
++static inline void kaiser_init(void)
++{
++}
++static inline int kaiser_add_mapping(unsigned long addr,
++ unsigned long size, unsigned long flags)
++{
++ return 0;
++}
++static inline void kaiser_remove_mapping(unsigned long start,
++ unsigned long size)
++{
++}
++static inline int kaiser_map_thread_stack(void *stack)
++{
++ return 0;
++}
++static inline void kaiser_unmap_thread_stack(void *stack)
++{
++}
++
++#endif /* !CONFIG_KAISER */
++#endif /* _LINUX_KAISER_H */
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -95,8 +95,9 @@ enum zone_stat_item {
+ NR_SLAB_RECLAIMABLE,
+ NR_SLAB_UNRECLAIMABLE,
+ NR_PAGETABLE, /* used for pagetables */
+- NR_KERNEL_STACK,
+ /* Second 128 byte cacheline */
++ NR_KERNEL_STACK,
++ NR_KAISERTABLE,
+ NR_UNSTABLE_NFS, /* NFS unstable pages */
+ NR_BOUNCE,
+ NR_VMSCAN_WRITE,
+--- a/include/linux/percpu-defs.h
++++ b/include/linux/percpu-defs.h
+@@ -28,6 +28,12 @@
+ (void)__vpp_verify; \
+ } while (0)
+
++#ifdef CONFIG_KAISER
++#define USER_MAPPED_SECTION "..user_mapped"
++#else
++#define USER_MAPPED_SECTION ""
++#endif
++
+ /*
+ * s390 and alpha modules require percpu variables to be defined as
+ * weak to force the compiler to generate GOT based external
+@@ -90,6 +96,12 @@
+ #define DEFINE_PER_CPU(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, "")
+
++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
++
++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
++
+ /*
+ * Declaration/definition used for per-CPU variables that must come first in
+ * the set of variables.
+@@ -119,6 +131,14 @@
+ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
++ ____cacheline_aligned_in_smp
++
++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
++ ____cacheline_aligned_in_smp
++
+ #define DECLARE_PER_CPU_ALIGNED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
+ ____cacheline_aligned
+@@ -137,11 +157,21 @@
+ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
+ __aligned(PAGE_SIZE)
++/*
++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
++ */
++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
++ __aligned(PAGE_SIZE)
++
++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
++ __aligned(PAGE_SIZE)
+
+ /*
+ * Declaration/definition used for per-CPU variables that must be read mostly.
+ */
+-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
+
+ #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
+--- a/init/main.c
++++ b/init/main.c
+@@ -70,6 +70,7 @@
+ #include <linux/slab.h>
+ #include <linux/perf_event.h>
+ #include <linux/random.h>
++#include <linux/kaiser.h>
+
+ #include <asm/io.h>
+ #include <asm/bugs.h>
+@@ -477,6 +478,7 @@ static void __init mm_init(void)
+ percpu_init_late();
+ pgtable_cache_init();
+ vmalloc_init();
++ kaiser_init();
+ }
+
+ asmlinkage void __init start_kernel(void)
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -59,6 +59,7 @@
+ #include <linux/tsacct_kern.h>
+ #include <linux/cn_proc.h>
+ #include <linux/freezer.h>
++#include <linux/kaiser.h>
+ #include <linux/delayacct.h>
+ #include <linux/taskstats_kern.h>
+ #include <linux/random.h>
+@@ -137,6 +138,7 @@ static struct thread_info *alloc_thread_
+
+ static inline void free_thread_info(struct thread_info *ti)
+ {
++ kaiser_unmap_thread_stack(ti);
+ free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ }
+ #endif
+@@ -286,6 +288,10 @@ static struct task_struct *dup_task_stru
+ if (err)
+ goto out;
+
++ err = kaiser_map_thread_stack(tsk->stack);
++ if (err)
++ goto out;
++
+ setup_thread_stack(tsk, orig);
+ clear_user_return_notifier(tsk);
+ clear_tsk_need_resched(tsk);
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -699,6 +699,7 @@ const char * const vmstat_text[] = {
+ "nr_slab_unreclaimable",
+ "nr_page_table_pages",
+ "nr_kernel_stack",
++ "nr_overhead",
+ "nr_unstable",
+ "nr_bounce",
+ "nr_vmscan_write",
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -96,6 +96,16 @@ config SECURITY
+
+ If you are unsure how to answer this question, answer N.
+
++config KAISER
++ bool "Remove the kernel mapping in user mode"
++ default y
++ depends on X86_64 && SMP
++ help
++ This enforces a strict kernel and user space isolation, in order
++ to close hardware side channels on kernel address information.
++
++ If you are unsure how to answer this question, answer Y.
++
+ config SECURITYFS
+ bool "Enable the securityfs filesystem"
+ help
diff --git a/patches.suse/kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch b/patches.suse/kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch
new file mode 100644
index 0000000000..90b60d53c5
--- /dev/null
+++ b/patches.suse/kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch
@@ -0,0 +1,130 @@
+From ecd5a23ba4031f4cd1c9225d30ae5d210d65fc1b Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Tue, 5 Dec 2017 21:15:07 +0100
+Subject: [PATCH 19/19] x86/mm/kaiser: re-enable vsyscalls
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: No, backport-specific
+
+To avoid breaking the kernel ABI.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+
+hughd adjusted it to the 3.2.96 tree (leaving out the PVCLOCK_FIXMAP
+user mapping, which does not apply to this tree); and for safety
+added vsyscall_pgprot, and a BUG_ON if _PAGE_USER outside of FIXMAP.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/include/asm/vsyscall.h | 1 +
+ arch/x86/kernel/hpet.c | 3 +++
+ arch/x86/kernel/vsyscall_64.c | 4 +++-
+ arch/x86/mm/kaiser.c | 14 +++++++++++---
+ 4 files changed, 18 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/include/asm/vsyscall.h
++++ b/arch/x86/include/asm/vsyscall.h
+@@ -26,6 +26,7 @@ enum vsyscall_num {
+ /* kernel space (writeable) */
+ extern int vgetcpu_mode;
+ extern struct timezone sys_tz;
++extern unsigned long vsyscall_pgprot;
+
+ #include <asm/vvar.h>
+
+--- a/arch/x86/kernel/hpet.c
++++ b/arch/x86/kernel/hpet.c
+@@ -10,6 +10,7 @@
+ #include <linux/cpu.h>
+ #include <linux/pm.h>
+ #include <linux/io.h>
++#include <linux/kaiser.h>
+
+ #include <asm/fixmap.h>
+ #include <asm/i8253.h>
+@@ -100,6 +101,8 @@ static inline void hpet_set_mapping(void
+ hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ #ifdef CONFIG_X86_64
+ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
++ kaiser_add_mapping(__fix_to_virt(VSYSCALL_HPET), PAGE_SIZE,
++ __PAGE_KERNEL_VVAR_NOCACHE);
+ #endif
+ }
+
+--- a/arch/x86/kernel/vsyscall_64.c
++++ b/arch/x86/kernel/vsyscall_64.c
+@@ -49,6 +49,8 @@
+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
+ #define __syscall_clobber "r11","cx","memory"
+
++unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL;
++
+ DEFINE_VVAR(int, vgetcpu_mode);
+ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
+ {
+@@ -288,7 +290,7 @@ void __init map_vsyscall(void)
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+ /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
++ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, __pgprot(vsyscall_pgprot));
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+ (unsigned long)VVAR_ADDRESS);
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -15,6 +15,7 @@ extern struct mm_struct init_mm;
+
+ #include <asm/kaiser.h>
+ #include <asm/tlbflush.h> /* to verify its kaiser declarations */
++#include <asm/vsyscall.h>
+ #include <asm/pgtable.h>
+ #include <asm/pgalloc.h>
+ #include <asm/desc.h>
+@@ -135,7 +136,7 @@ static pte_t *kaiser_pagetable_walk(unsi
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud)) {
+- set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
++ set_pud(pud, __pud(_PAGE_TABLE | __pa(new_pmd_page)));
+ __inc_zone_page_state(virt_to_page((void *)
+ new_pmd_page), NR_KAISERTABLE);
+ } else
+@@ -155,7 +156,7 @@ static pte_t *kaiser_pagetable_walk(unsi
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd)) {
+- set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
++ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(new_pte_page)));
+ __inc_zone_page_state(virt_to_page((void *)
+ new_pte_page), NR_KAISERTABLE);
+ } else
+@@ -176,6 +177,9 @@ int kaiser_add_user_map(const void *__st
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
++ if (flags & _PAGE_USER)
++ BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP);
++
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+@@ -229,7 +233,7 @@ static void __init kaiser_init_all_pgds(
+ break;
+ }
+ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+- new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
++ new_pgd = __pgd(_PAGE_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+ */
+@@ -288,6 +292,10 @@ void __init kaiser_init(void)
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
++ kaiser_add_user_map_early((void *)VVAR_ADDRESS, PAGE_SIZE,
++ __PAGE_KERNEL_VVAR);
++ kaiser_add_user_map_early((void *)VSYSCALL_START, PAGE_SIZE,
++ vsyscall_pgprot);
+ kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+ sizeof(x86_cr3_pcid_noflush),
+ __PAGE_KERNEL);
diff --git a/patches.suse/kaiser-0020-fix-ldt-freeing.patch b/patches.suse/kaiser-0020-fix-ldt-freeing.patch
new file mode 100644
index 0000000000..7f63c88b28
--- /dev/null
+++ b/patches.suse/kaiser-0020-fix-ldt-freeing.patch
@@ -0,0 +1,42 @@
+From: Jiri Kosina <jkosina@suse.cz>
+Subject: KAISER: fix ldt freeing
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: No, backport-specific
+
+3.2.96 version of 0018-KAISER-Kernel-Address-Isolation.patch changed
+the hunk that frees ldt while moving it over to __free_ldt_struct(), so
+that ldt->entries freeing mechanism is changed from free_page() to kfree()
+whic leads to
+
+ BUG: Bad page state in process ldt_gdt_32 pfn:803852
+ page:ffffea001c0c51f0 count:0 mapcount:0 mapping: (null) index:0x0
+ page flags: 0x20000000000100(slab)
+ Pid: 7815, comm: ldt_gdt_32 Tainted: G B E X 3.0.101-0-default #1
+ Call Trace:
+ [<ffffffff81004b35>] dump_trace+0x75/0x300
+ [<ffffffff814690d3>] dump_stack+0x69/0x6f
+ [<ffffffff811026d1>] bad_page+0xb1/0x120
+ [<ffffffff81102c36>] free_pages_prepare+0xe6/0x110
+ [<ffffffff81106679>] free_hot_cold_page+0x49/0x1f0
+ [<ffffffff8100513b>] __free_ldt_struct+0x1b/0x40
+ [<ffffffff810053f2>] write_ldt+0x172/0x270
+ [<ffffffff81475956>] sysenter_dispatch+0x7/0x32
+
+Bring back the original behavior.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/kernel/ldt.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -40,7 +40,7 @@ static void __free_ldt_struct(struct ldt
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+- free_page((unsigned long)ldt->entries);
++ kfree(ldt->entries);
+ kfree(ldt);
+ }
+
diff --git a/patches.suse/kaiser-0021-disable-vmstat.patch b/patches.suse/kaiser-0021-disable-vmstat.patch
new file mode 100644
index 0000000000..33b7f308c9
--- /dev/null
+++ b/patches.suse/kaiser-0021-disable-vmstat.patch
@@ -0,0 +1,75 @@
+From: Jiri Kosina <jkosina@suse.cz>
+Subject: x86/kaiser: disable vmstat accounting
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: No, backport-specific
+
+This is debugging facility, and is unnecessary kABI hazard.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/mm/kaiser.c | 13 ++++---------
+ include/linux/mmzone.h | 3 +--
+ mm/vmstat.c | 1 -
+ 3 files changed, 5 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -135,11 +135,9 @@ static pte_t *kaiser_pagetable_walk(unsi
+ if (!new_pmd_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+- if (pud_none(*pud)) {
++ if (pud_none(*pud))
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(new_pmd_page)));
+- __inc_zone_page_state(virt_to_page((void *)
+- new_pmd_page), NR_KAISERTABLE);
+- } else
++ else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+@@ -155,11 +153,9 @@ static pte_t *kaiser_pagetable_walk(unsi
+ if (!new_pte_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+- if (pmd_none(*pmd)) {
++ if (pmd_none(*pmd))
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(new_pte_page)));
+- __inc_zone_page_state(virt_to_page((void *)
+- new_pte_page), NR_KAISERTABLE);
+- } else
++ else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+@@ -232,7 +228,6 @@ static void __init kaiser_init_all_pgds(
+ WARN_ON(1);
+ break;
+ }
+- inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+ new_pgd = __pgd(_PAGE_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -95,9 +95,8 @@ enum zone_stat_item {
+ NR_SLAB_RECLAIMABLE,
+ NR_SLAB_UNRECLAIMABLE,
+ NR_PAGETABLE, /* used for pagetables */
+- /* Second 128 byte cacheline */
+ NR_KERNEL_STACK,
+- NR_KAISERTABLE,
++ /* Second 128 byte cacheline */
+ NR_UNSTABLE_NFS, /* NFS unstable pages */
+ NR_BOUNCE,
+ NR_VMSCAN_WRITE,
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -699,7 +699,6 @@ const char * const vmstat_text[] = {
+ "nr_slab_unreclaimable",
+ "nr_page_table_pages",
+ "nr_kernel_stack",
+- "nr_overhead",
+ "nr_unstable",
+ "nr_bounce",
+ "nr_vmscan_write",
diff --git a/patches.suse/kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch b/patches.suse/kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
new file mode 100644
index 0000000000..0bf3b65192
--- /dev/null
+++ b/patches.suse/kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
@@ -0,0 +1,655 @@
+From 70eba0679e1004bb544b5d780f1c6233c60da2f2 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Sun, 24 Sep 2017 16:59:49 -0700
+Subject: [PATCH 05/14] kaiser: add "nokaiser" boot option, using ALTERNATIVE
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: No, backport specific + PTI under development
+
+Added "nokaiser" boot option: an early param like "noinvpcid".
+Most places now check int kaiser_enabled (#defined 0 when not
+CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
+and entry_64_compat.S are using the ALTERNATIVE technique, which
+patches in the preferred instructions at runtime. That technique
+is tied to x86 cpu features, so X86_FEATURE_KAISER fabricated
+("" in its comment so "kaiser" not magicked into /proc/cpuinfo).
+
+Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
+but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
+nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
+neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
+won't get set in some obscure corner, or something add PGE into CR4.
+By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
+all page table setup which uses pte_pfn() masks it out of the ptes.
+
+It's slightly shameful that the same declaration versus definition of
+kaiser_enabled appears in not one, not two, but in three header files
+(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way,
+than with #including any of those in any of the others; and did not
+feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
+them all, so we shall hear about it if they get out of synch.
+
+Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
+from kaiser.c; removed the unused native_get_normal_pgd(); removed
+the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
+comments. But more interestingly, set CR4.PSE in secondary_startup_64:
+the manual is clear that it does not matter whether it's 0 or 1 when
+4-level-pts are enabled, but I was distracted to find cr4 different on
+BSP and auxiliaries - BSP alone was adding PSE, in init_memory_mapping().
+
+(cherry picked from Change-Id: I8e5bec716944444359cbd19f6729311eff943e9a)
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ Documentation/kernel-parameters.txt | 2 +
+ arch/x86/ia32/ia32entry.S | 2 +
+ arch/x86/include/asm/alternative-asm.h | 35 +++++++++++++++++++++++++++++++++
+ arch/x86/include/asm/cpufeature.h | 3 ++
+ arch/x86/include/asm/kaiser.h | 22 ++++++++++++++++----
+ arch/x86/include/asm/pgtable.h | 22 +++++++++++++-------
+ arch/x86/include/asm/pgtable_64.h | 13 +++---------
+ arch/x86/include/asm/pgtable_types.h | 4 ---
+ arch/x86/include/asm/tlbflush.h | 35 +++++++++++++++++++--------------
+ arch/x86/kernel/cpu/common.c | 29 ++++++++++++++++++++++++++-
+ arch/x86/kernel/entry_64.S | 7 +++++-
+ arch/x86/kernel/espfix_64.c | 3 --
+ arch/x86/kernel/head_64.S | 4 +--
+ arch/x86/mm/init.c | 2 -
+ arch/x86/mm/init_64.c | 10 +++++++++
+ arch/x86/mm/kaiser.c | 26 ++++++++++++++++++++----
+ arch/x86/mm/pgtable.c | 8 +------
+ arch/x86/mm/tlb.c | 4 ---
+ 18 files changed, 172 insertions(+), 59 deletions(-)
+
+--- a/arch/x86/ia32/ia32entry.S
++++ b/arch/x86/ia32/ia32entry.S
+@@ -13,6 +13,8 @@
+ #include <asm/thread_info.h>
+ #include <asm/segment.h>
+ #include <asm/pgtable_types.h>
++#include <asm/alternative-asm.h>
++#include <asm/cpufeature.h>
+ #include <asm/kaiser.h>
+ #include <asm/irqflags.h>
+ #include <linux/linkage.h>
+--- a/arch/x86/include/asm/alternative-asm.h
++++ b/arch/x86/include/asm/alternative-asm.h
+@@ -24,4 +24,39 @@
+ .byte \alt_len
+ .endm
+
++.macro ALTERNATIVE oldinstr, newinstr, feature
++140:
++ \oldinstr
++141:
++
++ .pushsection .altinstructions,"a"
++ altinstruction_entry 140b,143f,\feature,141b-140b,144f-143f
++ .popsection
++
++ .pushsection .altinstr_replacement,"ax"
++143:
++ \newinstr
++144:
++ .popsection
++.endm
++
++.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
++140:
++ \oldinstr
++141:
++
++ .pushsection .altinstructions,"a"
++ altinstruction_entry 140b,143f,\feature1,141b-140b,144f-143f
++ altinstruction_entry 140b,144f,\feature2,141b-140b,145f-144f
++ .popsection
++
++ .pushsection .altinstr_replacement,"ax"
++143:
++ \newinstr1
++144:
++ \newinstr2
++145:
++ .popsection
++.endm
++
+ #endif /* __ASSEMBLY__ */
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -179,6 +179,9 @@
+ #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
+ #define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
+
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
++
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
+ #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
+--- a/arch/x86/include/asm/kaiser.h
++++ b/arch/x86/include/asm/kaiser.h
+@@ -46,28 +46,34 @@ movq \reg, %cr3
+ .endm
+
+ .macro SWITCH_KERNEL_CR3
++ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KAISER
+ pushq %rax
+ _SWITCH_TO_KERNEL_CR3 %rax
+ popq %rax
++.Lend_\@:
+ .endm
+
+ .macro SWITCH_USER_CR3
++ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KAISER
+ pushq %rax
+ _SWITCH_TO_USER_CR3 %rax %al
+ popq %rax
++.Lend_\@:
+ .endm
+
+ .macro SWITCH_KERNEL_CR3_NO_STACK
++ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KAISER
+ movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+ _SWITCH_TO_KERNEL_CR3 %rax
+ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
++.Lend_\@:
+ .endm
+
+ #else /* CONFIG_KAISER */
+
+-.macro SWITCH_KERNEL_CR3 reg
++.macro SWITCH_KERNEL_CR3
+ .endm
+-.macro SWITCH_USER_CR3 reg regb
++.macro SWITCH_USER_CR3
+ .endm
+ .macro SWITCH_KERNEL_CR3_NO_STACK
+ .endm
+@@ -90,6 +96,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_p
+
+ extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif /* CONFIG_KAISER */
++
++/*
++ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
++ * so as to build with tests on kaiser_enabled instead of #ifdefs.
++ */
++
+ /**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+@@ -119,8 +135,6 @@ extern void kaiser_remove_mapping(unsign
+ */
+ extern void kaiser_init(void);
+
+-#endif /* CONFIG_KAISER */
+-
+ #endif /* __ASSEMBLY */
+
+ #endif /* _ASM_X86_KAISER_H */
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -110,13 +110,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t
+
+ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+ {
++#ifdef CONFIG_DEBUG_VM
++ /* linux/mmdebug.h may not have been included at this point */
++ BUG_ON(!kaiser_enabled);
++#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+ }
+-
+-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+-{
+- return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+-}
+ #else
+ static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+@@ -126,10 +125,6 @@ static inline pgd_t *native_get_shadow_p
+ {
+ return NULL;
+ }
+-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+-{
+- return pgdp;
+-}
+ #endif /* CONFIG_KAISER */
+
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -17,6 +17,11 @@
+ #ifndef __ASSEMBLY__
+
+ #include <asm/x86_init.h>
++#ifdef CONFIG_KAISER
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif
+
+ /*
+ * ZERO_PAGE is a global shared page that is always zero: used
+@@ -578,9 +583,8 @@ static inline int pgd_bad(pgd_t pgd)
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+-#ifdef CONFIG_KAISER
+- ignore_flags |= _PAGE_NX;
+-#endif
++ if (kaiser_enabled)
++ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+@@ -782,12 +786,14 @@ static inline void pmdp_set_wrprotect(st
+ */
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+- memcpy(dst, src, count * sizeof(pgd_t));
++ memcpy(dst, src, count * sizeof(pgd_t));
+ #ifdef CONFIG_KAISER
+- /* Clone the shadow pgd part as well */
+- memcpy(native_get_shadow_pgd(dst),
+- native_get_shadow_pgd(src),
+- count * sizeof(pgd_t));
++ if (kaiser_enabled) {
++ /* Clone the shadow pgd part as well */
++ memcpy(native_get_shadow_pgd(dst),
++ native_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
++ }
+ #endif
+ }
+
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -39,11 +39,7 @@
+ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+-#ifdef CONFIG_KAISER
+-#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+-#else
+ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+-#endif
+ #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+ #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+ #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -69,9 +69,11 @@ static inline void invpcid_flush_all_non
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+ #ifdef CONFIG_KAISER
++extern int kaiser_enabled;
+ extern void kaiser_setup_pcid(void);
+ extern void kaiser_flush_tlb_on_return_to_user(void);
+ #else
++#define kaiser_enabled 0
+ static inline void kaiser_setup_pcid(void)
+ {
+ }
+@@ -90,20 +92,22 @@ static inline void __native_flush_tlb(vo
+ invpcid_flush_all_nonglobals();
+ return;
+ }
+- if (this_cpu_has(X86_FEATURE_PCID))
++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ native_write_cr3(native_read_cr3());
+ }
+
+ static inline void __native_flush_tlb_global(void)
+ {
+-#ifdef CONFIG_KAISER
+- /* Globals are not used at all */
+- __native_flush_tlb();
+-#else
+ unsigned long flags;
+ unsigned long cr4;
+
++ if (kaiser_enabled) {
++ /* Globals are not used at all */
++ __native_flush_tlb();
++ return;
++ }
++
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+@@ -123,13 +127,16 @@ static inline void __native_flush_tlb_gl
+ raw_local_irq_save(flags);
+
+ cr4 = native_read_cr4();
+- /* clear PGE */
+- native_write_cr4(cr4 & ~X86_CR4_PGE);
+- /* write old PGE again and flush TLBs */
+- native_write_cr4(cr4);
++ if (cr4 & X86_CR4_PGE) {
++ /* clear PGE and flush TLB of all entries */
++ native_write_cr4(cr4 & ~X86_CR4_PGE);
++ /* restore PGE as it was before */
++ native_write_cr4(cr4);
++ } else {
++ native_write_cr3(native_read_cr3());
++ }
+
+ raw_local_irq_restore(flags);
+-#endif
+ }
+
+ static inline void __native_flush_tlb_single(unsigned long addr)
+@@ -144,7 +151,7 @@ static inline void __native_flush_tlb_si
+ */
+
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+- if (this_cpu_has(X86_FEATURE_PCID))
++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+@@ -159,9 +166,9 @@ static inline void __native_flush_tlb_si
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+- if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+- invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+- invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
++ if (kaiser_enabled)
++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+ }
+
+ static inline void __flush_tlb_all(void)
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -171,6 +171,20 @@ static int __init x86_pcid_setup(char *s
+ return 1;
+ }
+ __setup("nopcid", x86_pcid_setup);
++
++static int __init x86_nokaiser_setup(char *s)
++{
++ /* nokaiser doesn't accept parameters */
++ if (s)
++ return -EINVAL;
++#ifdef CONFIG_KAISER
++ kaiser_enabled = 0;
++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
++ pr_info("nokaiser: KAISER feature disabled\n");
++#endif
++ return 0;
++}
++early_param("nokaiser", x86_nokaiser_setup);
+ #endif
+
+ static int __init x86_noinvpcid_setup(char *s)
+@@ -314,7 +328,7 @@ static void setup_pcid(struct cpuinfo_x8
+ {
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ #ifdef CONFIG_X86_64
+- if (cpu_has(c, X86_FEATURE_PGE)) {
++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ /*
+ * Regardless of whether PCID is enumerated, the
+ * SDM says that it can't be enabled in 32-bit mode.
+@@ -682,6 +696,10 @@ void __cpuinit get_cpu_cap(struct cpuinf
+ c->x86_power = cpuid_edx(0x80000007);
+
+ init_scattered_cpuid_features(c);
++#ifdef CONFIG_KAISER
++ if (kaiser_enabled)
++ set_cpu_cap(c, X86_FEATURE_KAISER);
++#endif
+ }
+
+ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+@@ -1231,6 +1249,15 @@ void __cpuinit cpu_init(void)
+ int cpu;
+ int i;
+
++ if (!kaiser_enabled) {
++ /*
++ * secondary_startup_64() deferred setting PGE in cr4:
++ * init_memory_mapping() sets it on the boot cpu,
++ * but it needs to be set on each secondary cpu.
++ */
++ set_in_cr4(X86_CR4_PGE);
++ }
++
+ cpu = stack_smp_processor_id();
+ t = &per_cpu(init_tss, cpu);
+ oist = &per_cpu(orig_ist, cpu);
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -55,6 +55,8 @@
+ #include <asm/percpu.h>
+ #include <asm/asm.h>
+ #include <asm/pgtable_types.h>
++#include <asm/alternative-asm.h>
++#include <asm/cpufeature.h>
+ #include <asm/kaiser.h>
+
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+@@ -408,7 +410,8 @@ ENTRY(save_paranoid)
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+- movq %cr3, %rax
++ ALTERNATIVE "jmp 2f", "", X86_FEATURE_KAISER
++ movq %cr3, %rax
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+@@ -1509,6 +1512,7 @@ paranoid_kernel:
+ movq %r12, %rbx /* restore after paranoid_userspace */
+ TRACE_IRQS_IRETQ 0
+ #ifdef CONFIG_KAISER
++ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+@@ -1929,6 +1933,7 @@ restart_nmi:
+ 1:
+
+ #ifdef CONFIG_KAISER
++ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz nmi_swapgs
+ SWITCH_USER_CR3
+--- a/arch/x86/kernel/espfix_64.c
++++ b/arch/x86/kernel/espfix_64.c
+@@ -134,10 +134,9 @@ void __init init_espfix_bsp(void)
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+-#ifdef CONFIG_KAISER
++ if (kaiser_enabled)
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+-#endif
+ /* Randomize the locations */
+ init_espfix_random();
+
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -160,8 +160,8 @@ ENTRY(secondary_startup_64)
+ * after the boot processor executes this code.
+ */
+
+- /* Enable PAE mode and PGE */
+- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
++ movl $(X86_CR4_PAE | X86_CR4_PSE), %eax
+ movq %rax, %cr4
+
+ /* Setup early boot stage 4 level pagetables. */
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -312,6 +312,16 @@ void __init cleanup_highmap(void)
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
++ else if (kaiser_enabled) {
++ /*
++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
++ * clear that now. This is not important, so long as
++ * CR4.PGE remains clear, but it removes an anomaly.
++ * Physical mapping setup below avoids _PAGE_GLOBAL
++ * by use of massage_pgprot() inside pfn_pte() etc.
++ */
++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
++ }
+ }
+ }
+
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -187,7 +187,7 @@ unsigned long __init_refok init_memory_m
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+- if (cpu_has_pge) {
++ if (cpu_has_pge && !kaiser_enabled) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -21,7 +21,9 @@ extern struct mm_struct init_mm;
+ #include <asm/desc.h>
+ #include <asm/asm-offsets.h>
+
+-#ifdef CONFIG_KAISER
++int kaiser_enabled __read_mostly = 1;
++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
++
+ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+ extern char __sched_text_start[], __irqentry_text_start[], __irqentry_text_end[];
+@@ -163,8 +165,8 @@ static pte_t *kaiser_pagetable_walk(unsi
+ return pte_offset_kernel(pmd, address);
+ }
+
+-int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+- unsigned long flags)
++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
++ unsigned long flags)
+ {
+ int ret = 0;
+ pte_t *pte;
+@@ -173,6 +175,15 @@ int kaiser_add_user_map(const void *__st
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
++ /*
++ * It is convenient for callers to pass in __PAGE_KERNEL etc,
++ * and there is no actual harm from setting _PAGE_GLOBAL, so
++ * long as CR4.PGE is not set. But it is nonetheless troubling
++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
++ * requires that not to be #defined to 0): so mask it off here.
++ */
++ flags &= ~_PAGE_GLOBAL;
++
+ if (flags & _PAGE_USER)
+ BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP);
+
+@@ -261,6 +272,8 @@ void __init kaiser_init(void)
+ {
+ int cpu;
+
++ if (!kaiser_enabled)
++ return;
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+@@ -299,6 +312,8 @@ void __init kaiser_init(void)
+ /* Add a mapping to the shadow mapping, and synchronize the mappings */
+ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+ {
++ if (!kaiser_enabled)
++ return 0;
+ return kaiser_add_user_map((const void *)addr, size, flags);
+ }
+
+@@ -308,6 +323,8 @@ void kaiser_remove_mapping(unsigned long
+ unsigned long addr;
+ pte_t *pte;
+
++ if (!kaiser_enabled)
++ return;
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ pte = kaiser_pagetable_walk(addr);
+ if (pte)
+@@ -329,6 +346,8 @@ static inline bool is_userspace_pgd(pgd_
+
+ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
++ if (!kaiser_enabled)
++ return pgd;
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+@@ -385,4 +404,3 @@ void kaiser_flush_tlb_on_return_to_user(
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+ }
+ EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+-#endif /* CONFIG_KAISER */
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -253,16 +253,12 @@ static void pgd_prepopulate_pmd(struct m
+ }
+ }
+
+-#ifdef CONFIG_KAISER
+ /*
+- * Instead of one pmd, we aquire two pmds. Being order-1, it is
++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+-#define PGD_ALLOCATION_ORDER 1
+-#else
+-#define PGD_ALLOCATION_ORDER 0
+-#endif
++#define PGD_ALLOCATION_ORDER kaiser_enabled
+
+ static inline pgd_t *_pgd_alloc(void)
+ {
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -21,8 +21,7 @@ static void load_new_mm_cr3(pgd_t *pgdir
+ {
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+-#ifdef CONFIG_KAISER
+- if (this_cpu_has(X86_FEATURE_PCID)) {
++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+@@ -39,7 +38,6 @@ static void load_new_mm_cr3(pgd_t *pgdir
+ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ kaiser_flush_tlb_on_return_to_user();
+ }
+-#endif /* CONFIG_KAISER */
+
+ /*
+ * Caution: many callers of this function expect
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -1831,6 +1831,8 @@ bytes respectively. Such letter suffixes
+
+ nojitter [IA64] Disables jitter checking for ITC timers.
+
++ nokaiser [X86-64] Disable KAISER isolation of kernel from user.
++
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
+
+ no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
diff --git a/patches.suse/kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch b/patches.suse/kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch
new file mode 100644
index 0000000000..153dd95010
--- /dev/null
+++ b/patches.suse/kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch
@@ -0,0 +1,133 @@
+From 78e37b25da902ecb56124427ed682f3846f7a191 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Tue, 3 Oct 2017 20:49:04 -0700
+Subject: [PATCH 06/14] kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: No, backport specific
+
+Now that we're playing the ALTERNATIVE game, use that more efficient
+method: instead of user-mapping an extra page, and reading an extra
+cacheline each time for x86_cr3_pcid_noflush.
+
+Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax)
+is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs;
+but the one line with $63 in looks clearer, so let's stick with that.
+
+Worried about what happens with an ALTERNATIVE between the jump and
+jump label in another ALTERNATIVE? I was, but have checked the
+combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64,
+and it does a good job.
+
+(cherry picked from Change-Id: I46d06167615aa8d628eed9972125ab2faca93f05)
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/x86/include/asm/kaiser.h | 7 ++++---
+ arch/x86/include/asm/nops.h | 4 ++++
+ arch/x86/kernel/entry_64.S | 3 ++-
+ arch/x86/mm/kaiser.c | 10 +---------
+ 4 files changed, 11 insertions(+), 13 deletions(-)
+
+--- a/arch/x86/include/asm/kaiser.h
++++ b/arch/x86/include/asm/kaiser.h
+@@ -1,6 +1,7 @@
+ #ifndef _ASM_X86_KAISER_H
+ #define _ASM_X86_KAISER_H
+
++#include <asm/nops.h>
+ #include <asm/processor-flags.h> /* For PCID constants */
+
+ /*
+@@ -25,7 +26,8 @@
+ .macro _SWITCH_TO_KERNEL_CR3 reg
+ movq %cr3, \reg
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+-orq x86_cr3_pcid_noflush, \reg
++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++ALTERNATIVE ASM_NOP5, "bts $63, \reg", X86_FEATURE_PCID
+ movq \reg, %cr3
+ .endm
+
+@@ -39,7 +41,7 @@ movq \reg, %cr3
+ movq %cr3, \reg
+ orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
+ js 9f
+-/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
+ movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+ 9:
+ movq \reg, %cr3
+@@ -91,7 +93,6 @@ movq PER_CPU_VAR(unsafe_stack_register_b
+ */
+ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+-extern unsigned long x86_cr3_pcid_noflush;
+ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+ extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+--- a/arch/x86/include/asm/nops.h
++++ b/arch/x86/include/asm/nops.h
+@@ -87,7 +87,11 @@
+ #define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
+ #define P6_NOP5_ATOMIC P6_NOP5
+
++#ifdef __ASSEMBLY__
++#define _ASM_MK_NOP(x) __stringify(.byte x)
++#else
+ #define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
++#endif
+
+ #if defined(CONFIG_MK7)
+ #define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -416,7 +416,8 @@ ENTRY(save_paranoid)
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+- orq x86_cr3_pcid_noflush, %rax
++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++ ALTERNATIVE ASM_NOP5, "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+ 2:
+ #endif
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -36,7 +36,6 @@ extern char __sched_text_start[];
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+-unsigned long x86_cr3_pcid_noflush __read_mostly;
+ DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+ /*
+@@ -304,9 +303,6 @@ void __init kaiser_init(void)
+ __PAGE_KERNEL_VVAR);
+ kaiser_add_user_map_early((void *)VSYSCALL_START, PAGE_SIZE,
+ vsyscall_pgprot);
+- kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+- sizeof(x86_cr3_pcid_noflush),
+- __PAGE_KERNEL);
+ }
+
+ /* Add a mapping to the shadow mapping, and synchronize the mappings */
+@@ -378,18 +374,14 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp,
+
+ void kaiser_setup_pcid(void)
+ {
+- unsigned long kern_cr3 = 0;
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+- if (this_cpu_has(X86_FEATURE_PCID)) {
+- kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
++ if (this_cpu_has(X86_FEATURE_PCID))
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+- }
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+- x86_cr3_pcid_noflush = kern_cr3;
+ this_cpu_write(x86_cr3_pcid_user, user_cr3);
+ }
+
diff --git a/patches.suse/kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch b/patches.suse/kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch
new file mode 100644
index 0000000000..f5ed36242a
--- /dev/null
+++ b/patches.suse/kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch
@@ -0,0 +1,95 @@
+From: Borislav Petkov <bp@suse.de>
+Subject: x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: Not yet, under development
+
+Concentrate it in arch/x86/mm/kaiser.c and use the upstream string "nopti".
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+
+---
+ Documentation/kernel-parameters.txt | 2 +-
+ arch/x86/kernel/cpu/common.c | 18 ------------------
+ arch/x86/mm/kaiser.c | 20 +++++++++++++++++++-
+ 3 files changed, 20 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -171,20 +171,6 @@ static int __init x86_pcid_setup(char *s
+ return 1;
+ }
+ __setup("nopcid", x86_pcid_setup);
+-
+-static int __init x86_nokaiser_setup(char *s)
+-{
+- /* nokaiser doesn't accept parameters */
+- if (s)
+- return -EINVAL;
+-#ifdef CONFIG_KAISER
+- kaiser_enabled = 0;
+- setup_clear_cpu_cap(X86_FEATURE_KAISER);
+- pr_info("nokaiser: KAISER feature disabled\n");
+-#endif
+- return 0;
+-}
+-early_param("nokaiser", x86_nokaiser_setup);
+ #endif
+
+ static int __init x86_noinvpcid_setup(char *s)
+@@ -696,10 +682,6 @@ void __cpuinit get_cpu_cap(struct cpuinf
+ c->x86_power = cpuid_edx(0x80000007);
+
+ init_scattered_cpuid_features(c);
+-#ifdef CONFIG_KAISER
+- if (kaiser_enabled)
+- set_cpu_cap(c, X86_FEATURE_KAISER);
+-#endif
+ }
+
+ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -276,8 +276,13 @@ void __init kaiser_init(void)
+ {
+ int cpu;
+
+- if (!kaiser_enabled)
++ if (!kaiser_enabled) {
++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
+ return;
++ }
++
++ setup_force_cpu_cap(X86_FEATURE_KAISER);
++
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+@@ -401,3 +406,16 @@ void kaiser_flush_tlb_on_return_to_user(
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+ }
+ EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
++
++static int __init x86_nokaiser_setup(char *s)
++{
++ /* nopti doesn't accept parameters */
++ if (s)
++ return -EINVAL;
++
++ kaiser_enabled = 0;
++ pr_info("Kernel/User page tables isolation: disabled\n");
++
++ return 0;
++}
++early_param("nopti", x86_nokaiser_setup);
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -1831,7 +1831,7 @@ bytes respectively. Such letter suffixes
+
+ nojitter [IA64] Disables jitter checking for ITC timers.
+
+- nokaiser [X86-64] Disable KAISER isolation of kernel from user.
++ nopti [X86-64] Disable KAISER isolation of kernel from user.
+
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
+
diff --git a/patches.suse/kaiser-nokaiser-0008-x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch b/patches.suse/kaiser-nokaiser-0008-x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch
new file mode 100644
index 0000000000..e50ea84e19
--- /dev/null
+++ b/patches.suse/kaiser-nokaiser-0008-x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch
@@ -0,0 +1,171 @@
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Mon, 17 Jul 2017 16:10:33 -0500
+Subject: x86/boot: Add early cmdline parsing for options with arguments
+References: bsc#1068032 CVE-2017-5754
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: e505371dd83963caae1a37ead9524e8d997341be
+Patch-mainline: v4.14-rc1
+
+Add a cmdline_find_option() function to look for cmdline options that
+take arguments. The argument is returned in a supplied buffer and the
+argument length (regardless of whether it fits in the supplied buffer)
+is returned, with -1 indicating not found.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brijesh Singh <brijesh.singh@amd.com>
+Cc: Dave Young <dyoung@redhat.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Larry Woodman <lwoodman@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matt Fleming <matt@codeblueprint.co.uk>
+Cc: Michael S. Tsirkin <mst@redhat.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Toshimitsu Kani <toshi.kani@hpe.com>
+Cc: kasan-dev@googlegroups.com
+Cc: kvm@vger.kernel.org
+Cc: linux-arch@vger.kernel.org
+Cc: linux-doc@vger.kernel.org
+Cc: linux-efi@vger.kernel.org
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/include/asm/cmdline.h | 2
+ arch/x86/lib/cmdline.c | 105 +++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 107 insertions(+)
+
+--- a/arch/x86/include/asm/cmdline.h
++++ b/arch/x86/include/asm/cmdline.h
+@@ -2,5 +2,7 @@
+ #define _ASM_X86_CMDLINE_H
+
+ int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
++int cmdline_find_option(const char *cmdline_ptr, const char *option,
++ char *buffer, int bufsize);
+
+ #endif /* _ASM_X86_CMDLINE_H */
+--- a/arch/x86/lib/cmdline.c
++++ b/arch/x86/lib/cmdline.c
+@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char
+
+ return 0; /* Buffer overrun */
+ }
++
++/*
++ * Find a non-boolean option (i.e. option=argument). In accordance with
++ * standard Linux practice, if this option is repeated, this returns the
++ * last instance on the command line.
++ *
++ * @cmdline: the cmdline string
++ * @max_cmdline_size: the maximum size of cmdline
++ * @option: option string to look for
++ * @buffer: memory buffer to return the option argument
++ * @bufsize: size of the supplied memory buffer
++ *
++ * Returns the length of the argument (regardless of if it was
++ * truncated to fit in the buffer), or -1 on not found.
++ */
++static int
++__cmdline_find_option(const char *cmdline, int max_cmdline_size,
++ const char *option, char *buffer, int bufsize)
++{
++ char c;
++ int pos = 0, len = -1;
++ const char *opptr = NULL;
++ char *bufptr = buffer;
++ enum {
++ st_wordstart = 0, /* Start of word/after whitespace */
++ st_wordcmp, /* Comparing this word */
++ st_wordskip, /* Miscompare, skip */
++ st_bufcpy, /* Copying this to buffer */
++ } state = st_wordstart;
++
++ if (!cmdline)
++ return -1; /* No command line */
++
++ /*
++ * This 'pos' check ensures we do not overrun
++ * a non-NULL-terminated 'cmdline'
++ */
++ while (pos++ < max_cmdline_size) {
++ c = *(char *)cmdline++;
++ if (!c)
++ break;
++
++ switch (state) {
++ case st_wordstart:
++ if (myisspace(c))
++ break;
++
++ state = st_wordcmp;
++ opptr = option;
++ /* fall through */
++
++ case st_wordcmp:
++ if ((c == '=') && !*opptr) {
++ /*
++ * We matched all the way to the end of the
++ * option we were looking for, prepare to
++ * copy the argument.
++ */
++ len = 0;
++ bufptr = buffer;
++ state = st_bufcpy;
++ break;
++ } else if (c == *opptr++) {
++ /*
++ * We are currently matching, so continue
++ * to the next character on the cmdline.
++ */
++ break;
++ }
++ state = st_wordskip;
++ /* fall through */
++
++ case st_wordskip:
++ if (myisspace(c))
++ state = st_wordstart;
++ break;
++
++ case st_bufcpy:
++ if (myisspace(c)) {
++ state = st_wordstart;
++ } else {
++ /*
++ * Increment len, but don't overrun the
++ * supplied buffer and leave room for the
++ * NULL terminator.
++ */
++ if (++len < bufsize)
++ *bufptr++ = c;
++ }
++ break;
++ }
++ }
++
++ if (bufsize)
++ *bufptr = '\0';
++
++ return len;
++}
++
++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
++ int bufsize)
++{
++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
++ buffer, bufsize);
++}
diff --git a/patches.suse/kaiser-nokaiser-0009-kaiser-add_pti_cmdline_option_and_documentation.patch b/patches.suse/kaiser-nokaiser-0009-kaiser-add_pti_cmdline_option_and_documentation.patch
new file mode 100644
index 0000000000..252be25110
--- /dev/null
+++ b/patches.suse/kaiser-nokaiser-0009-kaiser-add_pti_cmdline_option_and_documentation.patch
@@ -0,0 +1,119 @@
+From: Borislav Petkov <bp@suse.de>
+Subject: x86/kaiser: Check boottime cmdline params
+References: bsc#1068032 CVE-2017-5754
+Patch-mainline: Not yet, under development
+
+AMD (and possibly other vendors) are not affected by the leak
+KAISER is protecting against.
+
+Keep the "nopti" for traditional reasons and add pti=<on|off|auto>
+like upstream.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+---
+ Documentation/kernel-parameters.txt | 6 +++
+ arch/x86/mm/kaiser.c | 59 +++++++++++++++++++++++++-----------
+ 2 files changed, 47 insertions(+), 18 deletions(-)
+
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -20,6 +20,7 @@ extern struct mm_struct init_mm;
+ #include <asm/pgalloc.h>
+ #include <asm/desc.h>
+ #include <asm/asm-offsets.h>
++#include <asm/cmdline.h>
+
+ int kaiser_enabled __read_mostly = 1;
+ EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
+@@ -260,6 +261,43 @@ static void __init kaiser_init_all_pgds(
+ WARN_ON(__ret); \
+ } while (0)
+
++void __init kaiser_check_boottime_disable(void)
++{
++ bool enable = true;
++ char arg[5];
++ int ret;
++
++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
++ if (ret > 0) {
++ if (!strncmp(arg, "on", 2))
++ goto enable;
++
++ if (!strncmp(arg, "off", 3))
++ goto disable;
++
++ if (!strncmp(arg, "auto", 4))
++ goto skip;
++ }
++
++ if (cmdline_find_option_bool(boot_command_line, "nopti"))
++ goto disable;
++
++skip:
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
++ goto disable;
++
++enable:
++ if (enable)
++ setup_force_cpu_cap(X86_FEATURE_KAISER);
++
++ return;
++
++disable:
++ pr_info("Kernel/User page tables isolation: disabled\n");
++ kaiser_enabled = 0;
++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
++}
++
+ /*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+@@ -271,12 +309,10 @@ void __init kaiser_init(void)
+ {
+ int cpu;
+
+- if (!kaiser_enabled) {
+- setup_clear_cpu_cap(X86_FEATURE_KAISER);
+- return;
+- }
++ kaiser_check_boottime_disable();
+
+- setup_force_cpu_cap(X86_FEATURE_KAISER);
++ if (!kaiser_enabled)
++ return;
+
+ kaiser_init_all_pgds();
+
+@@ -401,16 +437,3 @@ void kaiser_flush_tlb_on_return_to_user(
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+ }
+ EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+-
+-static int __init x86_nokaiser_setup(char *s)
+-{
+- /* nopti doesn't accept parameters */
+- if (s)
+- return -EINVAL;
+-
+- kaiser_enabled = 0;
+- pr_info("Kernel/User page tables isolation: disabled\n");
+-
+- return 0;
+-}
+-early_param("nopti", x86_nokaiser_setup);
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -2277,6 +2277,12 @@ bytes respectively. Such letter suffixes
+ pt. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
++ pti= [X86_64]
++ Control KAISER user/kernel address space isolation:
++ on - enable
++ off - disable
++ auto - default setting
++
+ pty.legacy_count=
+ [KNL] Number of legacy pty's. Overwrites compiled-in
+ default number.
diff --git a/patches.suse/kaiser-nokaiser-0010-move-pti-feature-check-up.patch b/patches.suse/kaiser-nokaiser-0010-move-pti-feature-check-up.patch
new file mode 100644
index 0000000000..65764cd85b
--- /dev/null
+++ b/patches.suse/kaiser-nokaiser-0010-move-pti-feature-check-up.patch
@@ -0,0 +1,77 @@
+From: Borislav Petkov <bp@suse.de>
+Date: Mon Dec 25 13:57:16 CET 2017
+Subject: x86/kaiser: Move feature detection up
+Patch-mainline: Not yet, under development
+References: bsc#1068032
+
+... before the first use of kaiser_enabled as otherwise funky
+things happen:
+
+ about to get started...
+ (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0000]
+ (XEN) Pagetable walk from ffff88022a449090:
+ (XEN) L4[0x110] = 0000000229e0e067 0000000000001e0e
+ (XEN) L3[0x008] = 0000000000000000 ffffffffffffffff
+ (XEN) domain_crash_sync called from entry.S: fault at ffff82d08033fd08
+ entry.o#create_bounce_frame+0x135/0x14d
+ (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
+ (XEN) ----[ Xen-4.9.1_02-3.21 x86_64 debug=n Not tainted ]----
+ (XEN) CPU: 0
+ (XEN) RIP: e033:[<ffffffff81007460>]
+ (XEN) RFLAGS: 0000000000000286 EM: 1 CONTEXT: pv guest (d0v0)
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+
+---
+ arch/x86/include/asm/kaiser.h | 2 ++
+ arch/x86/kernel/setup.c | 7 +++++++
+ arch/x86/mm/kaiser.c | 2 --
+ 3 files changed, 9 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/kaiser.h
++++ b/arch/x86/include/asm/kaiser.h
+@@ -98,8 +98,10 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_p
+ extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+ extern int kaiser_enabled;
++extern void __init kaiser_check_boottime_disable(void);
+ #else
+ #define kaiser_enabled 0
++static inline void __init kaiser_check_boottime_disable(void) {}
+ #endif /* CONFIG_KAISER */
+
+ /*
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -114,6 +114,7 @@
+ #include <asm/mce.h>
+ #include <asm/alternative.h>
+ #include <asm/prom.h>
++#include <asm/kaiser.h>
+
+ /*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+@@ -915,6 +916,12 @@ void __init setup_arch(char **cmdline_p)
+ */
+ init_hypervisor_platform();
+
++ /*
++ * This needs to happen right after XENPV is set on xen and
++ * kaiser_enabled is checked below in cleanup_highmap().
++ */
++ kaiser_check_boottime_disable();
++
+ x86_init.resources.probe_roms();
+
+ /* after parse_early_param, so could debug it */
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -309,8 +309,6 @@ void __init kaiser_init(void)
+ {
+ int cpu;
+
+- kaiser_check_boottime_disable();
+-
+ if (!kaiser_enabled)
+ return;
+
diff --git a/patches.suse/powerpc-add-gmb.patch b/patches.suse/powerpc-add-gmb.patch
new file mode 100644
index 0000000000..a91be20e5b
--- /dev/null
+++ b/patches.suse/powerpc-add-gmb.patch
@@ -0,0 +1,18 @@
+From: Jiri Kosina <jkosina@suse.cz>
+Subject: powerpc/barrier: add gmb
+Patch-mainline: Not yet, under development
+References: bsc#1068032 CVE-2017-5753
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+--- a/arch/powerpc/include/asm/system.h
++++ b/arch/powerpc/include/asm/system.h
+@@ -42,6 +42,8 @@
+
+ #define set_mb(var, value) do { var = value; mb(); } while (0)
+
++#define gmb() asm volatile("ori 31,31,0")
++
+ #ifdef __KERNEL__
+ #define AT_VECTOR_SIZE_ARCH 6 /* entries in ARCH_DLINFO */
+ #ifdef CONFIG_SMP
diff --git a/patches.suse/powerpc-rfi-flush.patch b/patches.suse/powerpc-rfi-flush.patch
new file mode 100644
index 0000000000..a22e12e601
--- /dev/null
+++ b/patches.suse/powerpc-rfi-flush.patch
@@ -0,0 +1,1055 @@
+From: Nicholas Piggin <npiggin@gmail.com>
+Date: Fri, 15 Dec 2017 16:34:38 +1100
+Subject: [PATCH] powerpc: Secure memory rfi flush
+Patch-mainline: Not yet, under development
+References: bsc#1068032
+
+This puts a nop before each rfid/hrfid and patches in an L1-D
+cache flush instruction where possible.
+
+It provides /sys/devices/system/cpu/secure_memory_protection which can
+report and can patch the rfi flushes at runtime.
+
+This has some debug checking in the rfi instructions to make sure
+we're returning to the context we think we are, so we can avoid
+some flushes.
+
+[duwe@suse.de: fix boot crash in fast_exception_return; needs to be
+ cross-checked with IBM still]
+[msuchanek@suse.de: fix boot crash in slb_miss_realmode]
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+---
+ arch/powerpc/include/asm/exception-64s.h | 58 ++++-
+ arch/powerpc/include/asm/feature-fixups.h | 16 ++
+ arch/powerpc/include/asm/reg.h | 2 +
+ arch/powerpc/include/asm/setup.h | 7 +
+ arch/powerpc/kernel/entry_64.S | 37 ++-
+ arch/powerpc/kernel/exceptions-64s.S | 57 ++++-
+ arch/powerpc/kernel/setup.h | 1 +
+ arch/powerpc/kernel/setup_64.c | 76 ++++++
+ arch/powerpc/kernel/sysfs.c | 40 ++++
+ arch/powerpc/kernel/vmlinux.lds.S | 9 +
+ arch/powerpc/kvm/book3s_rmhandlers.S | 4 +-
+ arch/powerpc/lib/feature-fixups.c | 27 +++
+ arch/powerpc/platforms/powernv/setup.c | 376 ++++++++++++++++++++++++++++++
+ arch/powerpc/platforms/pseries/setup.c | 2 +
+ 14 files changed, 693 insertions(+), 19 deletions(-)
+ create mode 100644 arch/powerpc/platforms/powernv/setup.c
+
+diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
+index f9fa656e141b..1a91dec97b67 100644
+--- a/arch/powerpc/include/asm/exception-64s.h
++++ b/arch/powerpc/include/asm/exception-64s.h
+@@ -35,6 +35,8 @@
+ * implementations as possible.
+ */
+
++#include <asm/bug.h>
++
+ #define EX_R9 0
+ #define EX_R10 8
+ #define EX_R11 16
+@@ -50,6 +52,58 @@
+ #define EX_PPR 88 /* SMT thread status register (priority) */
+
+ /*
++ * The nop instruction allows a secure memory protection instruction to be
++ * inserted with the rfi flush fixup.
++ */
++#define PREPARE_RFI_TO_USER \
++ RFI_FLUSH_FIXUP_SECTION; \
++ nop
++
++#define PREPARE_RFI_TO_GUEST \
++ RFI_FLUSH_FIXUP_SECTION; \
++ nop
++
++#define DEBUG_RFI
++
++#ifdef DEBUG_RFI
++#define CHECK_TARGET_MSR_PR(srr_reg, expected_pr) \
++ SET_SCRATCH0(r3); \
++ mfspr r3,srr_reg; \
++ extrdi r3,r3,1,63-MSR_PR_LG; \
++666: tdnei r3,expected_pr; \
++ EMIT_BUG_ENTRY 666b,__FILE__,__LINE__,0; \
++ GET_SCRATCH0(r3);
++#else
++#define CHECK_TARGET_MSR_PR(expected)
++#endif
++
++#define RFI_TO_KERNEL \
++ CHECK_TARGET_MSR_PR(SPRN_SRR1, 0); \
++ rfid
++
++#define RFI_TO_USER \
++ CHECK_TARGET_MSR_PR(SPRN_SRR1, 1); \
++ PREPARE_RFI_TO_USER; \
++ rfid
++
++#define RFI_TO_GUEST \
++ PREPARE_RFI_TO_GUEST; \
++ rfid
++
++#define HRFI_TO_KERNEL \
++ CHECK_TARGET_MSR_PR(SPRN_HSRR1, 0); \
++ hrfid
++
++#define HRFI_TO_USER \
++ CHECK_TARGET_MSR_PR(SPRN_HSRR1, 1); \
++ PREPARE_RFI_TO_USER; \
++ hrfid
++
++#define HRFI_TO_GUEST \
++ PREPARE_RFI_TO_GUEST; \
++ hrfid
++
++/*
+ * We're short on space and time in the exception prolog, so we can't
+ * use the normal SET_REG_IMMEDIATE macro. Normally we just need the
+ * low halfword of the address, but for Kdump we need the whole low
+@@ -122,7 +176,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,943)
+ mtspr SPRN_##h##SRR0,r12; \
+ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
+ mtspr SPRN_##h##SRR1,r10; \
+- h##rfid; \
++ h##rfid; /* should be h##RFI_TO_KERNEL but run out of space */ \
+ b . /* prevent speculative execution */
+ #define EXCEPTION_PROLOG_PSERIES_1(label, h) \
+ __EXCEPTION_PROLOG_PSERIES_1(label, h)
+@@ -241,7 +295,7 @@ label##_hv: \
+ mtspr SPRN_##h##SRR0,r12; \
+ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
+ mtspr SPRN_##h##SRR1,r10; \
+- h##rfid; \
++ h##RFI_TO_KERNEL; \
+ b . /* prevent speculative execution */
+ #define _MASKABLE_EXCEPTION_PSERIES(vec, label, h) \
+ __MASKABLE_EXCEPTION_PSERIES(vec, label, h)
+diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
+index 9a67a38bf7b9..6ef97605fa64 100644
+--- a/arch/powerpc/include/asm/feature-fixups.h
++++ b/arch/powerpc/include/asm/feature-fixups.h
+@@ -184,4 +184,20 @@ label##3: \
+ FTR_ENTRY_OFFSET label##1b-label##3b; \
+ .popsection;
+
++#define RFI_FLUSH_FIXUP_SECTION \
++951: \
++ .pushsection __rfi_flush_fixup,"a"; \
++ .align 2; \
++952: \
++ FTR_ENTRY_OFFSET 951b-952b; \
++ .popsection;
++
++
++#ifndef __ASSEMBLY__
++#include <linux/types.h>
++
++extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
++extern void do_rfi_flush_fixups(bool enable, unsigned int insn);
++#endif
++
+ #endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */
+diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
+index d0e3509e3119..b236b036bdd4 100644
+--- a/arch/powerpc/include/asm/reg.h
++++ b/arch/powerpc/include/asm/reg.h
+@@ -1034,8 +1034,10 @@
+ #define PV_630p 0x0041
+ #define PV_970MP 0x0044
+ #define PV_970GX 0x0045
++#define PVR_POWER7 0x003F
+ #define PVR_POWER7p 0x004A
+ #define PVR_POWER8E 0x004B
++#define PVR_POWER8NVL 0x004C
+ #define PVR_POWER8 0x004D
+ #define PV_BE 0x0070
+ #define PV_PA6T 0x0090
+diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
+index dae19342f0b9..4130653083b5 100644
+--- a/arch/powerpc/include/asm/setup.h
++++ b/arch/powerpc/include/asm/setup.h
+@@ -3,4 +3,11 @@
+
+ #include <asm-generic/setup.h>
+
++#ifndef __ASSEMBLY__
++
++void rfi_flush_enable(bool enable);
++void __init setup_rfi_flush(void);
++
++#endif /* !__ASSEMBLY__ */
++
+ #endif /* _ASM_POWERPC_SETUP_H */
+diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
+index d8a74c7a3489..5226630a109f 100644
+--- a/arch/powerpc/kernel/entry_64.S
++++ b/arch/powerpc/kernel/entry_64.S
+@@ -32,6 +32,9 @@
+ #include <asm/ptrace.h>
+ #include <asm/irqflags.h>
+ #include <asm/ftrace.h>
++#ifdef CONFIG_PPC_BOOK3S
++#include <asm/exception-64s.h>
++#endif
+
+ /*
+ * System calls.
+@@ -242,13 +245,23 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+ ACCOUNT_CPU_USER_EXIT(r11, r12)
+ HMT_MEDIUM_LOW_HAS_PPR
+ ld r13,GPR13(r1) /* only restore r13 if returning to usermode */
++ ld r2,GPR2(r1)
++ ld r1,GPR1(r1)
++ mtlr r4
++ mtcr r5
++ mtspr SPRN_SRR0,r7
++ mtspr SPRN_SRR1,r8
++ RFI_TO_USER
++ b . /* prevent speculative execution */
++
++ /* exit to kernel */
+ 1: ld r2,GPR2(r1)
+ ld r1,GPR1(r1)
+ mtlr r4
+ mtcr r5
+ mtspr SPRN_SRR0,r7
+ mtspr SPRN_SRR1,r8
+- RFI
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
+ syscall_error:
+@@ -708,7 +721,7 @@ BEGIN_FTR_SECTION
+ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+ ACCOUNT_CPU_USER_EXIT(r2, r4)
+ REST_GPR(13, r1)
+-1:
++
+ mtspr SPRN_SRR1,r3
+
+ ld r2,_CCR(r1)
+@@ -721,8 +734,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+ ld r3,GPR3(r1)
+ ld r4,GPR4(r1)
+ ld r1,GPR1(r1)
++ RFI_TO_USER
++ b . /* prevent speculative execution */
++
++1: mtspr SPRN_SRR1,r3
++
++ ld r2,_CCR(r1)
++ mtcrf 0xFF,r2
++ ld r2,_NIP(r1)
++ mtspr SPRN_SRR0,r2
+
+- rfid
++ ld r0,GPR0(r1)
++ ld r2,GPR2(r1)
++ ld r3,GPR3(r1)
++ ld r4,GPR4(r1)
++ ld r1,GPR1(r1)
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
+ #endif /* CONFIG_PPC_BOOK3E */
+@@ -906,7 +933,7 @@ _GLOBAL(enter_rtas)
+
+ mtspr SPRN_SRR0,r5
+ mtspr SPRN_SRR1,r6
+- rfid
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
+ _STATIC(rtas_return_loc)
+@@ -929,7 +956,7 @@ _STATIC(rtas_return_loc)
+
+ mtspr SPRN_SRR0,r3
+ mtspr SPRN_SRR1,r4
+- rfid
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
+ .align 3
+diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
+index 2c4ffe3bdd34..763b3db5dd73 100644
+--- a/arch/powerpc/kernel/exceptions-64s.S
++++ b/arch/powerpc/kernel/exceptions-64s.S
+@@ -224,14 +224,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
+ mtspr SPRN_SRR0,r10
+ ld r10,PACAKMSR(r13)
+ mtspr SPRN_SRR1,r10
+- rfid
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
+ /* Fast LE/BE switch system call */
+ 1: mfspr r12,SPRN_SRR1
+ xori r12,r12,MSR_LE
+ mtspr SPRN_SRR1,r12
+- rfid /* return to userspace */
++ RFI_TO_USER /* return to userspace */
+ b .
+
+ STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+@@ -312,7 +312,7 @@ masked_interrupt:
+ mtspr SPRN_SRR1,r10
+ ld r10,PACA_EXGEN+EX_R10(r13)
+ GET_SCRATCH0(r13)
+- rfid
++ RFI_TO_KERNEL
+ b .
+
+ masked_Hinterrupt:
+@@ -325,7 +325,7 @@ masked_Hinterrupt:
+ mtspr SPRN_HSRR1,r10
+ ld r10,PACA_EXGEN+EX_R10(r13)
+ GET_SCRATCH0(r13)
+- hrfid
++ HRFI_TO_KERNEL
+ b .
+
+ #ifndef CONFIG_BIGMEM
+@@ -382,7 +382,7 @@ slb_miss_user_pseries:
+ mtspr SRR0,r12
+ mfspr r12,SRR1 /* and SRR1 */
+ mtspr SRR1,r10
+- rfid
++ RFI_TO_KERNEL
+ b . /* prevent spec. execution */
+ #endif /* __DISABLED__ */
+
+@@ -591,7 +591,7 @@ slb_miss_user_common:
+ ld r11,PACA_EXGEN+EX_R11(r13)
+ ld r12,PACA_EXGEN+EX_R12(r13)
+ ld r13,PACA_EXGEN+EX_R13(r13)
+- rfid
++ RFI_TO_USER
+ b .
+
+ slb_miss_fault:
+@@ -649,6 +649,31 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
+ andi. r10,r12,MSR_RI /* check for unrecoverable exception */
+ beq- 2f
+
++ andi. r10,r12,MSR_PR /* check for userspace exception */
++ beq 1f /* returning to kernel */
++
++.machine push
++.machine "power4"
++ mtcrf 0x80,r9
++ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
++.machine pop
++
++#ifdef CONFIG_PPC_ISERIES
++BEGIN_FW_FTR_SECTION
++ mtspr SPRN_SRR0,r11
++ mtspr SPRN_SRR1,r12
++END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
++#endif /* CONFIG_PPC_ISERIES */
++ RESTORE_PPR_PACA(PACA_EXSLB, r9)
++ ld r9,PACA_EXSLB+EX_R9(r13)
++ ld r10,PACA_EXSLB+EX_R10(r13)
++ ld r11,PACA_EXSLB+EX_R11(r13)
++ ld r12,PACA_EXSLB+EX_R12(r13)
++ ld r13,PACA_EXSLB+EX_R13(r13)
++ RFI_TO_USER
++ b . /* prevent speculative execution */
++
++1:
+ .machine push
+ .machine "power4"
+ mtcrf 0x80,r9
+@@ -667,9 +692,10 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
+ ld r11,PACA_EXSLB+EX_R11(r13)
+ ld r12,PACA_EXSLB+EX_R12(r13)
+ ld r13,PACA_EXSLB+EX_R13(r13)
+- rfid
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
++
+ 2:
+ #ifdef CONFIG_PPC_ISERIES
+ BEGIN_FW_FTR_SECTION
+@@ -682,7 +708,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
+ mtspr SPRN_SRR0,r10
+ ld r10,PACAKMSR(r13)
+ mtspr SPRN_SRR1,r10
+- rfid
++ RFI_TO_KERNEL
+ b .
+
+ unrecov_slb:
+@@ -850,9 +876,20 @@ fast_exception_return:
+
+ mtspr SPRN_SRR1,r12
+ mtspr SPRN_SRR0,r11
++ mfcr r11
++ andi. r12,r12,__MASK(MSR_PR_LG)
++ beq 3f
++
++ mtcr r11
++ REST_4GPRS(10, r1)
++ ld r1,GPR1(r1)
++ RFI_TO_USER
++ b . /* prevent speculative execution */
++
++3: mtcr r11
+ REST_4GPRS(10, r1)
+ ld r1,GPR1(r1)
+- rfid
++ RFI_TO_KERNEL
+ b . /* prevent speculative execution */
+
+ unrecov_fer:
+@@ -1105,7 +1142,7 @@ _GLOBAL(do_stab_bolted)
+ ld r11,PACA_EXSLB+EX_R11(r13)
+ ld r12,PACA_EXSLB+EX_R12(r13)
+ ld r13,PACA_EXSLB+EX_R13(r13)
+- rfid
++ RFI_TO_USER
+ b . /* prevent speculative execution */
+
+ #endif
+diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
+index 4c67ad7fae08..134a46183b78 100644
+--- a/arch/powerpc/kernel/setup.h
++++ b/arch/powerpc/kernel/setup.h
+@@ -5,5 +5,6 @@ void check_for_initrd(void);
+ void do_init_bootmem(void);
+ void setup_panic(void);
+ extern int do_early_xmon;
++extern bool rfi_flush;
+
+ #endif /* _POWERPC_KERNEL_SETUP_H */
+diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
+index 1408118d5cb1..bffd0c6c115a 100644
+--- a/arch/powerpc/kernel/setup_64.c
++++ b/arch/powerpc/kernel/setup_64.c
+@@ -673,3 +673,79 @@ struct ppc_pci_io ppc_pci_io;
+ EXPORT_SYMBOL(ppc_pci_io);
+ #endif /* CONFIG_PPC_INDIRECT_IO */
+
++#ifdef CONFIG_PPC_BOOK3S_64
++enum l1d_flush_type {
++ L1D_FLUSH_NONE,
++ L1D_FLUSH_ORI,
++ L1D_FLUSH_MTTRIG,
++};
++
++enum l1d_flush_type l1d_flush_type;
++
++bool rfi_flush;
++
++static void do_rfi_flush(void *val)
++{
++ switch (l1d_flush_type) {
++ case L1D_FLUSH_ORI:
++ asm volatile("ori 30,30,0" ::: "memory");
++ break;
++ case L1D_FLUSH_MTTRIG:
++ asm volatile("mtspr 882,0" ::: "memory");
++ break;
++ default:
++ break;
++ }
++}
++
++void rfi_flush_enable(bool enable)
++{
++ unsigned int insn;
++
++ if (rfi_flush == enable)
++ return;
++
++ switch (l1d_flush_type) {
++ case L1D_FLUSH_ORI:
++ insn = 0x63de0000;
++ break;
++ case L1D_FLUSH_MTTRIG:
++ insn = 0x7c12dba6;
++ break;
++ default:
++ printk("Secure memory protection not enabled! System is vulnerable to local exploit. Update firmware.\n");
++ return;
++ }
++
++ do_rfi_flush_fixups(enable, insn);
++
++ if (enable)
++ on_each_cpu(do_rfi_flush, NULL, 1);
++
++ rfi_flush = enable;
++}
++
++/* This tries to guess the cpu characteristics based on the PVR. */
++static bool get_cpu_characteristics(void)
++{
++ if (__is_processor(PVR_POWER7) || __is_processor(PVR_POWER7p))
++ l1d_flush_type = L1D_FLUSH_NONE;
++ else if (__is_processor(PVR_POWER8E) ||
++ __is_processor(PVR_POWER8))
++ l1d_flush_type = L1D_FLUSH_ORI;
++ else {
++ /* unknown CPU */
++ l1d_flush_type = L1D_FLUSH_NONE;
++ return false;
++ }
++
++ return true;
++}
++
++void __init setup_rfi_flush(void)
++{
++ if (get_cpu_characteristics())
++ rfi_flush_enable(true);
++}
++#endif /* CONFIG_PPC_BOOK3S_64 */
++
+diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
+index 1433c0a4f2da..e499370a0e55 100644
+--- a/arch/powerpc/kernel/sysfs.c
++++ b/arch/powerpc/kernel/sysfs.c
+@@ -21,6 +21,7 @@
+ #include <asm/system.h>
+
+ #include "cacheinfo.h"
++#include "setup.h"
+
+ #ifdef CONFIG_PPC64
+ #include <asm/paca.h>
+@@ -196,6 +197,44 @@ static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL);
+ static SYSDEV_ATTR(purr, 0600, show_purr, store_purr);
+ static SYSDEV_ATTR(pir, 0400, show_pir, NULL);
+
++#ifdef CONFIG_PPC_BOOK3S_64
++static ssize_t show_rfi_flush(struct sysdev_class *class,
++ struct sysdev_class_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%d\n", rfi_flush ? 1 : 0);
++}
++
++static ssize_t __used store_rfi_flush(struct sysdev_class *class,
++ struct sysdev_class_attribute *attr, const char *buf,
++ size_t count)
++{
++ int val;
++ int ret = 0;
++
++ ret = sscanf(buf, "%d", &val);
++ if (ret != 1)
++ return -EINVAL;
++
++ if (val == 1)
++ rfi_flush_enable(true);
++ else if (val == 0)
++ rfi_flush_enable(false);
++ else
++ return -EINVAL;
++
++ return count;
++}
++
++static SYSDEV_CLASS_ATTR(rfi_flush, 0600,
++ show_rfi_flush, store_rfi_flush);
++
++static void sysfs_create_rfi_flush(void)
++{
++ sysfs_create_file(&cpu_sysdev_class.kset.kobj,
++ &attr_rfi_flush.attr);
++}
++#endif /* CONFIG_PPC_BOOK3S_64 */
++
+ static unsigned long dscr_default;
+
+ static void read_dscr(void *val)
+@@ -691,6 +730,7 @@ static int __init topology_init(void)
+ }
+ #ifdef CONFIG_PPC64
+ sysfs_create_dscr_default();
++ sysfs_create_rfi_flush();
+ #endif /* CONFIG_PPC64 */
+
+ return 0;
+diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
+index 3e8fe4b832fd..a4916671de52 100644
+--- a/arch/powerpc/kernel/vmlinux.lds.S
++++ b/arch/powerpc/kernel/vmlinux.lds.S
+@@ -72,6 +72,15 @@ SECTIONS
+ /* Read-only data */
+ RODATA
+
++#ifdef CONFIG_PPC64
++ . = ALIGN(8);
++ __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
++ __start___rfi_flush_fixup = .;
++ *(__rfi_flush_fixup)
++ __stop___rfi_flush_fixup = .;
++ }
++#endif
++
+ EXCEPTION_TABLE(0)
+
+ NOTES :kernel :notes
+diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
+index 1a1b34487e71..3365c9219892 100644
+--- a/arch/powerpc/kvm/book3s_rmhandlers.S
++++ b/arch/powerpc/kvm/book3s_rmhandlers.S
+@@ -162,7 +162,7 @@ kvmppc_handler_skip_ins:
+ GET_SCRATCH0(r13)
+
+ /* And get back into the code */
+- RFI
++ RFI_TO_GUEST
+
+ /*
+ * This trampoline brings us back to a real mode handler
+@@ -199,7 +199,7 @@ _GLOBAL(kvmppc_rmcall)
+ sync
+ mtsrr0 r3
+ mtsrr1 r4
+- RFI
++ RFI_TO_KERNEL
+
+ #if defined(CONFIG_PPC_BOOK3S_32)
+ #define STACK_LR INT_FRAME_SIZE+4
+diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
+index 7a8a7487cee8..3e91e2ffc1d0 100644
+--- a/arch/powerpc/lib/feature-fixups.c
++++ b/arch/powerpc/lib/feature-fixups.c
+@@ -113,6 +113,33 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
+ }
+ }
+
++#ifdef CONFIG_PPC_BOOK3S_64
++void do_rfi_flush_fixups(bool enable, unsigned int insn)
++{
++ long *start, *end;
++ unsigned int *dest;
++ int i;
++
++ start = PTRRELOC(&__start___rfi_flush_fixup),
++ end = PTRRELOC(&__stop___rfi_flush_fixup);
++
++ for (i = 0; start < end; start++, i++) {
++ dest = (void *)start + *start;
++
++ pr_devel("RFI FLUSH FIXUP %s %lx\n", enable ? "enable" : "disable", (unsigned long)start);
++ if (!enable) {
++ pr_devel("patching dest %lx\n", (unsigned long)dest);
++ patch_instruction(dest, PPC_INST_NOP);
++ } else {
++ pr_devel("patching dest %lx\n", (unsigned long)dest);
++ patch_instruction(dest, insn);
++ }
++ }
++
++ printk(KERN_DEBUG "rfi-fixups: patched %d locations\n", i);
++}
++#endif /* CONFIG_PPC_BOOK3S_64 */
++
+ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
+ {
+ long *start, *end;
+diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
+new file mode 100644
+index 000000000000..320a6a74af3f
+--- /dev/null
++++ b/arch/powerpc/platforms/powernv/setup.c
+@@ -0,0 +1,376 @@
++/*
++ * PowerNV setup code.
++ *
++ * Copyright 2011 IBM Corp.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#undef DEBUG
++
++#include <linux/cpu.h>
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/tty.h>
++#include <linux/reboot.h>
++#include <linux/init.h>
++#include <linux/console.h>
++#include <linux/delay.h>
++#include <linux/irq.h>
++#include <linux/seq_file.h>
++#include <linux/of.h>
++#include <linux/of_fdt.h>
++#include <linux/interrupt.h>
++#include <linux/bug.h>
++#include <linux/pci.h>
++#include <linux/cpufreq.h>
++
++#include <asm/machdep.h>
++#include <asm/firmware.h>
++#include <asm/xics.h>
++#include <asm/xive.h>
++#include <asm/opal.h>
++#include <asm/kexec.h>
++#include <asm/smp.h>
++#include <asm/tm.h>
++
++#include "powernv.h"
++
++static void __init pnv_setup_arch(void)
++{
++ set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
++
++ setup_rfi_flush();
++
++ /* Initialize SMP */
++ pnv_smp_init();
++
++ /* Setup PCI */
++ pnv_pci_init();
++
++ /* Setup RTC and NVRAM callbacks */
++ if (firmware_has_feature(FW_FEATURE_OPAL))
++ opal_nvram_init();
++
++ /* Enable NAP mode */
++ powersave_nap = 1;
++
++ /* XXX PMCS */
++}
++
++static void __init pnv_init(void)
++{
++ /*
++ * Initialize the LPC bus now so that legacy serial
++ * ports can be found on it
++ */
++ opal_lpc_init();
++
++#ifdef CONFIG_HVC_OPAL
++ if (firmware_has_feature(FW_FEATURE_OPAL))
++ hvc_opal_init_early();
++ else
++#endif
++ add_preferred_console("hvc", 0, NULL);
++}
++
++static void __init pnv_init_IRQ(void)
++{
++ /* Try using a XIVE if available, otherwise use a XICS */
++ if (!xive_native_init())
++ xics_init();
++
++ WARN_ON(!ppc_md.get_irq);
++}
++
++static void pnv_show_cpuinfo(struct seq_file *m)
++{
++ struct device_node *root;
++ const char *model = "";
++
++ root = of_find_node_by_path("/");
++ if (root)
++ model = of_get_property(root, "model", NULL);
++ seq_printf(m, "machine\t\t: PowerNV %s\n", model);
++ if (firmware_has_feature(FW_FEATURE_OPAL))
++ seq_printf(m, "firmware\t: OPAL\n");
++ else
++ seq_printf(m, "firmware\t: BML\n");
++ of_node_put(root);
++ if (radix_enabled())
++ seq_printf(m, "MMU\t\t: Radix\n");
++ else
++ seq_printf(m, "MMU\t\t: Hash\n");
++}
++
++static void pnv_prepare_going_down(void)
++{
++ /*
++ * Disable all notifiers from OPAL, we can't
++ * service interrupts anymore anyway
++ */
++ opal_event_shutdown();
++
++ /* Soft disable interrupts */
++ local_irq_disable();
++
++ /*
++ * Return secondary CPUs to firwmare if a flash update
++ * is pending otherwise we will get all sort of error
++ * messages about CPU being stuck etc.. This will also
++ * have the side effect of hard disabling interrupts so
++ * past this point, the kernel is effectively dead.
++ */
++ opal_flash_term_callback();
++}
++
++static void __noreturn pnv_restart(char *cmd)
++{
++ long rc = OPAL_BUSY;
++
++ pnv_prepare_going_down();
++
++ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
++ rc = opal_cec_reboot();
++ if (rc == OPAL_BUSY_EVENT)
++ opal_poll_events(NULL);
++ else
++ mdelay(10);
++ }
++ for (;;)
++ opal_poll_events(NULL);
++}
++
++static void __noreturn pnv_power_off(void)
++{
++ long rc = OPAL_BUSY;
++
++ pnv_prepare_going_down();
++
++ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
++ rc = opal_cec_power_down(0);
++ if (rc == OPAL_BUSY_EVENT)
++ opal_poll_events(NULL);
++ else
++ mdelay(10);
++ }
++ for (;;)
++ opal_poll_events(NULL);
++}
++
++static void __noreturn pnv_halt(void)
++{
++ pnv_power_off();
++}
++
++static void pnv_progress(char *s, unsigned short hex)
++{
++}
++
++static void pnv_shutdown(void)
++{
++ /* Let the PCI code clear up IODA tables */
++ pnv_pci_shutdown();
++
++ /*
++ * Stop OPAL activity: Unregister all OPAL interrupts so they
++ * don't fire up while we kexec and make sure all potentially
++ * DMA'ing ops are complete (such as dump retrieval).
++ */
++ opal_shutdown();
++}
++
++#ifdef CONFIG_KEXEC_CORE
++static void pnv_kexec_wait_secondaries_down(void)
++{
++ int my_cpu, i, notified = -1;
++
++ my_cpu = get_cpu();
++
++ for_each_online_cpu(i) {
++ uint8_t status;
++ int64_t rc, timeout = 1000;
++
++ if (i == my_cpu)
++ continue;
++
++ for (;;) {
++ rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
++ &status);
++ if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED)
++ break;
++ barrier();
++ if (i != notified) {
++ printk(KERN_INFO "kexec: waiting for cpu %d "
++ "(physical %d) to enter OPAL\n",
++ i, paca[i].hw_cpu_id);
++ notified = i;
++ }
++
++ /*
++ * On crash secondaries might be unreachable or hung,
++ * so timeout if we've waited too long
++ * */
++ mdelay(1);
++ if (timeout-- == 0) {
++ printk(KERN_ERR "kexec: timed out waiting for "
++ "cpu %d (physical %d) to enter OPAL\n",
++ i, paca[i].hw_cpu_id);
++ break;
++ }
++ }
++ }
++}
++
++static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
++{
++ u64 reinit_flags;
++
++ if (xive_enabled())
++ xive_kexec_teardown_cpu(secondary);
++ else
++ xics_kexec_teardown_cpu(secondary);
++
++ /* On OPAL, we return all CPUs to firmware */
++ if (!firmware_has_feature(FW_FEATURE_OPAL))
++ return;
++
++ if (secondary) {
++ /* Return secondary CPUs to firmware on OPAL v3 */
++ mb();
++ get_paca()->kexec_state = KEXEC_STATE_REAL_MODE;
++ mb();
++
++ /* Return the CPU to OPAL */
++ opal_return_cpu();
++ } else {
++ /* Primary waits for the secondaries to have reached OPAL */
++ pnv_kexec_wait_secondaries_down();
++
++ /* Switch XIVE back to emulation mode */
++ if (xive_enabled())
++ xive_shutdown();
++
++ /*
++ * We might be running as little-endian - now that interrupts
++ * are disabled, reset the HILE bit to big-endian so we don't
++ * take interrupts in the wrong endian later
++ *
++ * We reinit to enable both radix and hash on P9 to ensure
++ * the mode used by the next kernel is always supported.
++ */
++ reinit_flags = OPAL_REINIT_CPUS_HILE_BE;
++ if (cpu_has_feature(CPU_FTR_ARCH_300))
++ reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX |
++ OPAL_REINIT_CPUS_MMU_HASH;
++ opal_reinit_cpus(reinit_flags);
++ }
++}
++#endif /* CONFIG_KEXEC_CORE */
++
++#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
++static unsigned long pnv_memory_block_size(void)
++{
++ /*
++ * We map the kernel linear region with 1GB large pages on radix. For
++ * memory hot unplug to work our memory block size must be at least
++ * this size.
++ */
++ if (radix_enabled())
++ return 1UL * 1024 * 1024 * 1024;
++ else
++ return 256UL * 1024 * 1024;
++}
++#endif
++
++static void __init pnv_setup_machdep_opal(void)
++{
++ ppc_md.get_boot_time = opal_get_boot_time;
++ ppc_md.restart = pnv_restart;
++ pm_power_off = pnv_power_off;
++ ppc_md.halt = pnv_halt;
++ /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
++ ppc_md.machine_check_exception = opal_machine_check;
++ ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
++ ppc_md.hmi_exception_early = opal_hmi_exception_early;
++ ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
++}
++
++static int __init pnv_probe(void)
++{
++ if (!of_machine_is_compatible("ibm,powernv"))
++ return 0;
++
++ if (firmware_has_feature(FW_FEATURE_OPAL))
++ pnv_setup_machdep_opal();
++
++ pr_debug("PowerNV detected !\n");
++
++ pnv_init();
++
++ return 1;
++}
++
++#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
++void __init pnv_tm_init(void)
++{
++ if (!firmware_has_feature(FW_FEATURE_OPAL) ||
++ !pvr_version_is(PVR_POWER9) ||
++ early_cpu_has_feature(CPU_FTR_TM))
++ return;
++
++ if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
++ return;
++
++ pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
++ cur_cpu_spec->cpu_features |= CPU_FTR_TM;
++ /* Make sure "normal" HTM is off (it should be) */
++ cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
++ /* Turn on no suspend mode, and HTM no SC */
++ cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
++ PPC_FEATURE2_HTM_NOSC;
++ tm_suspend_disabled = true;
++}
++#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
++
++/*
++ * Returns the cpu frequency for 'cpu' in Hz. This is used by
++ * /proc/cpuinfo
++ */
++static unsigned long pnv_get_proc_freq(unsigned int cpu)
++{
++ unsigned long ret_freq;
++
++ ret_freq = cpufreq_get(cpu) * 1000ul;
++
++ /*
++ * If the backend cpufreq driver does not exist,
++ * then fallback to old way of reporting the clockrate.
++ */
++ if (!ret_freq)
++ ret_freq = ppc_proc_freq;
++ return ret_freq;
++}
++
++define_machine(powernv) {
++ .name = "PowerNV",
++ .probe = pnv_probe,
++ .setup_arch = pnv_setup_arch,
++ .init_IRQ = pnv_init_IRQ,
++ .show_cpuinfo = pnv_show_cpuinfo,
++ .get_proc_freq = pnv_get_proc_freq,
++ .progress = pnv_progress,
++ .machine_shutdown = pnv_shutdown,
++ .power_save = NULL,
++ .calibrate_decr = generic_calibrate_decr,
++#ifdef CONFIG_KEXEC_CORE
++ .kexec_cpu_down = pnv_kexec_cpu_down,
++#endif
++#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
++ .memory_block_size = pnv_memory_block_size,
++#endif
++};
+diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
+index 1b288d678235..72aa85ffb371 100644
+--- a/arch/powerpc/platforms/pseries/setup.c
++++ b/arch/powerpc/platforms/pseries/setup.c
+@@ -386,6 +386,8 @@ static void __init pSeries_setup_arch(void)
+
+ fwnmi_init();
+
++ setup_rfi_flush();
++
+ /* Find and initialize PCI host bridges */
+ init_pci_config_tokens();
+ find_and_init_phbs();
+--
+2.13.6
+
diff --git a/patches.xen/xen-x86-pmd-handling b/patches.xen/xen-x86-pmd-handling
index 713ee8e703..735c2fd5e2 100644
--- a/patches.xen/xen-x86-pmd-handling
+++ b/patches.xen/xen-x86-pmd-handling
@@ -1,9 +1,9 @@
From: jbeulich@novell.com
Subject: consolidate pmd/pud/pgd entry handling
-Patch-mainline: obsolete
+Patch-mainline: No, obsolete
---- sle11sp4.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2014-01-15 13:02:20.000000000 +0100
-+++ sle11sp4/arch/x86/include/mach-xen/asm/hypervisor.h 2012-10-19 15:50:26.000000000 +0200
+--- a/arch/x86/include/mach-xen/asm/hypervisor.h
++++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@ -95,10 +95,12 @@ void xen_invlpg(unsigned long ptr);
void xen_l1_entry_update(pte_t *ptr, pte_t val);
void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
@@ -18,8 +18,8 @@ Patch-mainline: obsolete
void xen_set_ldt(const void *ptr, unsigned int ents);
#ifdef CONFIG_SMP
---- sle11sp4.orig/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-01 15:41:35.000000000 +0100
-+++ sle11sp4/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-03 14:41:13.000000000 +0100
+--- a/arch/x86/include/mach-xen/asm/pgalloc.h
++++ b/arch/x86/include/mach-xen/asm/pgalloc.h
@@ -75,20 +75,16 @@ static inline void pmd_populate(struct m
struct page *pte)
{
@@ -99,9 +99,9 @@ Patch-mainline: obsolete
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
---- sle11sp4.orig/arch/x86/include/mach-xen/asm/pgtable.h 2013-02-19 15:59:02.000000000 +0100
-+++ sle11sp4/arch/x86/include/mach-xen/asm/pgtable.h 2013-02-19 15:58:01.000000000 +0100
-@@ -629,7 +629,7 @@ static inline pmd_t xen_local_pmdp_get_a
+--- a/arch/x86/include/mach-xen/asm/pgtable.h
++++ b/arch/x86/include/mach-xen/asm/pgtable.h
+@@ -644,7 +644,7 @@ static inline pmd_t xen_local_pmdp_get_a
{
pmd_t res = *pmdp;
@@ -110,8 +110,8 @@ Patch-mainline: obsolete
return res;
}
---- sle11sp4.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2012-06-21 13:49:18.000000000 +0200
-+++ sle11sp4/arch/x86/include/mach-xen/asm/pgtable-3level.h 2012-06-21 13:50:41.000000000 +0200
+--- a/arch/x86/include/mach-xen/asm/pgtable-3level.h
++++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
@@ -110,12 +110,15 @@ static inline void __xen_pte_clear(pte_t
ptep->pte_high = 0;
}
@@ -148,8 +148,8 @@ Patch-mainline: obsolete
#ifdef CONFIG_SMP
static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
{
---- sle11sp4.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-04-12 15:59:10.000000000 +0200
-+++ sle11sp4/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-03-23 10:11:13.000000000 +0100
+--- a/arch/x86/include/mach-xen/asm/pgtable_64.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
@@ -65,10 +65,13 @@ static inline void xen_set_pmd(pmd_t *pm
xen_l2_entry_update(pmdp, pmd);
}
@@ -168,7 +168,7 @@ Patch-mainline: obsolete
#ifdef CONFIG_SMP
static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
-@@ -95,23 +98,28 @@ static inline void xen_set_pud(pud_t *pu
+@@ -95,10 +98,13 @@ static inline void xen_set_pud(pud_t *pu
xen_l3_entry_update(pudp, pud);
}
@@ -186,10 +186,12 @@ Patch-mainline: obsolete
#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+@@ -126,14 +132,16 @@ static inline pgd_t *xen_get_shadow_pgd(
+
static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
-- xen_l4_entry_update(pgdp, pgd);
-+ xen_l4_entry_update(pgdp, 0, pgd);
+- xen_l4_entry_update(pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
++ xen_l4_entry_update(pgdp, 0, kaiser_set_shadow_pgd(pgdp, pgd));
}
-static inline void xen_pgd_clear(pgd_t *pgd)
@@ -207,8 +209,8 @@ Patch-mainline: obsolete
#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
---- sle11sp4.orig/arch/x86/mm/hypervisor.c 2012-10-19 15:49:40.000000000 +0200
-+++ sle11sp4/arch/x86/mm/hypervisor.c 2012-10-19 15:50:17.000000000 +0200
+--- a/arch/x86/mm/hypervisor.c
++++ b/arch/x86/mm/hypervisor.c
@@ -356,31 +356,91 @@ void xen_l1_entry_update(pte_t *ptr, pte
}
EXPORT_SYMBOL_GPL(xen_l1_entry_update);
@@ -308,8 +310,8 @@ Patch-mainline: obsolete
}
#endif /* CONFIG_X86_64 */
---- sle11sp4.orig/arch/x86/mm/init_32-xen.c 2011-08-22 11:25:03.000000000 +0200
-+++ sle11sp4/arch/x86/mm/init_32-xen.c 2011-07-04 12:29:02.000000000 +0200
+--- a/arch/x86/mm/init_32-xen.c
++++ b/arch/x86/mm/init_32-xen.c
@@ -690,6 +690,8 @@ static void __init zone_sizes_init(void)
#endif
@@ -328,8 +330,8 @@ Patch-mainline: obsolete
}
#ifdef CONFIG_MEMORY_HOTPLUG
---- sle11sp4.orig/arch/x86/mm/init_64-xen.c 2011-08-22 11:25:03.000000000 +0200
-+++ sle11sp4/arch/x86/mm/init_64-xen.c 2011-08-09 11:17:07.000000000 +0200
+--- a/arch/x86/mm/init_64-xen.c
++++ b/arch/x86/mm/init_64-xen.c
@@ -232,8 +232,11 @@ static pud_t *fill_pud(pgd_t *pgd, unsig
{
if (pgd_none(*pgd)) {
@@ -358,7 +360,7 @@ Patch-mainline: obsolete
if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
-@@ -596,7 +602,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
+@@ -606,7 +612,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
} else
*pmd = __pmd(pte_phys | _PAGE_TABLE);
} else {
@@ -366,7 +368,7 @@ Patch-mainline: obsolete
spin_lock(&init_mm.page_table_lock);
pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
spin_unlock(&init_mm.page_table_lock);
-@@ -683,7 +688,6 @@ phys_pud_init(pud_t *pud_page, unsigned
+@@ -693,7 +698,6 @@ phys_pud_init(pud_t *pud_page, unsigned
} else
*pud = __pud(pmd_phys | _PAGE_TABLE);
} else {
@@ -374,7 +376,7 @@ Patch-mainline: obsolete
spin_lock(&init_mm.page_table_lock);
pud_populate(&init_mm, pud, __va(pmd_phys));
spin_unlock(&init_mm.page_table_lock);
-@@ -850,7 +854,6 @@ kernel_physical_mapping_init(unsigned lo
+@@ -860,7 +864,6 @@ kernel_physical_mapping_init(unsigned lo
XENFEAT_writable_page_tables);
xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
} else {
@@ -382,7 +384,7 @@ Patch-mainline: obsolete
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
-@@ -895,7 +898,7 @@ void __init paging_init(void)
+@@ -905,7 +908,7 @@ void __init paging_init(void)
free_area_init_nodes(max_zone_pfns);
@@ -391,8 +393,8 @@ Patch-mainline: obsolete
}
/*
---- sle11sp4.orig/arch/x86/mm/pgtable-xen.c 2011-04-12 15:59:10.000000000 +0200
-+++ sle11sp4/arch/x86/mm/pgtable-xen.c 2011-04-11 16:12:44.000000000 +0200
+--- a/arch/x86/mm/pgtable-xen.c
++++ b/arch/x86/mm/pgtable-xen.c
@@ -66,16 +66,16 @@ early_param("userpte", setup_userpte);
void __pte_free(pgtable_t pte)
{
diff --git a/patches.xen/xen-x86_64-pgd-alloc-order b/patches.xen/xen-x86_64-pgd-alloc-order
index cbf1819cc0..75f177b79f 100644
--- a/patches.xen/xen-x86_64-pgd-alloc-order
+++ b/patches.xen/xen-x86_64-pgd-alloc-order
@@ -26,41 +26,6 @@ At the same time remove the useless user mode pair of init_level4_pgt.
void xen_pgd_pin(pgd_t *);
void xen_pgd_unpin(pgd_t *);
---- a/arch/x86/include/mach-xen/asm/mmu_context.h
-+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
-@@ -126,6 +126,9 @@ static inline void switch_mm(struct mm_s
- {
- unsigned cpu = smp_processor_id();
- struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
-+#ifdef CONFIG_X86_64
-+ pgd_t *upgd;
-+#endif
-
- if (likely(prev != next)) {
- BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
-@@ -167,10 +170,11 @@ static inline void switch_mm(struct mm_s
- op->arg1.mfn = virt_to_mfn(next->pgd);
- op++;
-
-- /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
-+ /* xen_new_user_pt(next->pgd) */
- #ifdef CONFIG_X86_64
- op->cmd = MMUEXT_NEW_USER_BASEPTR;
-- op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
-+ upgd = __user_pgd(next->pgd);
-+ op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
- op++;
- #endif
-
-@@ -214,7 +218,7 @@ static inline void switch_mm(struct mm_s
- * store to mm_cpumask.
- */
- load_cr3(next->pgd);
-- xen_new_user_pt(__pa(__user_pgd(next->pgd)));
-+ xen_new_user_pt(next->pgd);
- load_mm_ldt(next);
- }
- }
--- a/arch/x86/include/mach-xen/asm/pgalloc.h
+++ b/arch/x86/include/mach-xen/asm/pgalloc.h
@@ -123,15 +123,13 @@ static inline void pud_populate(struct m
@@ -82,7 +47,7 @@ At the same time remove the useless user mode pair of init_level4_pgt.
}
--- a/arch/x86/include/mach-xen/asm/pgtable_64.h
+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
-@@ -106,18 +106,25 @@ static inline void xen_set_pud(pud_t *pu
+@@ -106,7 +106,14 @@ static inline void xen_set_pud(pud_t *pu
: (void)(*__pudp = xen_make_pud(0)); \
})
@@ -96,10 +61,14 @@ At the same time remove the useless user mode pair of init_level4_pgt.
+ + ((unsigned long)pgd & ~PAGE_MASK));
+}
+ #ifdef CONFIG_KAISER
+ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+@@ -132,14 +139,14 @@ static inline pgd_t *xen_get_shadow_pgd(
+
static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
-- xen_l4_entry_update(pgdp, 0, pgd);
-+ xen_l4_entry_update(pgdp, pgd);
+- xen_l4_entry_update(pgdp, 0, kaiser_set_shadow_pgd(pgdp, pgd));
++ xen_l4_entry_update(pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
}
#define xen_pgd_clear(pgd) \
@@ -113,7 +82,7 @@ At the same time remove the useless user mode pair of init_level4_pgt.
--- a/arch/x86/kernel/cpu/common-xen.c
+++ b/arch/x86/kernel/cpu/common-xen.c
-@@ -1099,8 +1099,7 @@ DEFINE_PER_CPU_FIRST(union irq_stack_uni
+@@ -1146,8 +1146,7 @@ DEFINE_PER_CPU_FIRST(union irq_stack_uni
void xen_switch_pt(void)
{
#ifdef CONFIG_XEN
@@ -225,7 +194,7 @@ At the same time remove the useless user mode pair of init_level4_pgt.
BUG();
--- a/arch/x86/mm/init_64-xen.c
+++ b/arch/x86/mm/init_64-xen.c
-@@ -766,9 +766,6 @@ void __init xen_init_pt(void)
+@@ -776,9 +776,6 @@ void __init xen_init_pt(void)
(PTRS_PER_PUD - pud_index(__START_KERNEL_map))
* sizeof(*level3_kernel_pgt));
@@ -235,7 +204,7 @@ At the same time remove the useless user mode pair of init_level4_pgt.
/* Do an early initialization of the fixmap area. */
addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
-@@ -784,8 +781,6 @@ void __init xen_init_pt(void)
+@@ -794,8 +791,6 @@ void __init xen_init_pt(void)
early_make_page_readonly(init_level4_pgt,
XENFEAT_writable_page_tables);
@@ -323,6 +292,41 @@ At the same time remove the useless user mode pair of init_level4_pgt.
}
/* blktap and gntdev need this, as otherwise they would implicitly (and
+--- a/arch/x86/mm/tlb-xen.c
++++ b/arch/x86/mm/tlb-xen.c
+@@ -26,6 +26,9 @@ void switch_mm_irqs_off(struct mm_struct
+ {
+ unsigned cpu = smp_processor_id();
+ struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
++#ifdef CONFIG_X86_64
++ pgd_t *upgd;
++#endif
+
+ if (likely(prev != next)) {
+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+@@ -67,10 +70,11 @@ void switch_mm_irqs_off(struct mm_struct
+ op->arg1.mfn = virt_to_mfn(next->pgd);
+ op++;
+
+- /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
++ /* xen_new_user_pt(next->pgd) */
+ #ifdef CONFIG_X86_64
+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
+- op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
++ upgd = __user_pgd(next->pgd);
++ op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
+ op++;
+ #endif
+
+@@ -115,7 +119,7 @@ void switch_mm_irqs_off(struct mm_struct
+ * store to mm_cpumask.
+ */
+ load_cr3(next->pgd);
+- xen_new_user_pt(__pa(__user_pgd(next->pgd)));
++ xen_new_user_pt(next->pgd);
+ load_mm_ldt(next);
+ }
+ }
--- a/drivers/xen/core/machine_reboot.c
+++ b/drivers/xen/core/machine_reboot.c
@@ -175,8 +175,7 @@ static int take_machine_down(void *_susp
diff --git a/patches.xen/xen3-0001-x86-64-Give-vvars-their-own-page.patch b/patches.xen/xen3-0001-x86-64-Give-vvars-their-own-page.patch
new file mode 100644
index 0000000000..0ab339eed3
--- /dev/null
+++ b/patches.xen/xen3-0001-x86-64-Give-vvars-their-own-page.patch
@@ -0,0 +1,85 @@
+From: Andy Lutomirski <luto@MIT.EDU>
+Date: Sun, 5 Jun 2011 13:50:19 -0400
+Subject: [PATCH] xen/x86-64: Give vvars their own page
+Patch-mainline: Never, SUSE-Xen specific
+
+Move vvars out of the vsyscall page into their own page and mark
+it NX.
+
+Without this patch, an attacker who can force a daemon to call
+some fixed address could wait until the time contains, say,
+0xCD80, and then execute the current time.
+
+Signed-off-by: Andy Lutomirski <luto@mit.edu>
+Cc: Jesper Juhl <jj@chaosbits.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@infradead.org>
+Cc: Jan Beulich <JBeulich@novell.com>
+Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
+Cc: Mikael Pettersson <mikpe@it.uu.se>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
+Cc: Valdis.Kletnieks@vt.edu
+Cc: pageexec@freemail.hu
+Link: http://lkml.kernel.org/r/b1460f81dc4463d66ea3f2b5ce240f58d48effec.1307292171.git.luto@mit.edu
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+Automatically created from "patches.kaiser/0001-x86-64-Give-vvars-their-own-page.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/fixmap.h
++++ b/arch/x86/include/mach-xen/asm/fixmap.h
+@@ -78,6 +78,7 @@ enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++ VVAR_PAGE,
+ VSYSCALL_HPET,
+ #endif
+ FIX_DBGP_BASE,
+--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
+@@ -122,6 +122,7 @@ extern unsigned int __kernel_page_user;
+ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+ #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+ #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+ #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+@@ -144,6 +145,7 @@ extern unsigned int __kernel_page_user;
+ #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+ #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+ #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
++#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+
+ #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+ #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+--- a/arch/x86/kernel/vsyscall_64-xen.c
++++ b/arch/x86/kernel/vsyscall_64-xen.c
+@@ -284,9 +284,14 @@ void __init map_vsyscall(void)
+ {
+ extern char __vsyscall_0;
+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
++ extern char __vvar_page;
++ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+ /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
++ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
++ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
++ (unsigned long)VVAR_ADDRESS);
+ }
+
+ static int __init vsyscall_init(void)
+--- a/arch/x86/mm/pgtable-xen.c
++++ b/arch/x86/mm/pgtable-xen.c
+@@ -868,6 +868,7 @@ void xen_set_fixmap(enum fixed_addresses
+ extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
++ case VVAR_PAGE:
+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+ set_pte_vaddr_pud(level3_user_pgt, address, pte);
+ break;
diff --git a/patches.xen/xen3-0001-x86-64-Map-the-HPET-NX.patch b/patches.xen/xen3-0001-x86-64-Map-the-HPET-NX.patch
new file mode 100644
index 0000000000..c2de0b9d15
--- /dev/null
+++ b/patches.xen/xen3-0001-x86-64-Map-the-HPET-NX.patch
@@ -0,0 +1,50 @@
+From: Andy Lutomirski <luto@mit.edu>
+Date: Sun, 5 Jun 2011 13:50:21 -0400
+Subject: [PATCH] xen/x86-64: Map the HPET NX
+Patch-mainline: Never, SUSE-Xen specific
+
+Currently the HPET mapping is a user-accessible syscall
+instruction at a fixed address some of the time.
+
+A sufficiently determined hacker might be able to guess when.
+
+Signed-off-by: Andy Lutomirski <luto@mit.edu>
+Cc: Jesper Juhl <jj@chaosbits.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@infradead.org>
+Cc: Jan Beulich <JBeulich@novell.com>
+Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
+Cc: Mikael Pettersson <mikpe@it.uu.se>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
+Cc: Valdis.Kletnieks@vt.edu
+Cc: pageexec@freemail.hu
+Link: http://lkml.kernel.org/r/ab41b525a4ca346b1ca1145d16fb8d181861a8aa.1307292171.git.luto@mit.edu
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+Automatically created from "patches.kaiser/0001-x86-64-Map-the-HPET-NX.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
+@@ -121,8 +121,8 @@ extern unsigned int __kernel_page_user;
+ #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+ #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
++#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+ #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+@@ -144,8 +144,8 @@ extern unsigned int __kernel_page_user;
+ #define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+ #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+ #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+ #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
++#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
+
+ #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+ #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/patches.xen/xen3-kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch b/patches.xen/xen3-kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch
new file mode 100644
index 0000000000..c49739fd42
--- /dev/null
+++ b/patches.xen/xen3-kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch
@@ -0,0 +1,259 @@
+From 74e28ebd9c306dd4e3101233a25f2ce2294ec49b Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 26 Apr 2016 09:39:08 -0700
+Subject: [PATCH 09/19] xen: x86/mm, sched/core: Uninline switch_mm()
+Patch-mainline: Never, SUSE-specific
+
+It's fairly large and it has quite a few callers. This may also
+help untangle some headers down the road.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/54f3367803e7f80b2be62c8a21879aa74b1a5f57.1461688545.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 69c0319aabba45bcf33178916a2f06967b4adede)
+
+Automatically created from "patches.kaiser/kaiser-0009-x86-mm-sched-core-Uninline-switch_mm.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/mmu_context.h
++++ b/arch/x86/include/mach-xen/asm/mmu_context.h
+@@ -121,105 +121,8 @@ static inline void __prepare_arch_switch
+ #endif
+ }
+
+-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+- struct task_struct *tsk)
+-{
+- unsigned cpu = smp_processor_id();
+- struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+-
+- if (likely(prev != next)) {
+- BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+- !PagePinned(virt_to_page(next->pgd)));
+-
+-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+- percpu_write(cpu_tlbstate.active_mm, next);
+-#endif
+- cpumask_set_cpu(cpu, mm_cpumask(next));
+-
+- /*
+- * Re-load page tables: load_cr3(next->pgd).
+- *
+- * This logic has an ordering constraint:
+- *
+- * CPU 0: Write to a PTE for 'next'
+- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+- * CPU 1: set bit 1 in next's mm_cpumask
+- * CPU 1: load from the PTE that CPU 0 writes (implicit)
+- *
+- * We need to prevent an outcome in which CPU 1 observes
+- * the new PTE value and CPU 0 observes bit 1 clear in
+- * mm_cpumask. (If that occurs, then the IPI will never
+- * be sent, and CPU 0's TLB will contain a stale entry.)
+- *
+- * The bad outcome can occur if either CPU's load is
+- * reordered before that CPU's store, so both CPUs much
+- * execute full barriers to prevent this from happening.
+- *
+- * Thus, switch_mm needs a full barrier between the
+- * store to mm_cpumask and any operation that could load
+- * from next->pgd. This barrier synchronizes with
+- * remote TLB flushers. Fortunately, load_cr3 is
+- * serializing and thus acts as a full barrier.
+- *
+- */
+- op->cmd = MMUEXT_NEW_BASEPTR;
+- op->arg1.mfn = virt_to_mfn(next->pgd);
+- op++;
+-
+- /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
+-#ifdef CONFIG_X86_64
+- op->cmd = MMUEXT_NEW_USER_BASEPTR;
+- op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
+- op++;
+-#endif
+-
+- /*
+- * load the LDT, if the LDT is different:
+- */
+- if (unlikely(prev->context.ldt != next->context.ldt)) {
+- /* load_mm_ldt(next) */
+- const struct ldt_struct *ldt;
+-
+- /* lockless_dereference synchronizes with smp_store_release */
+- ldt = lockless_dereference(next->context.ldt);
+- op->cmd = MMUEXT_SET_LDT;
+- if (unlikely(ldt)) {
+- op->arg1.linear_addr = (long)ldt->entries;
+- op->arg2.nr_ents = ldt->size;
+- } else {
+- op->arg1.linear_addr = 0;
+- op->arg2.nr_ents = 0;
+- }
+- op++;
+- }
+-
+- BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+-
+- /* stop TLB flushes for the previous mm */
+- cpumask_clear_cpu(cpu, mm_cpumask(prev));
+- }
+-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+- else {
+- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+- BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+-
+- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+- /* We were in lazy tlb mode and leave_mm disabled
+- * tlb flush IPI delivery. We must reload CR3
+- * to make sure to use no freed page tables.
+- *
+- * As above, this is a barrier that forces
+- * TLB repopulation to be ordered after the
+- * store to mm_cpumask.
+- */
+- load_cr3(next->pgd);
+- xen_new_user_pt(__pa(__user_pgd(next->pgd)));
+- load_mm_ldt(next);
+- }
+- }
+-#endif
+-}
++extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk);
+
+ #define activate_mm(prev, next) \
+ do { \
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -27,7 +27,7 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
+ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
+
+ obj-$(CONFIG_XEN) += hypervisor.o
+-disabled-obj-$(CONFIG_XEN) := gup.o tlb.o
++disabled-obj-$(CONFIG_XEN) := gup.o
+
+ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
+--- /dev/null
++++ b/arch/x86/mm/tlb-xen.c
+@@ -0,0 +1,113 @@
++#include <linux/init.h>
++
++#include <linux/mm.h>
++#include <linux/spinlock.h>
++#include <linux/smp.h>
++#include <linux/interrupt.h>
++#include <linux/module.h>
++#include <linux/cpu.h>
++
++#include <asm/tlbflush.h>
++#include <asm/mmu_context.h>
++#include <asm/cache.h>
++
++void switch_mm(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk)
++{
++ unsigned cpu = smp_processor_id();
++ struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
++
++ if (likely(prev != next)) {
++ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
++ !PagePinned(virt_to_page(next->pgd)));
++
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
++ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
++ percpu_write(cpu_tlbstate.active_mm, next);
++#endif
++ cpumask_set_cpu(cpu, mm_cpumask(next));
++
++ /*
++ * Re-load page tables: load_cr3(next->pgd).
++ *
++ * This logic has an ordering constraint:
++ *
++ * CPU 0: Write to a PTE for 'next'
++ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
++ * CPU 1: set bit 1 in next's mm_cpumask
++ * CPU 1: load from the PTE that CPU 0 writes (implicit)
++ *
++ * We need to prevent an outcome in which CPU 1 observes
++ * the new PTE value and CPU 0 observes bit 1 clear in
++ * mm_cpumask. (If that occurs, then the IPI will never
++ * be sent, and CPU 0's TLB will contain a stale entry.)
++ *
++ * The bad outcome can occur if either CPU's load is
++ * reordered before that CPU's store, so both CPUs much
++ * execute full barriers to prevent this from happening.
++ *
++ * Thus, switch_mm needs a full barrier between the
++ * store to mm_cpumask and any operation that could load
++ * from next->pgd. This barrier synchronizes with
++ * remote TLB flushers. Fortunately, load_cr3 is
++ * serializing and thus acts as a full barrier.
++ *
++ */
++ op->cmd = MMUEXT_NEW_BASEPTR;
++ op->arg1.mfn = virt_to_mfn(next->pgd);
++ op++;
++
++ /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
++#ifdef CONFIG_X86_64
++ op->cmd = MMUEXT_NEW_USER_BASEPTR;
++ op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
++ op++;
++#endif
++
++ /*
++ * load the LDT, if the LDT is different:
++ */
++ if (unlikely(prev->context.ldt != next->context.ldt)) {
++ /* load_mm_ldt(next) */
++ const struct ldt_struct *ldt;
++
++ /* lockless_dereference synchronizes with smp_store_release */
++ ldt = lockless_dereference(next->context.ldt);
++ op->cmd = MMUEXT_SET_LDT;
++ if (unlikely(ldt)) {
++ op->arg1.linear_addr = (long)ldt->entries;
++ op->arg2.nr_ents = ldt->size;
++ } else {
++ op->arg1.linear_addr = 0;
++ op->arg2.nr_ents = 0;
++ }
++ op++;
++ }
++
++ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
++
++ /* stop TLB flushes for the previous mm */
++ cpumask_clear_cpu(cpu, mm_cpumask(prev));
++ }
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
++ else {
++ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
++ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
++
++ if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
++ /*
++ * We were in lazy tlb mode and leave_mm disabled
++ * tlb flush IPI delivery. We must reload CR3
++ * to make sure to use no freed page tables.
++ *
++ * As above, this is a barrier that forces
++ * TLB repopulation to be ordered after the
++ * store to mm_cpumask.
++ */
++ load_cr3(next->pgd);
++ xen_new_user_pt(__pa(__user_pgd(next->pgd)));
++ load_mm_ldt(next);
++ }
++ }
++#endif
++}
diff --git a/patches.xen/xen3-kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch b/patches.xen/xen3-kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch
new file mode 100644
index 0000000000..e8cb1d627e
--- /dev/null
+++ b/patches.xen/xen3-kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch
@@ -0,0 +1,62 @@
+From 656ec5851bc05fc918f99ad557e1a113b7267792 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 26 Apr 2016 09:39:09 -0700
+Subject: [PATCH 10/19] xen: x86/mm, sched/core: Turn off IRQs in switch_mm()
+Patch-mainline: Never, SUSE-specific
+
+Potential races between switch_mm() and TLB-flush or LDT-flush IPIs
+could be very messy. AFAICT the code is currently okay, whether by
+accident or by careful design, but enabling PCID will make it
+considerably more complicated and will no longer be obviously safe.
+
+Fix it with a big hammer: run switch_mm() with IRQs off.
+
+To avoid a performance hit in the scheduler, we take advantage of
+our knowledge that the scheduler already has IRQs disabled when it
+calls switch_mm().
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/f19baf759693c9dcae64bbff76189db77cb13398.1461688545.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 078194f8e9fe3cf54c8fd8bded48a1db5bd8eb8a)
+
+Automatically created from "patches.kaiser/kaiser-0010-x86-mm-sched-core-Turn-off-IRQs-in-switch_mm.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/mmu_context.h
++++ b/arch/x86/include/mach-xen/asm/mmu_context.h
+@@ -124,6 +124,10 @@ static inline void __prepare_arch_switch
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+
++extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk);
++#define switch_mm_irqs_off switch_mm_irqs_off
++
+ #define activate_mm(prev, next) \
+ do { \
+ xen_activate_mm(prev, next); \
+--- a/arch/x86/mm/tlb-xen.c
++++ b/arch/x86/mm/tlb-xen.c
+@@ -14,6 +14,16 @@
+ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+ {
++ unsigned long flags;
++
++ local_irq_save(flags);
++ switch_mm_irqs_off(prev, next, tsk);
++ local_irq_restore(flags);
++}
++
++void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
++ struct task_struct *tsk)
++{
+ unsigned cpu = smp_processor_id();
+ struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+
diff --git a/patches.xen/xen3-kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch b/patches.xen/xen3-kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch
new file mode 100644
index 0000000000..1e9ca8055b
--- /dev/null
+++ b/patches.xen/xen3-kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch
@@ -0,0 +1,123 @@
+From b9731f42739fa8378edd64342a4529a76cb4aec2 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Sun, 28 May 2017 10:00:14 -0700
+Subject: [PATCH 12/19] xen/x86/mm: Remove the UP asm/tlbflush.h code, always use
+ the (formerly) SMP code
+Patch-mainline: Never, SUSE-specific
+
+The UP asm/tlbflush.h generates somewhat nicer code than the SMP version.
+Aside from that, it's fallen quite a bit behind the SMP code:
+
+ - flush_tlb_mm_range() didn't flush individual pages if the range
+ was small.
+
+ - The lazy TLB code was much weaker. This usually wouldn't matter,
+ but, if a kernel thread flushed its lazy "active_mm" more than
+ once (due to reclaim or similar), it wouldn't be unlazied and
+ would instead pointlessly flush repeatedly.
+
+ - Tracepoints were missing.
+
+Aside from that, simply having the UP code around was a maintanence
+burden, since it means that any change to the TLB flush code had to
+make sure not to break it.
+
+Simplify everything by deleting the UP code.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@linux.intel.com>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Nadav Amit <namit@vmware.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit ce4a4e565f5264909a18c733b864c3f74467f69e)
+
+Automatically created from "patches.kaiser/kaiser-0012-x86-mm-Remove-the-UP-asm-tlbflush.h-code-always-use-.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/mmu_context.h
++++ b/arch/x86/include/mach-xen/asm/mmu_context.h
+@@ -73,7 +73,7 @@ void destroy_context(struct mm_struct *m
+
+ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+ {
+-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
++#ifndef CONFIG_XEN /* XEN: no lazy tlb */
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+ #endif
+--- a/arch/x86/include/mach-xen/asm/tlbflush.h
++++ b/arch/x86/include/mach-xen/asm/tlbflush.h
+@@ -6,6 +6,7 @@
+
+ #include <asm/processor.h>
+ #include <asm/system.h>
++#include <asm/smp.h>
+
+ #define __flush_tlb() xen_tlb_flush()
+ #define __flush_tlb_global() xen_tlb_flush()
+@@ -39,46 +40,8 @@ static inline void __flush_tlb_one(unsig
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+- *
+- * x86-64 can only flush individual pages or full VMs. For a range flush
+- * we always do the full VM. Might be worth trying if for a small
+- * range a few INVLPGs in a row are a win.
+ */
+
+-#ifndef CONFIG_SMP
+-
+-#define flush_tlb() __flush_tlb()
+-#define flush_tlb_all() __flush_tlb_all()
+-#define local_flush_tlb() __flush_tlb()
+-
+-static inline void flush_tlb_mm(struct mm_struct *mm)
+-{
+- if (mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-static inline void flush_tlb_page(struct vm_area_struct *vma,
+- unsigned long addr)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb_one(addr);
+-}
+-
+-static inline void flush_tlb_range(struct vm_area_struct *vma,
+- unsigned long start, unsigned long end)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-static inline void reset_lazy_tlbstate(void)
+-{
+-}
+-
+-#else /* SMP */
+-
+-#include <asm/smp.h>
+-
+ #define local_flush_tlb() __flush_tlb()
+
+ #define flush_tlb_all xen_tlb_flush_all
+@@ -111,8 +74,6 @@ static inline void reset_lazy_tlbstate(v
+ }
+ #endif
+
+-#endif /* SMP */
+-
+ static inline void flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
+ {
diff --git a/patches.xen/xen3-kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch b/patches.xen/xen3-kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch
new file mode 100644
index 0000000000..a93b996698
--- /dev/null
+++ b/patches.xen/xen3-kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch
@@ -0,0 +1,80 @@
+From fa472b69062008c4d0ef4aa2ac5e660252c6234e Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 29 Jun 2017 08:53:21 -0700
+Subject: [PATCH 15/19] xen/x86/mm: Enable CR4.PCIDE on supported systems
+Patch-mainline: Never, SUSE-specific
+
+We can use PCID if the CPU has PCID and PGE and we're not on Xen.
+
+By itself, this has no effect. A followup patch will start using PCID.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Arjan van de Ven <arjan@linux.intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+(cherry picked from commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5)
+
+Automatically created from "patches.kaiser/kaiser-0015-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch" by xen-port-patches.py
+
+--- a/arch/x86/kernel/cpu/common-xen.c
++++ b/arch/x86/kernel/cpu/common-xen.c
+@@ -296,6 +296,36 @@ static __cpuinit void setup_smep(struct
+ }
+ }
+
++static void setup_pcid(struct cpuinfo_x86 *c)
++{
++#ifndef CONFIG_XEN
++ if (cpu_has(c, X86_FEATURE_PCID)) {
++#ifdef CONFIG_X86_64
++ if (cpu_has(c, X86_FEATURE_PGE)) {
++ /*
++ * Regardless of whether PCID is enumerated, the
++ * SDM says that it can't be enabled in 32-bit mode.
++ */
++ set_in_cr4(X86_CR4_PCIDE);
++ }
++#else
++ /*
++ * flush_tlb_all(), as currently implemented, won't
++ * work if PCID is on but PGE is not. Since that
++ * combination doesn't exist on real hardware, there's
++ * no reason to try to fully support it, but it's
++ * polite to avoid corrupting data if we're on
++ * an improperly configured VM.
++ */
++ clear_cpu_cap(c, X86_FEATURE_PCID);
++#endif
++ }
++#else
++ setup_clear_cpu_cap(X86_FEATURE_PCID);
++ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
++#endif
++}
++
+ /*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+@@ -888,6 +918,9 @@ static void __cpuinit identify_cpu(struc
+ /* Disable the PN if appropriate */
+ squash_the_stupid_serial_number(c);
+
++ /* Set up PCID */
++ setup_pcid(c);
++
+ /*
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
diff --git a/patches.xen/xen3-kaiser-0018-KAISER-Kernel-Address-Isolation.patch b/patches.xen/xen3-kaiser-0018-KAISER-Kernel-Address-Isolation.patch
new file mode 100644
index 0000000000..22965b4ef1
--- /dev/null
+++ b/patches.xen/xen3-kaiser-0018-KAISER-Kernel-Address-Isolation.patch
@@ -0,0 +1,509 @@
+From 2622b8382cf1aa97ec9cf6d194f1389585f029b2 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Mon, 11 Dec 2017 17:59:50 -0800
+Subject: [PATCH 18/19] xen/KAISER: Kernel Address Isolation
+Patch-mainline: Never, SUSE-specific
+
+This patch introduces our implementation of KAISER (Kernel Address
+Isolation to have Side-channels Efficiently Removed), a kernel isolation
+technique to close hardware side channels on kernel address information.
+
+More information about the original patch can be found at:
+https://github.com/IAIK/KAISER
+http://marc.info/?l=linux-kernel&m=149390087310405&w=2
+
+Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+Richard Fellner <richard.fellner@student.tugraz.at>
+Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+<clementine.maurice@iaik.tugraz.at>
+<moritz.lipp@iaik.tugraz.at>
+
+That original was then developed further by
+Dave Hansen <dave.hansen@intel.com>
+Hugh Dickins <hughd@google.com>
+then others after this snapshot.
+
+This combined patch for 3.2.96 was derived from hughd's patches below
+for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last,
+which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been
+combined in order to minimize the effort of rebasing: most of the
+patches in the 3.18.72 series were small fixes and cleanups and
+enhancements to three large patches. About the only new work in this
+backport is a simple reimplementation of kaiser_remove_mapping():
+since mm/pageattr.c changed a lot between 3.2 and 3.18, and the
+mods there for Kaiser never seemed necessary.
+
+Backported to 3.0 (11-SP4 variant of it) by Jiri kosina.
+
+KAISER: Kernel Address Isolation
+kaiser: merged update
+kaiser: do not set _PAGE_NX on pgd_none
+kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
+kaiser: fix build and FIXME in alloc_ldt_struct()
+kaiser: KAISER depends on SMP
+kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
+kaiser: fix perf crashes
+kaiser: ENOMEM if kaiser_pagetable_walk() NULL
+kaiser: tidied up asm/kaiser.h somewhat
+kaiser: tidied up kaiser_add/remove_mapping slightly
+kaiser: kaiser_remove_mapping() move along the pgd
+kaiser: align addition to x86/mm/Makefile
+kaiser: cleanups while trying for gold link
+kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
+kaiser: delete KAISER_REAL_SWITCH option
+kaiser: vmstat show NR_KAISERTABLE as nr_overhead
+kaiser: enhanced by kernel and user PCIDs
+kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
+kaiser: PCID 0 for kernel and 128 for user
+kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
+kaiser: paranoid_entry pass cr3 need to paranoid_exit
+kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
+kaiser: fix unlikely error in alloc_ldt_struct()
+kaiser: drop is_atomic arg to kaiser_pagetable_walk()
+kaiser: extend maping to sched+kprobes+entry sections [jkosina@suse.cz]
+kaiser: port entry code to reentrant NMI support [jkosina@suse.cz]
+kaiser: remove !paravirt dependency [jkosina@suse.cz]
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+Automatically created from "patches.kaiser/kaiser-0018-KAISER-Kernel-Address-Isolation.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/desc.h
++++ b/arch/x86/include/mach-xen/asm/desc.h
+@@ -42,7 +42,7 @@ struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+ } __attribute__((aligned(PAGE_SIZE)));
+
+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+ {
+--- a/arch/x86/include/mach-xen/asm/pgtable_64.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
+@@ -102,9 +102,36 @@ static inline void xen_pud_clear(pud_t *
+
+ #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+
++#ifdef CONFIG_KAISER
++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
++
++static inline pgd_t *xen_get_shadow_pgd(pgd_t *pgdp)
++{
++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
++}
++
++static inline pgd_t *xen_get_normal_pgd(pgd_t *pgdp)
++{
++ return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
++}
++#else
++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ return pgd;
++}
++static inline pgd_t *xen_get_shadow_pgd(pgd_t *pgdp)
++{
++ return NULL;
++}
++static inline pgd_t *xen_get_normal_pgd(pgd_t *pgdp)
++{
++ return pgdp;
++}
++#endif /* CONFIG_KAISER */
++
+ static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+- xen_l4_entry_update(pgdp, pgd);
++ xen_l4_entry_update(pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
+ }
+
+ static inline void xen_pgd_clear(pgd_t *pgd)
+--- a/arch/x86/include/mach-xen/asm/pgtable.h
++++ b/arch/x86/include/mach-xen/asm/pgtable.h
+@@ -580,7 +580,18 @@ static inline pud_t *pud_offset(pgd_t *p
+
+ static inline int pgd_bad(pgd_t pgd)
+ {
+- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++ pgdval_t ignore_flags = _PAGE_USER;
++ /*
++ * We set NX on KAISER pgds that map userspace memory so
++ * that userspace can not meaningfully use the kernel
++ * page table by accident; it will fault on the first
++ * instruction it tries to run. See xen_set_pgd().
++ */
++#ifdef CONFIG_KAISER
++ ignore_flags |= _PAGE_NX;
++#endif
++
++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+
+ static inline int pgd_none(pgd_t pgd)
+@@ -811,6 +822,12 @@ static inline void pmdp_set_wrprotect(st
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+ memcpy(dst, src, count * sizeof(pgd_t));
++#ifdef CONFIG_KAISER
++ /* Clone the shadow pgd part as well */
++ memcpy(xen_get_shadow_pgd(dst),
++ xen_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
++#endif
+ }
+
+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)
+--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
+@@ -39,7 +39,11 @@
+ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
++#ifdef CONFIG_KAISER
++#define _PAGE_GLOBAL (_AT(pteval_t, 0))
++#else
+ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
++#endif
+ #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+ #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+ #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+@@ -62,7 +66,7 @@
+ #endif
+
+ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+ #ifndef __ASSEMBLY__
+ #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+@@ -82,6 +86,33 @@ extern unsigned int __kernel_page_user;
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
++/* The ASID is the lower 12 bits of CR3 */
++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
++
++/* Mask for all the PCID-related bits in CR3: */
++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
++
++#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
++
++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
++#else
++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
++/*
++ * PCIDs are unsupported on 32-bit and none of these bits can be
++ * set in CR3:
++ */
++#define X86_CR3_PCID_KERN_FLUSH (0)
++#define X86_CR3_PCID_USER_FLUSH (0)
++#define X86_CR3_PCID_KERN_NOFLUSH (0)
++#define X86_CR3_PCID_USER_NOFLUSH (0)
++#endif
++
+ /*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+--- a/arch/x86/include/mach-xen/asm/processor.h
++++ b/arch/x86/include/mach-xen/asm/processor.h
+@@ -283,7 +283,7 @@ struct tss_struct {
+
+ } ____cacheline_aligned;
+
+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
+
+ /*
+ * Save the original ist values for checking stack pointers during debugging
+--- a/arch/x86/include/mach-xen/asm/tlbflush.h
++++ b/arch/x86/include/mach-xen/asm/tlbflush.h
+@@ -8,6 +8,22 @@
+ #include <asm/system.h>
+ #include <asm/smp.h>
+
++/*
++ * Declare a couple of kaiser interfaces here for convenience,
++ * to avoid the need for asm/kaiser.h in unexpected places.
++ */
++#ifdef CONFIG_KAISER
++extern void kaiser_setup_pcid(void);
++extern void kaiser_flush_tlb_on_return_to_user(void);
++#else
++static inline void kaiser_setup_pcid(void)
++{
++}
++static inline void kaiser_flush_tlb_on_return_to_user(void)
++{
++}
++#endif
++
+ #define __flush_tlb() xen_tlb_flush()
+ #define __flush_tlb_global() xen_tlb_flush()
+ #define __flush_tlb_single(addr) xen_invlpg(addr)
+--- a/arch/x86/kernel/cpu/common-xen.c
++++ b/arch/x86/kernel/cpu/common-xen.c
+@@ -92,7 +92,7 @@ static const struct cpu_dev __cpuinitcon
+
+ static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
+@@ -307,6 +307,19 @@ static void setup_pcid(struct cpuinfo_x8
+ * SDM says that it can't be enabled in 32-bit mode.
+ */
+ set_in_cr4(X86_CR4_PCIDE);
++ /*
++ * INVPCID has two "groups" of types:
++ * 1/2: Invalidate an individual address
++ * 3/4: Invalidate all contexts
++ *
++ * 1/2 take a PCID, but 3/4 do not. So, 3/4
++ * ignore the PCID argument in the descriptor.
++ * But, we have to be careful not to call 1/2
++ * with an actual non-zero PCID in them before
++ * we do the above set_in_cr4().
++ */
++ if (cpu_has(c, X86_FEATURE_INVPCID))
++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ }
+ #else
+ /*
+@@ -630,6 +643,7 @@ void __cpuinit cpu_detect(struct cpuinfo
+ c->x86_cache_alignment = c->x86_clflush_size;
+ }
+ }
++ kaiser_setup_pcid();
+ }
+
+ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+@@ -1158,7 +1172,7 @@ static const unsigned int exception_stac
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+ };
+
+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+ #endif
+
+--- a/arch/x86/kernel/entry_64-xen.S
++++ b/arch/x86/kernel/entry_64-xen.S
+@@ -57,6 +57,7 @@
+ #include <asm/ftrace.h>
+ #include <asm/percpu.h>
+ #include <asm/pgtable_types.h>
++#include <asm/kaiser.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/features.h>
+
+@@ -354,6 +355,7 @@ ENTRY(save_args)
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
++ SWITCH_KERNEL_CR3
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+@@ -394,6 +396,12 @@ END(save_rest)
+ #ifndef CONFIG_XEN
+ /* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
++/*
++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
++ */
+ ENTRY(save_paranoid)
+ XCPT_FRAME offset=ORIG_RAX-R15+8
+ cld
+@@ -419,7 +427,25 @@ ENTRY(save_paranoid)
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+-1: ret
++1:
++#ifdef CONFIG_KAISER
++ /*
++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
++ * unconditionally, but we need to find out whether the reverse
++ * should be done on return (conveyed to paranoid_exit in %ebx).
++ */
++ movq %cr3, %rax
++ testl $KAISER_SHADOW_PGD_OFFSET, %eax
++ jz 2f
++ orl $2, %ebx
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ orq x86_cr3_pcid_noflush, %rax
++ movq %rax, %cr3
++2:
++#endif
++ ret
+ CFI_ENDPROC
+ END(save_paranoid)
+ .popsection
+@@ -1138,30 +1164,40 @@ paranoidzeroentry machine_check *machine
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+-
+- /* ebx: no swapgs flag */
++/*
++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
++ */
+ ENTRY(paranoid_exit)
+ DEFAULT_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+- testl %ebx,%ebx /* swapgs needed? */
+- jnz paranoid_restore
+- testl $3,CS(%rsp)
+- jnz paranoid_userspace
+-paranoid_swapgs:
++ movq %rbx, %r12 /* paranoid_userspace uses %ebx */
++ testl $3, CS(%rsp)
++ jnz paranoid_userspace
++paranoid_kernel:
++ movq %r12, %rbx /* restore after paranoid_userspace */
+ TRACE_IRQS_IRETQ 0
++#ifdef CONFIG_KAISER
++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
++ jz paranoid_exit_no_switch
++ SWITCH_USER_CR3
++paranoid_exit_no_switch:
++#endif
++ testl $1, %ebx /* swapgs needed? */
++ jnz paranoid_exit_no_swapgs
+ SWAPGS_UNSAFE_STACK
++paranoid_exit_no_swapgs:
+ RESTORE_ALL 8
+- jmp irq_return
+-paranoid_restore:
+- TRACE_IRQS_IRETQ 0
+- RESTORE_ALL 8
+- jmp irq_return
++ jmp irq_return
++
+ paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+- jz paranoid_swapgs
++ jz paranoid_kernel
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+@@ -1211,6 +1247,13 @@ ENTRY(error_entry)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
+ #ifndef CONFIG_XEN
++ /*
++ * error_entry() always returns with a kernel gsbase and
++ * CR3. We must also have a kernel CR3/gsbase before
++ * calling TRACE_IRQS_*. Just unconditionally switch to
++ * the kernel CR3 here.
++ */
++ SWITCH_KERNEL_CR3
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+--- a/arch/x86/kernel/ldt-xen.c
++++ b/arch/x86/kernel/ldt-xen.c
+@@ -15,6 +15,7 @@
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
++#include <linux/kaiser.h>
+
+ #include <asm/system.h>
+ #include <asm/ldt.h>
+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
+ set_ldt(ldt->entries, ldt->size);
+ }
+
++static void __free_ldt_struct(struct ldt_struct *ldt)
++{
++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(ldt->entries);
++ else
++ free_page((unsigned long)ldt->entries);
++ kfree(ldt);
++}
++
+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+ static struct ldt_struct *alloc_ldt_struct(int size)
+ {
+ struct ldt_struct *new_ldt;
+ int alloc_size;
++ int ret;
+
+ if (size > LDT_ENTRIES)
+ return NULL;
+@@ -65,7 +76,14 @@ static struct ldt_struct *alloc_ldt_stru
+ kfree(new_ldt);
+ return NULL;
+ }
++
++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
++ __PAGE_KERNEL);
+ new_ldt->size = size;
++ if (ret) {
++ __free_ldt_struct(new_ldt);
++ return NULL;
++ }
+ return new_ldt;
+ }
+
+@@ -96,13 +114,11 @@ static void free_ldt_struct(struct ldt_s
+ if (likely(!ldt))
+ return;
+
++ kaiser_remove_mapping((unsigned long)ldt->entries,
++ ldt->size * LDT_ENTRY_SIZE);
+ make_pages_writable(ldt->entries, PFN_UP(ldt->size * LDT_ENTRY_SIZE),
+ XENFEAT_writable_descriptor_tables);
+- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(ldt->entries);
+- else
+- free_page((unsigned long)ldt->entries);
+- kfree(ldt);
++ __free_ldt_struct(ldt);
+ }
+
+ /*
+--- a/arch/x86/mm/pgtable-xen.c
++++ b/arch/x86/mm/pgtable-xen.c
+@@ -9,7 +9,7 @@
+ #include <asm/hypervisor.h>
+ #include <asm/mmu_context.h>
+
+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+ #ifdef CONFIG_HIGHPTE
+ #define PGALLOC_USER_GFP __GFP_HIGHMEM
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -99,7 +99,7 @@ config SECURITY
+ config KAISER
+ bool "Remove the kernel mapping in user mode"
+ default y
+- depends on X86_64 && SMP
++ depends on X86_64 && SMP && !XEN
+ help
+ This enforces a strict kernel and user space isolation, in order
+ to close hardware side channels on kernel address information.
diff --git a/patches.xen/xen3-kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch b/patches.xen/xen3-kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch
new file mode 100644
index 0000000000..5cac49d326
--- /dev/null
+++ b/patches.xen/xen3-kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch
@@ -0,0 +1,37 @@
+From ecd5a23ba4031f4cd1c9225d30ae5d210d65fc1b Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Tue, 5 Dec 2017 21:15:07 +0100
+Subject: [PATCH 19/19] xen/x86/mm/kaiser: re-enable vsyscalls
+Patch-mainline: Never, SUSE-specific
+
+To avoid breaking the kernel ABI.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+
+hughd adjusted it to the 3.2.96 tree (leaving out the PVCLOCK_FIXMAP
+user mapping, which does not apply to this tree); and for safety
+added vsyscall_pgprot, and a BUG_ON if _PAGE_USER outside of FIXMAP.
+Automatically created from "patches.kaiser/kaiser-0019-x86-mm-kaiser-re-enable-vsyscalls.patch" by xen-port-patches.py
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+--- a/arch/x86/kernel/vsyscall_64-xen.c
++++ b/arch/x86/kernel/vsyscall_64-xen.c
+@@ -49,6 +49,8 @@
+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
+ #define __syscall_clobber "r11","cx","memory"
+
++unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL;
++
+ DEFINE_VVAR(int, vgetcpu_mode);
+ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
+ {
+@@ -288,7 +290,7 @@ void __init map_vsyscall(void)
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+ /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
++ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, __pgprot(vsyscall_pgprot));
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+ (unsigned long)VVAR_ADDRESS);
diff --git a/patches.xen/xen3-kaiser-build-fix.patch b/patches.xen/xen3-kaiser-build-fix.patch
new file mode 100644
index 0000000000..29bf0ce507
--- /dev/null
+++ b/patches.xen/xen3-kaiser-build-fix.patch
@@ -0,0 +1,32 @@
+From: bp@suse.de
+Subject: temporary fix
+Patch-mainline: never, hell no
+References: bsc#1068032
+
+---
+ arch/x86/include/mach-xen/asm/system.h | 1 -
+ arch/x86/kernel/cpu/spec_ctrl.c | 2 ++
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/mach-xen/asm/system.h
++++ b/arch/x86/include/mach-xen/asm/system.h
+@@ -554,7 +554,6 @@ void stop_this_cpu(void *dummy);
+ */
+ static __always_inline void rdtsc_barrier(void)
+ {
+- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ }
+
+--- a/arch/x86/kernel/cpu/spec_ctrl.c
++++ b/arch/x86/kernel/cpu/spec_ctrl.c
+@@ -47,7 +47,9 @@ EXPORT_SYMBOL_GPL(x86_enable_ibrs);
+ */
+ void stuff_RSB(void)
+ {
++#ifndef CONFIG_XEN
+ stuff_rsb();
++#endif
+ }
+ EXPORT_SYMBOL_GPL(stuff_RSB);
+
diff --git a/patches.xen/xen3-kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch b/patches.xen/xen3-kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
new file mode 100644
index 0000000000..36178dd7c9
--- /dev/null
+++ b/patches.xen/xen3-kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
@@ -0,0 +1,244 @@
+From 70eba0679e1004bb544b5d780f1c6233c60da2f2 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Sun, 24 Sep 2017 16:59:49 -0700
+Subject: [PATCH 05/14] xen/kaiser: add "nokaiser" boot option, using ALTERNATIVE
+Patch-mainline: Never, SUSE-specific
+
+Added "nokaiser" boot option: an early param like "noinvpcid".
+Most places now check int kaiser_enabled (#defined 0 when not
+CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
+and entry_64_compat.S are using the ALTERNATIVE technique, which
+patches in the preferred instructions at runtime. That technique
+is tied to x86 cpu features, so X86_FEATURE_KAISER fabricated
+("" in its comment so "kaiser" not magicked into /proc/cpuinfo).
+
+Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
+but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
+nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
+neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
+won't get set in some obscure corner, or something add PGE into CR4.
+By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
+all page table setup which uses pte_pfn() masks it out of the ptes.
+
+It's slightly shameful that the same declaration versus definition of
+kaiser_enabled appears in not one, not two, but in three header files
+(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way,
+than with #including any of those in any of the others; and did not
+feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
+them all, so we shall hear about it if they get out of synch.
+
+Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
+from kaiser.c; removed the unused native_get_normal_pgd(); removed
+the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
+comments. But more interestingly, set CR4.PSE in secondary_startup_64:
+the manual is clear that it does not matter whether it's 0 or 1 when
+4-level-pts are enabled, but I was distracted to find cr4 different on
+BSP and auxiliaries - BSP alone was adding PSE, in init_memory_mapping().
+
+(cherry picked from Change-Id: I8e5bec716944444359cbd19f6729311eff943e9a)
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+Automatically created from "patches.kaiser/kaiser-nokaiser-0005-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch" by xen-port-patches.py
+
+--- a/arch/x86/include/mach-xen/asm/pgtable_64.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
+@@ -107,13 +107,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t
+
+ static inline pgd_t *xen_get_shadow_pgd(pgd_t *pgdp)
+ {
++#ifdef CONFIG_DEBUG_VM
++ /* linux/mmdebug.h may not have been included at this point */
++ BUG_ON(!kaiser_enabled);
++#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+ }
+-
+-static inline pgd_t *xen_get_normal_pgd(pgd_t *pgdp)
+-{
+- return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+-}
+ #else
+ static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+@@ -123,10 +122,6 @@ static inline pgd_t *xen_get_shadow_pgd(
+ {
+ return NULL;
+ }
+-static inline pgd_t *xen_get_normal_pgd(pgd_t *pgdp)
+-{
+- return pgdp;
+-}
+ #endif /* CONFIG_KAISER */
+
+ static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+--- a/arch/x86/include/mach-xen/asm/pgtable.h
++++ b/arch/x86/include/mach-xen/asm/pgtable.h
+@@ -17,6 +17,11 @@
+ #ifndef __ASSEMBLY__
+
+ #include <asm/x86_init.h>
++#ifdef CONFIG_KAISER
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif
+
+ /*
+ * ZERO_PAGE is a global shared page that is always zero: used
+@@ -587,9 +592,8 @@ static inline int pgd_bad(pgd_t pgd)
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See xen_set_pgd().
+ */
+-#ifdef CONFIG_KAISER
+- ignore_flags |= _PAGE_NX;
+-#endif
++ if (kaiser_enabled)
++ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+@@ -821,12 +825,14 @@ static inline void pmdp_set_wrprotect(st
+ */
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+- memcpy(dst, src, count * sizeof(pgd_t));
++ memcpy(dst, src, count * sizeof(pgd_t));
+ #ifdef CONFIG_KAISER
+- /* Clone the shadow pgd part as well */
+- memcpy(xen_get_shadow_pgd(dst),
+- xen_get_shadow_pgd(src),
+- count * sizeof(pgd_t));
++ if (kaiser_enabled) {
++ /* Clone the shadow pgd part as well */
++ memcpy(xen_get_shadow_pgd(dst),
++ xen_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
++ }
+ #endif
+ }
+
+--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
+@@ -39,11 +39,7 @@
+ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+-#ifdef CONFIG_KAISER
+-#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+-#else
+ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+-#endif
+ #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+ #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+ #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+--- a/arch/x86/include/mach-xen/asm/tlbflush.h
++++ b/arch/x86/include/mach-xen/asm/tlbflush.h
+@@ -13,9 +13,11 @@
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+ #ifdef CONFIG_KAISER
++extern int kaiser_enabled;
+ extern void kaiser_setup_pcid(void);
+ extern void kaiser_flush_tlb_on_return_to_user(void);
+ #else
++#define kaiser_enabled 0
+ static inline void kaiser_setup_pcid(void)
+ {
+ }
+--- a/arch/x86/kernel/cpu/common-xen.c
++++ b/arch/x86/kernel/cpu/common-xen.c
+@@ -301,7 +301,7 @@ static void setup_pcid(struct cpuinfo_x8
+ #ifndef CONFIG_XEN
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ #ifdef CONFIG_X86_64
+- if (cpu_has(c, X86_FEATURE_PGE)) {
++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ /*
+ * Regardless of whether PCID is enumerated, the
+ * SDM says that it can't be enabled in 32-bit mode.
+@@ -698,6 +698,10 @@ void __cpuinit get_cpu_cap(struct cpuinf
+ #endif
+
+ init_scattered_cpuid_features(c);
++#ifdef CONFIG_KAISER
++ if (kaiser_enabled)
++ set_cpu_cap(c, X86_FEATURE_KAISER);
++#endif
+ }
+
+ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+@@ -1291,6 +1295,15 @@ void __cpuinit cpu_init(void)
+ struct task_struct *me;
+ int cpu;
+
++ if (!kaiser_enabled) {
++ /*
++ * secondary_startup_64() deferred setting PGE in cr4:
++ * init_memory_mapping() sets it on the boot cpu,
++ * but it needs to be set on each secondary cpu.
++ */
++ set_in_cr4(X86_CR4_PGE);
++ }
++
+ cpu = stack_smp_processor_id();
+ /* CPU 0 is initialised in head64.c */
+ if (cpu != 0)
+--- a/arch/x86/kernel/entry_64-xen.S
++++ b/arch/x86/kernel/entry_64-xen.S
+@@ -57,6 +57,8 @@
+ #include <asm/ftrace.h>
+ #include <asm/percpu.h>
+ #include <asm/pgtable_types.h>
++#include <asm/alternative-asm.h>
++#include <asm/cpufeature.h>
+ #include <asm/kaiser.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/features.h>
+@@ -436,7 +438,7 @@ ENTRY(save_paranoid)
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+- movq %cr3, %rax
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+@@ -1181,6 +1183,7 @@ paranoid_kernel:
+ movq %r12, %rbx /* restore after paranoid_userspace */
+ TRACE_IRQS_IRETQ 0
+ #ifdef CONFIG_KAISER
++ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+--- a/arch/x86/mm/init_64-xen.c
++++ b/arch/x86/mm/init_64-xen.c
+@@ -386,6 +386,16 @@ void __init cleanup_highmap(void)
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
++ else if (kaiser_enabled) {
++ /*
++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
++ * clear that now. This is not important, so long as
++ * CR4.PGE remains clear, but it removes an anomaly.
++ * Physical mapping setup below avoids _PAGE_GLOBAL
++ * by use of massage_pgprot() inside pfn_pte() etc.
++ */
++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
++ }
+ }
+ }
+ #endif
+--- a/arch/x86/mm/init-xen.c
++++ b/arch/x86/mm/init-xen.c
+@@ -179,7 +179,7 @@ unsigned long __init_refok init_memory_m
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+- if (cpu_has_pge) {
++ if (cpu_has_pge && !kaiser_enabled) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
diff --git a/patches.xen/xen3-kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch b/patches.xen/xen3-kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch
new file mode 100644
index 0000000000..108dda65fe
--- /dev/null
+++ b/patches.xen/xen3-kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch
@@ -0,0 +1,37 @@
+From 78e37b25da902ecb56124427ed682f3846f7a191 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Tue, 3 Oct 2017 20:49:04 -0700
+Subject: [PATCH 06/14] xen/kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush
+Patch-mainline: Never, SUSE-specific
+
+Now that we're playing the ALTERNATIVE game, use that more efficient
+method: instead of user-mapping an extra page, and reading an extra
+cacheline each time for x86_cr3_pcid_noflush.
+
+Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax)
+is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs;
+but the one line with $63 in looks clearer, so let's stick with that.
+
+Worried about what happens with an ALTERNATIVE between the jump and
+jump label in another ALTERNATIVE? I was, but have checked the
+combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64,
+and it does a good job.
+
+(cherry picked from Change-Id: I46d06167615aa8d628eed9972125ab2faca93f05)
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+
+Automatically created from "patches.kaiser/kaiser-nokaiser-0006-kaiser-use-ALTERNATIVE-instead-of-x86_cr3_pcid_noflu.patch" by xen-port-patches.py
+
+--- a/arch/x86/kernel/entry_64-xen.S
++++ b/arch/x86/kernel/entry_64-xen.S
+@@ -443,7 +443,8 @@ ENTRY(save_paranoid)
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+- orq x86_cr3_pcid_noflush, %rax
++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+ 2:
+ #endif
diff --git a/patches.xen/xen3-kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch b/patches.xen/xen3-kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch
new file mode 100644
index 0000000000..799a637d72
--- /dev/null
+++ b/patches.xen/xen3-kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch
@@ -0,0 +1,22 @@
+From: Borislav Petkov <bp@suse.de>
+Subject: xen/x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling
+Patch-mainline: Never, SUSE-Xen specific
+
+Concentrate it in arch/x86/mm/kaiser.c and use the upstream string "nopti".
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Automatically created from "patches.kaiser/kaiser-nokaiser-0007-rename-and-simplify-feature-setting.patch" by xen-port-patches.py
+
+--- a/arch/x86/kernel/cpu/common-xen.c
++++ b/arch/x86/kernel/cpu/common-xen.c
+@@ -698,10 +698,6 @@ void __cpuinit get_cpu_cap(struct cpuinf
+ #endif
+
+ init_scattered_cpuid_features(c);
+-#ifdef CONFIG_KAISER
+- if (kaiser_enabled)
+- set_cpu_cap(c, X86_FEATURE_KAISER);
+-#endif
+ }
+
+ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
diff --git a/patches.xen/xen3-patch-2.6.25 b/patches.xen/xen3-patch-2.6.25
index 16ece6d015..2e1ffe94f2 100644
--- a/patches.xen/xen3-patch-2.6.25
+++ b/patches.xen/xen3-patch-2.6.25
@@ -1,6 +1,7 @@
From: kernel.org
Subject: 2.6.25
-Patch-mainline: 2.6.25
+Git-commit: 4b119e21d0c66c22e8ca03df05d9de623d0eb50f
+Patch-mainline: v2.6.25
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
@@ -18559,7 +18560,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
}
/*
-@@ -311,8 +315,8 @@ blktap_unmap(struct blktap *tap, struct
+@@ -310,8 +314,8 @@ blktap_unmap(struct blktap *tap, struct
if (!xen_feature(XENFEAT_auto_translated_physmap) &&
request->handles[i].kernel == INVALID_GRANT_HANDLE) {
@@ -18570,7 +18571,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
INVALID_P2M_ENTRY);
}
-@@ -352,7 +356,7 @@ blktap_device_fail_pending_requests(stru
+@@ -356,7 +360,7 @@ blktap_device_fail_pending_requests(stru
blktap_unmap(tap, request);
req = (struct request *)(unsigned long)request->id;
@@ -18579,7 +18580,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
blktap_request_free(tap, request);
}
-@@ -375,16 +379,11 @@ blktap_device_finish_request(struct blkt
+@@ -379,16 +383,11 @@ blktap_device_finish_request(struct blkt
blkif_response_t *res,
struct blktap_request *request)
{
@@ -18596,7 +18597,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
res->status, res->operation, request->operation,
-@@ -396,7 +395,8 @@ blktap_device_finish_request(struct blkt
+@@ -400,7 +399,8 @@ blktap_device_finish_request(struct blkt
if (unlikely(res->status != BLKIF_RSP_OKAY))
BTERR("Bad return from device data "
"request: %x\n", res->status);
@@ -18606,7 +18607,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
break;
default:
BUG();
-@@ -545,9 +545,9 @@ blktap_map(struct blktap *tap,
+@@ -549,9 +549,9 @@ blktap_map(struct blktap *tap,
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
pte = mk_pte(page, ring->vma->vm_page_prot);
@@ -18618,7 +18619,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
flush_tlb_kernel_page(kvaddr);
set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
-@@ -875,7 +875,7 @@ blktap_device_run_queue(struct blktap *t
+@@ -879,7 +879,7 @@ blktap_device_run_queue(struct blktap *t
if (!err)
queued++;
else {
@@ -18629,7 +18630,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
--- a/drivers/xen/blktap2/ring.c
+++ b/drivers/xen/blktap2/ring.c
-@@ -103,8 +103,8 @@ blktap_ring_clear_pte(struct vm_area_str
+@@ -116,8 +116,8 @@ blktap_ring_clear_pte(struct vm_area_str
* mapped region.
*/
if (uvaddr < ring->user_vstart)
@@ -18640,7 +18641,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
-@@ -143,8 +143,8 @@ blktap_ring_clear_pte(struct vm_area_str
+@@ -156,8 +156,8 @@ blktap_ring_clear_pte(struct vm_area_str
khandle->user);
count++;
} else
@@ -19455,18 +19456,18 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
#define BIOS_BEGIN 0x000a0000
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
-@@ -21,7 +21,11 @@ typedef struct {
- #ifdef CONFIG_SMP
+@@ -22,7 +22,11 @@ typedef struct {
unsigned int irq_resched_count;
unsigned int irq_call_count;
-+#ifndef CONFIG_XEN
+ #endif
++#if !defined(CONFIG_XEN)
unsigned int irq_tlb_count;
-+#else
++#elif defined(CONFIG_SMP)
+ unsigned int irq_lock_count;
+#endif
- #endif
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
+ #endif
--- a/arch/x86/include/mach-xen/asm/agp.h
+++ b/arch/x86/include/mach-xen/asm/agp.h
@@ -13,18 +13,13 @@
@@ -27360,15 +27361,15 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
-#endif /* _X8664_TLBFLUSH_H */
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
-@@ -24,7 +24,7 @@ typedef struct {
+@@ -24,6 +24,8 @@ typedef struct {
void *vdso;
} mm_context_t;
--#ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
++#ifndef CONFIG_XEN
void leave_mm(int cpu);
- #else
- static inline void leave_mm(int cpu)
++#endif
+
+ #endif /* _ASM_X86_MMU_H */
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -283,7 +283,9 @@ static inline unsigned long regs_get_ker
@@ -27568,7 +27569,7 @@ Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches
}
--- a/mm/memory.c
+++ b/mm/memory.c
-@@ -2483,6 +2483,10 @@ int apply_to_page_range(struct mm_struct
+@@ -2493,6 +2493,10 @@ int apply_to_page_range(struct mm_struct
unsigned long end = addr + size;
int err;
diff --git a/patches.xen/xen3-patch-2.6.29 b/patches.xen/xen3-patch-2.6.29
index 42aa002c31..9afe734188 100644
--- a/patches.xen/xen3-patch-2.6.29
+++ b/patches.xen/xen3-patch-2.6.29
@@ -1,61 +1,110 @@
From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux: 2.6.29
-Patch-mainline: 2.6.29
+Patch-mainline: Never, SUSE-Xen specific
This patch contains the differences between 2.6.28 and 2.6.29.
Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches.py
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -318,7 +318,6 @@ config X86_XEN
- depends on X86_32
- select XEN
- select X86_PAE
-- select SWIOTLB
- help
- Choose this option if you plan to run this kernel on top of the
- Xen Hypervisor.
-@@ -356,7 +355,6 @@ config X86_64_XEN
- bool "Enable Xen compatible kernel"
- depends on X86_64
- select XEN
-- select SWIOTLB
- help
- This option will compile a kernel compatible with Xen hypervisor
-
-@@ -761,7 +759,7 @@ config AMD_IOMMU_STATS
-
- # need this always selected by IOMMU for the VIA workaround
- config SWIOTLB
-- def_bool y if X86_64
-+ def_bool y if X86_64 || XEN
- ---help---
- Support for software bounce buffers used on x86-64 systems
- which don't have a hardware IOMMU (e.g. the current generation
-@@ -876,7 +874,7 @@ config X86_XEN_GENAPIC
-
- config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
- bool "Reroute for broken boot IRQs"
-- depends on X86_IO_APIC
-+ depends on X86_IO_APIC && !XEN
- ---help---
- This option enables a workaround that fixes a source of
- spurious interrupts. This is recommended when threaded
---- a/arch/x86/Makefile
-+++ b/arch/x86/Makefile
-@@ -162,8 +162,8 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdi
- PHONY += bzImage vmlinuz $(BOOT_TARGETS)
-
- ifdef CONFIG_XEN
--KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
-- -I$(srctree)/arch/x86/include/mach-xen $(KBUILD_CPPFLAGS)
-+LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
-+ -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
-
- ifdef CONFIG_X86_64
- LDFLAGS_vmlinux := -e startup_64
+---
+ arch/x86/Kconfig | 6
+ arch/x86/Makefile | 4
+ arch/x86/ia32/ia32entry-xen.S | 8
+ arch/x86/include/asm/hw_irq.h | 2
+ arch/x86/include/asm/hypervisor.h | 4
+ arch/x86/include/asm/kexec.h | 9
+ arch/x86/include/asm/thread_info.h | 2
+ arch/x86/include/mach-xen/asm/desc.h | 10
+ arch/x86/include/mach-xen/asm/fixmap_64.h | 6
+ arch/x86/include/mach-xen/asm/highmem.h | 1
+ arch/x86/include/mach-xen/asm/hypervisor.h | 4
+ arch/x86/include/mach-xen/asm/io.h | 40
+ arch/x86/include/mach-xen/asm/irq_vectors.h | 6
+ arch/x86/include/mach-xen/asm/mmu_context_32.h | 19
+ arch/x86/include/mach-xen/asm/pci.h | 14
+ arch/x86/include/mach-xen/asm/pgtable-3level.h | 1
+ arch/x86/include/mach-xen/asm/pgtable.h | 86 +
+ arch/x86/include/mach-xen/asm/pgtable_32.h | 9
+ arch/x86/include/mach-xen/asm/pgtable_64.h | 12
+ arch/x86/include/mach-xen/asm/processor.h | 20
+ arch/x86/include/mach-xen/asm/smp.h | 40
+ arch/x86/include/mach-xen/asm/spinlock.h | 1
+ arch/x86/include/mach-xen/asm/system.h | 6
+ arch/x86/kernel/apic/apic-xen.c | 2
+ arch/x86/kernel/apic/io_apic-xen.c | 1090 ++++++++++++++--------
+ arch/x86/kernel/apic/ipi-xen.c | 24
+ arch/x86/kernel/cpu/Makefile | 2
+ arch/x86/kernel/cpu/common-xen.c | 45
+ arch/x86/kernel/cpu/intel.c | 5
+ arch/x86/kernel/cpu/mtrr/main-xen.c | 2
+ arch/x86/kernel/e820-xen.c | 37
+ arch/x86/kernel/early_printk-xen.c | 49 -
+ arch/x86/kernel/entry_32-xen.S | 559 ++++++-----
+ arch/x86/kernel/entry_64-xen.S | 1203 +++++++++++--------------
+ arch/x86/kernel/head-xen.c | 1
+ arch/x86/kernel/head32-xen.c | 3
+ arch/x86/kernel/head64-xen.c | 5
+ arch/x86/kernel/ioport-xen.c | 2
+ arch/x86/kernel/irq-xen.c | 45
+ arch/x86/kernel/ldt-xen.c | 2
+ arch/x86/kernel/machine_kexec_32.c | 19
+ arch/x86/kernel/mpparse-xen.c | 382 +++----
+ arch/x86/kernel/pci-dma-xen.c | 27
+ arch/x86/kernel/process-xen.c | 42
+ arch/x86/kernel/process_32-xen.c | 88 -
+ arch/x86/kernel/process_64-xen.c | 66 -
+ arch/x86/kernel/setup-xen.c | 183 ---
+ arch/x86/kernel/smp-xen.c | 41
+ arch/x86/kernel/time-xen.c | 31
+ arch/x86/kernel/traps-xen.c | 103 +-
+ arch/x86/kernel/vsyscall_64-xen.c | 12
+ arch/x86/mm/fault-xen.c | 52 -
+ arch/x86/mm/hypervisor.c | 19
+ arch/x86/mm/init_32-xen.c | 102 +-
+ arch/x86/mm/init_64-xen.c | 10
+ arch/x86/mm/iomap_32-xen.c | 21
+ arch/x86/mm/ioremap-xen.c | 50 -
+ arch/x86/mm/pageattr-xen.c | 93 +
+ arch/x86/mm/pat-xen.c | 292 +++++-
+ arch/x86/pci/pcifront.c | 2
+ arch/x86/vdso/vdso32-setup-xen.c | 2
+ drivers/acpi/Kconfig | 2
+ drivers/acpi/processor_core.c | 2
+ drivers/acpi/processor_idle.c | 16
+ drivers/gpu/drm/i915/i915_drv.c | 2
+ drivers/gpu/drm/i915/i915_drv.h | 5
+ drivers/gpu/drm/i915/i915_gem.c | 11
+ drivers/oprofile/buffer_sync.c | 17
+ drivers/oprofile/cpu_buffer.c | 40
+ drivers/pci/msi-xen.c | 31
+ drivers/xen/Kconfig | 1
+ drivers/xen/Makefile | 1
+ drivers/xen/balloon/sysfs.c | 39
+ drivers/xen/blkfront/vbd.c | 4
+ drivers/xen/core/cpu_hotplug.c | 27
+ drivers/xen/core/evtchn.c | 232 +++-
+ drivers/xen/core/machine_reboot.c | 13
+ drivers/xen/core/smpboot.c | 42
+ drivers/xen/core/spinlock.c | 14
+ drivers/xen/netback/interface.c | 14
+ drivers/xen/netback/loopback.c | 16
+ drivers/xen/netback/netback.c | 3
+ drivers/xen/netfront/netfront.c | 33
+ drivers/xen/sfc_netfront/accel_msg.c | 4
+ drivers/xen/xenbus/xenbus_client.c | 3
+ drivers/xen/xenbus/xenbus_probe.c | 54 -
+ drivers/xen/xenbus/xenbus_probe.h | 2
+ drivers/xen/xenbus/xenbus_probe_backend.c | 18
+ drivers/xen/xenbus/xenbus_xs.c | 3
+ drivers/xen/xenoprof/xenoprofile.c | 8
+ include/acpi/processor.h | 11
+ include/xen/cpu_hotplug.h | 2
+ include/xen/evtchn.h | 12
+ include/xen/xenbus.h | 2
+ lib/swiotlb-xen.c | 354 ++++---
+ 95 files changed, 3457 insertions(+), 2604 deletions(-)
+
--- a/arch/x86/ia32/ia32entry-xen.S
+++ b/arch/x86/ia32/ia32entry-xen.S
@@ -363,9 +363,9 @@ ENTRY(ia32_syscall)
@@ -92,7 +141,7 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
+#endif
typedef int vector_irq_t[NR_VECTORS];
- DECLARE_PER_CPU(vector_irq_t, vector_irq);
+ #ifndef __GENKSYMS__
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -56,3 +56,7 @@ extern const struct hypervisor_x86 x86_h
@@ -387,6 +436,69 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
#endif
#endif /* _ASM_X86_PCI_H */
+--- a/arch/x86/include/mach-xen/asm/pgtable_32.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_32.h
+@@ -107,15 +107,6 @@ extern unsigned long pg0[];
+ #endif
+
+ /*
+- * Macro to mark a page protection value as "uncacheable".
+- * On processors which do not support it, this is a no-op.
+- */
+-#define pgprot_noncached(prot) \
+- ((boot_cpu_data.x86 > 3) \
+- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
+- : (prot))
+-
+-/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+--- a/arch/x86/include/mach-xen/asm/pgtable-3level.h
++++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
+@@ -146,6 +146,7 @@ static inline int pte_none(pte_t pte)
+ #define PTE_FILE_MAX_BITS 32
+
+ /* Encode and de-code a swap entry */
++#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+ #define __swp_type(x) (((x).val) & 0x1f)
+ #define __swp_offset(x) ((x).val >> 5)
+ #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+--- a/arch/x86/include/mach-xen/asm/pgtable_64.h
++++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
+@@ -144,8 +144,8 @@ static inline void xen_pgd_clear(pgd_t *
+ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+ #define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+-
+-#define MAXMEM _AC(0x000004ffffffffff, UL)
++#define MAX_PHYSMEM_BITS 43
++#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+ #define VMALLOC_START _AC(0xffffc20000000000, UL)
+ #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+ #define VMEMMAP_START _AC(0xffffe20000000000, UL)
+@@ -178,12 +178,6 @@ static inline int pmd_bad(pmd_t pmd)
+ #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+ /*
+- * Macro to mark a page protection value as "uncacheable".
+- */
+-#define pgprot_noncached(prot) \
+- (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
+-
+-/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+@@ -265,6 +259,8 @@ static inline int pud_large(pud_t pte)
+ #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+ #endif
+
++#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
++
+ #define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+ #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
--- a/arch/x86/include/mach-xen/asm/pgtable.h
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -22,12 +22,10 @@
@@ -524,69 +636,6 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
---- a/arch/x86/include/mach-xen/asm/pgtable-3level.h
-+++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
-@@ -146,6 +146,7 @@ static inline int pte_none(pte_t pte)
- #define PTE_FILE_MAX_BITS 32
-
- /* Encode and de-code a swap entry */
-+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
- #define __swp_type(x) (((x).val) & 0x1f)
- #define __swp_offset(x) ((x).val >> 5)
- #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
---- a/arch/x86/include/mach-xen/asm/pgtable_32.h
-+++ b/arch/x86/include/mach-xen/asm/pgtable_32.h
-@@ -107,15 +107,6 @@ extern unsigned long pg0[];
- #endif
-
- /*
-- * Macro to mark a page protection value as "uncacheable".
-- * On processors which do not support it, this is a no-op.
-- */
--#define pgprot_noncached(prot) \
-- ((boot_cpu_data.x86 > 3) \
-- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
-- : (prot))
--
--/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
---- a/arch/x86/include/mach-xen/asm/pgtable_64.h
-+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
-@@ -144,8 +144,8 @@ static inline void xen_pgd_clear(pgd_t *
- #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
- #define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
--
--#define MAXMEM _AC(0x000004ffffffffff, UL)
-+#define MAX_PHYSMEM_BITS 43
-+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
- #define VMALLOC_START _AC(0xffffc20000000000, UL)
- #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
- #define VMEMMAP_START _AC(0xffffe20000000000, UL)
-@@ -178,12 +178,6 @@ static inline int pmd_bad(pmd_t pmd)
- #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
-
- /*
-- * Macro to mark a page protection value as "uncacheable".
-- */
--#define pgprot_noncached(prot) \
-- (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
--
--/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-@@ -265,6 +259,8 @@ static inline int pud_large(pud_t pte)
- #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
- #endif
-
-+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-+
- #define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
- & ((1U << SWP_TYPE_BITS) - 1))
- #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
--- a/arch/x86/include/mach-xen/asm/processor.h
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -121,6 +121,7 @@ struct cpuinfo_x86 {
@@ -771,6 +820,42 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
/**
* read_barrier_depends - Flush all pending reads that subsequents reads
* depend on.
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -318,7 +318,6 @@ config X86_XEN
+ depends on X86_32
+ select XEN
+ select X86_PAE
+- select SWIOTLB
+ help
+ Choose this option if you plan to run this kernel on top of the
+ Xen Hypervisor.
+@@ -356,7 +355,6 @@ config X86_64_XEN
+ bool "Enable Xen compatible kernel"
+ depends on X86_64
+ select XEN
+- select SWIOTLB
+ help
+ This option will compile a kernel compatible with Xen hypervisor
+
+@@ -761,7 +759,7 @@ config AMD_IOMMU_STATS
+
+ # need this always selected by IOMMU for the VIA workaround
+ config SWIOTLB
+- def_bool y if X86_64
++ def_bool y if X86_64 || XEN
+ ---help---
+ Support for software bounce buffers used on x86-64 systems
+ which don't have a hardware IOMMU (e.g. the current generation
+@@ -876,7 +874,7 @@ config X86_XEN_GENAPIC
+
+ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ bool "Reroute for broken boot IRQs"
+- depends on X86_IO_APIC
++ depends on X86_IO_APIC && !XEN
+ ---help---
+ This option enables a workaround that fixes a source of
+ spurious interrupts. This is recommended when threaded
--- a/arch/x86/kernel/apic/apic-xen.c
+++ b/arch/x86/kernel/apic/apic-xen.c
@@ -32,7 +32,7 @@ static int __init apic_set_verbosity(cha
@@ -782,17 +867,2017 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
---- a/arch/x86/kernel/cpu/Makefile
-+++ b/arch/x86/kernel/cpu/Makefile
-@@ -41,7 +41,7 @@ obj-$(CONFIG_MTRR) += mtrr/
+--- a/arch/x86/kernel/apic/io_apic-xen.c
++++ b/arch/x86/kernel/apic/io_apic-xen.c
+@@ -112,102 +112,276 @@ static int __init parse_noapic(char *str
+ }
+ early_param("noapic", parse_noapic);
- obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
++#ifndef CONFIG_XEN
+ struct irq_pin_list;
++
++/*
++ * This is performance-critical, we want to do it O(1)
++ *
++ * the indexing order of this array favors 1:1 mappings
++ * between pins and IRQs.
++ */
++
++struct irq_pin_list {
++ int apic, pin;
++ struct irq_pin_list *next;
++};
++
++static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
++{
++ struct irq_pin_list *pin;
++ int node;
++
++ node = cpu_to_node(cpu);
++
++ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
++
++ return pin;
++}
++
+ struct irq_cfg {
+-#ifndef CONFIG_XEN
+- unsigned int irq;
+ struct irq_pin_list *irq_2_pin;
+- cpumask_t domain;
+- cpumask_t old_domain;
++ cpumask_var_t domain;
++ cpumask_var_t old_domain;
+ unsigned move_cleanup_count;
+-#endif
+ u8 vector;
+-#ifndef CONFIG_XEN
+ u8 move_in_progress : 1;
++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
++ u8 move_desc_pending : 1;
+ #endif
+ };
--disabled-obj-$(CONFIG_XEN) := perfctr-watchdog.o
-+disabled-obj-$(CONFIG_XEN) := hypervisor.o perfctr-watchdog.o vmware.o
+ /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
++#ifdef CONFIG_SPARSE_IRQ
++static struct irq_cfg irq_cfgx[] = {
++#else
+ static struct irq_cfg irq_cfgx[NR_IRQS] = {
+- [0] = { .irq = 0 },
+- [1] = { .irq = 1 },
+- [2] = { .irq = 2 },
+- [3] = { .irq = 3 },
+- [4] = { .irq = 4 },
+- [5] = { .irq = 5 },
+- [6] = { .irq = 6 },
+- [7] = { .irq = 7 },
+- [8] = { .irq = 8 },
+- [9] = { .irq = 9 },
+- [10] = { .irq = 10 },
+- [11] = { .irq = 11 },
+- [12] = { .irq = 12 },
+- [13] = { .irq = 13 },
+- [14] = { .irq = 14 },
+- [15] = { .irq = 15 },
++#endif
++ [0] = { .vector = IRQ0_VECTOR, },
++ [1] = { .vector = IRQ1_VECTOR, },
++ [2] = { .vector = IRQ2_VECTOR, },
++ [3] = { .vector = IRQ3_VECTOR, },
++ [4] = { .vector = IRQ4_VECTOR, },
++ [5] = { .vector = IRQ5_VECTOR, },
++ [6] = { .vector = IRQ6_VECTOR, },
++ [7] = { .vector = IRQ7_VECTOR, },
++ [8] = { .vector = IRQ8_VECTOR, },
++ [9] = { .vector = IRQ9_VECTOR, },
++ [10] = { .vector = IRQ10_VECTOR, },
++ [11] = { .vector = IRQ11_VECTOR, },
++ [12] = { .vector = IRQ12_VECTOR, },
++ [13] = { .vector = IRQ13_VECTOR, },
++ [14] = { .vector = IRQ14_VECTOR, },
++ [15] = { .vector = IRQ15_VECTOR, },
+ };
- quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+-#define for_each_irq_cfg(irq, cfg) \
+- for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
++int __init arch_early_irq_init(void)
++{
++ struct irq_cfg *cfg;
++ struct irq_desc *desc;
++ int count;
++ int i;
++
++ cfg = irq_cfgx;
++ count = ARRAY_SIZE(irq_cfgx);
+
++ for (i = 0; i < count; i++) {
++ desc = irq_to_desc(i);
++ desc->chip_data = &cfg[i];
++ alloc_bootmem_cpumask_var(&cfg[i].domain);
++ alloc_bootmem_cpumask_var(&cfg[i].old_domain);
++ if (i < NR_IRQS_LEGACY)
++ cpumask_setall(cfg[i].domain);
++ }
++
++ return 0;
++}
++
++#ifdef CONFIG_SPARSE_IRQ
+ static struct irq_cfg *irq_cfg(unsigned int irq)
+ {
+- return irq < nr_irqs ? irq_cfgx + irq : NULL;
++ struct irq_cfg *cfg = NULL;
++ struct irq_desc *desc;
++
++ desc = irq_to_desc(irq);
++ if (desc)
++ cfg = desc->chip_data;
++
++ return cfg;
+ }
+
+-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
++static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+ {
+- return irq_cfg(irq);
++ struct irq_cfg *cfg;
++ int node;
++
++ node = cpu_to_node(cpu);
++
++ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
++ if (cfg) {
++ if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
++ kfree(cfg);
++ cfg = NULL;
++ } else if (!alloc_cpumask_var_node(&cfg->old_domain,
++ GFP_ATOMIC, node)) {
++ free_cpumask_var(cfg->domain);
++ kfree(cfg);
++ cfg = NULL;
++ } else {
++ cpumask_clear(cfg->domain);
++ cpumask_clear(cfg->old_domain);
++ }
++ }
++
++ return cfg;
+ }
+
+-#ifdef CONFIG_XEN
+-#define irq_2_pin_init()
+-#define add_pin_to_irq(irq, apic, pin)
+-#else
+-/*
+- * Rough estimation of how many shared IRQs there are, can be changed
+- * anytime.
+- */
+-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
++int arch_init_chip_data(struct irq_desc *desc, int cpu)
++{
++ struct irq_cfg *cfg;
+
+-/*
+- * This is performance-critical, we want to do it O(1)
+- *
+- * the indexing order of this array favors 1:1 mappings
+- * between pins and IRQs.
+- */
++ cfg = desc->chip_data;
++ if (!cfg) {
++ desc->chip_data = get_one_free_irq_cfg(cpu);
++ if (!desc->chip_data) {
++ printk(KERN_ERR "can not alloc irq_cfg\n");
++ BUG_ON(1);
++ }
++ }
+
+-struct irq_pin_list {
+- int apic, pin;
+- struct irq_pin_list *next;
+-};
++ return 0;
++}
+
+-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
+-static struct irq_pin_list *irq_2_pin_ptr;
++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+
+-static void __init irq_2_pin_init(void)
++static void
++init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+ {
+- struct irq_pin_list *pin = irq_2_pin_head;
+- int i;
++ struct irq_pin_list *old_entry, *head, *tail, *entry;
++
++ cfg->irq_2_pin = NULL;
++ old_entry = old_cfg->irq_2_pin;
++ if (!old_entry)
++ return;
++
++ entry = get_one_free_irq_2_pin(cpu);
++ if (!entry)
++ return;
++
++ entry->apic = old_entry->apic;
++ entry->pin = old_entry->pin;
++ head = entry;
++ tail = entry;
++ old_entry = old_entry->next;
++ while (old_entry) {
++ entry = get_one_free_irq_2_pin(cpu);
++ if (!entry) {
++ entry = head;
++ while (entry) {
++ head = entry->next;
++ kfree(entry);
++ entry = head;
++ }
++ /* still use the old one */
++ return;
++ }
++ entry->apic = old_entry->apic;
++ entry->pin = old_entry->pin;
++ tail->next = entry;
++ tail = entry;
++ old_entry = old_entry->next;
++ }
+
+- for (i = 1; i < PIN_MAP_SIZE; i++)
+- pin[i-1].next = &pin[i];
++ tail->next = NULL;
++ cfg->irq_2_pin = head;
++}
++
++static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
++{
++ struct irq_pin_list *entry, *next;
++
++ if (old_cfg->irq_2_pin == cfg->irq_2_pin)
++ return;
++
++ entry = old_cfg->irq_2_pin;
+
+- irq_2_pin_ptr = &pin[0];
++ while (entry) {
++ next = entry->next;
++ kfree(entry);
++ entry = next;
++ }
++ old_cfg->irq_2_pin = NULL;
+ }
+
+-static struct irq_pin_list *get_one_free_irq_2_pin(void)
++void arch_init_copy_chip_data(struct irq_desc *old_desc,
++ struct irq_desc *desc, int cpu)
+ {
+- struct irq_pin_list *pin = irq_2_pin_ptr;
++ struct irq_cfg *cfg;
++ struct irq_cfg *old_cfg;
+
+- if (!pin)
+- panic("can not get more irq_2_pin\n");
++ cfg = get_one_free_irq_cfg(cpu);
+
+- irq_2_pin_ptr = pin->next;
+- pin->next = NULL;
+- return pin;
++ if (!cfg)
++ return;
++
++ desc->chip_data = cfg;
++
++ old_cfg = old_desc->chip_data;
++
++ memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
++
++ init_copy_irq_2_pin(old_cfg, cfg, cpu);
++}
++
++static void free_irq_cfg(struct irq_cfg *old_cfg)
++{
++ kfree(old_cfg);
++}
++
++void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
++{
++ struct irq_cfg *old_cfg, *cfg;
++
++ old_cfg = old_desc->chip_data;
++ cfg = desc->chip_data;
++
++ if (old_cfg == cfg)
++ return;
++
++ if (old_cfg) {
++ free_irq_2_pin(old_cfg, cfg);
++ free_irq_cfg(old_cfg);
++ old_desc->chip_data = NULL;
++ }
++}
++
++static void
++set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
++{
++ struct irq_cfg *cfg = desc->chip_data;
++
++ if (!cfg->move_in_progress) {
++ /* it means that domain is not changed */
++ if (!cpumask_intersects(&desc->affinity, mask))
++ cfg->move_desc_pending = 1;
++ }
++}
++#endif
++
++#else
++static struct irq_cfg *irq_cfg(unsigned int irq)
++{
++ return irq < nr_irqs ? irq_cfgx + irq : NULL;
++}
++
++#endif
++
++#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
++static inline void
++set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
++{
+ }
++#endif
+
+ struct io_apic {
+ unsigned int index;
+@@ -220,7 +394,7 @@ static __attribute_const__ struct io_api
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
+ }
+-#endif
++#endif /* !CONFIG_XEN */
+
+ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+ {
+@@ -275,11 +449,10 @@ static inline void io_apic_modify(unsign
+ writel(value, &io_apic->data);
+ }
+
+-static bool io_apic_level_ack_pending(unsigned int irq)
++static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+ {
+ struct irq_pin_list *entry;
+ unsigned long flags;
+- struct irq_cfg *cfg = irq_cfg(irq);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ entry = cfg->irq_2_pin;
+@@ -369,13 +542,32 @@ static void ioapic_mask_entry(int apic,
+ }
+
+ #ifdef CONFIG_SMP
+-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
++static void send_cleanup_vector(struct irq_cfg *cfg)
++{
++ cpumask_var_t cleanup_mask;
++
++ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
++ unsigned int i;
++ cfg->move_cleanup_count = 0;
++ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
++ cfg->move_cleanup_count++;
++ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
++ send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
++ } else {
++ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
++ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
++ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
++ free_cpumask_var(cleanup_mask);
++ }
++ cfg->move_in_progress = 0;
++}
++
++static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+ {
+ int apic, pin;
+- struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
++ u8 vector = cfg->vector;
+
+- cfg = irq_cfg(irq);
+ entry = cfg->irq_2_pin;
+ for (;;) {
+ unsigned int reg;
+@@ -405,36 +597,61 @@ static void __target_IO_APIC_irq(unsigne
+ }
+ }
+
+-static int assign_irq_vector(int irq, cpumask_t mask);
++static int
++assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
++
++/*
++ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
++ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
++ */
++static unsigned int
++set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
++{
++ struct irq_cfg *cfg;
++ unsigned int irq;
++
++ if (!cpumask_intersects(mask, cpu_online_mask))
++ return BAD_APICID;
++
++ irq = desc->irq;
++ cfg = desc->chip_data;
++ if (assign_irq_vector(irq, cfg, mask))
++ return BAD_APICID;
++
++ cpumask_and(&desc->affinity, cfg->domain, mask);
++ set_extra_move_desc(desc, mask);
++ return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
++}
+
+-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
++static void
++set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ {
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int dest;
+- cpumask_t tmp;
+- struct irq_desc *desc;
++ unsigned int irq;
+
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
+- return;
++ irq = desc->irq;
++ cfg = desc->chip_data;
+
+- cfg = irq_cfg(irq);
+- if (assign_irq_vector(irq, mask))
+- return;
++ spin_lock_irqsave(&ioapic_lock, flags);
++ dest = set_desc_affinity(desc, mask);
++ if (dest != BAD_APICID) {
++ /* Only the high 8 bits are valid. */
++ dest = SET_APIC_LOGICAL_ID(dest);
++ __target_IO_APIC_irq(irq, dest, cfg);
++ }
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++}
+
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
+- /*
+- * Only the high 8 bits are valid.
+- */
+- dest = SET_APIC_LOGICAL_ID(dest);
++static void
++set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
++{
++ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+- spin_lock_irqsave(&ioapic_lock, flags);
+- __target_IO_APIC_irq(irq, dest, cfg->vector);
+- desc->affinity = mask;
+- spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ set_ioapic_affinity_irq_desc(desc, mask);
+ }
+ #endif /* CONFIG_SMP */
+
+@@ -443,16 +660,18 @@ static void set_ioapic_affinity_irq(unsi
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
++static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+ {
+- struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
+
+- /* first time to refer irq_cfg, so with new */
+- cfg = irq_cfg_alloc(irq);
+ entry = cfg->irq_2_pin;
+ if (!entry) {
+- entry = get_one_free_irq_2_pin();
++ entry = get_one_free_irq_2_pin(cpu);
++ if (!entry) {
++ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
++ apic, pin);
++ return;
++ }
+ cfg->irq_2_pin = entry;
+ entry->apic = apic;
+ entry->pin = pin;
+@@ -467,7 +686,7 @@ static void add_pin_to_irq(unsigned int
+ entry = entry->next;
+ }
+
+- entry->next = get_one_free_irq_2_pin();
++ entry->next = get_one_free_irq_2_pin(cpu);
+ entry = entry->next;
+ entry->apic = apic;
+ entry->pin = pin;
+@@ -476,11 +695,10 @@ static void add_pin_to_irq(unsigned int
+ /*
+ * Reroute an IRQ to a different pin.
+ */
+-static void __init replace_pin_at_irq(unsigned int irq,
++static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+ {
+- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_pin_list *entry = cfg->irq_2_pin;
+ int replaced = 0;
+
+@@ -497,18 +715,16 @@ static void __init replace_pin_at_irq(un
+
+ /* why? call replace before add? */
+ if (!replaced)
+- add_pin_to_irq(irq, newapic, newpin);
++ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+ }
+
+-static inline void io_apic_modify_irq(unsigned int irq,
++static inline void io_apic_modify_irq(struct irq_cfg *cfg,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+ {
+ int pin;
+- struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
+
+- cfg = irq_cfg(irq);
+ for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
+ unsigned int reg;
+ pin = entry->pin;
+@@ -521,13 +737,13 @@ static inline void io_apic_modify_irq(un
+ }
+ }
+
+-static void __unmask_IO_APIC_irq(unsigned int irq)
++static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
+ {
+- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
++ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ }
+
+ #ifdef CONFIG_X86_64
+-void io_apic_sync(struct irq_pin_list *entry)
++static void io_apic_sync(struct irq_pin_list *entry)
+ {
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+@@ -538,47 +754,64 @@ void io_apic_sync(struct irq_pin_list *e
+ readl(&io_apic->data);
+ }
+
+-static void __mask_IO_APIC_irq(unsigned int irq)
++static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+ {
+- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
++ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ }
+ #else /* CONFIG_X86_32 */
+-static void __mask_IO_APIC_irq(unsigned int irq)
++static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+ {
+- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
++ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
+ }
+
+-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
++static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
+ {
+- io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
++ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED, NULL);
+ }
+
+-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
++static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
+ {
+- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
++ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
+ }
+ #endif /* CONFIG_X86_32 */
+
+-static void mask_IO_APIC_irq (unsigned int irq)
++static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+ {
++ struct irq_cfg *cfg = desc->chip_data;
+ unsigned long flags;
+
++ BUG_ON(!cfg);
++
+ spin_lock_irqsave(&ioapic_lock, flags);
+- __mask_IO_APIC_irq(irq);
++ __mask_IO_APIC_irq(cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+-static void unmask_IO_APIC_irq (unsigned int irq)
++static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+ {
++ struct irq_cfg *cfg = desc->chip_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+- __unmask_IO_APIC_irq(irq);
++ __unmask_IO_APIC_irq(cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
++static void mask_IO_APIC_irq(unsigned int irq)
++{
++ struct irq_desc *desc = irq_to_desc(irq);
++
++ mask_IO_APIC_irq_desc(desc);
++}
++static void unmask_IO_APIC_irq(unsigned int irq)
++{
++ struct irq_desc *desc = irq_to_desc(irq);
++
++ unmask_IO_APIC_irq_desc(desc);
++}
++
+ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+ {
+ struct IO_APIC_route_entry entry;
+@@ -618,6 +851,8 @@ void send_IPI_self(int vector)
+ apic_write(APIC_ICR, cfg);
+ }
+ #endif /* !CONFIG_SMP && CONFIG_X86_32*/
++#else
++#define add_pin_to_irq_cpu(cfg, cpu, apic, pin)
+ #endif /* !CONFIG_XEN */
+
+ #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+@@ -858,7 +1093,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
+ */
+ static int EISA_ELCR(unsigned int irq)
+ {
+- if (irq < 16) {
++ if (irq < NR_IRQS_LEGACY) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+@@ -1083,52 +1318,114 @@ void unlock_vector_lock(void)
+ {
+ spin_unlock(&vector_lock);
+ }
+-#endif
+
+-static int assign_irq_vector(int irq, cpumask_t mask)
++static int
++__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+ {
+- struct physdev_irq irq_op;
+- struct irq_cfg *cfg;
+-
+- if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
+- return -EINVAL;
++ /*
++ * NOTE! The local APIC isn't very good at handling
++ * multiple interrupts at the same interrupt level.
++ * As the interrupt level is determined by taking the
++ * vector number and shifting that right by 4, we
++ * want to spread these out a bit so that they don't
++ * all fall in the same interrupt level.
++ *
++ * Also, we've got to be careful not to trash gate
++ * 0x80, because int 0x80 is hm, kind of importantish. ;)
++ */
++ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
++ unsigned int old_vector;
++ int cpu, err;
++ cpumask_var_t tmp_mask;
++
++ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
++ return -EBUSY;
++
++ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
++ return -ENOMEM;
++
++ old_vector = cfg->vector;
++ if (old_vector) {
++ cpumask_and(tmp_mask, mask, cpu_online_mask);
++ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
++ if (!cpumask_empty(tmp_mask)) {
++ free_cpumask_var(tmp_mask);
++ return 0;
++ }
++ }
+
+- cfg = irq_cfg(irq);
++ /* Only try and allocate irqs on cpus that are present */
++ err = -ENOSPC;
++ for_each_cpu_and(cpu, mask, cpu_online_mask) {
++ int new_cpu;
++ int vector, offset;
++
++ vector_allocation_domain(cpu, tmp_mask);
++
++ vector = current_vector;
++ offset = current_offset;
++next:
++ vector += 8;
++ if (vector >= first_system_vector) {
++ /* If out of vectors on large boxen, must share them. */
++ offset = (offset + 1) % 8;
++ vector = FIRST_DEVICE_VECTOR + offset;
++ }
++ if (unlikely(current_vector == vector))
++ continue;
+
+- if (cfg->vector)
+- return 0;
++ if (test_bit(vector, used_vectors))
++ goto next;
+
+- irq_op.irq = irq;
+- if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+- return -ENOSPC;
++ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
++ if (per_cpu(vector_irq, new_cpu)[vector] != -1)
++ goto next;
++ /* Found one! */
++ current_vector = vector;
++ current_offset = offset;
++ if (old_vector) {
++ cfg->move_in_progress = 1;
++ cpumask_copy(cfg->old_domain, cfg->domain);
++ }
++ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
++ per_cpu(vector_irq, new_cpu)[vector] = irq;
++ cfg->vector = vector;
++ cpumask_copy(cfg->domain, tmp_mask);
++ err = 0;
++ break;
++ }
++ free_cpumask_var(tmp_mask);
++ return err;
++}
+
+- cfg->vector = irq_op.vector;
++static int
++assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
++{
++ int err;
++ unsigned long flags;
+
+- return 0;
++ spin_lock_irqsave(&vector_lock, flags);
++ err = __assign_irq_vector(irq, cfg, mask);
++ spin_unlock_irqrestore(&vector_lock, flags);
++ return err;
+ }
+
+-#ifndef CONFIG_XEN
+-static void __clear_irq_vector(int irq)
++static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+ {
+- struct irq_cfg *cfg;
+- cpumask_t mask;
+ int cpu, vector;
+
+- cfg = irq_cfg(irq);
+ BUG_ON(!cfg->vector);
+
+ vector = cfg->vector;
+- cpus_and(mask, cfg->domain, cpu_online_map);
+- for_each_cpu_mask_nr(cpu, mask)
++ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ per_cpu(vector_irq, cpu)[vector] = -1;
+
+ cfg->vector = 0;
+- cpus_clear(cfg->domain);
++ cpumask_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress))
+ return;
+- cpus_and(mask, cfg->old_domain, cpu_online_map);
+- for_each_cpu_mask_nr(cpu, mask) {
++ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+@@ -1146,10 +1443,12 @@ void __setup_vector_irq(int cpu)
+ /* This function must be called with vector_lock held */
+ int irq, vector;
+ struct irq_cfg *cfg;
++ struct irq_desc *desc;
+
+ /* Mark the inuse vectors */
+- for_each_irq_cfg(irq, cfg) {
+- if (!cpu_isset(cpu, cfg->domain))
++ for_each_irq_desc(irq, desc) {
++ cfg = desc->chip_data;
++ if (!cpumask_test_cpu(cpu, cfg->domain))
+ continue;
+ vector = cfg->vector;
+ per_cpu(vector_irq, cpu)[vector] = irq;
+@@ -1161,7 +1460,7 @@ void __setup_vector_irq(int cpu)
+ continue;
+
+ cfg = irq_cfg(irq);
+- if (!cpu_isset(cpu, cfg->domain))
++ if (!cpumask_test_cpu(cpu, cfg->domain))
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ }
+ }
+@@ -1199,11 +1498,8 @@ static inline int IO_APIC_irq_trigger(in
+ }
+ #endif
+
+-static void ioapic_register_intr(int irq, unsigned long trigger)
++static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+ {
+- struct irq_desc *desc;
+-
+- desc = irq_to_desc(irq);
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+@@ -1234,8 +1530,8 @@ static void ioapic_register_intr(int irq
+ handle_edge_irq, "edge");
+ }
+ #else /* !CONFIG_XEN */
+-#define __clear_irq_vector(irq) ((void)(irq))
+-#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq)
++#define __clear_irq_vector(irq, cfg) ((void)0)
++#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq)
+ #endif
+
+ static int setup_ioapic_entry(int apic, int irq,
+@@ -1299,24 +1595,25 @@ static int setup_ioapic_entry(int apic,
+ return 0;
+ }
+
+-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
++static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
+ int trigger, int polarity)
+ {
+ struct irq_cfg *cfg;
+ struct IO_APIC_route_entry entry;
+- cpumask_t mask;
++ unsigned int dest;
+
+ if (!IO_APIC_IRQ(irq))
+ return;
+
+- cfg = irq_cfg(irq);
++ cfg = desc->chip_data;
+
+- mask = TARGET_CPUS;
+- if (assign_irq_vector(irq, mask))
++ if (assign_irq_vector(irq, cfg, TARGET_CPUS))
+ return;
+
+ #ifndef CONFIG_XEN
+- cpus_and(mask, cfg->domain, mask);
++ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
++#else
++ dest = cpu_mask_to_apicid(TARGET_CPUS);
+ #endif
+
+ apic_printk(APIC_VERBOSE,KERN_DEBUG
+@@ -1327,16 +1624,15 @@ static void setup_IO_APIC_irq(int apic,
+
+
+ if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
+- cpu_mask_to_apicid(mask), trigger, polarity,
+- cfg->vector)) {
++ dest, trigger, polarity, cfg->vector)) {
+ printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
+ mp_ioapics[apic].mp_apicid, pin);
+- __clear_irq_vector(irq);
++ __clear_irq_vector(irq, cfg);
+ return;
+ }
+
+- ioapic_register_intr(irq, trigger);
+- if (irq < 16)
++ ioapic_register_intr(irq, desc, trigger);
++ if (irq < NR_IRQS_LEGACY)
+ disable_8259A_irq(irq);
+
+ ioapic_write_entry(apic, pin, entry);
+@@ -1346,6 +1642,9 @@ static void __init setup_IO_APIC_irqs(vo
+ {
+ int apic, pin, idx, irq;
+ int notcon = 0;
++ struct irq_desc *desc;
++ struct irq_cfg *cfg;
++ int cpu = boot_cpu_id;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+@@ -1380,9 +1679,15 @@ static void __init setup_IO_APIC_irqs(vo
+ if (multi_timer_check(apic, irq))
+ continue;
+ #endif
+- add_pin_to_irq(irq, apic, pin);
++ desc = irq_to_desc_alloc_cpu(irq, cpu);
++ if (!desc) {
++ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
++ continue;
++ }
++ cfg = desc->chip_data;
++ add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+
+- setup_IO_APIC_irq(apic, pin, irq,
++ setup_IO_APIC_irq(apic, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
+ }
+ }
+@@ -1442,6 +1747,7 @@ __apicdebuginit(void) print_IO_APIC(void
+ union IO_APIC_reg_03 reg_03;
+ unsigned long flags;
+ struct irq_cfg *cfg;
++ struct irq_desc *desc;
+ unsigned int irq;
+
+ if (apic_verbosity == APIC_QUIET)
+@@ -1531,8 +1837,11 @@ __apicdebuginit(void) print_IO_APIC(void
+ }
+ }
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+- for_each_irq_cfg(irq, cfg) {
+- struct irq_pin_list *entry = cfg->irq_2_pin;
++ for_each_irq_desc(irq, desc) {
++ struct irq_pin_list *entry;
++
++ cfg = desc->chip_data;
++ entry = cfg->irq_2_pin;
+ if (!entry)
+ continue;
+ printk(KERN_DEBUG "IRQ%d ", irq);
+@@ -2022,14 +2331,16 @@ static unsigned int startup_ioapic_irq(u
+ {
+ int was_pending = 0;
+ unsigned long flags;
++ struct irq_cfg *cfg;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+- if (irq < 16) {
++ if (irq < NR_IRQS_LEGACY) {
+ disable_8259A_irq(irq);
+ if (i8259A_irq_pending(irq))
+ was_pending = 1;
+ }
+- __unmask_IO_APIC_irq(irq);
++ cfg = irq_cfg(irq);
++ __unmask_IO_APIC_irq(cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+@@ -2043,7 +2354,7 @@ static int ioapic_retrigger_irq(unsigned
+ unsigned long flags;
+
+ spin_lock_irqsave(&vector_lock, flags);
+- send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
++ send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+@@ -2092,35 +2403,35 @@ static DECLARE_DELAYED_WORK(ir_migration
+ * as simple as edge triggered migration and we can do the irq migration
+ * with a simple atomic update to IO-APIC RTE.
+ */
+-static void migrate_ioapic_irq(int irq, cpumask_t mask)
++static void
++migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ {
+ struct irq_cfg *cfg;
+- struct irq_desc *desc;
+- cpumask_t tmp, cleanup_mask;
+ struct irte irte;
+ int modify_ioapic_rte;
+ unsigned int dest;
+ unsigned long flags;
++ unsigned int irq;
+
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
++ if (!cpumask_intersects(mask, cpu_online_mask))
+ return;
+
++ irq = desc->irq;
+ if (get_irte(irq, &irte))
+ return;
+
+- if (assign_irq_vector(irq, mask))
++ cfg = desc->chip_data;
++ if (assign_irq_vector(irq, cfg, mask))
+ return;
+
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
++ set_extra_move_desc(desc, mask);
++
++ dest = cpu_mask_to_apicid_and(cfg->domain, mask);
+
+- desc = irq_to_desc(irq);
+ modify_ioapic_rte = desc->status & IRQ_LEVEL;
+ if (modify_ioapic_rte) {
+ spin_lock_irqsave(&ioapic_lock, flags);
+- __target_IO_APIC_irq(irq, dest, cfg->vector);
++ __target_IO_APIC_irq(irq, dest, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+@@ -2132,24 +2443,20 @@ static void migrate_ioapic_irq(int irq,
+ */
+ modify_irte(irq, &irte);
+
+- if (cfg->move_in_progress) {
+- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+- cfg->move_in_progress = 0;
+- }
++ if (cfg->move_in_progress)
++ send_cleanup_vector(cfg);
+
+- desc->affinity = mask;
++ cpumask_copy(&desc->affinity, mask);
+ }
+
+-static int migrate_irq_remapped_level(int irq)
++static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
+ {
+ int ret = -1;
+- struct irq_desc *desc = irq_to_desc(irq);
++ struct irq_cfg *cfg = desc->chip_data;
+
+- mask_IO_APIC_irq(irq);
++ mask_IO_APIC_irq_desc(desc);
+
+- if (io_apic_level_ack_pending(irq)) {
++ if (io_apic_level_ack_pending(cfg)) {
+ /*
+ * Interrupt in progress. Migrating irq now will change the
+ * vector information in the IO-APIC RTE and that will confuse
+@@ -2161,14 +2468,15 @@ static int migrate_irq_remapped_level(in
+ }
+
+ /* everthing is clear. we have right of way */
+- migrate_ioapic_irq(irq, desc->pending_mask);
++ migrate_ioapic_irq_desc(desc, &desc->pending_mask);
+
+ ret = 0;
+ desc->status &= ~IRQ_MOVE_PENDING;
+- cpus_clear(desc->pending_mask);
++ cpumask_clear(&desc->pending_mask);
+
+ unmask:
+- unmask_IO_APIC_irq(irq);
++ unmask_IO_APIC_irq_desc(desc);
++
+ return ret;
+ }
+
+@@ -2189,7 +2497,7 @@ static void ir_irq_migration(struct work
+ continue;
+ }
+
+- desc->chip->set_affinity(irq, desc->pending_mask);
++ desc->chip->set_affinity(irq, &desc->pending_mask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+ }
+@@ -2198,28 +2506,33 @@ static void ir_irq_migration(struct work
+ /*
+ * Migrates the IRQ destination in the process context.
+ */
+-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
++static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
++ const struct cpumask *mask)
+ {
+- struct irq_desc *desc = irq_to_desc(irq);
+-
+ if (desc->status & IRQ_LEVEL) {
+ desc->status |= IRQ_MOVE_PENDING;
+- desc->pending_mask = mask;
+- migrate_irq_remapped_level(irq);
++ cpumask_copy(&desc->pending_mask, mask);
++ migrate_irq_remapped_level_desc(desc);
+ return;
+ }
+
+- migrate_ioapic_irq(irq, mask);
++ migrate_ioapic_irq_desc(desc, mask);
++}
++static void set_ir_ioapic_affinity_irq(unsigned int irq,
++ const struct cpumask *mask)
++{
++ struct irq_desc *desc = irq_to_desc(irq);
++
++ set_ir_ioapic_affinity_irq_desc(desc, mask);
+ }
+ #endif
+
+ asmlinkage void smp_irq_move_cleanup_interrupt(void)
+ {
+ unsigned vector, me;
++
+ ack_APIC_irq();
+-#ifdef CONFIG_X86_64
+ exit_idle();
+-#endif
+ irq_enter();
+
+ me = smp_processor_id();
+@@ -2229,6 +2542,9 @@ asmlinkage void smp_irq_move_cleanup_int
+ struct irq_cfg *cfg;
+ irq = __get_cpu_var(vector_irq)[vector];
+
++ if (irq == -1)
++ continue;
++
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+@@ -2238,7 +2554,7 @@ asmlinkage void smp_irq_move_cleanup_int
+ if (!cfg->move_cleanup_count)
+ goto unlock;
+
+- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
++ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ goto unlock;
+
+ __get_cpu_var(vector_irq)[vector] = -1;
+@@ -2250,28 +2566,45 @@ unlock:
+ irq_exit();
+ }
+
+-static void irq_complete_move(unsigned int irq)
++static void irq_complete_move(struct irq_desc **descp)
+ {
+- struct irq_cfg *cfg = irq_cfg(irq);
++ struct irq_desc *desc = *descp;
++ struct irq_cfg *cfg = desc->chip_data;
+ unsigned vector, me;
+
+- if (likely(!cfg->move_in_progress))
++ if (likely(!cfg->move_in_progress)) {
++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
++ if (likely(!cfg->move_desc_pending))
++ return;
++
++ /* domain has not changed, but affinity did */
++ me = smp_processor_id();
++ if (cpu_isset(me, desc->affinity)) {
++ *descp = desc = move_irq_desc(desc, me);
++ /* get the new one */
++ cfg = desc->chip_data;
++ cfg->move_desc_pending = 0;
++ }
++#endif
+ return;
++ }
+
+ vector = ~get_irq_regs()->orig_ax;
+ me = smp_processor_id();
+- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
+- cpumask_t cleanup_mask;
+
+- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+- cfg->move_in_progress = 0;
++ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
++ *descp = desc = move_irq_desc(desc, me);
++ /* get the new one */
++ cfg = desc->chip_data;
++#endif
++ send_cleanup_vector(cfg);
+ }
+ }
+ #else
+-static inline void irq_complete_move(unsigned int irq) {}
++static inline void irq_complete_move(struct irq_desc **descp) {}
+ #endif
++
+ #ifdef CONFIG_INTR_REMAP
+ static void ack_x2apic_level(unsigned int irq)
+ {
+@@ -2282,11 +2615,14 @@ static void ack_x2apic_edge(unsigned int
+ {
+ ack_x2APIC_irq();
+ }
++
+ #endif
+
+ static void ack_apic_edge(unsigned int irq)
+ {
+- irq_complete_move(irq);
++ struct irq_desc *desc = irq_to_desc(irq);
++
++ irq_complete_move(&desc);
+ move_native_irq(irq);
+ ack_APIC_irq();
+ }
+@@ -2295,18 +2631,21 @@ atomic_t irq_mis_count;
+
+ static void ack_apic_level(unsigned int irq)
+ {
++ struct irq_desc *desc = irq_to_desc(irq);
++
+ #ifdef CONFIG_X86_32
+ unsigned long v;
+ int i;
+ #endif
++ struct irq_cfg *cfg;
+ int do_unmask_irq = 0;
+
+- irq_complete_move(irq);
++ irq_complete_move(&desc);
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+ /* If we are moving the irq we need to mask it */
+- if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
++ if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
+ do_unmask_irq = 1;
+- mask_IO_APIC_irq(irq);
++ mask_IO_APIC_irq_desc(desc);
+ }
+ #endif
+
+@@ -2330,7 +2669,8 @@ static void ack_apic_level(unsigned int
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
+- i = irq_cfg(irq)->vector;
++ cfg = desc->chip_data;
++ i = cfg->vector;
+
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+ #endif
+@@ -2369,17 +2709,18 @@ static void ack_apic_level(unsigned int
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+- if (!io_apic_level_ack_pending(irq))
++ cfg = desc->chip_data;
++ if (!io_apic_level_ack_pending(cfg))
+ move_masked_irq(irq);
+- unmask_IO_APIC_irq(irq);
++ unmask_IO_APIC_irq_desc(desc);
+ }
+
+ #ifdef CONFIG_X86_32
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ spin_lock(&ioapic_lock);
+- __mask_and_edge_IO_APIC_irq(irq);
+- __unmask_and_level_IO_APIC_irq(irq);
++ __mask_and_edge_IO_APIC_irq(cfg);
++ __unmask_and_level_IO_APIC_irq(cfg);
+ spin_unlock(&ioapic_lock);
+ }
+ #endif
+@@ -2431,24 +2772,23 @@ static inline void init_IO_APIC_traps(vo
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+- for_each_irq_cfg(irq, cfg) {
++ for_each_irq_desc(irq, desc) {
+ #ifdef CONFIG_XEN
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
+ continue;
+ #endif
+- if (IO_APIC_IRQ(irq) && !cfg->vector) {
++ cfg = desc->chip_data;
++ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+- if (irq < 16)
++ if (irq < NR_IRQS_LEGACY)
+ make_8259A_irq(irq);
+- else {
+- desc = irq_to_desc(irq);
++ else
+ /* Strange. Oh, well.. */
+ desc->chip = &no_irq_chip;
+- }
+ }
+ }
+ }
+@@ -2474,7 +2814,7 @@ static void unmask_lapic_irq(unsigned in
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+ }
+
+-static void ack_lapic_irq (unsigned int irq)
++static void ack_lapic_irq(unsigned int irq)
+ {
+ ack_APIC_irq();
+ }
+@@ -2486,11 +2826,8 @@ static struct irq_chip lapic_chip __read
+ .ack = ack_lapic_irq,
+ };
+
+-static void lapic_register_intr(int irq)
++static void lapic_register_intr(int irq, struct irq_desc *desc)
+ {
+- struct irq_desc *desc;
+-
+- desc = irq_to_desc(irq);
+ desc->status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+@@ -2594,7 +2931,9 @@ int timer_through_8259 __initdata;
+ */
+ static inline void __init check_timer(void)
+ {
+- struct irq_cfg *cfg = irq_cfg(0);
++ struct irq_desc *desc = irq_to_desc(0);
++ struct irq_cfg *cfg = desc->chip_data;
++ int cpu = boot_cpu_id;
+ int apic1, pin1, apic2, pin2;
+ unsigned long flags;
+ unsigned int ver;
+@@ -2609,7 +2948,7 @@ static inline void __init check_timer(vo
+ * get/set the timer IRQ vector:
+ */
+ disable_8259A_irq(0);
+- assign_irq_vector(0, TARGET_CPUS);
++ assign_irq_vector(0, cfg, TARGET_CPUS);
+
+ /*
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+@@ -2660,10 +2999,10 @@ static inline void __init check_timer(vo
+ * Ok, does IRQ0 through the IOAPIC work?
+ */
+ if (no_pin1) {
+- add_pin_to_irq(0, apic1, pin1);
++ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ }
+- unmask_IO_APIC_irq(0);
++ unmask_IO_APIC_irq_desc(desc);
+ if (timer_irq_works()) {
+ if (nmi_watchdog == NMI_IO_APIC) {
+ setup_nmi();
+@@ -2689,9 +3028,9 @@ static inline void __init check_timer(vo
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
++ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+- unmask_IO_APIC_irq(0);
++ unmask_IO_APIC_irq_desc(desc);
+ enable_8259A_irq(0);
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+@@ -2723,7 +3062,7 @@ static inline void __init check_timer(vo
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as Virtual Wire IRQ...\n");
+
+- lapic_register_intr(0);
++ lapic_register_intr(0, desc);
+ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
+ enable_8259A_irq(0);
+
+@@ -2922,22 +3261,26 @@ unsigned int create_irq_nr(unsigned int
+ unsigned int irq;
+ unsigned int new;
+ unsigned long flags;
+- struct irq_cfg *cfg_new;
+-
+- irq_want = nr_irqs - 1;
++ struct irq_cfg *cfg_new = NULL;
++ int cpu = boot_cpu_id;
++ struct irq_desc *desc_new = NULL;
+
+ irq = 0;
+ spin_lock_irqsave(&vector_lock, flags);
+- for (new = irq_want; new > 0; new--) {
++ for (new = irq_want; new < NR_IRQS; new++) {
+ if (platform_legacy_irq(new))
+ continue;
+- cfg_new = irq_cfg(new);
+- if (cfg_new && cfg_new->vector != 0)
++
++ desc_new = irq_to_desc_alloc_cpu(new, cpu);
++ if (!desc_new) {
++ printk(KERN_INFO "can not get irq_desc for %d\n", new);
++ continue;
++ }
++ cfg_new = desc_new->chip_data;
++
++ if (cfg_new->vector != 0)
+ continue;
+- /* check if need to create one */
+- if (!cfg_new)
+- cfg_new = irq_cfg_alloc(new);
+- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
++ if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
+ irq = new;
+ break;
+ }
+@@ -2945,15 +3288,21 @@ unsigned int create_irq_nr(unsigned int
+
+ if (irq > 0) {
+ dynamic_irq_init(irq);
++ /* restore it, in case dynamic_irq_init clear it */
++ if (desc_new)
++ desc_new->chip_data = cfg_new;
+ }
+ return irq;
+ }
+
++static int nr_irqs_gsi = NR_IRQS_LEGACY;
+ int create_irq(void)
+ {
++ unsigned int irq_want;
+ int irq;
+
+- irq = create_irq_nr(nr_irqs - 1);
++ irq_want = nr_irqs_gsi;
++ irq = create_irq_nr(irq_want);
+
+ if (irq == 0)
+ irq = -1;
+@@ -2964,14 +3313,22 @@ int create_irq(void)
+ void destroy_irq(unsigned int irq)
+ {
+ unsigned long flags;
++ struct irq_cfg *cfg;
++ struct irq_desc *desc;
+
++ /* store it, in case dynamic_irq_cleanup clear it */
++ desc = irq_to_desc(irq);
++ cfg = desc->chip_data;
+ dynamic_irq_cleanup(irq);
++ /* connect back irq_cfg */
++ if (desc)
++ desc->chip_data = cfg;
+
+ #ifdef CONFIG_INTR_REMAP
+ free_irte(irq);
+ #endif
+ spin_lock_irqsave(&vector_lock, flags);
+- __clear_irq_vector(irq);
++ __clear_irq_vector(irq, cfg);
+ spin_unlock_irqrestore(&vector_lock, flags);
+ }
+ #endif /* !CONFIG_XEN */
+@@ -2985,16 +3342,13 @@ static int msi_compose_msg(struct pci_de
+ struct irq_cfg *cfg;
+ int err;
+ unsigned dest;
+- cpumask_t tmp;
+
+- tmp = TARGET_CPUS;
+- err = assign_irq_vector(irq, tmp);
++ cfg = irq_cfg(irq);
++ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+ if (err)
+ return err;
+
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, tmp);
+- dest = cpu_mask_to_apicid(tmp);
++ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+
+ #ifdef CONFIG_INTR_REMAP
+ if (irq_remapped(irq)) {
+@@ -3048,64 +3402,48 @@ static int msi_compose_msg(struct pci_de
+ }
+
+ #ifdef CONFIG_SMP
+-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
++static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ {
++ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+ struct msi_msg msg;
+ unsigned int dest;
+- cpumask_t tmp;
+- struct irq_desc *desc;
+
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
++ dest = set_desc_affinity(desc, mask);
++ if (dest == BAD_APICID)
+ return;
+
+- if (assign_irq_vector(irq, mask))
+- return;
+-
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
++ cfg = desc->chip_data;
+
+- read_msi_msg(irq, &msg);
++ read_msi_msg_desc(desc, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+- write_msi_msg(irq, &msg);
+- desc = irq_to_desc(irq);
+- desc->affinity = mask;
++ write_msi_msg_desc(desc, &msg);
+ }
+-
+ #ifdef CONFIG_INTR_REMAP
+ /*
+ * Migrate the MSI irq to another cpumask. This migration is
+ * done in the process context using interrupt-remapping hardware.
+ */
+-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
++static void
++ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ {
+- struct irq_cfg *cfg;
++ struct irq_desc *desc = irq_to_desc(irq);
++ struct irq_cfg *cfg = desc->chip_data;
+ unsigned int dest;
+- cpumask_t tmp, cleanup_mask;
+ struct irte irte;
+- struct irq_desc *desc;
+-
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
+- return;
+
+ if (get_irte(irq, &irte))
+ return;
+
+- if (assign_irq_vector(irq, mask))
++ dest = set_desc_affinity(desc, mask);
++ if (dest == BAD_APICID)
+ return;
+
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
+-
+ irte.vector = cfg->vector;
+ irte.dest_id = IRTE_DEST(dest);
+
+@@ -3119,16 +3457,10 @@ static void ir_set_msi_irq_affinity(unsi
+ * at the new destination. So, time to cleanup the previous
+ * vector allocation.
+ */
+- if (cfg->move_in_progress) {
+- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+- cfg->move_in_progress = 0;
+- }
+-
+- desc = irq_to_desc(irq);
+- desc->affinity = mask;
++ if (cfg->move_in_progress)
++ send_cleanup_vector(cfg);
+ }
++
+ #endif
+ #endif /* CONFIG_SMP */
+
+@@ -3187,7 +3519,7 @@ static int msi_alloc_irte(struct pci_dev
+ }
+ #endif
+
+-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
++static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+ {
+ int ret;
+ struct msi_msg msg;
+@@ -3196,7 +3528,7 @@ static int setup_msi_irq(struct pci_dev
+ if (ret < 0)
+ return ret;
+
+- set_irq_msi(irq, desc);
++ set_irq_msi(irq, msidesc);
+ write_msi_msg(irq, &msg);
+
+ #ifdef CONFIG_INTR_REMAP
+@@ -3216,26 +3548,13 @@ static int setup_msi_irq(struct pci_dev
+ return 0;
+ }
+
+-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+-{
+- unsigned int irq;
+-
+- irq = dev->bus->number;
+- irq <<= 8;
+- irq |= dev->devfn;
+- irq <<= 12;
+-
+- return irq;
+-}
+-
+-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
++int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
+ {
+ unsigned int irq;
+ int ret;
+ unsigned int irq_want;
+
+- irq_want = build_irq_for_pci_dev(dev) + 0x100;
+-
++ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want);
+ if (irq == 0)
+ return -1;
+@@ -3249,7 +3568,7 @@ int arch_setup_msi_irq(struct pci_dev *d
+ goto error;
+ no_ir:
+ #endif
+- ret = setup_msi_irq(dev, desc, irq);
++ ret = setup_msi_irq(dev, msidesc, irq);
+ if (ret < 0) {
+ destroy_irq(irq);
+ return ret;
+@@ -3267,7 +3586,7 @@ int arch_setup_msi_irqs(struct pci_dev *
+ {
+ unsigned int irq;
+ int ret, sub_handle;
+- struct msi_desc *desc;
++ struct msi_desc *msidesc;
+ unsigned int irq_want;
+
+ #ifdef CONFIG_INTR_REMAP
+@@ -3275,10 +3594,11 @@ int arch_setup_msi_irqs(struct pci_dev *
+ int index = 0;
+ #endif
+
+- irq_want = build_irq_for_pci_dev(dev) + 0x100;
++ irq_want = nr_irqs_gsi;
+ sub_handle = 0;
+- list_for_each_entry(desc, &dev->msi_list, list) {
+- irq = create_irq_nr(irq_want--);
++ list_for_each_entry(msidesc, &dev->msi_list, list) {
++ irq = create_irq_nr(irq_want);
++ irq_want++;
+ if (irq == 0)
+ return -1;
+ #ifdef CONFIG_INTR_REMAP
+@@ -3310,7 +3630,7 @@ int arch_setup_msi_irqs(struct pci_dev *
+ }
+ no_ir:
+ #endif
+- ret = setup_msi_irq(dev, desc, irq);
++ ret = setup_msi_irq(dev, msidesc, irq);
+ if (ret < 0)
+ goto error;
+ sub_handle++;
+@@ -3329,24 +3649,18 @@ void arch_teardown_msi_irq(unsigned int
+
+ #ifdef CONFIG_DMAR
+ #ifdef CONFIG_SMP
+-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
++static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+ {
++ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+ struct msi_msg msg;
+ unsigned int dest;
+- cpumask_t tmp;
+- struct irq_desc *desc;
+
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
++ dest = set_desc_affinity(desc, mask);
++ if (dest == BAD_APICID)
+ return;
+
+- if (assign_irq_vector(irq, mask))
+- return;
+-
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
++ cfg = desc->chip_data;
+
+ dmar_msi_read(irq, &msg);
+
+@@ -3356,9 +3670,8 @@ static void dmar_msi_set_affinity(unsign
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ dmar_msi_write(irq, &msg);
+- desc = irq_to_desc(irq);
+- desc->affinity = mask;
+ }
++
+ #endif /* CONFIG_SMP */
+
+ struct irq_chip dmar_msi_type = {
+@@ -3390,24 +3703,18 @@ int arch_setup_dmar_msi(unsigned int irq
+ #ifdef CONFIG_HPET_TIMER
+
+ #ifdef CONFIG_SMP
+-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
++static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+ {
++ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+- struct irq_desc *desc;
+ struct msi_msg msg;
+ unsigned int dest;
+- cpumask_t tmp;
+
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
++ dest = set_desc_affinity(desc, mask);
++ if (dest == BAD_APICID)
+ return;
+
+- if (assign_irq_vector(irq, mask))
+- return;
+-
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
++ cfg = desc->chip_data;
+
+ hpet_msi_read(irq, &msg);
+
+@@ -3417,9 +3724,8 @@ static void hpet_msi_set_affinity(unsign
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ hpet_msi_write(irq, &msg);
+- desc = irq_to_desc(irq);
+- desc->affinity = mask;
+ }
++
+ #endif /* CONFIG_SMP */
+
+ struct irq_chip hpet_msi_type = {
+@@ -3472,28 +3778,21 @@ static void target_ht_irq(unsigned int i
+ write_ht_irq_msg(irq, &msg);
+ }
+
+-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
++static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ {
++ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+ unsigned int dest;
+- cpumask_t tmp;
+- struct irq_desc *desc;
+
+- cpus_and(tmp, mask, cpu_online_map);
+- if (cpus_empty(tmp))
++ dest = set_desc_affinity(desc, mask);
++ if (dest == BAD_APICID)
+ return;
+
+- if (assign_irq_vector(irq, mask))
+- return;
+-
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, mask);
+- dest = cpu_mask_to_apicid(tmp);
++ cfg = desc->chip_data;
+
+ target_ht_irq(irq, dest, cfg->vector);
+- desc = irq_to_desc(irq);
+- desc->affinity = mask;
+ }
++
+ #endif
+
+ static struct irq_chip ht_irq_chip = {
+@@ -3511,17 +3810,14 @@ int arch_setup_ht_irq(unsigned int irq,
+ {
+ struct irq_cfg *cfg;
+ int err;
+- cpumask_t tmp;
+
+- tmp = TARGET_CPUS;
+- err = assign_irq_vector(irq, tmp);
++ cfg = irq_cfg(irq);
++ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+ if (!err) {
+ struct ht_irq_msg msg;
+ unsigned dest;
+
+- cfg = irq_cfg(irq);
+- cpus_and(tmp, cfg->domain, tmp);
+- dest = cpu_mask_to_apicid(tmp);
++ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+@@ -3557,7 +3853,7 @@ int arch_setup_ht_irq(unsigned int irq,
+ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+ {
+- const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
++ const struct cpumask *eligible_cpu = cpumask_of(cpu);
+ struct irq_cfg *cfg;
+ int mmr_pnode;
+ unsigned long mmr_value;
+@@ -3565,7 +3861,9 @@ int arch_enable_uv_irq(char *irq_name, u
+ unsigned long flags;
+ int err;
+
+- err = assign_irq_vector(irq, *eligible_cpu);
++ cfg = irq_cfg(irq);
++
++ err = assign_irq_vector(irq, cfg, eligible_cpu);
+ if (err != 0)
+ return err;
+
+@@ -3574,8 +3872,6 @@ int arch_enable_uv_irq(char *irq_name, u
+ irq_name);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+- cfg = irq_cfg(irq);
+-
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+@@ -3586,7 +3882,7 @@ int arch_enable_uv_irq(char *irq_name, u
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+- entry->dest = cpu_mask_to_apicid(*eligible_cpu);
++ entry->dest = cpu_mask_to_apicid(eligible_cpu);
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+@@ -3627,10 +3923,29 @@ int __init io_apic_get_redir_entries (in
+ return reg_01.bits.entries;
+ }
+
+-int __init probe_nr_irqs(void)
++#ifndef CONFIG_XEN
++void __init probe_nr_irqs_gsi(void)
+ {
+- return NR_IRQS;
++ int nr = 0;
++
++ nr = acpi_probe_gsi();
++ if (nr > nr_irqs_gsi) {
++ nr_irqs_gsi = nr;
++ } else {
++ /* for acpi=off or acpi is not compiled in */
++ int idx;
++
++ nr = 0;
++ for (idx = 0; idx < nr_ioapics; idx++)
++ nr += io_apic_get_redir_entries(idx) + 1;
++
++ if (nr > nr_irqs_gsi)
++ nr_irqs_gsi = nr;
++ }
++
++ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+ }
++#endif
+
+ /* --------------------------------------------------------------------------
+ ACPI-based IOAPIC Configuration
+@@ -3730,6 +4045,10 @@ int __init io_apic_get_version(int ioapi
+
+ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+ {
++ struct irq_desc *desc;
++ struct irq_cfg *cfg;
++ int cpu = boot_cpu_id;
++
+ #ifdef CONFIG_XEN
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
+@@ -3744,13 +4063,21 @@ int io_apic_set_pci_routing (int ioapic,
+ return -EINVAL;
+ }
+
++ desc = irq_to_desc_alloc_cpu(irq, cpu);
++ if (!desc) {
++ printk(KERN_INFO "can not get irq_desc %d\n", irq);
++ return 0;
++ }
++
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+- if (irq >= 16)
+- add_pin_to_irq(irq, ioapic, pin);
++ if (irq >= NR_IRQS_LEGACY) {
++ cfg = desc->chip_data;
++ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
++ }
+
+- setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
++ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
+
+ return 0;
+ }
+@@ -3789,7 +4116,7 @@ void __init setup_ioapic_dest(void)
+ int pin, ioapic, irq, irq_entry;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+- cpumask_t mask;
++ const struct cpumask *mask;
+
+ if (skip_ioapic_setup == 1)
+ return;
+@@ -3805,9 +4132,10 @@ void __init setup_ioapic_dest(void)
+ * when you have too many devices, because at that time only boot
+ * cpu is online.
+ */
+- cfg = irq_cfg(irq);
++ desc = irq_to_desc(irq);
++ cfg = desc->chip_data;
+ if (!cfg->vector) {
+- setup_IO_APIC_irq(ioapic, pin, irq,
++ setup_IO_APIC_irq(ioapic, pin, irq, desc,
+ irq_trigger(irq_entry),
+ irq_polarity(irq_entry));
+ continue;
+@@ -3817,19 +4145,18 @@ void __init setup_ioapic_dest(void)
+ /*
+ * Honour affinities which have been set in early boot
+ */
+- desc = irq_to_desc(irq);
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+- mask = desc->affinity;
++ mask = &desc->affinity;
+ else
+ mask = TARGET_CPUS;
+
+ #ifdef CONFIG_INTR_REMAP
+ if (intr_remapping_enabled)
+- set_ir_ioapic_affinity_irq(irq, mask);
++ set_ir_ioapic_affinity_irq_desc(desc, mask);
+ else
+ #endif
+- set_ioapic_affinity_irq(irq, mask);
++ set_ioapic_affinity_irq_desc(desc, mask);
+ }
+
+ }
+@@ -3878,7 +4205,6 @@ void __init ioapic_init_mappings(void)
+ struct resource *ioapic_res;
+ int i;
+
+- irq_2_pin_init();
+ ioapic_res = ioapic_setup_resources();
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+--- a/arch/x86/kernel/apic/ipi-xen.c
++++ b/arch/x86/kernel/apic/ipi-xen.c
+@@ -40,21 +40,29 @@ void send_IPI_self(int vector)
+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
+ }
+
+-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
++void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
+ {
+- cpumask_t mask;
+ unsigned int cpu;
+
+- cpus_andnot(mask, cpumask, cpu_online_map);
+- WARN_ON(!cpus_empty(mask));
+- for_each_online_cpu(cpu)
+- if (cpu_isset(cpu, cpumask))
+- __send_IPI_one(cpu, vector);
++ WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
++ for_each_cpu_and(cpu, cpumask, cpu_online_mask)
++ __send_IPI_one(cpu, vector);
+ }
+
+-void send_IPI_mask_sequence(cpumask_t mask, int vector)
++void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
+ {
+ send_IPI_mask_bitmask(mask, vector);
+ }
+
++void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
++{
++ unsigned int query_cpu;
++ unsigned int this_cpu = smp_processor_id();
++
++ WARN_ON(!cpumask_subset(mask, cpu_online_mask));
++ for_each_cpu_and(query_cpu, mask, cpu_online_mask)
++ if (query_cpu != this_cpu)
++ __send_IPI_one(query_cpu, vector);
++}
++
+ #endif
--- a/arch/x86/kernel/cpu/common-xen.c
+++ b/arch/x86/kernel/cpu/common-xen.c
@@ -38,17 +38,45 @@
@@ -924,9 +3009,20 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
}
}
+--- a/arch/x86/kernel/cpu/Makefile
++++ b/arch/x86/kernel/cpu/Makefile
+@@ -41,7 +41,7 @@ obj-$(CONFIG_MTRR) += mtrr/
+
+ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+
+-disabled-obj-$(CONFIG_XEN) := perfctr-watchdog.o
++disabled-obj-$(CONFIG_XEN) := hypervisor.o perfctr-watchdog.o vmware.o
+
+ quiet_cmd_mkcapflags = MKCAP $@
+ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
--- a/arch/x86/kernel/cpu/mtrr/main-xen.c
+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
-@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
+@@ -34,7 +34,7 @@ struct mtrr_ops generic_mtrr_ops = {
struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
unsigned int num_var_ranges;
@@ -3314,16 +5410,6 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
+ * End of kprobes section
+ */
+ .popsection
---- a/arch/x86/kernel/head-xen.c
-+++ b/arch/x86/kernel/head-xen.c
-@@ -36,7 +36,6 @@ void __init reserve_ebda_region(void)
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-- printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
--- a/arch/x86/kernel/head32-xen.c
+++ b/arch/x86/kernel/head32-xen.c
@@ -11,9 +11,12 @@
@@ -3362,1977 +5448,16 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
---- a/arch/x86/kernel/apic/io_apic-xen.c
-+++ b/arch/x86/kernel/apic/io_apic-xen.c
-@@ -112,102 +112,276 @@ static int __init parse_noapic(char *str
- }
- early_param("noapic", parse_noapic);
-
-+#ifndef CONFIG_XEN
- struct irq_pin_list;
-+
-+/*
-+ * This is performance-critical, we want to do it O(1)
-+ *
-+ * the indexing order of this array favors 1:1 mappings
-+ * between pins and IRQs.
-+ */
-+
-+struct irq_pin_list {
-+ int apic, pin;
-+ struct irq_pin_list *next;
-+};
-+
-+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
-+{
-+ struct irq_pin_list *pin;
-+ int node;
-+
-+ node = cpu_to_node(cpu);
-+
-+ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-+
-+ return pin;
-+}
-+
- struct irq_cfg {
--#ifndef CONFIG_XEN
-- unsigned int irq;
- struct irq_pin_list *irq_2_pin;
-- cpumask_t domain;
-- cpumask_t old_domain;
-+ cpumask_var_t domain;
-+ cpumask_var_t old_domain;
- unsigned move_cleanup_count;
--#endif
- u8 vector;
--#ifndef CONFIG_XEN
- u8 move_in_progress : 1;
-+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-+ u8 move_desc_pending : 1;
- #endif
- };
-
- /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-+#ifdef CONFIG_SPARSE_IRQ
-+static struct irq_cfg irq_cfgx[] = {
-+#else
- static struct irq_cfg irq_cfgx[NR_IRQS] = {
-- [0] = { .irq = 0 },
-- [1] = { .irq = 1 },
-- [2] = { .irq = 2 },
-- [3] = { .irq = 3 },
-- [4] = { .irq = 4 },
-- [5] = { .irq = 5 },
-- [6] = { .irq = 6 },
-- [7] = { .irq = 7 },
-- [8] = { .irq = 8 },
-- [9] = { .irq = 9 },
-- [10] = { .irq = 10 },
-- [11] = { .irq = 11 },
-- [12] = { .irq = 12 },
-- [13] = { .irq = 13 },
-- [14] = { .irq = 14 },
-- [15] = { .irq = 15 },
-+#endif
-+ [0] = { .vector = IRQ0_VECTOR, },
-+ [1] = { .vector = IRQ1_VECTOR, },
-+ [2] = { .vector = IRQ2_VECTOR, },
-+ [3] = { .vector = IRQ3_VECTOR, },
-+ [4] = { .vector = IRQ4_VECTOR, },
-+ [5] = { .vector = IRQ5_VECTOR, },
-+ [6] = { .vector = IRQ6_VECTOR, },
-+ [7] = { .vector = IRQ7_VECTOR, },
-+ [8] = { .vector = IRQ8_VECTOR, },
-+ [9] = { .vector = IRQ9_VECTOR, },
-+ [10] = { .vector = IRQ10_VECTOR, },
-+ [11] = { .vector = IRQ11_VECTOR, },
-+ [12] = { .vector = IRQ12_VECTOR, },
-+ [13] = { .vector = IRQ13_VECTOR, },
-+ [14] = { .vector = IRQ14_VECTOR, },
-+ [15] = { .vector = IRQ15_VECTOR, },
- };
-
--#define for_each_irq_cfg(irq, cfg) \
-- for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
-+int __init arch_early_irq_init(void)
-+{
-+ struct irq_cfg *cfg;
-+ struct irq_desc *desc;
-+ int count;
-+ int i;
-+
-+ cfg = irq_cfgx;
-+ count = ARRAY_SIZE(irq_cfgx);
-
-+ for (i = 0; i < count; i++) {
-+ desc = irq_to_desc(i);
-+ desc->chip_data = &cfg[i];
-+ alloc_bootmem_cpumask_var(&cfg[i].domain);
-+ alloc_bootmem_cpumask_var(&cfg[i].old_domain);
-+ if (i < NR_IRQS_LEGACY)
-+ cpumask_setall(cfg[i].domain);
-+ }
-+
-+ return 0;
-+}
-+
-+#ifdef CONFIG_SPARSE_IRQ
- static struct irq_cfg *irq_cfg(unsigned int irq)
- {
-- return irq < nr_irqs ? irq_cfgx + irq : NULL;
-+ struct irq_cfg *cfg = NULL;
-+ struct irq_desc *desc;
-+
-+ desc = irq_to_desc(irq);
-+ if (desc)
-+ cfg = desc->chip_data;
-+
-+ return cfg;
- }
-
--static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
-+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
- {
-- return irq_cfg(irq);
-+ struct irq_cfg *cfg;
-+ int node;
-+
-+ node = cpu_to_node(cpu);
-+
-+ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
-+ if (cfg) {
-+ if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
-+ kfree(cfg);
-+ cfg = NULL;
-+ } else if (!alloc_cpumask_var_node(&cfg->old_domain,
-+ GFP_ATOMIC, node)) {
-+ free_cpumask_var(cfg->domain);
-+ kfree(cfg);
-+ cfg = NULL;
-+ } else {
-+ cpumask_clear(cfg->domain);
-+ cpumask_clear(cfg->old_domain);
-+ }
-+ }
-+
-+ return cfg;
- }
-
--#ifdef CONFIG_XEN
--#define irq_2_pin_init()
--#define add_pin_to_irq(irq, apic, pin)
--#else
--/*
-- * Rough estimation of how many shared IRQs there are, can be changed
-- * anytime.
-- */
--#define MAX_PLUS_SHARED_IRQS NR_IRQS
--#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-+int arch_init_chip_data(struct irq_desc *desc, int cpu)
-+{
-+ struct irq_cfg *cfg;
-
--/*
-- * This is performance-critical, we want to do it O(1)
-- *
-- * the indexing order of this array favors 1:1 mappings
-- * between pins and IRQs.
-- */
-+ cfg = desc->chip_data;
-+ if (!cfg) {
-+ desc->chip_data = get_one_free_irq_cfg(cpu);
-+ if (!desc->chip_data) {
-+ printk(KERN_ERR "can not alloc irq_cfg\n");
-+ BUG_ON(1);
-+ }
-+ }
-
--struct irq_pin_list {
-- int apic, pin;
-- struct irq_pin_list *next;
--};
-+ return 0;
-+}
-
--static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
--static struct irq_pin_list *irq_2_pin_ptr;
-+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
--static void __init irq_2_pin_init(void)
-+static void
-+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
- {
-- struct irq_pin_list *pin = irq_2_pin_head;
-- int i;
-+ struct irq_pin_list *old_entry, *head, *tail, *entry;
-+
-+ cfg->irq_2_pin = NULL;
-+ old_entry = old_cfg->irq_2_pin;
-+ if (!old_entry)
-+ return;
-+
-+ entry = get_one_free_irq_2_pin(cpu);
-+ if (!entry)
-+ return;
-+
-+ entry->apic = old_entry->apic;
-+ entry->pin = old_entry->pin;
-+ head = entry;
-+ tail = entry;
-+ old_entry = old_entry->next;
-+ while (old_entry) {
-+ entry = get_one_free_irq_2_pin(cpu);
-+ if (!entry) {
-+ entry = head;
-+ while (entry) {
-+ head = entry->next;
-+ kfree(entry);
-+ entry = head;
-+ }
-+ /* still use the old one */
-+ return;
-+ }
-+ entry->apic = old_entry->apic;
-+ entry->pin = old_entry->pin;
-+ tail->next = entry;
-+ tail = entry;
-+ old_entry = old_entry->next;
-+ }
-
-- for (i = 1; i < PIN_MAP_SIZE; i++)
-- pin[i-1].next = &pin[i];
-+ tail->next = NULL;
-+ cfg->irq_2_pin = head;
-+}
-+
-+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
-+{
-+ struct irq_pin_list *entry, *next;
-+
-+ if (old_cfg->irq_2_pin == cfg->irq_2_pin)
-+ return;
-+
-+ entry = old_cfg->irq_2_pin;
-
-- irq_2_pin_ptr = &pin[0];
-+ while (entry) {
-+ next = entry->next;
-+ kfree(entry);
-+ entry = next;
-+ }
-+ old_cfg->irq_2_pin = NULL;
- }
-
--static struct irq_pin_list *get_one_free_irq_2_pin(void)
-+void arch_init_copy_chip_data(struct irq_desc *old_desc,
-+ struct irq_desc *desc, int cpu)
- {
-- struct irq_pin_list *pin = irq_2_pin_ptr;
-+ struct irq_cfg *cfg;
-+ struct irq_cfg *old_cfg;
-
-- if (!pin)
-- panic("can not get more irq_2_pin\n");
-+ cfg = get_one_free_irq_cfg(cpu);
-
-- irq_2_pin_ptr = pin->next;
-- pin->next = NULL;
-- return pin;
-+ if (!cfg)
-+ return;
-+
-+ desc->chip_data = cfg;
-+
-+ old_cfg = old_desc->chip_data;
-+
-+ memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
-+
-+ init_copy_irq_2_pin(old_cfg, cfg, cpu);
-+}
-+
-+static void free_irq_cfg(struct irq_cfg *old_cfg)
-+{
-+ kfree(old_cfg);
-+}
-+
-+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
-+{
-+ struct irq_cfg *old_cfg, *cfg;
-+
-+ old_cfg = old_desc->chip_data;
-+ cfg = desc->chip_data;
-+
-+ if (old_cfg == cfg)
-+ return;
-+
-+ if (old_cfg) {
-+ free_irq_2_pin(old_cfg, cfg);
-+ free_irq_cfg(old_cfg);
-+ old_desc->chip_data = NULL;
-+ }
-+}
-+
-+static void
-+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-+{
-+ struct irq_cfg *cfg = desc->chip_data;
-+
-+ if (!cfg->move_in_progress) {
-+ /* it means that domain is not changed */
-+ if (!cpumask_intersects(&desc->affinity, mask))
-+ cfg->move_desc_pending = 1;
-+ }
-+}
-+#endif
-+
-+#else
-+static struct irq_cfg *irq_cfg(unsigned int irq)
-+{
-+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
-+}
-+
-+#endif
-+
-+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-+static inline void
-+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-+{
- }
-+#endif
-
- struct io_apic {
- unsigned int index;
-@@ -220,7 +394,7 @@ static __attribute_const__ struct io_api
- return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
- }
--#endif
-+#endif /* !CONFIG_XEN */
-
- static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
- {
-@@ -275,11 +449,10 @@ static inline void io_apic_modify(unsign
- writel(value, &io_apic->data);
- }
-
--static bool io_apic_level_ack_pending(unsigned int irq)
-+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
- {
- struct irq_pin_list *entry;
- unsigned long flags;
-- struct irq_cfg *cfg = irq_cfg(irq);
-
- spin_lock_irqsave(&ioapic_lock, flags);
- entry = cfg->irq_2_pin;
-@@ -369,13 +542,32 @@ static void ioapic_mask_entry(int apic,
- }
-
- #ifdef CONFIG_SMP
--static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
-+static void send_cleanup_vector(struct irq_cfg *cfg)
-+{
-+ cpumask_var_t cleanup_mask;
-+
-+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-+ unsigned int i;
-+ cfg->move_cleanup_count = 0;
-+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-+ cfg->move_cleanup_count++;
-+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-+ send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
-+ } else {
-+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
-+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-+ free_cpumask_var(cleanup_mask);
-+ }
-+ cfg->move_in_progress = 0;
-+}
-+
-+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
- {
- int apic, pin;
-- struct irq_cfg *cfg;
- struct irq_pin_list *entry;
-+ u8 vector = cfg->vector;
-
-- cfg = irq_cfg(irq);
- entry = cfg->irq_2_pin;
- for (;;) {
- unsigned int reg;
-@@ -405,36 +597,61 @@ static void __target_IO_APIC_irq(unsigne
- }
- }
-
--static int assign_irq_vector(int irq, cpumask_t mask);
-+static int
-+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-+
-+/*
-+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
-+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
-+ */
-+static unsigned int
-+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-+{
-+ struct irq_cfg *cfg;
-+ unsigned int irq;
-+
-+ if (!cpumask_intersects(mask, cpu_online_mask))
-+ return BAD_APICID;
-+
-+ irq = desc->irq;
-+ cfg = desc->chip_data;
-+ if (assign_irq_vector(irq, cfg, mask))
-+ return BAD_APICID;
-+
-+ cpumask_and(&desc->affinity, cfg->domain, mask);
-+ set_extra_move_desc(desc, mask);
-+ return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
-+}
-
--static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-+static void
-+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
- {
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
-- cpumask_t tmp;
-- struct irq_desc *desc;
-+ unsigned int irq;
-
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-- return;
-+ irq = desc->irq;
-+ cfg = desc->chip_data;
-
-- cfg = irq_cfg(irq);
-- if (assign_irq_vector(irq, mask))
-- return;
-+ spin_lock_irqsave(&ioapic_lock, flags);
-+ dest = set_desc_affinity(desc, mask);
-+ if (dest != BAD_APICID) {
-+ /* Only the high 8 bits are valid. */
-+ dest = SET_APIC_LOGICAL_ID(dest);
-+ __target_IO_APIC_irq(irq, dest, cfg);
-+ }
-+ spin_unlock_irqrestore(&ioapic_lock, flags);
-+}
-
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
-- /*
-- * Only the high 8 bits are valid.
-- */
-- dest = SET_APIC_LOGICAL_ID(dest);
-+static void
-+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-+{
-+ struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-- spin_lock_irqsave(&ioapic_lock, flags);
-- __target_IO_APIC_irq(irq, dest, cfg->vector);
-- desc->affinity = mask;
-- spin_unlock_irqrestore(&ioapic_lock, flags);
-+
-+ set_ioapic_affinity_irq_desc(desc, mask);
- }
- #endif /* CONFIG_SMP */
-
-@@ -443,16 +660,18 @@ static void set_ioapic_affinity_irq(unsi
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
--static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
- {
-- struct irq_cfg *cfg;
- struct irq_pin_list *entry;
-
-- /* first time to refer irq_cfg, so with new */
-- cfg = irq_cfg_alloc(irq);
- entry = cfg->irq_2_pin;
- if (!entry) {
-- entry = get_one_free_irq_2_pin();
-+ entry = get_one_free_irq_2_pin(cpu);
-+ if (!entry) {
-+ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
-+ apic, pin);
-+ return;
-+ }
- cfg->irq_2_pin = entry;
- entry->apic = apic;
- entry->pin = pin;
-@@ -467,7 +686,7 @@ static void add_pin_to_irq(unsigned int
- entry = entry->next;
- }
-
-- entry->next = get_one_free_irq_2_pin();
-+ entry->next = get_one_free_irq_2_pin(cpu);
- entry = entry->next;
- entry->apic = apic;
- entry->pin = pin;
-@@ -476,11 +695,10 @@ static void add_pin_to_irq(unsigned int
- /*
- * Reroute an IRQ to a different pin.
- */
--static void __init replace_pin_at_irq(unsigned int irq,
-+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
- int oldapic, int oldpin,
- int newapic, int newpin)
- {
-- struct irq_cfg *cfg = irq_cfg(irq);
- struct irq_pin_list *entry = cfg->irq_2_pin;
- int replaced = 0;
-
-@@ -497,18 +715,16 @@ static void __init replace_pin_at_irq(un
-
- /* why? call replace before add? */
- if (!replaced)
-- add_pin_to_irq(irq, newapic, newpin);
-+ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
- }
-
--static inline void io_apic_modify_irq(unsigned int irq,
-+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
- int mask_and, int mask_or,
- void (*final)(struct irq_pin_list *entry))
- {
- int pin;
-- struct irq_cfg *cfg;
- struct irq_pin_list *entry;
-
-- cfg = irq_cfg(irq);
- for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
- unsigned int reg;
- pin = entry->pin;
-@@ -521,13 +737,13 @@ static inline void io_apic_modify_irq(un
- }
- }
-
--static void __unmask_IO_APIC_irq(unsigned int irq)
-+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
- {
-- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
-+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
- }
-
- #ifdef CONFIG_X86_64
--void io_apic_sync(struct irq_pin_list *entry)
-+static void io_apic_sync(struct irq_pin_list *entry)
- {
- /*
- * Synchronize the IO-APIC and the CPU by doing
-@@ -538,47 +754,64 @@ void io_apic_sync(struct irq_pin_list *e
- readl(&io_apic->data);
- }
-
--static void __mask_IO_APIC_irq(unsigned int irq)
-+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
- {
-- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
-+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
- }
- #else /* CONFIG_X86_32 */
--static void __mask_IO_APIC_irq(unsigned int irq)
-+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
- {
-- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
-+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
- }
-
--static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
-+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
- {
-- io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
-+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED, NULL);
- }
-
--static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
-+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
- {
-- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
-+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
- }
- #endif /* CONFIG_X86_32 */
-
--static void mask_IO_APIC_irq (unsigned int irq)
-+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
- {
-+ struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
-+ BUG_ON(!cfg);
-+
- spin_lock_irqsave(&ioapic_lock, flags);
-- __mask_IO_APIC_irq(irq);
-+ __mask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- }
-
--static void unmask_IO_APIC_irq (unsigned int irq)
-+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
- {
-+ struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
-- __unmask_IO_APIC_irq(irq);
-+ __unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- }
-
-+static void mask_IO_APIC_irq(unsigned int irq)
-+{
-+ struct irq_desc *desc = irq_to_desc(irq);
-+
-+ mask_IO_APIC_irq_desc(desc);
-+}
-+static void unmask_IO_APIC_irq(unsigned int irq)
-+{
-+ struct irq_desc *desc = irq_to_desc(irq);
-+
-+ unmask_IO_APIC_irq_desc(desc);
-+}
-+
- static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
- {
- struct IO_APIC_route_entry entry;
-@@ -618,6 +851,8 @@ void send_IPI_self(int vector)
- apic_write(APIC_ICR, cfg);
- }
- #endif /* !CONFIG_SMP && CONFIG_X86_32*/
-+#else
-+#define add_pin_to_irq_cpu(cfg, cpu, apic, pin)
- #endif /* !CONFIG_XEN */
-
- #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-@@ -858,7 +1093,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
- */
- static int EISA_ELCR(unsigned int irq)
- {
-- if (irq < 16) {
-+ if (irq < NR_IRQS_LEGACY) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
-@@ -1083,52 +1318,114 @@ void unlock_vector_lock(void)
- {
- spin_unlock(&vector_lock);
- }
--#endif
-
--static int assign_irq_vector(int irq, cpumask_t mask)
-+static int
-+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
- {
-- struct physdev_irq irq_op;
-- struct irq_cfg *cfg;
--
-- if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
-- return -EINVAL;
-+ /*
-+ * NOTE! The local APIC isn't very good at handling
-+ * multiple interrupts at the same interrupt level.
-+ * As the interrupt level is determined by taking the
-+ * vector number and shifting that right by 4, we
-+ * want to spread these out a bit so that they don't
-+ * all fall in the same interrupt level.
-+ *
-+ * Also, we've got to be careful not to trash gate
-+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
-+ */
-+ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
-+ unsigned int old_vector;
-+ int cpu, err;
-+ cpumask_var_t tmp_mask;
-+
-+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-+ return -EBUSY;
-+
-+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
-+ return -ENOMEM;
-+
-+ old_vector = cfg->vector;
-+ if (old_vector) {
-+ cpumask_and(tmp_mask, mask, cpu_online_mask);
-+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
-+ if (!cpumask_empty(tmp_mask)) {
-+ free_cpumask_var(tmp_mask);
-+ return 0;
-+ }
-+ }
-
-- cfg = irq_cfg(irq);
-+ /* Only try and allocate irqs on cpus that are present */
-+ err = -ENOSPC;
-+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
-+ int new_cpu;
-+ int vector, offset;
-+
-+ vector_allocation_domain(cpu, tmp_mask);
-+
-+ vector = current_vector;
-+ offset = current_offset;
-+next:
-+ vector += 8;
-+ if (vector >= first_system_vector) {
-+ /* If out of vectors on large boxen, must share them. */
-+ offset = (offset + 1) % 8;
-+ vector = FIRST_DEVICE_VECTOR + offset;
-+ }
-+ if (unlikely(current_vector == vector))
-+ continue;
-
-- if (cfg->vector)
-- return 0;
-+ if (test_bit(vector, used_vectors))
-+ goto next;
-
-- irq_op.irq = irq;
-- if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
-- return -ENOSPC;
-+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
-+ if (per_cpu(vector_irq, new_cpu)[vector] != -1)
-+ goto next;
-+ /* Found one! */
-+ current_vector = vector;
-+ current_offset = offset;
-+ if (old_vector) {
-+ cfg->move_in_progress = 1;
-+ cpumask_copy(cfg->old_domain, cfg->domain);
-+ }
-+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
-+ per_cpu(vector_irq, new_cpu)[vector] = irq;
-+ cfg->vector = vector;
-+ cpumask_copy(cfg->domain, tmp_mask);
-+ err = 0;
-+ break;
-+ }
-+ free_cpumask_var(tmp_mask);
-+ return err;
-+}
-
-- cfg->vector = irq_op.vector;
-+static int
-+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
-+{
-+ int err;
-+ unsigned long flags;
-
-- return 0;
-+ spin_lock_irqsave(&vector_lock, flags);
-+ err = __assign_irq_vector(irq, cfg, mask);
-+ spin_unlock_irqrestore(&vector_lock, flags);
-+ return err;
- }
-
--#ifndef CONFIG_XEN
--static void __clear_irq_vector(int irq)
-+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
- {
-- struct irq_cfg *cfg;
-- cpumask_t mask;
- int cpu, vector;
-
-- cfg = irq_cfg(irq);
- BUG_ON(!cfg->vector);
-
- vector = cfg->vector;
-- cpus_and(mask, cfg->domain, cpu_online_map);
-- for_each_cpu_mask_nr(cpu, mask)
-+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = -1;
-
- cfg->vector = 0;
-- cpus_clear(cfg->domain);
-+ cpumask_clear(cfg->domain);
-
- if (likely(!cfg->move_in_progress))
- return;
-- cpus_and(mask, cfg->old_domain, cpu_online_map);
-- for_each_cpu_mask_nr(cpu, mask) {
-+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != irq)
-@@ -1146,10 +1443,12 @@ void __setup_vector_irq(int cpu)
- /* This function must be called with vector_lock held */
- int irq, vector;
- struct irq_cfg *cfg;
-+ struct irq_desc *desc;
-
- /* Mark the inuse vectors */
-- for_each_irq_cfg(irq, cfg) {
-- if (!cpu_isset(cpu, cfg->domain))
-+ for_each_irq_desc(irq, desc) {
-+ cfg = desc->chip_data;
-+ if (!cpumask_test_cpu(cpu, cfg->domain))
- continue;
- vector = cfg->vector;
- per_cpu(vector_irq, cpu)[vector] = irq;
-@@ -1161,7 +1460,7 @@ void __setup_vector_irq(int cpu)
- continue;
-
- cfg = irq_cfg(irq);
-- if (!cpu_isset(cpu, cfg->domain))
-+ if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = -1;
- }
- }
-@@ -1199,11 +1498,8 @@ static inline int IO_APIC_irq_trigger(in
- }
- #endif
-
--static void ioapic_register_intr(int irq, unsigned long trigger)
-+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
- {
-- struct irq_desc *desc;
--
-- desc = irq_to_desc(irq);
-
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
-@@ -1234,8 +1530,8 @@ static void ioapic_register_intr(int irq
- handle_edge_irq, "edge");
- }
- #else /* !CONFIG_XEN */
--#define __clear_irq_vector(irq) ((void)(irq))
--#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq)
-+#define __clear_irq_vector(irq, cfg) ((void)0)
-+#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq)
- #endif
-
- static int setup_ioapic_entry(int apic, int irq,
-@@ -1299,24 +1595,25 @@ static int setup_ioapic_entry(int apic,
- return 0;
- }
-
--static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
-+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
- int trigger, int polarity)
- {
- struct irq_cfg *cfg;
- struct IO_APIC_route_entry entry;
-- cpumask_t mask;
-+ unsigned int dest;
-
- if (!IO_APIC_IRQ(irq))
- return;
-
-- cfg = irq_cfg(irq);
-+ cfg = desc->chip_data;
-
-- mask = TARGET_CPUS;
-- if (assign_irq_vector(irq, mask))
-+ if (assign_irq_vector(irq, cfg, TARGET_CPUS))
- return;
-
- #ifndef CONFIG_XEN
-- cpus_and(mask, cfg->domain, mask);
-+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
-+#else
-+ dest = cpu_mask_to_apicid(TARGET_CPUS);
- #endif
-
- apic_printk(APIC_VERBOSE,KERN_DEBUG
-@@ -1327,16 +1624,15 @@ static void setup_IO_APIC_irq(int apic,
-
-
- if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
-- cpu_mask_to_apicid(mask), trigger, polarity,
-- cfg->vector)) {
-+ dest, trigger, polarity, cfg->vector)) {
- printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic].mp_apicid, pin);
-- __clear_irq_vector(irq);
-+ __clear_irq_vector(irq, cfg);
- return;
- }
-
-- ioapic_register_intr(irq, trigger);
-- if (irq < 16)
-+ ioapic_register_intr(irq, desc, trigger);
-+ if (irq < NR_IRQS_LEGACY)
- disable_8259A_irq(irq);
-
- ioapic_write_entry(apic, pin, entry);
-@@ -1346,6 +1642,9 @@ static void __init setup_IO_APIC_irqs(vo
- {
- int apic, pin, idx, irq;
- int notcon = 0;
-+ struct irq_desc *desc;
-+ struct irq_cfg *cfg;
-+ int cpu = boot_cpu_id;
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
-@@ -1380,9 +1679,15 @@ static void __init setup_IO_APIC_irqs(vo
- if (multi_timer_check(apic, irq))
- continue;
- #endif
-- add_pin_to_irq(irq, apic, pin);
-+ desc = irq_to_desc_alloc_cpu(irq, cpu);
-+ if (!desc) {
-+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-+ continue;
-+ }
-+ cfg = desc->chip_data;
-+ add_pin_to_irq_cpu(cfg, cpu, apic, pin);
-
-- setup_IO_APIC_irq(apic, pin, irq,
-+ setup_IO_APIC_irq(apic, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
- }
- }
-@@ -1442,6 +1747,7 @@ __apicdebuginit(void) print_IO_APIC(void
- union IO_APIC_reg_03 reg_03;
- unsigned long flags;
- struct irq_cfg *cfg;
-+ struct irq_desc *desc;
- unsigned int irq;
-
- if (apic_verbosity == APIC_QUIET)
-@@ -1531,8 +1837,11 @@ __apicdebuginit(void) print_IO_APIC(void
- }
- }
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
-- for_each_irq_cfg(irq, cfg) {
-- struct irq_pin_list *entry = cfg->irq_2_pin;
-+ for_each_irq_desc(irq, desc) {
-+ struct irq_pin_list *entry;
-+
-+ cfg = desc->chip_data;
-+ entry = cfg->irq_2_pin;
- if (!entry)
- continue;
- printk(KERN_DEBUG "IRQ%d ", irq);
-@@ -2022,14 +2331,16 @@ static unsigned int startup_ioapic_irq(u
- {
- int was_pending = 0;
- unsigned long flags;
-+ struct irq_cfg *cfg;
-
- spin_lock_irqsave(&ioapic_lock, flags);
-- if (irq < 16) {
-+ if (irq < NR_IRQS_LEGACY) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
- was_pending = 1;
- }
-- __unmask_IO_APIC_irq(irq);
-+ cfg = irq_cfg(irq);
-+ __unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return was_pending;
-@@ -2043,7 +2354,7 @@ static int ioapic_retrigger_irq(unsigned
- unsigned long flags;
-
- spin_lock_irqsave(&vector_lock, flags);
-- send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
-+ send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- return 1;
-@@ -2092,35 +2403,35 @@ static DECLARE_DELAYED_WORK(ir_migration
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
- */
--static void migrate_ioapic_irq(int irq, cpumask_t mask)
-+static void
-+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
- {
- struct irq_cfg *cfg;
-- struct irq_desc *desc;
-- cpumask_t tmp, cleanup_mask;
- struct irte irte;
- int modify_ioapic_rte;
- unsigned int dest;
- unsigned long flags;
-+ unsigned int irq;
-
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-+ if (!cpumask_intersects(mask, cpu_online_mask))
- return;
-
-+ irq = desc->irq;
- if (get_irte(irq, &irte))
- return;
-
-- if (assign_irq_vector(irq, mask))
-+ cfg = desc->chip_data;
-+ if (assign_irq_vector(irq, cfg, mask))
- return;
-
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
-+ set_extra_move_desc(desc, mask);
-+
-+ dest = cpu_mask_to_apicid_and(cfg->domain, mask);
-
-- desc = irq_to_desc(irq);
- modify_ioapic_rte = desc->status & IRQ_LEVEL;
- if (modify_ioapic_rte) {
- spin_lock_irqsave(&ioapic_lock, flags);
-- __target_IO_APIC_irq(irq, dest, cfg->vector);
-+ __target_IO_APIC_irq(irq, dest, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- }
-
-@@ -2132,24 +2443,20 @@ static void migrate_ioapic_irq(int irq,
- */
- modify_irte(irq, &irte);
-
-- if (cfg->move_in_progress) {
-- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-- cfg->move_in_progress = 0;
-- }
-+ if (cfg->move_in_progress)
-+ send_cleanup_vector(cfg);
-
-- desc->affinity = mask;
-+ cpumask_copy(&desc->affinity, mask);
- }
-
--static int migrate_irq_remapped_level(int irq)
-+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
- {
- int ret = -1;
-- struct irq_desc *desc = irq_to_desc(irq);
-+ struct irq_cfg *cfg = desc->chip_data;
-
-- mask_IO_APIC_irq(irq);
-+ mask_IO_APIC_irq_desc(desc);
-
-- if (io_apic_level_ack_pending(irq)) {
-+ if (io_apic_level_ack_pending(cfg)) {
- /*
- * Interrupt in progress. Migrating irq now will change the
- * vector information in the IO-APIC RTE and that will confuse
-@@ -2161,14 +2468,15 @@ static int migrate_irq_remapped_level(in
- }
-
- /* everthing is clear. we have right of way */
-- migrate_ioapic_irq(irq, desc->pending_mask);
-+ migrate_ioapic_irq_desc(desc, &desc->pending_mask);
-
- ret = 0;
- desc->status &= ~IRQ_MOVE_PENDING;
-- cpus_clear(desc->pending_mask);
-+ cpumask_clear(&desc->pending_mask);
-
- unmask:
-- unmask_IO_APIC_irq(irq);
-+ unmask_IO_APIC_irq_desc(desc);
-+
- return ret;
- }
-
-@@ -2189,7 +2497,7 @@ static void ir_irq_migration(struct work
- continue;
- }
-
-- desc->chip->set_affinity(irq, desc->pending_mask);
-+ desc->chip->set_affinity(irq, &desc->pending_mask);
- spin_unlock_irqrestore(&desc->lock, flags);
- }
- }
-@@ -2198,28 +2506,33 @@ static void ir_irq_migration(struct work
- /*
- * Migrates the IRQ destination in the process context.
- */
--static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
-+ const struct cpumask *mask)
- {
-- struct irq_desc *desc = irq_to_desc(irq);
--
- if (desc->status & IRQ_LEVEL) {
- desc->status |= IRQ_MOVE_PENDING;
-- desc->pending_mask = mask;
-- migrate_irq_remapped_level(irq);
-+ cpumask_copy(&desc->pending_mask, mask);
-+ migrate_irq_remapped_level_desc(desc);
- return;
- }
-
-- migrate_ioapic_irq(irq, mask);
-+ migrate_ioapic_irq_desc(desc, mask);
-+}
-+static void set_ir_ioapic_affinity_irq(unsigned int irq,
-+ const struct cpumask *mask)
-+{
-+ struct irq_desc *desc = irq_to_desc(irq);
-+
-+ set_ir_ioapic_affinity_irq_desc(desc, mask);
- }
- #endif
-
- asmlinkage void smp_irq_move_cleanup_interrupt(void)
- {
- unsigned vector, me;
-+
- ack_APIC_irq();
--#ifdef CONFIG_X86_64
- exit_idle();
--#endif
- irq_enter();
-
- me = smp_processor_id();
-@@ -2229,6 +2542,9 @@ asmlinkage void smp_irq_move_cleanup_int
- struct irq_cfg *cfg;
- irq = __get_cpu_var(vector_irq)[vector];
-
-+ if (irq == -1)
-+ continue;
-+
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
-@@ -2238,7 +2554,7 @@ asmlinkage void smp_irq_move_cleanup_int
- if (!cfg->move_cleanup_count)
- goto unlock;
-
-- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
-+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- goto unlock;
-
- __get_cpu_var(vector_irq)[vector] = -1;
-@@ -2250,28 +2566,45 @@ unlock:
- irq_exit();
- }
-
--static void irq_complete_move(unsigned int irq)
-+static void irq_complete_move(struct irq_desc **descp)
- {
-- struct irq_cfg *cfg = irq_cfg(irq);
-+ struct irq_desc *desc = *descp;
-+ struct irq_cfg *cfg = desc->chip_data;
- unsigned vector, me;
-
-- if (likely(!cfg->move_in_progress))
-+ if (likely(!cfg->move_in_progress)) {
-+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-+ if (likely(!cfg->move_desc_pending))
-+ return;
-+
-+ /* domain has not changed, but affinity did */
-+ me = smp_processor_id();
-+ if (cpu_isset(me, desc->affinity)) {
-+ *descp = desc = move_irq_desc(desc, me);
-+ /* get the new one */
-+ cfg = desc->chip_data;
-+ cfg->move_desc_pending = 0;
-+ }
-+#endif
- return;
-+ }
-
- vector = ~get_irq_regs()->orig_ax;
- me = smp_processor_id();
-- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
-- cpumask_t cleanup_mask;
-
-- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-- cfg->move_in_progress = 0;
-+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-+ *descp = desc = move_irq_desc(desc, me);
-+ /* get the new one */
-+ cfg = desc->chip_data;
-+#endif
-+ send_cleanup_vector(cfg);
- }
- }
- #else
--static inline void irq_complete_move(unsigned int irq) {}
-+static inline void irq_complete_move(struct irq_desc **descp) {}
- #endif
-+
- #ifdef CONFIG_INTR_REMAP
- static void ack_x2apic_level(unsigned int irq)
- {
-@@ -2282,11 +2615,14 @@ static void ack_x2apic_edge(unsigned int
- {
- ack_x2APIC_irq();
- }
-+
- #endif
-
- static void ack_apic_edge(unsigned int irq)
- {
-- irq_complete_move(irq);
-+ struct irq_desc *desc = irq_to_desc(irq);
-+
-+ irq_complete_move(&desc);
- move_native_irq(irq);
- ack_APIC_irq();
- }
-@@ -2295,18 +2631,21 @@ atomic_t irq_mis_count;
-
- static void ack_apic_level(unsigned int irq)
- {
-+ struct irq_desc *desc = irq_to_desc(irq);
-+
- #ifdef CONFIG_X86_32
- unsigned long v;
- int i;
- #endif
-+ struct irq_cfg *cfg;
- int do_unmask_irq = 0;
-
-- irq_complete_move(irq);
-+ irq_complete_move(&desc);
- #ifdef CONFIG_GENERIC_PENDING_IRQ
- /* If we are moving the irq we need to mask it */
-- if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
-+ if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
- do_unmask_irq = 1;
-- mask_IO_APIC_irq(irq);
-+ mask_IO_APIC_irq_desc(desc);
- }
- #endif
-
-@@ -2330,7 +2669,8 @@ static void ack_apic_level(unsigned int
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul. --macro
- */
-- i = irq_cfg(irq)->vector;
-+ cfg = desc->chip_data;
-+ i = cfg->vector;
-
- v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
- #endif
-@@ -2369,17 +2709,18 @@ static void ack_apic_level(unsigned int
- * accurate and is causing problems then it is a hardware bug
- * and you can go talk to the chipset vendor about it.
- */
-- if (!io_apic_level_ack_pending(irq))
-+ cfg = desc->chip_data;
-+ if (!io_apic_level_ack_pending(cfg))
- move_masked_irq(irq);
-- unmask_IO_APIC_irq(irq);
-+ unmask_IO_APIC_irq_desc(desc);
- }
-
- #ifdef CONFIG_X86_32
- if (!(v & (1 << (i & 0x1f)))) {
- atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
-- __mask_and_edge_IO_APIC_irq(irq);
-- __unmask_and_level_IO_APIC_irq(irq);
-+ __mask_and_edge_IO_APIC_irq(cfg);
-+ __unmask_and_level_IO_APIC_irq(cfg);
- spin_unlock(&ioapic_lock);
- }
- #endif
-@@ -2431,24 +2772,23 @@ static inline void init_IO_APIC_traps(vo
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
-- for_each_irq_cfg(irq, cfg) {
-+ for_each_irq_desc(irq, desc) {
- #ifdef CONFIG_XEN
- if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
- continue;
- #endif
-- if (IO_APIC_IRQ(irq) && !cfg->vector) {
-+ cfg = desc->chip_data;
-+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
- /*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
- */
-- if (irq < 16)
-+ if (irq < NR_IRQS_LEGACY)
- make_8259A_irq(irq);
-- else {
-- desc = irq_to_desc(irq);
-+ else
- /* Strange. Oh, well.. */
- desc->chip = &no_irq_chip;
-- }
- }
- }
- }
-@@ -2474,7 +2814,7 @@ static void unmask_lapic_irq(unsigned in
- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
- }
-
--static void ack_lapic_irq (unsigned int irq)
-+static void ack_lapic_irq(unsigned int irq)
- {
- ack_APIC_irq();
- }
-@@ -2486,11 +2826,8 @@ static struct irq_chip lapic_chip __read
- .ack = ack_lapic_irq,
- };
-
--static void lapic_register_intr(int irq)
-+static void lapic_register_intr(int irq, struct irq_desc *desc)
- {
-- struct irq_desc *desc;
--
-- desc = irq_to_desc(irq);
- desc->status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
-@@ -2594,7 +2931,9 @@ int timer_through_8259 __initdata;
- */
- static inline void __init check_timer(void)
- {
-- struct irq_cfg *cfg = irq_cfg(0);
-+ struct irq_desc *desc = irq_to_desc(0);
-+ struct irq_cfg *cfg = desc->chip_data;
-+ int cpu = boot_cpu_id;
- int apic1, pin1, apic2, pin2;
- unsigned long flags;
- unsigned int ver;
-@@ -2609,7 +2948,7 @@ static inline void __init check_timer(vo
- * get/set the timer IRQ vector:
- */
- disable_8259A_irq(0);
-- assign_irq_vector(0, TARGET_CPUS);
-+ assign_irq_vector(0, cfg, TARGET_CPUS);
-
- /*
- * As IRQ0 is to be enabled in the 8259A, the virtual
-@@ -2660,10 +2999,10 @@ static inline void __init check_timer(vo
- * Ok, does IRQ0 through the IOAPIC work?
- */
- if (no_pin1) {
-- add_pin_to_irq(0, apic1, pin1);
-+ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
- }
-- unmask_IO_APIC_irq(0);
-+ unmask_IO_APIC_irq_desc(desc);
- if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
-@@ -2689,9 +3028,9 @@ static inline void __init check_timer(vo
- /*
- * legacy devices should be connected to IO APIC #0
- */
-- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
-+ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-- unmask_IO_APIC_irq(0);
-+ unmask_IO_APIC_irq_desc(desc);
- enable_8259A_irq(0);
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
-@@ -2723,7 +3062,7 @@ static inline void __init check_timer(vo
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
-
-- lapic_register_intr(0);
-+ lapic_register_intr(0, desc);
- apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- enable_8259A_irq(0);
-
-@@ -2922,22 +3261,26 @@ unsigned int create_irq_nr(unsigned int
- unsigned int irq;
- unsigned int new;
- unsigned long flags;
-- struct irq_cfg *cfg_new;
--
-- irq_want = nr_irqs - 1;
-+ struct irq_cfg *cfg_new = NULL;
-+ int cpu = boot_cpu_id;
-+ struct irq_desc *desc_new = NULL;
-
- irq = 0;
- spin_lock_irqsave(&vector_lock, flags);
-- for (new = irq_want; new > 0; new--) {
-+ for (new = irq_want; new < NR_IRQS; new++) {
- if (platform_legacy_irq(new))
- continue;
-- cfg_new = irq_cfg(new);
-- if (cfg_new && cfg_new->vector != 0)
-+
-+ desc_new = irq_to_desc_alloc_cpu(new, cpu);
-+ if (!desc_new) {
-+ printk(KERN_INFO "can not get irq_desc for %d\n", new);
-+ continue;
-+ }
-+ cfg_new = desc_new->chip_data;
-+
-+ if (cfg_new->vector != 0)
- continue;
-- /* check if need to create one */
-- if (!cfg_new)
-- cfg_new = irq_cfg_alloc(new);
-- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
-+ if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
- irq = new;
- break;
- }
-@@ -2945,15 +3288,21 @@ unsigned int create_irq_nr(unsigned int
-
- if (irq > 0) {
- dynamic_irq_init(irq);
-+ /* restore it, in case dynamic_irq_init clear it */
-+ if (desc_new)
-+ desc_new->chip_data = cfg_new;
- }
- return irq;
- }
-
-+static int nr_irqs_gsi = NR_IRQS_LEGACY;
- int create_irq(void)
- {
-+ unsigned int irq_want;
- int irq;
-
-- irq = create_irq_nr(nr_irqs - 1);
-+ irq_want = nr_irqs_gsi;
-+ irq = create_irq_nr(irq_want);
-
- if (irq == 0)
- irq = -1;
-@@ -2964,14 +3313,22 @@ int create_irq(void)
- void destroy_irq(unsigned int irq)
- {
- unsigned long flags;
-+ struct irq_cfg *cfg;
-+ struct irq_desc *desc;
-
-+ /* store it, in case dynamic_irq_cleanup clear it */
-+ desc = irq_to_desc(irq);
-+ cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
-+ /* connect back irq_cfg */
-+ if (desc)
-+ desc->chip_data = cfg;
-
- #ifdef CONFIG_INTR_REMAP
- free_irte(irq);
- #endif
- spin_lock_irqsave(&vector_lock, flags);
-- __clear_irq_vector(irq);
-+ __clear_irq_vector(irq, cfg);
- spin_unlock_irqrestore(&vector_lock, flags);
- }
- #endif /* !CONFIG_XEN */
-@@ -2985,16 +3342,13 @@ static int msi_compose_msg(struct pci_de
- struct irq_cfg *cfg;
- int err;
- unsigned dest;
-- cpumask_t tmp;
-
-- tmp = TARGET_CPUS;
-- err = assign_irq_vector(irq, tmp);
-+ cfg = irq_cfg(irq);
-+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
- if (err)
- return err;
-
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, tmp);
-- dest = cpu_mask_to_apicid(tmp);
-+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
-
- #ifdef CONFIG_INTR_REMAP
- if (irq_remapped(irq)) {
-@@ -3048,64 +3402,48 @@ static int msi_compose_msg(struct pci_de
- }
-
- #ifdef CONFIG_SMP
--static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
- {
-+ struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
-- cpumask_t tmp;
-- struct irq_desc *desc;
-
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-+ dest = set_desc_affinity(desc, mask);
-+ if (dest == BAD_APICID)
- return;
-
-- if (assign_irq_vector(irq, mask))
-- return;
--
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
-+ cfg = desc->chip_data;
-
-- read_msi_msg(irq, &msg);
-+ read_msi_msg_desc(desc, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
-- write_msi_msg(irq, &msg);
-- desc = irq_to_desc(irq);
-- desc->affinity = mask;
-+ write_msi_msg_desc(desc, &msg);
- }
--
- #ifdef CONFIG_INTR_REMAP
- /*
- * Migrate the MSI irq to another cpumask. This migration is
- * done in the process context using interrupt-remapping hardware.
- */
--static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-+static void
-+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
- {
-- struct irq_cfg *cfg;
-+ struct irq_desc *desc = irq_to_desc(irq);
-+ struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
-- cpumask_t tmp, cleanup_mask;
- struct irte irte;
-- struct irq_desc *desc;
--
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-- return;
-
- if (get_irte(irq, &irte))
- return;
-
-- if (assign_irq_vector(irq, mask))
-+ dest = set_desc_affinity(desc, mask);
-+ if (dest == BAD_APICID)
- return;
-
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
--
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
-@@ -3119,16 +3457,10 @@ static void ir_set_msi_irq_affinity(unsi
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
- */
-- if (cfg->move_in_progress) {
-- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-- cfg->move_in_progress = 0;
-- }
--
-- desc = irq_to_desc(irq);
-- desc->affinity = mask;
-+ if (cfg->move_in_progress)
-+ send_cleanup_vector(cfg);
- }
-+
- #endif
- #endif /* CONFIG_SMP */
-
-@@ -3187,7 +3519,7 @@ static int msi_alloc_irte(struct pci_dev
- }
- #endif
-
--static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
-+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
- {
- int ret;
- struct msi_msg msg;
-@@ -3196,7 +3528,7 @@ static int setup_msi_irq(struct pci_dev
- if (ret < 0)
- return ret;
-
-- set_irq_msi(irq, desc);
-+ set_irq_msi(irq, msidesc);
- write_msi_msg(irq, &msg);
-
- #ifdef CONFIG_INTR_REMAP
-@@ -3216,26 +3548,13 @@ static int setup_msi_irq(struct pci_dev
- return 0;
- }
-
--static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
--{
-- unsigned int irq;
--
-- irq = dev->bus->number;
-- irq <<= 8;
-- irq |= dev->devfn;
-- irq <<= 12;
--
-- return irq;
--}
--
--int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
- {
- unsigned int irq;
- int ret;
- unsigned int irq_want;
-
-- irq_want = build_irq_for_pci_dev(dev) + 0x100;
--
-+ irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
- if (irq == 0)
- return -1;
-@@ -3249,7 +3568,7 @@ int arch_setup_msi_irq(struct pci_dev *d
- goto error;
- no_ir:
- #endif
-- ret = setup_msi_irq(dev, desc, irq);
-+ ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
-@@ -3267,7 +3586,7 @@ int arch_setup_msi_irqs(struct pci_dev *
- {
- unsigned int irq;
- int ret, sub_handle;
-- struct msi_desc *desc;
-+ struct msi_desc *msidesc;
- unsigned int irq_want;
-
- #ifdef CONFIG_INTR_REMAP
-@@ -3275,10 +3594,11 @@ int arch_setup_msi_irqs(struct pci_dev *
- int index = 0;
- #endif
-
-- irq_want = build_irq_for_pci_dev(dev) + 0x100;
-+ irq_want = nr_irqs_gsi;
- sub_handle = 0;
-- list_for_each_entry(desc, &dev->msi_list, list) {
-- irq = create_irq_nr(irq_want--);
-+ list_for_each_entry(msidesc, &dev->msi_list, list) {
-+ irq = create_irq_nr(irq_want);
-+ irq_want++;
- if (irq == 0)
- return -1;
- #ifdef CONFIG_INTR_REMAP
-@@ -3310,7 +3630,7 @@ int arch_setup_msi_irqs(struct pci_dev *
- }
- no_ir:
- #endif
-- ret = setup_msi_irq(dev, desc, irq);
-+ ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0)
- goto error;
- sub_handle++;
-@@ -3329,24 +3649,18 @@ void arch_teardown_msi_irq(unsigned int
-
- #ifdef CONFIG_DMAR
- #ifdef CONFIG_SMP
--static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
-+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
- {
-+ struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
-- cpumask_t tmp;
-- struct irq_desc *desc;
-
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-+ dest = set_desc_affinity(desc, mask);
-+ if (dest == BAD_APICID)
- return;
-
-- if (assign_irq_vector(irq, mask))
-- return;
--
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
-+ cfg = desc->chip_data;
-
- dmar_msi_read(irq, &msg);
-
-@@ -3356,9 +3670,8 @@ static void dmar_msi_set_affinity(unsign
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- dmar_msi_write(irq, &msg);
-- desc = irq_to_desc(irq);
-- desc->affinity = mask;
- }
-+
- #endif /* CONFIG_SMP */
-
- struct irq_chip dmar_msi_type = {
-@@ -3390,24 +3703,18 @@ int arch_setup_dmar_msi(unsigned int irq
- #ifdef CONFIG_HPET_TIMER
-
- #ifdef CONFIG_SMP
--static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
-+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
- {
-+ struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
-- struct irq_desc *desc;
- struct msi_msg msg;
- unsigned int dest;
-- cpumask_t tmp;
-
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-+ dest = set_desc_affinity(desc, mask);
-+ if (dest == BAD_APICID)
- return;
-
-- if (assign_irq_vector(irq, mask))
-- return;
--
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
-+ cfg = desc->chip_data;
-
- hpet_msi_read(irq, &msg);
-
-@@ -3417,9 +3724,8 @@ static void hpet_msi_set_affinity(unsign
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- hpet_msi_write(irq, &msg);
-- desc = irq_to_desc(irq);
-- desc->affinity = mask;
- }
-+
- #endif /* CONFIG_SMP */
-
- struct irq_chip hpet_msi_type = {
-@@ -3472,28 +3778,21 @@ static void target_ht_irq(unsigned int i
- write_ht_irq_msg(irq, &msg);
- }
-
--static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
- {
-+ struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- unsigned int dest;
-- cpumask_t tmp;
-- struct irq_desc *desc;
-
-- cpus_and(tmp, mask, cpu_online_map);
-- if (cpus_empty(tmp))
-+ dest = set_desc_affinity(desc, mask);
-+ if (dest == BAD_APICID)
- return;
-
-- if (assign_irq_vector(irq, mask))
-- return;
--
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, mask);
-- dest = cpu_mask_to_apicid(tmp);
-+ cfg = desc->chip_data;
-
- target_ht_irq(irq, dest, cfg->vector);
-- desc = irq_to_desc(irq);
-- desc->affinity = mask;
- }
-+
- #endif
-
- static struct irq_chip ht_irq_chip = {
-@@ -3511,17 +3810,14 @@ int arch_setup_ht_irq(unsigned int irq,
- {
- struct irq_cfg *cfg;
- int err;
-- cpumask_t tmp;
-
-- tmp = TARGET_CPUS;
-- err = assign_irq_vector(irq, tmp);
-+ cfg = irq_cfg(irq);
-+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
- if (!err) {
- struct ht_irq_msg msg;
- unsigned dest;
-
-- cfg = irq_cfg(irq);
-- cpus_and(tmp, cfg->domain, tmp);
-- dest = cpu_mask_to_apicid(tmp);
-+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
-@@ -3557,7 +3853,7 @@ int arch_setup_ht_irq(unsigned int irq,
- int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
- unsigned long mmr_offset)
- {
-- const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
-+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_cfg *cfg;
- int mmr_pnode;
- unsigned long mmr_value;
-@@ -3565,7 +3861,9 @@ int arch_enable_uv_irq(char *irq_name, u
- unsigned long flags;
- int err;
-
-- err = assign_irq_vector(irq, *eligible_cpu);
-+ cfg = irq_cfg(irq);
-+
-+ err = assign_irq_vector(irq, cfg, eligible_cpu);
- if (err != 0)
- return err;
-
-@@ -3574,8 +3872,6 @@ int arch_enable_uv_irq(char *irq_name, u
- irq_name);
- spin_unlock_irqrestore(&vector_lock, flags);
-
-- cfg = irq_cfg(irq);
--
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-@@ -3586,7 +3882,7 @@ int arch_enable_uv_irq(char *irq_name, u
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
-- entry->dest = cpu_mask_to_apicid(*eligible_cpu);
-+ entry->dest = cpu_mask_to_apicid(eligible_cpu);
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-@@ -3627,10 +3923,29 @@ int __init io_apic_get_redir_entries (in
- return reg_01.bits.entries;
- }
-
--int __init probe_nr_irqs(void)
-+#ifndef CONFIG_XEN
-+void __init probe_nr_irqs_gsi(void)
- {
-- return NR_IRQS;
-+ int nr = 0;
-+
-+ nr = acpi_probe_gsi();
-+ if (nr > nr_irqs_gsi) {
-+ nr_irqs_gsi = nr;
-+ } else {
-+ /* for acpi=off or acpi is not compiled in */
-+ int idx;
-+
-+ nr = 0;
-+ for (idx = 0; idx < nr_ioapics; idx++)
-+ nr += io_apic_get_redir_entries(idx) + 1;
-+
-+ if (nr > nr_irqs_gsi)
-+ nr_irqs_gsi = nr;
-+ }
-+
-+ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
- }
-+#endif
-
- /* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
-@@ -3730,6 +4045,10 @@ int __init io_apic_get_version(int ioapi
-
- int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
- {
-+ struct irq_desc *desc;
-+ struct irq_cfg *cfg;
-+ int cpu = boot_cpu_id;
-+
- #ifdef CONFIG_XEN
- if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
-@@ -3744,13 +4063,21 @@ int io_apic_set_pci_routing (int ioapic,
- return -EINVAL;
- }
-
-+ desc = irq_to_desc_alloc_cpu(irq, cpu);
-+ if (!desc) {
-+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
-+ return 0;
-+ }
-+
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
-- if (irq >= 16)
-- add_pin_to_irq(irq, ioapic, pin);
-+ if (irq >= NR_IRQS_LEGACY) {
-+ cfg = desc->chip_data;
-+ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
-+ }
-
-- setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
-+ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
- return 0;
- }
-@@ -3789,7 +4116,7 @@ void __init setup_ioapic_dest(void)
- int pin, ioapic, irq, irq_entry;
- struct irq_desc *desc;
- struct irq_cfg *cfg;
-- cpumask_t mask;
-+ const struct cpumask *mask;
-
- if (skip_ioapic_setup == 1)
- return;
-@@ -3805,9 +4132,10 @@ void __init setup_ioapic_dest(void)
- * when you have too many devices, because at that time only boot
- * cpu is online.
- */
-- cfg = irq_cfg(irq);
-+ desc = irq_to_desc(irq);
-+ cfg = desc->chip_data;
- if (!cfg->vector) {
-- setup_IO_APIC_irq(ioapic, pin, irq,
-+ setup_IO_APIC_irq(ioapic, pin, irq, desc,
- irq_trigger(irq_entry),
- irq_polarity(irq_entry));
- continue;
-@@ -3817,19 +4145,18 @@ void __init setup_ioapic_dest(void)
- /*
- * Honour affinities which have been set in early boot
- */
-- desc = irq_to_desc(irq);
- if (desc->status &
- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-- mask = desc->affinity;
-+ mask = &desc->affinity;
- else
- mask = TARGET_CPUS;
-
- #ifdef CONFIG_INTR_REMAP
- if (intr_remapping_enabled)
-- set_ir_ioapic_affinity_irq(irq, mask);
-+ set_ir_ioapic_affinity_irq_desc(desc, mask);
- else
- #endif
-- set_ioapic_affinity_irq(irq, mask);
-+ set_ioapic_affinity_irq_desc(desc, mask);
- }
+--- a/arch/x86/kernel/head-xen.c
++++ b/arch/x86/kernel/head-xen.c
+@@ -36,7 +36,6 @@ void __init reserve_ebda_region(void)
- }
-@@ -3878,7 +4205,6 @@ void __init ioapic_init_mappings(void)
- struct resource *ioapic_res;
- int i;
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+- printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
-- irq_2_pin_init();
- ioapic_res = ioapic_setup_resources();
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
--- a/arch/x86/kernel/ioport-xen.c
+++ b/arch/x86/kernel/ioport-xen.c
@@ -36,7 +36,7 @@ static void set_bitmap(unsigned long *bi
@@ -5344,46 +5469,6 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
struct physdev_set_iobitmap set_iobitmap;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
---- a/arch/x86/kernel/apic/ipi-xen.c
-+++ b/arch/x86/kernel/apic/ipi-xen.c
-@@ -40,21 +40,29 @@ void send_IPI_self(int vector)
- __send_IPI_shortcut(APIC_DEST_SELF, vector);
- }
-
--void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
-+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
- {
-- cpumask_t mask;
- unsigned int cpu;
-
-- cpus_andnot(mask, cpumask, cpu_online_map);
-- WARN_ON(!cpus_empty(mask));
-- for_each_online_cpu(cpu)
-- if (cpu_isset(cpu, cpumask))
-- __send_IPI_one(cpu, vector);
-+ WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
-+ for_each_cpu_and(cpu, cpumask, cpu_online_mask)
-+ __send_IPI_one(cpu, vector);
- }
-
--void send_IPI_mask_sequence(cpumask_t mask, int vector)
-+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
- {
- send_IPI_mask_bitmask(mask, vector);
- }
-
-+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
-+{
-+ unsigned int query_cpu;
-+ unsigned int this_cpu = smp_processor_id();
-+
-+ WARN_ON(!cpumask_subset(mask, cpu_online_mask));
-+ for_each_cpu_and(query_cpu, mask, cpu_online_mask)
-+ if (query_cpu != this_cpu)
-+ __send_IPI_one(query_cpu, vector);
-+}
-+
- #endif
--- a/arch/x86/kernel/irq-xen.c
+++ b/arch/x86/kernel/irq-xen.c
@@ -5,10 +5,11 @@
@@ -6482,125 +6567,6 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
forbid_dac = 1;
}
}
---- a/arch/x86/kernel/process-xen.c
-+++ b/arch/x86/kernel/process-xen.c
-@@ -1,13 +1,17 @@
- #include <linux/errno.h>
- #include <linux/kernel.h>
- #include <linux/mm.h>
-+#include <asm/idle.h>
- #include <linux/smp.h>
- #include <linux/slab.h>
- #include <linux/sched.h>
- #include <linux/module.h>
- #include <linux/pm.h>
- #include <linux/clockchips.h>
-+#include <linux/ftrace.h>
- #include <asm/system.h>
-+#include <asm/apic.h>
-+#include <xen/evtchn.h>
-
- unsigned long idle_halt;
- EXPORT_SYMBOL(idle_halt);
-@@ -70,6 +74,9 @@ EXPORT_SYMBOL(pm_idle);
- */
- void xen_idle(void)
- {
-+ struct power_trace it;
-+
-+ trace_power_start(&it, POWER_CSTATE, 1);
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
-@@ -82,11 +89,27 @@ void xen_idle(void)
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
-+ trace_power_end(&it);
- }
- #ifdef CONFIG_APM_MODULE
- EXPORT_SYMBOL(default_idle);
- #endif
-
-+void stop_this_cpu(void *dummy)
-+{
-+ local_irq_disable();
-+ /*
-+ * Remove this CPU:
-+ */
-+ cpu_clear(smp_processor_id(), cpu_online_map);
-+ disable_all_local_evtchn();
-+
-+ for (;;) {
-+ if (hlt_works(smp_processor_id()))
-+ halt();
-+ }
-+}
-+
- static void do_nothing(void *unused)
- {
- }
-@@ -120,24 +143,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
- */
- void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
- {
-+ struct power_trace it;
-+
-+ trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
- if (!need_resched()) {
-+ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
-+ clflush((void *)&current_thread_info()->flags);
-+
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-+ trace_power_end(&it);
- }
-
- /* Default MONITOR/MWAIT with no hints, used for default C1 state */
- static void mwait_idle(void)
- {
-+ struct power_trace it;
- if (!need_resched()) {
-+ trace_power_start(&it, POWER_CSTATE, 1);
-+ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
-+ clflush((void *)&current_thread_info()->flags);
-+
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
-+ trace_power_end(&it);
- } else
- local_irq_enable();
- }
-@@ -150,9 +186,13 @@ static void mwait_idle(void)
- */
- static void poll_idle(void)
- {
-+ struct power_trace it;
-+
-+ trace_power_start(&it, POWER_CSTATE, 0);
- local_irq_enable();
- while (!need_resched())
- cpu_relax();
-+ trace_power_end(&it);
- }
-
- #ifndef CONFIG_XEN
-@@ -238,7 +278,7 @@ static void c1e_idle(void)
- rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
- if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = 1;
-- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
-+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halt in AMD C1E");
- printk(KERN_INFO "System has AMD C1E enabled\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
--- a/arch/x86/kernel/process_32-xen.c
+++ b/arch/x86/kernel/process_32-xen.c
@@ -38,11 +38,13 @@
@@ -6930,6 +6896,125 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
+--- a/arch/x86/kernel/process-xen.c
++++ b/arch/x86/kernel/process-xen.c
+@@ -1,13 +1,17 @@
+ #include <linux/errno.h>
+ #include <linux/kernel.h>
+ #include <linux/mm.h>
++#include <asm/idle.h>
+ #include <linux/smp.h>
+ #include <linux/slab.h>
+ #include <linux/sched.h>
+ #include <linux/module.h>
+ #include <linux/pm.h>
+ #include <linux/clockchips.h>
++#include <linux/ftrace.h>
+ #include <asm/system.h>
++#include <asm/apic.h>
++#include <xen/evtchn.h>
+
+ unsigned long idle_halt;
+ EXPORT_SYMBOL(idle_halt);
+@@ -70,6 +74,9 @@ EXPORT_SYMBOL(pm_idle);
+ */
+ void xen_idle(void)
+ {
++ struct power_trace it;
++
++ trace_power_start(&it, POWER_CSTATE, 1);
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+@@ -82,11 +89,27 @@ void xen_idle(void)
+ else
+ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
++ trace_power_end(&it);
+ }
+ #ifdef CONFIG_APM_MODULE
+ EXPORT_SYMBOL(default_idle);
+ #endif
+
++void stop_this_cpu(void *dummy)
++{
++ local_irq_disable();
++ /*
++ * Remove this CPU:
++ */
++ cpu_clear(smp_processor_id(), cpu_online_map);
++ disable_all_local_evtchn();
++
++ for (;;) {
++ if (hlt_works(smp_processor_id()))
++ halt();
++ }
++}
++
+ static void do_nothing(void *unused)
+ {
+ }
+@@ -120,24 +143,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
+ */
+ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+ {
++ struct power_trace it;
++
++ trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
+ if (!need_resched()) {
++ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
++ clflush((void *)&current_thread_info()->flags);
++
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
+ }
++ trace_power_end(&it);
+ }
+
+ /* Default MONITOR/MWAIT with no hints, used for default C1 state */
+ static void mwait_idle(void)
+ {
++ struct power_trace it;
+ if (!need_resched()) {
++ trace_power_start(&it, POWER_CSTATE, 1);
++ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
++ clflush((void *)&current_thread_info()->flags);
++
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
++ trace_power_end(&it);
+ } else
+ local_irq_enable();
+ }
+@@ -150,9 +186,13 @@ static void mwait_idle(void)
+ */
+ static void poll_idle(void)
+ {
++ struct power_trace it;
++
++ trace_power_start(&it, POWER_CSTATE, 0);
+ local_irq_enable();
+ while (!need_resched())
+ cpu_relax();
++ trace_power_end(&it);
+ }
+
+ #ifndef CONFIG_XEN
+@@ -238,7 +278,7 @@ static void c1e_idle(void)
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+ c1e_detected = 1;
+- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
++ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ printk(KERN_INFO "System has AMD C1E enabled\n");
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
--- a/arch/x86/kernel/setup-xen.c
+++ b/arch/x86/kernel/setup-xen.c
@@ -93,11 +93,13 @@
@@ -7637,6 +7722,19 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -162,8 +162,8 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdi
+ PHONY += bzImage vmlinuz $(BOOT_TARGETS)
+
+ ifdef CONFIG_XEN
+-KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+- -I$(srctree)/arch/x86/include/mach-xen $(KBUILD_CPPFLAGS)
++LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
++ -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
+
+ ifdef CONFIG_X86_64
+ LDFLAGS_vmlinux := -e startup_64
--- a/arch/x86/mm/fault-xen.c
+++ b/arch/x86/mm/fault-xen.c
@@ -53,7 +53,7 @@
@@ -9030,26 +9128,6 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
- msi_acpi_init();
+ INIT_LIST_HEAD(&dev->msi_list);
}
---- a/drivers/xen/Kconfig
-+++ b/drivers/xen/Kconfig
-@@ -457,6 +457,7 @@ config XEN_BACKEND
-
- config XENFS
- tristate "Xen filesystem"
-+ depends on PARAVIRT_XEN
- default y
- help
- The xen filesystem provides a way for domains to share
---- a/drivers/xen/Makefile
-+++ b/drivers/xen/Makefile
-@@ -15,6 +15,7 @@ obj-$(CONFIG_XEN) += features.o $(xen-
- obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y)
- obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
- obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
-+obj-$(CONFIG_XENFS) += xenfs/
- obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
- obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
- obj-$(filter m,$(CONFIG_XEN_BLKDEV_TAP2)) += blktap2/ blktap2-new/
--- a/drivers/xen/balloon/sysfs.c
+++ b/drivers/xen/balloon/sysfs.c
@@ -67,7 +67,7 @@ static ssize_t store_target_kb(struct sy
@@ -9952,6 +10030,26 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
if (cpu == stop)
break;
}
+--- a/drivers/xen/Kconfig
++++ b/drivers/xen/Kconfig
+@@ -457,6 +457,7 @@ config XEN_BACKEND
+
+ config XENFS
+ tristate "Xen filesystem"
++ depends on PARAVIRT_XEN
+ default y
+ help
+ The xen filesystem provides a way for domains to share
+--- a/drivers/xen/Makefile
++++ b/drivers/xen/Makefile
+@@ -15,6 +15,7 @@ obj-$(CONFIG_XEN) += features.o $(xen-
+ obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y)
+ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+ obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
++obj-$(CONFIG_XENFS) += xenfs/
+ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
+ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
+ obj-$(filter m,$(CONFIG_XEN_BLKDEV_TAP2)) += blktap2/ blktap2-new/
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -229,6 +229,15 @@ static struct ethtool_ops network_ethtoo
@@ -10032,7 +10130,7 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
}
/*
* Add following to poll() function in NAPI driver (Tigon3 is example):
-@@ -1551,7 +1551,6 @@ static void net_tx_action(unsigned long
+@@ -1552,7 +1552,6 @@ static void net_tx_action(unsigned long
dev->stats.rx_packets++;
netif_rx(skb);
@@ -10171,6 +10269,47 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
* closedown of this driver and its peer.
*/
void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
+--- a/drivers/xen/xenbus/xenbus_probe_backend.c
++++ b/drivers/xen/xenbus/xenbus_probe_backend.c
+@@ -36,6 +36,7 @@
+ __FUNCTION__, __LINE__, ##args)
+
+ #include <linux/kernel.h>
++#include <linux/version.h>
+ #include <linux/err.h>
+ #include <linux/string.h>
+ #include <linux/ctype.h>
+@@ -108,6 +109,10 @@ static int backend_bus_id(char bus_id[XE
+ return 0;
+ }
+
++static struct device_attribute xenbus_backend_attrs[] = {
++ __ATTR_NULL
++};
++
+ static struct xen_bus_type xenbus_backend = {
+ .root = "backend",
+ .levels = 3, /* backend/type/<frontend>/<id> */
+@@ -115,12 +120,13 @@ static struct xen_bus_type xenbus_backen
+ .probe = xenbus_probe_backend,
+ .error = -ENODEV,
+ .bus = {
+- .name = "xen-backend",
+- .match = xenbus_match,
+- .probe = xenbus_dev_probe,
+- .remove = xenbus_dev_remove,
+-// .shutdown = xenbus_dev_shutdown,
+- .uevent = xenbus_uevent_backend,
++ .name = "xen-backend",
++ .match = xenbus_match,
++ .probe = xenbus_dev_probe,
++ .remove = xenbus_dev_remove,
++// .shutdown = xenbus_dev_shutdown,
++ .uevent = xenbus_uevent_backend,
++ .dev_attrs = xenbus_backend_attrs,
+ },
+ .dev = {
+ .bus_id = "xen-backend",
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -42,6 +42,7 @@
@@ -10317,47 +10456,6 @@ Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches
#endif
#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
---- a/drivers/xen/xenbus/xenbus_probe_backend.c
-+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
-@@ -36,6 +36,7 @@
- __FUNCTION__, __LINE__, ##args)
-
- #include <linux/kernel.h>
-+#include <linux/version.h>
- #include <linux/err.h>
- #include <linux/string.h>
- #include <linux/ctype.h>
-@@ -108,6 +109,10 @@ static int backend_bus_id(char bus_id[XE
- return 0;
- }
-
-+static struct device_attribute xenbus_backend_attrs[] = {
-+ __ATTR_NULL
-+};
-+
- static struct xen_bus_type xenbus_backend = {
- .root = "backend",
- .levels = 3, /* backend/type/<frontend>/<id> */
-@@ -115,12 +120,13 @@ static struct xen_bus_type xenbus_backen
- .probe = xenbus_probe_backend,
- .error = -ENODEV,
- .bus = {
-- .name = "xen-backend",
-- .match = xenbus_match,
-- .probe = xenbus_dev_probe,
-- .remove = xenbus_dev_remove,
--// .shutdown = xenbus_dev_shutdown,
-- .uevent = xenbus_uevent_backend,
-+ .name = "xen-backend",
-+ .match = xenbus_match,
-+ .probe = xenbus_dev_probe,
-+ .remove = xenbus_dev_remove,
-+// .shutdown = xenbus_dev_shutdown,
-+ .uevent = xenbus_uevent_backend,
-+ .dev_attrs = xenbus_backend_attrs,
- },
- .dev = {
- .bus_id = "xen-backend",
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -227,6 +227,9 @@ void *xenbus_dev_request_and_reply(struc
diff --git a/patches.xen/xen3-patch-2.6.30 b/patches.xen/xen3-patch-2.6.30
index 0d84456966..2c428c2b77 100644
--- a/patches.xen/xen3-patch-2.6.30
+++ b/patches.xen/xen3-patch-2.6.30
@@ -7,6 +7,120 @@ Patch-mainline: Never, SUSE-Xen specific
Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches.py
+---
+ arch/ia64/include/asm/xen/hypervisor.h | 2
+ arch/ia64/kernel/vmlinux.lds.S | 2
+ arch/x86/Kconfig | 16
+ arch/x86/Makefile | 6
+ arch/x86/boot/Makefile | 8
+ arch/x86/ia32/ia32entry-xen.S | 4
+ arch/x86/include/asm/kexec.h | 6
+ arch/x86/include/asm/page_64_types.h | 8
+ arch/x86/include/mach-xen/asm/desc.h | 12
+ arch/x86/include/mach-xen/asm/fixmap.h | 152 +
+ arch/x86/include/mach-xen/asm/fixmap_32.h | 125 -
+ arch/x86/include/mach-xen/asm/fixmap_64.h | 90 -
+ arch/x86/include/mach-xen/asm/highmem.h | 1
+ arch/x86/include/mach-xen/asm/hypervisor.h | 13
+ arch/x86/include/mach-xen/asm/io.h | 130 +
+ arch/x86/include/mach-xen/asm/ipi.h | 13
+ arch/x86/include/mach-xen/asm/irq_vectors.h | 81 -
+ arch/x86/include/mach-xen/asm/irqflags.h | 3
+ arch/x86/include/mach-xen/asm/mmu_context.h | 122 +
+ arch/x86/include/mach-xen/asm/mmu_context_32.h | 83 -
+ arch/x86/include/mach-xen/asm/mmu_context_64.h | 106 -
+ arch/x86/include/mach-xen/asm/mutex.h | 3
+ arch/x86/include/mach-xen/asm/pci.h | 46
+ arch/x86/include/mach-xen/asm/pgtable-3level-defs.h | 24
+ arch/x86/include/mach-xen/asm/pgtable-3level.h | 35
+ arch/x86/include/mach-xen/asm/pgtable-3level_types.h | 44
+ arch/x86/include/mach-xen/asm/pgtable.h | 520 +++---
+ arch/x86/include/mach-xen/asm/pgtable_32.h | 100 -
+ arch/x86/include/mach-xen/asm/pgtable_64.h | 122 -
+ arch/x86/include/mach-xen/asm/pgtable_64_types.h | 63
+ arch/x86/include/mach-xen/asm/pgtable_types.h | 388 ++++
+ arch/x86/include/mach-xen/asm/processor.h | 67
+ arch/x86/include/mach-xen/asm/smp.h | 86 -
+ arch/x86/include/mach-xen/asm/spinlock.h | 40
+ arch/x86/include/mach-xen/asm/spinlock_types.h | 14
+ arch/x86/include/mach-xen/asm/system.h | 90 -
+ arch/x86/include/mach-xen/asm/tlbflush.h | 11
+ arch/x86/kernel/Makefile | 8
+ arch/x86/kernel/acpi/boot.c | 9
+ arch/x86/kernel/apic/Makefile | 6
+ arch/x86/kernel/apic/apic-xen.c | 2
+ arch/x86/kernel/apic/io_apic-xen.c | 855 +++++-----
+ arch/x86/kernel/apic/ipi-xen.c | 50
+ arch/x86/kernel/apic/probe_32-xen.c | 68
+ arch/x86/kernel/asm-offsets_32.c | 5
+ arch/x86/kernel/cpu/common-xen.c | 644 ++++----
+ arch/x86/kernel/cpu/intel.c | 4
+ arch/x86/kernel/e820-xen.c | 141 +
+ arch/x86/kernel/early_printk-xen.c | 22
+ arch/x86/kernel/entry_32-xen.S | 503 +++---
+ arch/x86/kernel/entry_64-xen.S | 53
+ arch/x86/kernel/head-xen.c | 156 +
+ arch/x86/kernel/head32-xen.c | 33
+ arch/x86/kernel/head64-xen.c | 54
+ arch/x86/kernel/head_32-xen.S | 51
+ arch/x86/kernel/head_64-xen.S | 17
+ arch/x86/kernel/ioport-xen.c | 3
+ arch/x86/kernel/irq-xen.c | 106 +
+ arch/x86/kernel/machine_kexec_64.c | 11
+ arch/x86/kernel/microcode_core-xen.c | 36
+ arch/x86/kernel/mpparse-xen.c | 543 +++---
+ arch/x86/kernel/pci-dma-xen.c | 35
+ arch/x86/kernel/pci-nommu-xen.c | 77
+ arch/x86/kernel/process-xen.c | 207 ++
+ arch/x86/kernel/process_32-xen.c | 210 --
+ arch/x86/kernel/process_64-xen.c | 230 --
+ arch/x86/kernel/setup-xen.c | 224 +-
+ arch/x86/kernel/setup_percpu.c | 4
+ arch/x86/kernel/smp-xen.c | 10
+ arch/x86/kernel/time-xen.c | 5
+ arch/x86/kernel/traps-xen.c | 66
+ arch/x86/kernel/vmlinux.lds.S | 4
+ arch/x86/mach-xen/Makefile | 5
+ arch/x86/mach-xen/setup.c | 186 --
+ arch/x86/mm/Makefile | 2
+ arch/x86/mm/fault-xen.c | 1519 ++++++++++---------
+ arch/x86/mm/highmem_32-xen.c | 93 -
+ arch/x86/mm/hypervisor.c | 8
+ arch/x86/mm/init-xen.c | 461 +++++
+ arch/x86/mm/init_32-xen.c | 509 ++----
+ arch/x86/mm/init_64-xen.c | 504 +-----
+ arch/x86/mm/iomap_32-xen.c | 30
+ arch/x86/mm/ioremap-xen.c | 102 -
+ arch/x86/mm/pageattr-xen.c | 290 ++-
+ arch/x86/mm/pat-xen.c | 246 ---
+ arch/x86/mm/pgtable-xen.c | 33
+ arch/x86/mm/pgtable_32-xen.c | 19
+ drivers/acpi/Makefile | 4
+ drivers/acpi/acpica/hwsleep.c | 2
+ drivers/oprofile/oprofile_files.c | 1
+ drivers/pci/msi-xen.c | 86 -
+ drivers/xen/Kconfig | 5
+ drivers/xen/char/mem.c | 14
+ drivers/xen/core/Makefile | 2
+ drivers/xen/core/evtchn.c | 245 ++-
+ drivers/xen/core/smpboot.c | 22
+ drivers/xen/core/spinlock.c | 34
+ drivers/xen/netback/interface.c | 3
+ drivers/xen/netback/netback.c | 7
+ drivers/xen/netfront/netfront.c | 13
+ drivers/xen/pciback/conf_space_header.c | 4
+ drivers/xen/pciback/pci_stub.c | 46
+ drivers/xen/sfc_netfront/accel_msg.c | 4
+ drivers/xen/usbback/usbstub.c | 2
+ drivers/xen/usbfront/usbfront-dbg.c | 2
+ drivers/xen/usbfront/xenbus.c | 4
+ drivers/xen/xenbus/xenbus_probe.c | 6
+ drivers/xen/xenbus/xenbus_probe_backend.c | 2
+ kernel/sched.c | 8
+ lib/swiotlb-xen.c | 79
+ mm/page_alloc.c | 4
+ 111 files changed, 6339 insertions(+), 5355 deletions(-)
+
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -34,13 +34,13 @@
@@ -35,87 +149,6 @@ Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches
. = ALIGN(PAGE_SIZE);
__xen_start_gate_section = .;
*(.data..gate.xen)
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -49,8 +49,8 @@ config X86
- select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_DMA_API_DEBUG
- select HAVE_KERNEL_GZIP
-- select HAVE_KERNEL_BZIP2
-- select HAVE_KERNEL_LZMA
-+ select HAVE_KERNEL_BZIP2 if !XEN
-+ select HAVE_KERNEL_LZMA if !XEN
- select HAVE_KERNEL_XZ
- select HAVE_KERNEL_LZO
- select HAVE_HW_BREAKPOINT
-@@ -324,11 +324,11 @@ config X86_XEN
-
- config X86_BIGSMP
- bool "Support for big SMP systems with more than 8 CPUs"
-- depends on X86_32 && SMP
-+ depends on X86_32 && SMP && !XEN
- ---help---
- This option is needed for the systems that have more than 8 CPUs
-
--if X86_32
-+if X86_32 && !XEN
- config X86_EXTENDED_PLATFORM
- bool "Support for extended (non-PC) x86 platforms"
- default y
-@@ -358,7 +358,7 @@ config X86_64_XEN
- help
- This option will compile a kernel compatible with Xen hypervisor
-
--if X86_64
-+if X86_64 && !XEN
- config X86_EXTENDED_PLATFORM
- bool "Support for extended (non-PC) x86 platforms"
- default y
-@@ -783,7 +783,7 @@ config MAXSMP
-
- config NR_CPUS
- int "Maximum number of CPUs" if SMP && !MAXSMP
-- range 2 8 if SMP && X86_32 && !X86_BIGSMP
-+ range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
- range 2 512 if SMP && !MAXSMP
- default "1" if !SMP
- default "4096" if MAXSMP
-@@ -868,10 +868,6 @@ config X86_VISWS_APIC
- def_bool y
- depends on X86_32 && X86_VISWS
-
--config X86_XEN_GENAPIC
-- def_bool y
-- depends on X86_64_XEN
--
- config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
- bool "Reroute for broken boot IRQs"
- depends on X86_IO_APIC && !XEN
---- a/arch/x86/Makefile
-+++ b/arch/x86/Makefile
-@@ -117,10 +117,6 @@ endif
- # prevent gcc from generating any FP code by mistake
- KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
-
--# Xen subarch support
--mflags-$(CONFIG_XEN) := -Iarch/x86/include/mach-xen
--mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
--
- KBUILD_CFLAGS += $(mflags-y)
- KBUILD_AFLAGS += $(mflags-y)
-
-@@ -193,10 +189,10 @@ endif
- $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
- $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
- $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
-+endif
-
- $(BOOT_TARGETS): vmlinux
- $(Q)$(MAKE) $(build)=$(boot) $@
--endif
-
- PHONY += install
- install:
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -198,6 +198,12 @@ $(obj)/vmlinux-stripped: OBJCOPYFLAGS :=
@@ -234,184 +267,6 @@ Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches
BUG();
}
#endif
---- a/arch/x86/include/mach-xen/asm/fixmap.h
-+++ b/arch/x86/include/mach-xen/asm/fixmap.h
-@@ -1,11 +1,154 @@
-+/*
-+ * fixmap.h: compile-time virtual memory allocation
-+ *
-+ * This file is subject to the terms and conditions of the GNU General Public
-+ * License. See the file "COPYING" in the main directory of this archive
-+ * for more details.
-+ *
-+ * Copyright (C) 1998 Ingo Molnar
-+ *
-+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
-+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
-+ */
-+
- #ifndef _ASM_X86_FIXMAP_H
- #define _ASM_X86_FIXMAP_H
-
-+#ifndef __ASSEMBLY__
-+#include <linux/kernel.h>
-+#include <asm/acpi.h>
-+#include <asm/apicdef.h>
-+#include <asm/page.h>
-+#ifdef CONFIG_X86_32
-+#include <linux/threads.h>
-+#include <asm/kmap_types.h>
-+#else
-+#include <asm/vsyscall.h>
-+#endif
-+
-+/*
-+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
-+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
-+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
-+ */
-+#ifdef CONFIG_X86_32
-+/* used by vmalloc.c, vsyscall.lds.S.
-+ *
-+ * Leave one empty page between vmalloc'ed areas and
-+ * the start of the fixmap.
-+ */
-+extern unsigned long __FIXADDR_TOP;
-+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
-+
-+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
-+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
-+#else
-+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
-+
-+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
-+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
-+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
-+#endif
-+
-+
-+/*
-+ * Here we define all the compile-time 'special' virtual
-+ * addresses. The point is to have a constant address at
-+ * compile time, but to set the physical address only
-+ * in the boot process.
-+ * for x86_32: We allocate these special addresses
-+ * from the end of virtual memory (0xfffff000) backwards.
-+ * Also this lets us do fail-safe vmalloc(), we
-+ * can guarantee that these special addresses and
-+ * vmalloc()-ed addresses never overlap.
-+ *
-+ * These 'compile-time allocated' memory buffers are
-+ * fixed-size 4k pages (or larger if used with an increment
-+ * higher than 1). Use set_fixmap(idx,phys) to associate
-+ * physical memory with fixmap indices.
-+ *
-+ * TLB entries of such buffers will not be flushed across
-+ * task switches.
-+ */
-+enum fixed_addresses {
- #ifdef CONFIG_X86_32
--# include "fixmap_32.h"
-+ FIX_HOLE,
-+ FIX_VDSO,
- #else
--# include "fixmap_64.h"
-+ VSYSCALL_LAST_PAGE,
-+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
-+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
-+ VSYSCALL_HPET,
-+#endif
-+ FIX_DBGP_BASE,
-+ FIX_EARLYCON_MEM_BASE,
-+#ifdef CONFIG_X86_LOCAL_APIC
-+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
- #endif
-+#ifndef CONFIG_XEN
-+#ifdef CONFIG_X86_IO_APIC
-+ FIX_IO_APIC_BASE_0,
-+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-+#endif
-+#else
-+ FIX_SHARED_INFO,
-+#define NR_FIX_ISAMAPS 256
-+ FIX_ISAMAP_END,
-+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
-+#endif
-+#ifdef CONFIG_X86_VISWS_APIC
-+ FIX_CO_CPU, /* Cobalt timer */
-+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
-+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
-+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
-+#endif
-+#ifdef CONFIG_X86_F00F_BUG
-+ FIX_F00F_IDT, /* Virtual mapping for IDT */
-+#endif
-+#ifdef CONFIG_X86_CYCLONE_TIMER
-+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
-+#endif
-+#ifdef CONFIG_X86_32
-+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
-+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
-+#ifdef CONFIG_PCI_MMCONFIG
-+ FIX_PCIE_MCFG,
-+#endif
-+#endif
-+#ifdef CONFIG_PARAVIRT
-+ FIX_PARAVIRT_BOOTMAP,
-+#endif
-+ FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
-+ FIX_TEXT_POKE1,
-+ __end_of_permanent_fixed_addresses,
-+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-+ FIX_OHCI1394_BASE,
-+#endif
-+ /*
-+ * 256 temporary boot-time mappings, used by early_ioremap(),
-+ * before ioremap() is functional.
-+ *
-+ * We round it up to the next 256 pages boundary so that we
-+ * can have a single pgd entry and a single pte table:
-+ */
-+#define NR_FIX_BTMAPS 64
-+#define FIX_BTMAPS_SLOTS 4
-+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
-+ (__end_of_permanent_fixed_addresses & 255),
-+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
-+#ifdef CONFIG_X86_32
-+ FIX_WP_TEST,
-+#endif
-+ __end_of_fixed_addresses
-+};
-+
-+
-+extern void reserve_top_address(unsigned long reserve);
-+
-+#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-+#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
-+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
-+#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
-
- extern int fixmaps_set;
-
-@@ -13,10 +156,10 @@ extern pte_t *kmap_pte;
- extern pgprot_t kmap_prot;
- extern pte_t *pkmap_page_table;
-
--void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
-+void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);
-
- static inline void __set_fixmap(enum fixed_addresses idx,
-- maddr_t phys, pgprot_t flags)
-+ phys_addr_t phys, pgprot_t flags)
- {
- xen_set_fixmap(idx, phys, flags);
- }
-@@ -65,4 +208,5 @@ static inline unsigned long virt_to_fix(
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
- }
-+#endif /* !__ASSEMBLY__ */
- #endif /* _ASM_X86_FIXMAP_H */
--- a/arch/x86/include/mach-xen/asm/fixmap_32.h
+++ /dev/null
@@ -1,125 +0,0 @@
@@ -633,6 +488,184 @@ Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches
-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
-
-#endif /* _ASM_X86_FIXMAP_64_H */
+--- a/arch/x86/include/mach-xen/asm/fixmap.h
++++ b/arch/x86/include/mach-xen/asm/fixmap.h
+@@ -1,11 +1,154 @@
++/*
++ * fixmap.h: compile-time virtual memory allocation
++ *
++ * This file is subject to the terms and conditions of the GNU General Public
++ * License. See the file "COPYING" in the main directory of this archive
++ * for more details.
++ *
++ * Copyright (C) 1998 Ingo Molnar
++ *
++ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
++ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
++ */
++
+ #ifndef _ASM_X86_FIXMAP_H
+ #define _ASM_X86_FIXMAP_H
+
++#ifndef __ASSEMBLY__
++#include <linux/kernel.h>
++#include <asm/acpi.h>
++#include <asm/apicdef.h>
++#include <asm/page.h>
++#ifdef CONFIG_X86_32
++#include <linux/threads.h>
++#include <asm/kmap_types.h>
++#else
++#include <asm/vsyscall.h>
++#endif
++
++/*
++ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
++ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
++ * Because of this, FIXADDR_TOP x86 integration was left as later work.
++ */
++#ifdef CONFIG_X86_32
++/* used by vmalloc.c, vsyscall.lds.S.
++ *
++ * Leave one empty page between vmalloc'ed areas and
++ * the start of the fixmap.
++ */
++extern unsigned long __FIXADDR_TOP;
++#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
++
++#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
++#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
++#else
++#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
++
++/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
++#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
++#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
++#endif
++
++
++/*
++ * Here we define all the compile-time 'special' virtual
++ * addresses. The point is to have a constant address at
++ * compile time, but to set the physical address only
++ * in the boot process.
++ * for x86_32: We allocate these special addresses
++ * from the end of virtual memory (0xfffff000) backwards.
++ * Also this lets us do fail-safe vmalloc(), we
++ * can guarantee that these special addresses and
++ * vmalloc()-ed addresses never overlap.
++ *
++ * These 'compile-time allocated' memory buffers are
++ * fixed-size 4k pages (or larger if used with an increment
++ * higher than 1). Use set_fixmap(idx,phys) to associate
++ * physical memory with fixmap indices.
++ *
++ * TLB entries of such buffers will not be flushed across
++ * task switches.
++ */
++enum fixed_addresses {
+ #ifdef CONFIG_X86_32
+-# include "fixmap_32.h"
++ FIX_HOLE,
++ FIX_VDSO,
+ #else
+-# include "fixmap_64.h"
++ VSYSCALL_LAST_PAGE,
++ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
++ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++ VSYSCALL_HPET,
++#endif
++ FIX_DBGP_BASE,
++ FIX_EARLYCON_MEM_BASE,
++#ifdef CONFIG_X86_LOCAL_APIC
++ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+ #endif
++#ifndef CONFIG_XEN
++#ifdef CONFIG_X86_IO_APIC
++ FIX_IO_APIC_BASE_0,
++ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
++#endif
++#else
++ FIX_SHARED_INFO,
++#define NR_FIX_ISAMAPS 256
++ FIX_ISAMAP_END,
++ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
++#endif
++#ifdef CONFIG_X86_VISWS_APIC
++ FIX_CO_CPU, /* Cobalt timer */
++ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
++ FIX_LI_PCIA, /* Lithium PCI Bridge A */
++ FIX_LI_PCIB, /* Lithium PCI Bridge B */
++#endif
++#ifdef CONFIG_X86_F00F_BUG
++ FIX_F00F_IDT, /* Virtual mapping for