Home Home > GIT Browse > openSUSE-15.1
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2018-01-04 09:11:03 +0100
committerTakashi Iwai <tiwai@suse.de>2018-01-04 09:11:03 +0100
commit23990647d9f98086679c537d0550c1fccaf04d63 (patch)
treed8e54d53914d835ab2ac5c9b71f16d04000e7892
parent0298faa18daedbea35fccd6e88faaec5b1ecfe3f (diff)
parentfc84c7924cd045f96c47a0c888baf55871b1c5bb (diff)
Merge branch 'SLE12-SP3' into openSUSE-42.3rpm-4.4.104-39
Conflicts: series.conf suse-commit: 7db19124eed3ca155d26fcbf3a2d3c0329e4ca60
-rw-r--r--Documentation/kernel-parameters.txt18
-rw-r--r--arch/arm/include/asm/mmu_context.h2
-rw-r--r--arch/powerpc/include/asm/barrier.h1
-rw-r--r--arch/powerpc/include/asm/exception-64s.h55
-rw-r--r--arch/powerpc/include/asm/feature-fixups.h15
-rw-r--r--arch/powerpc/include/asm/setup.h4
-rw-r--r--arch/powerpc/kernel/entry_64.S37
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S12
-rw-r--r--arch/powerpc/kernel/setup_64.c77
-rw-r--r--arch/powerpc/kernel/sysfs.c40
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S9
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S7
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S4
-rw-r--r--arch/powerpc/lib/feature-fixups.c28
-rw-r--r--arch/powerpc/platforms/powernv/setup.c2
-rw-r--r--arch/powerpc/platforms/pseries/setup.c2
-rw-r--r--arch/s390/include/asm/alternative.h149
-rw-r--r--arch/s390/include/asm/barrier.h9
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/alternative.c123
-rw-r--r--arch/s390/kernel/entry.S47
-rw-r--r--arch/s390/kernel/ipl.c1
-rw-r--r--arch/s390/kernel/module.c13
-rw-r--r--arch/s390/kernel/setup.c3
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/s390/kernel/vmlinux.lds.S26
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/entry/entry_64.S266
-rw-r--r--arch/x86/entry/entry_64_compat.S19
-rw-r--r--arch/x86/include/asm/barrier.h2
-rw-r--r--arch/x86/include/asm/cmdline.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h8
-rw-r--r--arch/x86/include/asm/desc.h4
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/kaiser.h141
-rw-r--r--arch/x86/include/asm/microcode.h15
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h103
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/include/asm/msr.h18
-rw-r--r--arch/x86/include/asm/mwait.h6
-rw-r--r--arch/x86/include/asm/pgtable.h28
-rw-r--r--arch/x86/include/asm/pgtable_64.h25
-rw-r--r--arch/x86/include/asm/pgtable_types.h29
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/proto.h1
-rw-r--r--arch/x86/include/asm/spec_ctrl.h108
-rw-r--r--arch/x86/include/asm/tlbflush.h228
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h3
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c15
-rw-r--r--arch/x86/kernel/cpu/common.c84
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c57
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/spec_ctrl.c91
-rw-r--r--arch/x86/kernel/espfix_64.c10
-rw-r--r--arch/x86/kernel/head_64.S35
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/ldt.c25
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process.c9
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/smpboot.c5
-rw-r--r--arch/x86/kernel/tracepoint.c2
-rw-r--r--arch/x86/kvm/cpuid.c12
-rw-r--r--arch/x86/kvm/svm.c52
-rw-r--r--arch/x86/kvm/vmx.c33
-rw-r--r--arch/x86/kvm/x86.c5
-rw-r--r--arch/x86/lib/cmdline.c105
-rw-r--r--arch/x86/lib/delay.c8
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/kaiser.c444
-rw-r--r--arch/x86/mm/pageattr.c63
-rw-r--r--arch/x86/mm/pgtable.c16
-rw-r--r--arch/x86/mm/tlb.c203
-rw-r--r--arch/x86/xen/enlighten.c6
-rw-r--r--drivers/media/usb/uvc/uvc_v4l2.c1
-rw-r--r--drivers/net/wireless/ath/carl9170/main.c1
-rw-r--r--drivers/net/wireless/cw1200/sta.c1
-rw-r--r--drivers/net/wireless/p54/main.c1
-rw-r--r--drivers/scsi/qla2xxx/qla_mr.c12
-rw-r--r--drivers/thermal/int340x_thermal/int340x_thermal_zone.c11
-rw-r--r--fs/nfs/dir.c21
-rw-r--r--fs/udf/misc.c6
-rw-r--r--include/asm-generic/barrier.h4
-rw-r--r--include/asm-generic/vmlinux.lds.h7
-rw-r--r--include/linux/fdtable.h4
-rw-r--r--include/linux/kaiser.h52
-rw-r--r--include/linux/mmu_context.h9
-rw-r--r--include/linux/percpu-defs.h32
-rw-r--r--include/linux/ptrace.h6
-rw-r--r--include/linux/sunrpc/auth.h3
-rw-r--r--init/main.c2
-rw-r--r--kernel/bpf/core.c3
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/ptrace.c18
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--mm/mmu_context.c2
-rw-r--r--net/ipv6/raw.c1
-rw-r--r--net/mpls/af_mpls.c2
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_generic.c21
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c8
-rw-r--r--net/sunrpc/auth_unix.c21
-rw-r--r--security/Kconfig10
114 files changed, 2924 insertions, 398 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1c259ede391c..47de3730509b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2521,6 +2521,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nosmep [X86]
Disable SMEP (Supervisor Mode Execution Prevention)
even if it is supported by processor.
+
+ nospec [X86]
+ Disable indirect branch restricted speculation and
+ indirect branch prediction barrier to avoid performance
+ penalties in trusted environments.
+
noexec32 [X86-64]
This affects only 32-bit executables.
@@ -2604,8 +2610,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nointroute [IA-64]
+ noinvpcid [X86] Disable the INVPCID cpu feature.
+
nojitter [IA-64] Disables jitter checking for ITC timers.
+ nopti [X86-64] Disable KAISER isolation of kernel from user.
+
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
@@ -2638,6 +2648,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nopat [X86] Disable PAT (page attribute table extension of
pagetables) support.
+ nopcid [X86-64] Disable the PCID cpu feature.
+
norandmaps Don't use address space randomization. Equivalent to
echo 0 > /proc/sys/kernel/randomize_va_space
@@ -3159,6 +3171,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
+ pti= [X86_64]
+ Control KAISER user/kernel address space isolation:
+ on - enable
+ off - disable
+ auto - default setting
+
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 432ce8176498..340be7a4b40f 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -61,6 +61,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
cpu_switch_mm(mm->pgd, mm);
}
+#ifndef MODULE
#define finish_arch_post_lock_switch \
finish_arch_post_lock_switch
static inline void finish_arch_post_lock_switch(void)
@@ -82,6 +83,7 @@ static inline void finish_arch_post_lock_switch(void)
preempt_enable_no_resched();
}
}
+#endif /* !MODULE */
#endif /* CONFIG_MMU */
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index c0deafc212b8..9711556b1684 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -75,6 +75,7 @@ do { \
})
#define smp_mb__before_spinlock() smp_mb()
+#define gmb() asm volatile("ori 31,31,0");
#include <asm-generic/barrier.h>
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 77f52b26dad6..9c1102cbbd11 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -34,6 +34,7 @@
* exception handlers (including pSeries LPAR) and iSeries LPAR
* implementations as possible.
*/
+#include <asm/bug.h>
#define EX_R9 0
#define EX_R10 8
@@ -50,6 +51,58 @@
#define EX_PPR 88 /* SMT thread status register (priority) */
#define EX_CTR 96
+/*
+ * The nop instruction allows a secure memory protection instruction to be
+ * inserted with the rfi flush fixup.
+ */
+#define PREPARE_RFI_TO_USER \
+ RFI_FLUSH_FIXUP_SECTION; \
+ nop
+
+#define PREPARE_RFI_TO_GUEST \
+ RFI_FLUSH_FIXUP_SECTION; \
+ nop
+
+#define DEBUG_RFI
+
+#ifdef DEBUG_RFI
+#define CHECK_TARGET_MSR_PR(srr_reg, expected_pr) \
+ SET_SCRATCH0(r3); \
+ mfspr r3,srr_reg; \
+ extrdi r3,r3,1,63-MSR_PR_LG; \
+666: tdnei r3,expected_pr; \
+ EMIT_BUG_ENTRY 666b,__FILE__,__LINE__,0; \
+ GET_SCRATCH0(r3);
+#else
+#define CHECK_TARGET_MSR_PR(expected)
+#endif
+
+#define RFI_TO_KERNEL \
+ CHECK_TARGET_MSR_PR(SPRN_SRR1, 0); \
+ rfid
+
+#define RFI_TO_USER \
+ CHECK_TARGET_MSR_PR(SPRN_SRR1, 1); \
+ PREPARE_RFI_TO_USER; \
+ rfid
+
+#define RFI_TO_GUEST \
+ PREPARE_RFI_TO_GUEST; \
+ rfid
+
+#define HRFI_TO_KERNEL \
+ CHECK_TARGET_MSR_PR(SPRN_HSRR1, 0); \
+ hrfid
+
+#define HRFI_TO_USER \
+ CHECK_TARGET_MSR_PR(SPRN_HSRR1, 1); \
+ PREPARE_RFI_TO_USER; \
+ hrfid
+
+#define HRFI_TO_GUEST \
+ PREPARE_RFI_TO_GUEST; \
+ hrfid
+
#ifdef CONFIG_RELOCATABLE
#define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \
ld r12,PACAKBASE(r13); /* get high part of &label */ \
@@ -191,7 +244,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
mtspr SPRN_##h##SRR0,r12; \
mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
mtspr SPRN_##h##SRR1,r10; \
- h##rfid; \
+ h##rfid; /* h##RFI_TO_KERNEL runs out of space */ \
b . /* prevent speculative execution */
#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
__EXCEPTION_PROLOG_PSERIES_1(label, h)
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 9a67a38bf7b9..7817cb8429c7 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -184,4 +184,19 @@ label##3: \
FTR_ENTRY_OFFSET label##1b-label##3b; \
.popsection;
+#define RFI_FLUSH_FIXUP_SECTION \
+951: \
+ .pushsection __rfi_flush_fixup,"a"; \
+ .align 2; \
+952: \
+ FTR_ENTRY_OFFSET 951b-952b; \
+ .popsection;
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
+extern void do_rfi_flush_fixups(bool enable, unsigned int insn);
+
+#endif
#endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index e9d384cbd021..900c02b23453 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -26,6 +26,10 @@ void initmem_init(void);
void setup_panic(void);
#define ARCH_PANIC_TIMEOUT 180
+extern bool rfi_flush;
+void rfi_flush_enable(bool enable);
+void __init setup_rfi_flush(void);
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_POWERPC_SETUP_H */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 472aa8023276..70eedfed7f3c 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -37,6 +37,9 @@
#include <asm/hw_irq.h>
#include <asm/context_tracking.h>
#include <asm/tm.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/exception-64s.h>
+#endif
/*
* System calls.
@@ -226,13 +229,23 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
ACCOUNT_CPU_USER_EXIT(r11, r12)
HMT_MEDIUM_LOW_HAS_PPR
ld r13,GPR13(r1) /* only restore r13 if returning to usermode */
+ ld r2,GPR2(r1)
+ ld r1,GPR1(r1)
+ mtlr r4
+ mtcr r5
+ mtspr SPRN_SRR0,r7
+ mtspr SPRN_SRR1,r8
+ RFI_TO_USER
+ b . /* prevent speculative execution */
+
+ /* exit to kernel */
1: ld r2,GPR2(r1)
ld r1,GPR1(r1)
mtlr r4
mtcr r5
mtspr SPRN_SRR0,r7
mtspr SPRN_SRR1,r8
- RFI
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
syscall_error:
@@ -888,7 +901,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ACCOUNT_CPU_USER_EXIT(r2, r4)
REST_GPR(13, r1)
-1:
+
mtspr SPRN_SRR1,r3
ld r2,_CCR(r1)
@@ -901,8 +914,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r3,GPR3(r1)
ld r4,GPR4(r1)
ld r1,GPR1(r1)
+ RFI_TO_USER
+ b . /* prevent speculative execution */
- rfid
+1: mtspr SPRN_SRR1,r3
+
+ ld r2,_CCR(r1)
+ mtcrf 0xFF,r2
+ ld r2,_NIP(r1)
+ mtspr SPRN_SRR0,r2
+
+ ld r0,GPR0(r1)
+ ld r2,GPR2(r1)
+ ld r3,GPR3(r1)
+ ld r4,GPR4(r1)
+ ld r1,GPR1(r1)
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
#endif /* CONFIG_PPC_BOOK3E */
@@ -1078,7 +1105,7 @@ _GLOBAL(enter_rtas)
mtspr SPRN_SRR0,r5
mtspr SPRN_SRR1,r6
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
rtas_return_loc:
@@ -1103,7 +1130,7 @@ rtas_return_loc:
mtspr SPRN_SRR0,r3
mtspr SPRN_SRR1,r4
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
.align 3
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 4185fc303cc6..bf47cbe0f504 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -54,7 +54,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
1: mfspr r12,SPRN_SRR1 ; \
xori r12,r12,MSR_LE ; \
mtspr SPRN_SRR1,r12 ; \
- rfid ; /* return to userspace */ \
+ RFI_TO_USER ; /* return to userspace */ \
b . ; /* prevent speculative execution */
#if defined(CONFIG_RELOCATABLE)
@@ -671,7 +671,7 @@ masked_##_H##interrupt: \
ld r10,PACA_EXGEN+EX_R10(r13); \
ld r11,PACA_EXGEN+EX_R11(r13); \
GET_SCRATCH0(r13); \
- ##_H##rfid; \
+ ##_H##RFI_TO_KERNEL; \
b .
MASKED_INTERRUPT()
@@ -747,7 +747,7 @@ slb_miss_user_pseries:
mtspr SRR0,r12
mfspr r12,SRR1 /* and SRR1 */
mtspr SRR1,r10
- rfid
+ RFI_TO_KERNEL
b . /* prevent spec. execution */
#endif /* __DISABLED__ */
@@ -761,7 +761,7 @@ kvmppc_skip_interrupt:
addi r13, r13, 4
mtspr SPRN_SRR0, r13
GET_SCRATCH0(r13)
- rfid
+ RFI_TO_GUEST
b .
kvmppc_skip_Hinterrupt:
@@ -773,7 +773,7 @@ kvmppc_skip_Hinterrupt:
addi r13, r13, 4
mtspr SPRN_HSRR0, r13
GET_SCRATCH0(r13)
- hrfid
+ HRFI_TO_GUEST
b .
#endif
@@ -1525,7 +1525,7 @@ slb_miss_realmode:
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
mtspr SPRN_SRR1,r10
- rfid
+ RFI_TO_KERNEL
b .
unrecov_slb:
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a149f6a5907f..c2e11158fdce 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -847,3 +847,80 @@ static int __init disable_hardlockup_detector(void)
}
early_initcall(disable_hardlockup_detector);
#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+enum l1d_flush_type {
+ L1D_FLUSH_NONE,
+ L1D_FLUSH_ORI,
+ L1D_FLUSH_MTTRIG,
+};
+
+enum l1d_flush_type l1d_flush_type;
+
+bool rfi_flush;
+
+static void do_rfi_flush(void *val)
+{
+ switch (l1d_flush_type) {
+ case L1D_FLUSH_ORI:
+ asm volatile("ori 30,30,0" ::: "memory");
+ break;
+ case L1D_FLUSH_MTTRIG:
+ asm volatile("mtspr 882,0" ::: "memory");
+ break;
+ default:
+ break;
+ }
+}
+
+void rfi_flush_enable(bool enable)
+{
+ unsigned int insn;
+
+ if (rfi_flush == enable)
+ return;
+
+ switch (l1d_flush_type) {
+ case L1D_FLUSH_ORI:
+ insn = 0x63de0000;
+ break;
+ case L1D_FLUSH_MTTRIG:
+ insn = 0x7c12dba6;
+ break;
+ default:
+ printk("Secure memory protection not enabled! System is vulnerable to local exploit. Update firmware.\n");
+ return;
+ }
+
+ do_rfi_flush_fixups(enable, insn);
+
+ if (enable)
+ on_each_cpu(do_rfi_flush, NULL, 1);
+
+ rfi_flush = enable;
+}
+
+/* This tries to guess the cpu characteristics based on the PVR. */
+static bool get_cpu_characteristics(void)
+{
+ if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
+ l1d_flush_type = L1D_FLUSH_NONE;
+ else if (pvr_version_is(PVR_POWER8E) ||
+ pvr_version_is(PVR_POWER8NVL) ||
+ pvr_version_is(PVR_POWER8))
+ l1d_flush_type = L1D_FLUSH_ORI;
+ else {
+ /* unknown CPU */
+ l1d_flush_type = L1D_FLUSH_NONE;
+ return false;
+ }
+
+ return true;
+}
+
+void __init setup_rfi_flush(void)
+{
+ if (get_cpu_characteristics())
+ rfi_flush_enable(true);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 692873bff334..2584f0df3e49 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -18,8 +18,10 @@
#include <asm/smp.h>
#include <asm/pmc.h>
#include <asm/firmware.h>
+// #include <asm/ppc_asm.h>
#include "cacheinfo.h"
+// #include "setup.h"
#ifdef CONFIG_PPC64
#include <asm/paca.h>
@@ -496,6 +498,43 @@ static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
static DEVICE_ATTR(pir, 0400, show_pir, NULL);
+#ifdef CONFIG_PPC_BOOK3S_64
+static ssize_t show_rfi_flush(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", rfi_flush ? 1 : 0);
+}
+
+static ssize_t __used store_rfi_flush(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ int val;
+ int ret = 0;
+
+ ret = sscanf(buf, "%d", &val);
+ if (ret != 1)
+ return -EINVAL;
+
+ if (val == 1)
+ rfi_flush_enable(true);
+ else if (val == 0)
+ rfi_flush_enable(false);
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR(rfi_flush, 0600,
+ show_rfi_flush, store_rfi_flush);
+
+static void sysfs_create_rfi_flush(void)
+{
+ device_create_file(cpu_subsys.dev_root, &dev_attr_rfi_flush);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
/*
* This is the system wide DSCR register default value. Any
* change to this default value through the sysfs interface
@@ -1058,6 +1097,7 @@ static int __init topology_init(void)
#ifdef CONFIG_PPC64
sysfs_create_dscr_default();
+ sysfs_create_rfi_flush();
#endif /* CONFIG_PPC64 */
return 0;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 4a49b12d8f78..aee82bbe63c7 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -72,6 +72,15 @@ SECTIONS
/* Read-only data */
RODATA
+#ifdef CONFIG_PPC64
+ . = ALIGN(8);
+ __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+ __start___rfi_flush_fixup = .;
+ *(__rfi_flush_fixup)
+ __stop___rfi_flush_fixup = .;
+ }
+#endif
+
EXCEPTION_TABLE(0)
NOTES :kernel :notes
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2e9742b89b40..5b1fa8585613 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -64,7 +64,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
mtmsrd r0,1 /* clear RI in MSR */
mtsrr0 r5
mtsrr1 r6
- RFI
+ RFI_TO_KERNEL
kvmppc_call_hv_entry:
ld r4, HSTATE_KVM_VCPU(r13)
@@ -170,7 +170,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtsrr0 r8
mtsrr1 r7
beq cr1, 13f /* machine check */
- RFI
+ RFI_TO_KERNEL
/* On POWER7, we have external interrupts set to use HSRR0/1 */
11: mtspr SPRN_HSRR0, r8
@@ -965,8 +965,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r0, VCPU_GPR(R0)(r4)
ld r4, VCPU_GPR(R4)(r4)
-
- hrfid
+ HRFI_TO_GUEST
b .
secondary_too_late:
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 16c4d88ba27d..53efd6d709b0 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -141,7 +141,7 @@ kvmppc_handler_skip_ins:
GET_SCRATCH0(r13)
/* And get back into the code */
- RFI
+ RFI_TO_GUEST
#endif
/*
@@ -164,6 +164,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline)
ori r5, r5, MSR_EE
mtsrr0 r7
mtsrr1 r6
- RFI
+ RFI_TO_KERNEL
#include "book3s_segment.S"
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 6b7172597b50..35075de87239 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -20,6 +20,7 @@
#include <asm/code-patching.h>
#include <asm/page.h>
#include <asm/sections.h>
+#include <asm/setup.h>
struct fixup_entry {
@@ -113,6 +114,33 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
}
}
+#ifdef CONFIG_PPC_BOOK3S_64
+void do_rfi_flush_fixups(bool enable, unsigned int insn)
+{
+ long *start, *end;
+ unsigned int *dest;
+ int i;
+
+ start = PTRRELOC(&__start___rfi_flush_fixup),
+ end = PTRRELOC(&__stop___rfi_flush_fixup);
+
+ for (i = 0; start < end; start++, i++) {
+ dest = (void *)start + *start;
+
+ pr_devel("RFI FLUSH FIXUP %s %lx\n", enable ? "enable" : "disable", (unsigned long)start);
+ if (!enable) {
+ pr_devel("patching dest %lx\n", (unsigned long)dest);
+ patch_instruction(dest, PPC_INST_NOP);
+ } else {
+ pr_devel("patching dest %lx\n", (unsigned long)dest);
+ patch_instruction(dest, insn);
+ }
+ }
+
+ printk(KERN_DEBUG "rfi-fixups: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
{
long *start, *end;
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 1acb0c72d923..430c27a9ea26 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -42,6 +42,8 @@ static void __init pnv_setup_arch(void)
{
set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+ setup_rfi_flush();
+
/* Initialize SMP */
pnv_smp_init();
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 90ecb3f148df..005c330ce3e1 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -561,6 +561,8 @@ static void __init pSeries_setup_arch(void)
fwnmi_init();
+ setup_rfi_flush();
+
/* By default, only probe PCI (can be overriden by rtas_pci) */
pci_add_flags(PCI_PROBE_ONLY);
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
new file mode 100644
index 000000000000..da7e097fbaf4
--- /dev/null
+++ b/arch/s390/include/asm/alternative.h
@@ -0,0 +1,149 @@
+#ifndef _ASM_S390_ALTERNATIVE_H
+#define _ASM_S390_ALTERNATIVE_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/stringify.h>
+
+struct alt_instr {
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
+ u16 facility; /* facility bit set for replacement */
+ u8 instrlen; /* length of original instruction */
+ u8 replacementlen; /* length of new instruction */
+} __packed;
+
+extern void apply_alternative_instructions(void);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+/*
+ * |661: |662: |6620 |663:
+ * +-----------+---------------------+
+ * | oldinstr | oldinstr_padding |
+ * | +----------+----------+
+ * | | | |
+ * | | >6 bytes |6/4/2 nops|
+ * | |6 bytes jg----------->
+ * +-----------+---------------------+
+ * ^^ static padding ^^
+ *
+ * .altinstr_replacement section
+ * +---------------------+-----------+
+ * |6641: |6651:
+ * | alternative instr 1 |
+ * +-----------+---------+- - - - - -+
+ * |6642: |6652: |
+ * | alternative instr 2 | padding
+ * +---------------------+- - - - - -+
+ * ^ runtime ^
+ *
+ * .altinstructions section
+ * +---------------------------------+
+ * | alt_instr entries for each |
+ * | alternative instr |
+ * +---------------------------------+
+ */
+
+#define b_altinstr(num) "664"#num
+#define e_altinstr(num) "665"#num
+
+#define e_oldinstr_pad_end "663"
+#define oldinstr_len "662b-661b"
+#define oldinstr_total_len e_oldinstr_pad_end"b-661b"
+#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b"
+#define oldinstr_pad_len(num) \
+ "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \
+ "((" altinstr_len(num) ")-(" oldinstr_len "))"
+
+#define INSTR_LEN_SANITY_CHECK(len) \
+ ".if " len " > 254\n" \
+ "\t.error \"cpu alternatives does not support instructions " \
+ "blocks > 254 bytes\"\n" \
+ ".endif\n" \
+ ".if (" len ") %% 2\n" \
+ "\t.error \"cpu alternatives instructions length is odd\"\n" \
+ ".endif\n"
+
+#define OLDINSTR_PADDING(oldinstr, num) \
+ ".if " oldinstr_pad_len(num) " > 6\n" \
+ "\tjg " e_oldinstr_pad_end "f\n" \
+ "6620:\n" \
+ "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \
+ ".else\n" \
+ "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \
+ "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \
+ "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \
+ ".endif\n"
+
+#define OLDINSTR(oldinstr, num) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ OLDINSTR_PADDING(oldinstr, num) \
+ e_oldinstr_pad_end ":\n" \
+ INSTR_LEN_SANITY_CHECK(oldinstr_len)
+
+#define OLDINSTR_2(oldinstr, num1, num2) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \
+ OLDINSTR_PADDING(oldinstr, num2) \
+ ".else\n" \
+ OLDINSTR_PADDING(oldinstr, num1) \
+ ".endif\n" \
+ e_oldinstr_pad_end ":\n" \
+ INSTR_LEN_SANITY_CHECK(oldinstr_len)
+
+#define ALTINSTR_ENTRY(facility, num) \
+ "\t.long 661b - .\n" /* old instruction */ \
+ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \
+ "\t.word " __stringify(facility) "\n" /* facility bit */ \
+ "\t.byte " oldinstr_total_len "\n" /* source len */ \
+ "\t.byte " altinstr_len(num) "\n" /* alt instruction len */
+
+#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \
+ b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \
+ INSTR_LEN_SANITY_CHECK(altinstr_len(num))
+
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, altinstr, facility) \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(altinstr, 1) \
+ ".popsection\n" \
+ OLDINSTR(oldinstr, 1) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(facility, 1) \
+ ".popsection\n"
+
+#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(altinstr1, 1) \
+ ALTINSTR_REPLACEMENT(altinstr2, 2) \
+ ".popsection\n" \
+ OLDINSTR_2(oldinstr, 1, 2) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(facility1, 1) \
+ ALTINSTR_ENTRY(facility2, 2) \
+ ".popsection\n"
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * oldinstr is padded with jump and nops at compile time if altinstr is
+ * longer. altinstr is padded with jump and nops at run-time during patching.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, altinstr, facility) \
+ asm volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory")
+
+#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \
+ asm volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \
+ altinstr2, facility2) ::: "memory")
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_ALTERNATIVE_H */
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 5c8db3ce61c8..02104d8c7175 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -7,6 +7,8 @@
#ifndef __ASM_BARRIER_H
#define __ASM_BARRIER_H
+#include <asm/alternative.h>
+
/*
* Force strict CPU ordering.
* And yes, this is required on UP too when we're talking
@@ -22,6 +24,13 @@
#define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+static inline void gmb(void)
+{
+ asm volatile(
+ ALTERNATIVE("", ".long 0xb2e8f000", 81)
+ : : : "memory");
+}
+
#define rmb() barrier()
#define wmb() barrier()
#define dma_rmb() mb()
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index c61ed7890cef..d5431b366e1a 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -69,6 +69,7 @@ extern void s390_adjust_jiffies(void);
extern const struct seq_operations cpuinfo_op;
extern int sysctl_ieee_emulation_warnings;
extern void execve_tail(void);
+extern void __bpon(void);
/*
* User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 1cec329aa351..b282ae6c8964 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -44,7 +44,7 @@ obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o
obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
obj-y += runtime_instr.o cache.o dumpstack.o
-obj-y += entry.o reipl.o relocate_kernel.o
+obj-y += entry.o reipl.o relocate_kernel.o alternative.o
extra-y += head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
new file mode 100644
index 000000000000..39a530f1f816
--- /dev/null
+++ b/arch/s390/kernel/alternative.c
@@ -0,0 +1,123 @@
+#include <linux/module.h>
+#include <asm/alternative.h>
+#include <asm/facility.h>
+
+#define MAX_PATCH_LEN (255 - 1)
+
+static int __initdata_or_module alt_instr_disabled;
+
+static int __init disable_alternative_instructions(char *str)
+{
+ alt_instr_disabled = 1;
+ return 0;
+}
+
+early_param("noaltinstr", disable_alternative_instructions);
+
+extern struct alt_instr __alt_nobp[], __alt_nobp_end[];
+static int __init nobp_setup(char *str)
+{
+ bool enabled;
+ int rc;
+
+ rc = kstrtobool(str, &enabled);
+ if (!rc && enabled)
+ apply_alternatives(__alt_nobp, __alt_nobp_end);
+ return rc;
+}
+__setup("nobp=", nobp_setup);
+
+struct brcl_insn {
+ u16 opc;
+ s32 disp;
+} __packed;
+
+static u16 __initdata_or_module nop16 = 0x0700;
+static u32 __initdata_or_module nop32 = 0x47000000;
+static struct brcl_insn __initdata_or_module nop48 = {
+ 0xc004, 0
+};
+
+static const void * __initdata_or_module nops[] = {
+ &nop16,
+ &nop32,
+ &nop48
+};
+
+static void __init_or_module add_jump_padding(void *insns, unsigned int len)
+{
+ struct brcl_insn brcl = {
+ 0xc0f4,
+ len / 2
+ };
+
+ memcpy(insns, &brcl, sizeof(brcl));
+ insns += sizeof(brcl);
+ len -= sizeof(brcl);
+
+ while (len > 0) {
+ memcpy(insns, &nop16, 2);
+ insns += 2;
+ len -= 2;
+ }
+}
+
+static void __init_or_module add_padding(void *insns, unsigned int len)
+{
+ if (len > 6)
+ add_jump_padding(insns, len);
+ else if (len >= 2)
+ memcpy(insns, nops[len / 2 - 1], len);
+}
+
+static void __init_or_module __apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ struct alt_instr *a;
+ u8 *instr, *replacement;
+ u8 insnbuf[MAX_PATCH_LEN];
+
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite previously scanned alternative code.
+ */
+ for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+
+ if (!test_facility(a->facility))
+ continue;
+
+ if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
+ WARN_ONCE(1, "cpu alternatives instructions length is "
+ "odd, skipping patching\n");
+ continue;
+ }
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
+
+ if (a->instrlen > a->replacementlen) {
+ add_padding(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+
+ s390_kernel_write(instr, insnbuf, insnbuf_sz);
+ }
+}
+
+void __init_or_module apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ if (!alt_instr_disabled)
+ __apply_alternatives(start, end);
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+void __init apply_alternative_instructions(void)
+{
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+}
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 93e7422d2acc..3e1266d6ef2f 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -171,8 +171,41 @@ _PIF_WORK = (_PIF_PER_TRAP)
tm off+\addr, \mask
.endm
+ .macro BPOFF
+ .pushsection .altinstr_replacement, "ax"
+660: .long 0xb2e8c000
+ .popsection
+661: .long 0x47000000
+ .pushsection .altnobp, "a"
+ .long 661b - .
+ .long 660b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
+ .macro BPON
+ .pushsection .altinstr_replacement, "ax"
+662: .long 0xb2e8d000
+ .popsection
+663: .long 0x47000000
+ .pushsection .altnobp, "a"
+ .long 663b - .
+ .long 662b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
.section .kprobes.text, "ax"
+ENTRY(__bpon)
+ .globl __bpon
+ BPON
+ br %r14
+
/*
* Scheduler resume function, called by switch_to
* gpr2 = (task_struct *) prev
@@ -233,7 +266,9 @@ ENTRY(sie64a)
jnz .Lsie_skip
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsie_skip # exit if fp/vx regs changed
+ BPON
sie 0(%r14)
+ BPOFF
.Lsie_skip:
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
@@ -277,6 +312,7 @@ ENTRY(system_call)
stpt __LC_SYNC_ENTER_TIMER
.Lsysc_stmg:
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ BPOFF
lg %r10,__LC_LAST_BREAK
lg %r12,__LC_THREAD_INFO
lghi %r14,_PIF_SYSCALL
@@ -326,6 +362,7 @@ ENTRY(system_call)
lg %r14,__LC_VDSO_PER_CPU
lmg %r0,%r10,__PT_R0(%r11)
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
+ BPON
.Lsysc_exit_timer:
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
@@ -484,6 +521,7 @@ ENTRY(kernel_thread_starter)
ENTRY(pgm_check_handler)
stpt __LC_SYNC_ENTER_TIMER
+ BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
lg %r10,__LC_LAST_BREAK
lg %r12,__LC_THREAD_INFO
@@ -573,6 +611,7 @@ ENTRY(pgm_check_handler)
ENTRY(io_int_handler)
STCK __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
lg %r10,__LC_LAST_BREAK
lg %r12,__LC_THREAD_INFO
@@ -614,9 +653,13 @@ ENTRY(io_int_handler)
lg %r14,__LC_VDSO_PER_CPU
lmg %r0,%r10,__PT_R0(%r11)
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
+ tm __PT_PSW+1(%r11),0x01 # returning to user ?
+ jno .Lio_exit_kernel
+ BPON
.Lio_exit_timer:
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+.Lio_exit_kernel:
lmg %r11,%r15,__PT_R11(%r11)
lpswe __LC_RETURN_PSW
.Lio_done:
@@ -749,6 +792,7 @@ ENTRY(io_int_handler)
ENTRY(ext_int_handler)
STCK __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
lg %r10,__LC_LAST_BREAK
lg %r12,__LC_THREAD_INFO
@@ -787,6 +831,7 @@ ENTRY(psw_idle)
.insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15)
.Lpsw_idle_stcctm:
#endif
+ BPON
STCK __CLOCK_IDLE_ENTER(%r2)
stpt __TIMER_IDLE_ENTER(%r2)
.Lpsw_idle_lpsw:
@@ -891,6 +936,7 @@ load_fpu_regs:
*/
ENTRY(mcck_int_handler)
STCK __LC_MCCK_CLOCK
+ BPOFF
la %r1,4095 # revalidate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer
lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
@@ -947,6 +993,7 @@ ENTRY(mcck_int_handler)
mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
+ BPON
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
0: lmg %r11,%r15,__PT_R11(%r11)
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 42570d8fb265..6fff0c827558 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -563,6 +563,7 @@ static struct kset *ipl_kset;
static void __ipl_run(void *unused)
{
+ __bpon();
diag308(DIAG308_IPL, NULL);
if (MACHINE_IS_VM)
__cpcmd("IPL", NULL, 0, NULL);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 0c1a679314dd..c1b05b46282b 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/moduleloader.h>
#include <linux/bug.h>
+#include <asm/alternative.h>
#if 0
#define DEBUGP printk
@@ -424,6 +425,18 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
+ const Elf_Shdr *s;
+ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".altinstructions", secstrings + s->sh_name)) {
+ /* patch .altinstructions */
+ void *aseg = (void *)s->sh_addr;
+
+ apply_alternatives(aseg, aseg + s->sh_size);
+ }
+ }
+
jump_label_apply_nops(me);
vfree(me->arch.syminfo);
me->arch.syminfo = NULL;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 38bb024fbb76..c1cad2c98e10 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -63,6 +63,7 @@
#include <asm/sclp.h>
#include <asm/sysinfo.h>
#include <asm/numa.h>
+#include <asm/alternative.h>
#include "entry.h"
/*
@@ -897,6 +898,8 @@ void __init setup_arch(char **cmdline_p)
conmode_default();
set_preferred_console();
+ apply_alternative_instructions();
+
/* Setup zfcpdump support */
setup_zfcpdump();
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 7ad070e984f2..72c2e5437d4d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -299,6 +299,7 @@ static void pcpu_delegate(struct pcpu *pcpu, void (*func)(void *),
mem_assign_absolute(lc->restart_fn, (unsigned long) func);
mem_assign_absolute(lc->restart_data, (unsigned long) data);
mem_assign_absolute(lc->restart_source, source_cpu);
+ __bpon();
asm volatile(
"0: sigp 0,%0,%2 # sigp restart to target cpu\n"
" brc 2,0b # busy, try again\n"
@@ -884,6 +885,7 @@ void __cpu_die(unsigned int cpu)
void __noreturn cpu_die(void)
{
idle_task_exit();
+ __bpon();
pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
for (;;) ;
}
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 445657fe658c..5b96276f7b10 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -72,6 +72,32 @@ SECTIONS
EXIT_DATA
}
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ * Note, that it is a part of __init region.
+ */
+ . = ALIGN(8);
+ .altinstructions : {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ __alt_nobp = .;
+ *(.altnobp)
+ __alt_nobp_end = .;
+ }
+
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ * Note, that it is a part of __init region.
+ */
+ .altinstr_replacement : {
+ *(.altinstr_replacement)
+ }
+
/* early.c uses stsi, which requires page aligned data. */
. = ALIGN(PAGE_SIZE);
INIT_DATA_SECTION(0x100)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f47220a91808..cb1b9f22a04e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -42,7 +42,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3aebd11e4aa9..23f1a19f8aec 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
*/
#undef CONFIG_PARAVIRT
#undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KAISER
#undef CONFIG_KASAN
#include <linux/linkage.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cb08e7aa3b8d..ab2686eae08e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,8 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
+#include <asm/spec_ctrl.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -135,6 +137,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -171,6 +174,9 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+ ENABLE_IBRS
+ STUFF_RSB
+
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz tracesys
entry_SYSCALL_64_fastpath:
@@ -207,9 +213,20 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
- RESTORE_C_REGS_EXCEPT_RCX_R11
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
+
+ DISABLE_IBRS
+
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
/*
* 64-bit SYSRET restores rip from rcx,
@@ -345,12 +362,32 @@ GLOBAL(int_ret_from_sys_call)
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
+ DISABLE_IBRS
+
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ DISABLE_IBRS
+
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -501,6 +538,8 @@ END(irq_entries_start)
SAVE_C_REGS
SAVE_EXTRA_REGS
+ STUFF_RSB
+
testb $3, CS(%rsp)
jz 1f
@@ -509,6 +548,8 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
+ ENABLE_IBRS
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -566,6 +607,8 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ DISABLE_IBRS
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -623,6 +666,7 @@ native_irq_return_ldt:
pushq %rax
pushq %rdi
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* RAX */
movq (2*8)(%rsp), %rax /* RIP */
@@ -638,6 +682,7 @@ native_irq_return_ldt:
andl $0xffff0000, %eax
popq %rdi
orq PER_CPU_VAR(espfix_stack), %rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
popq %rax
@@ -1025,12 +1070,20 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
*/
ENTRY(paranoid_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+
+ /* Do the stuffing unconditionally from user/kernel to be safe */
+ STUFF_RSB
+
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -1038,7 +1091,27 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
-1: ret
+1:
+#ifdef CONFIG_KAISER
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+2:
+#endif
+ ENABLE_IBRS
+ ret
END(paranoid_entry)
/*
@@ -1051,19 +1124,27 @@ END(paranoid_entry)
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
*
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
*/
ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
- testl %ebx, %ebx /* swapgs needed? */
+ TRACE_IRQS_IRETQ_DEBUG
+ DISABLE_IBRS
+#ifdef CONFIG_KAISER
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1078,6 +1159,16 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+
+ STUFF_RSB
+
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1089,6 +1180,8 @@ ENTRY(error_entry)
*/
SWAPGS
+ ENABLE_IBRS
+
.Lerror_entry_from_usermode_after_swapgs:
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -1133,6 +1226,7 @@ ENTRY(error_entry)
/* fall through */
.Lerror_bad_iret:
+ ENABLE_IBRS_CLOBBER
/*
* We came from an IRET to user mode, so we have user gsbase.
* Switch to kernel gsbase:
@@ -1238,6 +1332,10 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ /*
+ * percpu variables are mapped with user CR3, so no need
+ * to switch CR3 here.
+ */
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1271,12 +1369,39 @@ ENTRY(nmi)
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_KAISER
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
+2:
+#endif
+
+ ENABLE_IBRS
+
call do_nmi
+ DISABLE_IBRS
+
+#ifdef CONFIG_KAISER
+ /*
+ * Unconditionally restore CR3. I know we return to
+ * kernel code that needs user CR3, but do we ever return
+ * to "user mode" where we need the kernel CR3?
+ */
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts. Do not
+ * switch to user CR3: we might be going back to kernel code
+ * that had a user CR3 set.
*/
SWAPGS
jmp restore_c_regs_and_iret
@@ -1473,22 +1598,55 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK
/*
- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
- * as we should not be calling schedule in NMI context.
- * Even with normal interrupts enabled. An NMI should not be
- * setting NEED_RESCHED or anything that normal interrupts and
- * exceptions might do.
+ * Use the same approach as paranoid_entry to handle SWAPGS, but
+ * without CR3 handling since we do that differently in NMIs. No
+ * need to use paranoid_exit as we should not be calling schedule
+ * in NMI context. Even with normal interrupts enabled. An NMI
+ * should not be setting NEED_RESCHED or anything that normal
+ * interrupts and exceptions might do.
*/
- call paranoid_entry
-
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ cld
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+1:
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_KAISER
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
+2:
+#endif
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
+#ifdef CONFIG_KAISER
+ /*
+ * Unconditionally restore CR3. We might be returning to
+ * kernel code that needs user CR3, like just just before
+ * a sysret.
+ */
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
+ /* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
@@ -1520,3 +1678,73 @@ ENTRY(ignore_sysret)
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
+
+ENTRY(stuff_rsb)
+ call 1f
+ pause
+1: call 2f
+ pause
+2: call 3f;
+ pause
+3: call 4f
+ pause
+4: call 5f
+ pause
+5: call 6f
+ pause
+6: call 7f
+ pause
+7: call 8f
+ pause
+8: call 9f
+ pause
+9: call 10f
+ pause
+10: call 11f
+ pause
+11: call 12f
+ pause
+12: call 13f
+ pause
+13: call 14f
+ pause
+14: call 15f
+ pause
+15: call 16f
+ pause
+16: call 17f
+ pause
+17: call 18f
+ pause
+18: call 19f
+ pause
+19: call 20f
+ pause
+20: call 21f
+ pause
+21: call 22f
+ pause
+22: call 23f
+ pause
+23: call 24f
+ pause
+24: call 25f
+ pause
+25: call 26f
+ pause
+26: call 27f
+ pause
+27: call 28f
+ pause
+28: call 29f
+ pause
+29: call 30f
+ pause
+30: call 31f
+ pause
+31: call 32f
+ pause
+32:
+ add $(32*8), %rsp
+ ret
+END(stuff_rsb)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 051d86fbc00c..5c4270f7d5b2 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,9 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
+#include <asm/spec_ctrl.h>
#include <linux/linkage.h>
#include <linux/err.h>
@@ -50,6 +53,7 @@ ENDPROC(native_usergs_sysret32)
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -96,6 +100,9 @@ ENTRY(entry_SYSENTER_compat)
pushq %r8 /* pt_regs->r15 = 0 */
cld
+ ENABLE_IBRS
+ STUFF_RSB
+
/*
* Sysenter doesn't filter flags, so we need to clear NT
* ourselves. To save a few cycles, we can check whether
@@ -161,6 +168,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@@ -193,6 +201,9 @@ ENTRY(entry_SYSCALL_compat)
pushq %r8 /* pt_regs->r14 = 0 */
pushq %r8 /* pt_regs->r15 = 0 */
+ ENABLE_IBRS
+ STUFF_RSB
+
/*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
@@ -208,6 +219,8 @@ ENTRY(entry_SYSCALL_compat)
/* Opportunistic SYSRET */
sysret32_from_system_call:
TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ DISABLE_IBRS
+ SWITCH_USER_CR3
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
@@ -269,6 +282,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
@@ -299,6 +313,9 @@ ENTRY(entry_INT80_compat)
pushq %r15 /* pt_regs->r15 */
cld
+ ENABLE_IBRS
+ STUFF_RSB
+
/*
* User mode is traced as though IRQs are on, and the interrupt
* gate turned them off.
@@ -311,6 +328,8 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
+ DISABLE_IBRS
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index bfb28caf97b1..d00432579444 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -23,6 +23,8 @@
#define wmb() asm volatile("sfence" ::: "memory")
#endif
+#define gmb() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC);
+
#ifdef CONFIG_X86_PPRO_FENCE
#define dma_rmb() rmb()
#else
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
#define _ASM_X86_CMDLINE_H
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index c6adce7aa97b..2ddcaf350dc5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -92,7 +92,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) * "" Mfence synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -189,6 +189,7 @@
#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
@@ -202,6 +203,10 @@
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_SPEC_CTRL ( 7*32+19) /* Control Speculation Control */
+
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -267,6 +272,7 @@
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
+#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73cf018..4ffe9c782633 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -43,7 +43,11 @@ struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
+#ifdef __GENKSYMS__
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+#else
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+#endif
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- unsigned int irq_tlb_count;
#endif
+ unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 84b3d194a958..60353c020f61 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -192,7 +192,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..b5e46aa683f4
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,141 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory. It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js 9f
+/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+8:
+.endm
+
+.macro SWITCH_USER_CR3
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+8:
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+ALTERNATIVE "jmp 8f", \
+ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+ X86_FEATURE_KAISER
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3
+.endm
+.macro SWITCH_USER_CR3
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_KAISER
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored. To change the address space, another register is
+ * needed. A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+extern int kaiser_enabled;
+extern void __init kaiser_check_boottime_disable(void);
+#else
+#define kaiser_enabled 0
+static inline void __init kaiser_check_boottime_disable(void) {}
+#endif /* CONFIG_KAISER */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
+/**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * The mapping is done on a global scope, so no bigger
+ * synchronization has to be done. the pages have to be
+ * manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * kaiser_init - Initialize the shadow mapping
+ *
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 712b24ed3a64..b811c9efdc98 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -4,20 +4,7 @@
#include <linux/earlycpio.h>
#include <linux/initrd.h>
-#define native_rdmsr(msr, val1, val2) \
-do { \
- u64 __val = native_read_msr((msr)); \
- (void)((val1) = (u32)__val); \
- (void)((val2) = (u32)(__val >> 32)); \
-} while (0)
-
-#define native_wrmsr(msr, low, high) \
- native_write_msr(msr, low, high)
-
-#define native_wrmsrl(msr, val) \
- native_write_msr((msr), \
- (u32)((u64)(val)), \
- (u32)((u64)(val) >> 32))
+#include <asm/msr.h>
struct cpu_signature {
unsigned int sig;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 17d048d2fbab..ae92b25c691c 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -36,12 +36,6 @@ typedef struct {
#endif
} mm_context_t;
-#ifdef CONFIG_SMP
void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 25d3a2666e57..8a3c05223297 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -99,10 +99,8 @@ static inline void load_mm_ldt(struct mm_struct *mm)
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
}
static inline int init_new_context(struct task_struct *tsk,
@@ -123,103 +121,12 @@ static inline void destroy_context(struct mm_struct *mm)
destroy_context_ldt(mm);
}
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
- if (likely(prev != next)) {
-#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
- */
- load_cr3(next->pgd);
-
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
- /* Stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
- /* Load per-mm CR4 state */
- load_mm_cr4(next);
-
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT, if the LDT is different.
- *
- * It's possible that prev->context.ldt doesn't match
- * the LDT register. This can happen if leave_mm(prev)
- * was called and then modify_ldt changed
- * prev->context.ldt but suppressed an IPI to this CPU.
- * In this case, prev->context.ldt != NULL, because we
- * never set context.ldt to NULL while the mm still
- * exists. That means that next->context.ldt !=
- * prev->context.ldt, because mms never share an LDT.
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
-#endif
- }
-#ifdef CONFIG_SMP
- else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * On established mms, the mm_cpumask is only changed
- * from irq context, from ptep_clear_flush() while in
- * lazy tlb mode, and here. Irqs are blocked during
- * schedule, protecting us from simultaneous changes.
- */
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
- */
- load_cr3(next->pgd);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_mm_cr4(next);
- load_mm_ldt(next);
- }
- }
-#endif
-}
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c7258ca9d4f4..38fedaa5e131 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -32,6 +32,9 @@
#define EFER_FFXSR (1<<_EFER_FFXSR)
/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_SPEC_CTRL 0x00000048
+#define MSR_IA32_PRED_CMD 0x00000049
+
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
@@ -333,6 +336,7 @@
#define MSR_F15H_NB_PERF_CTL 0xc0010240
#define MSR_F15H_NB_PERF_CTR 0xc0010241
#define MSR_F15H_PTSC 0xc0010280
+#define MSR_F15H_IC_CFG 0xc0011021
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
@@ -342,6 +346,8 @@
#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
#define MSR_FAM10H_NODE_ID 0xc001100c
+#define MSR_F10H_DECFG 0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
@@ -425,6 +431,8 @@
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
#define FEATURE_CONTROL_LMCE (1<<20)
+#define FEATURE_ENABLE_IBRS (1<<0)
+#define FEATURE_SET_IBPB (1<<0)
#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b284e4a7..9f900ef276cf 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -147,8 +147,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
* that some other imaginary CPU is updating continuously with a
* time stamp.
*/
- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
- "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC);
return rdtsc();
}
@@ -163,6 +162,21 @@ static inline unsigned long long native_read_pmc(int counter)
return EAX_EDX_VAL(val, low, high);
}
+#define native_rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = native_read_msr((msr)); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
+} while (0)
+
+#define native_wrmsr(msr, low, high) \
+ native_write_msr(msr, low, high)
+
+#define native_wrmsrl(msr, val) \
+ native_write_msr((msr), \
+ (u32)((u64)(val)), \
+ (u32)((u64)(val) >> 32))
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index c70689b5e5aa..d1def1422510 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -3,6 +3,8 @@
#include <linux/sched.h>
+#include <asm/spec_ctrl.h>
+
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
#define MWAIT_SUBSTATE_SIZE 4
@@ -102,9 +104,13 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
mb();
}
+ x86_disable_ibrs();
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__mwait(eax, ecx);
+
+ x86_enable_ibrs();
}
current_clr_polling();
}
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 20e9f9594db8..d45dc0baaca9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);
@@ -692,7 +698,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ pgdval_t ignore_flags = _PAGE_USER;
+ /*
+ * We set NX on KAISER pgds that map userspace memory so
+ * that userspace can not meaningfully use the kernel
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+ if (kaiser_enabled)
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -900,7 +916,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ if (kaiser_enabled) {
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+ }
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 1cc82ece9ac1..cf68b5c1cb74 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+#ifdef CONFIG_DEBUG_VM
+ /* linux/mmdebug.h may not have been included at this point */
+ BUG_ON(!kaiser_enabled);
+#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ BUILD_BUG_ON(1);
+ return NULL;
+}
+#endif /* CONFIG_KAISER */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pgd;
+ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 2fd7194d2144..7ce43cc540db 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -120,7 +120,7 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -138,6 +138,33 @@
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH (0)
+#define X86_CR3_PCID_USER_FLUSH (0)
+#define X86_CR3_PCID_KERN_NOFLUSH (0)
+#define X86_CR3_PCID_USER_NOFLUSH (0)
+#endif
+
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 88b0f5974d90..9b04acbcd5ba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -319,7 +319,11 @@ struct tss_struct {
} ____cacheline_aligned;
+#ifndef __GENKSYMS__
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
+#else
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+#endif
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a4a77286cb1d..6cd9c49ba651 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -16,6 +16,7 @@ void entry_SYSENTER_compat(void);
void x86_configure_nx(void);
void x86_report_nx(void);
+void stuff_rsb(void);
extern int reboot_force;
diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h
new file mode 100644
index 000000000000..96f61a82396d
--- /dev/null
+++ b/arch/x86/include/asm/spec_ctrl.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_X86_SPEC_CTRL_H
+#define _ASM_X86_SPEC_CTRL_H
+
+#include <linux/stringify.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+#ifdef __ASSEMBLY__
+
+.macro __ENABLE_IBRS_CLOBBER
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+ xorl %edx, %edx
+ movl $FEATURE_ENABLE_IBRS, %eax
+ wrmsr
+.endm
+
+.macro ENABLE_IBRS_CLOBBER
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
+ call x86_ibrs_enabled
+ test %eax, %eax
+ jz .Llfence_\@
+
+ __ENABLE_IBRS_CLOBBER
+ jmp .Lend_\@
+
+.Llfence_\@:
+ lfence
+.Lend_\@:
+.endm
+
+
+.macro ENABLE_IBRS
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
+
+ pushq %rax
+
+ call x86_ibrs_enabled
+ test %eax, %eax
+ jz .Llfence_\@
+
+ pushq %rcx
+ pushq %rdx
+ __ENABLE_IBRS_CLOBBER
+ popq %rdx
+ popq %rcx
+
+ jmp .Lpop_\@
+
+.Llfence_\@:
+ lfence
+
+.Lpop_\@:
+ popq %rax
+
+.Lend_\@:
+.endm
+
+
+.macro DISABLE_IBRS
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
+
+ pushq %rax
+
+ call x86_ibrs_enabled
+ test %eax, %eax
+ jz .Llfence_\@
+
+ pushq %rcx
+ pushq %rdx
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+ xorl %edx, %edx
+ xorl %eax, %eax
+ wrmsr
+ popq %rdx
+ popq %rcx
+
+ jmp .Lpop_\@
+
+.Llfence_\@:
+ lfence
+
+.Lpop_\@:
+ popq %rax
+
+.Lend_\@:
+.endm
+
+.macro STUFF_RSB
+ ALTERNATIVE "call stuff_rsb", "", X86_FEATURE_SMEP
+.endm
+
+#else /* __ASSEMBLY__ */
+void x86_enable_ibrs(void);
+void x86_disable_ibrs(void);
+void stuff_RSB(void);
+unsigned int x86_ibrs_enabled(void);
+unsigned int x86_ibpb_enabled(void);
+void x86_spec_check(void);
+
+static inline void x86_ibp_barrier(void)
+{
+ if (x86_ibpb_enabled())
+ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
+}
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SPEC_CTRL_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 52965d38d7b9..d9b705f0786a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,57 @@
#include <asm/processor.h>
#include <asm/special_insns.h>
+#ifndef __GENKSYMS__
+#include <asm/smp.h>
+#endif
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ *
+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -16,10 +67,8 @@
#endif
struct tlb_state {
-#ifdef CONFIG_SMP
struct mm_struct *active_mm;
int state;
-#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -84,6 +133,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
cr4_set_bits(mask);
}
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+#define kaiser_enabled 0
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
static inline void __native_flush_tlb(void)
{
/*
@@ -92,6 +159,8 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
+ if (kaiser_enabled)
+ kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
}
@@ -101,39 +170,84 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ if (cr4 & X86_CR4_PGE) {
+ /* clear PGE and flush TLB of all entries */
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
+ /* restore PGE as it was before */
+ native_write_cr4(cr4);
+ } else {
+ /* do it with cr3, letting kaiser flush user PCID */
+ __native_flush_tlb();
+ }
}
static inline void __native_flush_tlb_global(void)
{
unsigned long flags;
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
-
__native_flush_tlb_global_irq_disabled();
-
raw_local_irq_restore(flags);
}
static inline void __native_flush_tlb_single(unsigned long addr)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /*
+ * SIMICS #GP's if you run INVPCID with type 2/3
+ * and X86_CR4_PCIDE clear. Shame!
+ *
+ * The ASIDs used below are hard-coded. But, we must not
+ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
+ * invlpg in the case we are called early.
+ */
+
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ if (kaiser_enabled)
+ kaiser_flush_tlb_on_return_to_user();
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+ }
+ /* Flush the address out of both PCIDs. */
+ /*
+ * An optimization here might be to determine addresses
+ * that are only kernel-mapped and only flush the kernel
+ * ASID. But, userspace flushes are probably much more
+ * important performance-wise.
+ *
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+ if (kaiser_enabled)
+ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
}
static inline void __flush_tlb_all(void)
{
- if (cpu_has_pge)
- __flush_tlb_global();
- else
- __flush_tlb();
+ __flush_tlb_global();
+ /*
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+ * we'd end up flushing kernel translations for the current ASID but
+ * we might fail to flush kernel translations for other cached ASIDs.
+ *
+ * To avoid this issue, we force PCID off if PGE is off.
+ */
}
static inline void __flush_tlb_one(unsigned long addr)
@@ -147,7 +261,6 @@ static inline void __flush_tlb_one(unsigned long addr)
/*
* TLB flushing:
*
- * - flush_tlb() flushes the current mm struct TLBs
* - flush_tlb_all() flushes all processes TLBs
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
@@ -159,84 +272,6 @@ static inline void __flush_tlb_one(unsigned long addr)
* and page-granular flushes are available only on i486 and up.
*/
-#ifndef CONFIG_SMP
-
-/* "_up" is for UniProcessor.
- *
- * This is a helper for other header functions. *Not* intended to be called
- * directly. All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
-}
-
-static inline void flush_tlb_all(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb_all();
-}
-
-static inline void flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void local_flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned long vmflag)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
-{
- flush_tlb_all();
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
-
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
@@ -245,13 +280,14 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_current_task(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
-#define flush_tlb() flush_tlb_current_task()
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
void native_flush_tlb_others(struct cpumask *cpumask,
struct mm_struct *mm,
@@ -267,8 +303,6 @@ static inline void reset_lazy_tlbstate(void)
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
}
-#endif /* SMP */
-
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..6768d1321016 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 58031303e304..c8205ff2a0f3 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
+obj-y += spec_ctrl.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c9cbdd48f094..5957b15cb54e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,6 +12,7 @@
#include <asm/smp.h>
#include <asm/pci-direct.h>
#include <asm/delay.h>
+#include <asm/spec_ctrl.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -766,8 +767,16 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serialized.
+ */
+ if (c->x86 > 0xf)
+ msr_set_bit(MSR_F10H_DECFG,
+ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
+ /* LFENCE with MSR_F10H_DECFG[1]=1 stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
/*
@@ -789,6 +798,8 @@ static void init_amd(struct cpuinfo_x86 *c)
/* AMD CPUs don't reset SS attributes on SYSRET */
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ x86_spec_check();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cd914e4ef24f..8837f63e045a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -95,7 +95,11 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
+#ifndef __GENKSYMS__
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
+#else
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#endif
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -165,6 +169,40 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
+static int __init x86_noinvpcid_setup(char *s)
+{
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
@@ -292,6 +330,39 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ cr4_set_bits(X86_CR4_PCIDE);
+ /*
+ * INVPCID has two "groups" of types:
+ * 1/2: Invalidate an individual address
+ * 3/4: Invalidate all contexts
+ *
+ * 1/2 take a PCID, but 3/4 do not. So, 3/4
+ * ignore the PCID argument in the descriptor.
+ * But, we have to be careful not to call 1/2
+ * with an actual non-zero PCID in them before
+ * we do the above cr4_set_bits().
+ */
+ if (cpu_has(c, X86_FEATURE_INVPCID))
+ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+ }
+ }
+ kaiser_setup_pcid();
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -971,6 +1042,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -1227,7 +1301,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
@@ -1390,6 +1464,14 @@ void cpu_init(void)
* try to read it.
*/
cr4_init_shadow();
+ if (!kaiser_enabled) {
+ /*
+ * secondary_startup_64() deferred setting PGE in cr4:
+ * probe_page_size_mask() sets it on the boot cpu,
+ * but it needs to be set on each secondary cpu.
+ */
+ cr4_set_bits(X86_CR4_PGE);
+ }
/*
* Load microcode on this cpu if a valid microcode is available.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 176ccd2964ff..f68e0bc85f2f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -16,6 +16,7 @@
#include <asm/intel-family.h>
#include <asm/hwcap2.h>
#include <asm/elf.h>
+#include <asm/spec_ctrl.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -560,6 +561,8 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_vmx_virtcap(c);
probe_xeon_phi_r3mwait(c);
+
+ x86_spec_check();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3e94ef461fd..b7dc3ca4518a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -39,6 +39,7 @@
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
+#include <asm/spec_ctrl.h>
#define MICROCODE_VERSION "2.01"
@@ -417,8 +418,11 @@ static ssize_t reload_store(struct device *dev,
if (!ret)
ret = tmp_ret;
}
- if (!ret)
+ if (!ret) {
perf_check_microcode();
+ x86_spec_check();
+ }
+
mutex_unlock(&microcode_mutex);
put_online_cpus();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index edfa6aba10d3..1d4090816f5e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,11 +2,15 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/kaiser.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "perf_event.h"
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+ unsigned int order = get_order(size);
+ struct page *page;
+ unsigned long addr;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ if (!page)
+ return NULL;
+ addr = (unsigned long)page_address(page);
+ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+ __free_pages(page, order);
+ addr = 0;
+ }
+ return (void *)addr;
+#else
+ return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+ if (!buffer)
+ return;
+ kaiser_remove_mapping((unsigned long)buffer, size);
+ free_pages((unsigned long)buffer, get_order(size));
+#else
+ kfree(buffer);
+#endif
+}
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
- kfree(buffer);
+ dsfree(buffer, x86_pmu.pebs_buffer_size);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+ x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
}
@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
- if (unlikely(!ds))
- return -ENOMEM;
+ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index dc847057c83c..ce84c60b227c 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -34,6 +34,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_INTEL_PT, CPUID_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
{ X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
+ { X86_FEATURE_SPEC_CTRL, CPUID_EDX,26, 0x00000007, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX,11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/spec_ctrl.c b/arch/x86/kernel/cpu/spec_ctrl.c
new file mode 100644
index 000000000000..0e8badea5f0e
--- /dev/null
+++ b/arch/x86/kernel/cpu/spec_ctrl.c
@@ -0,0 +1,91 @@
+/*
+ * Speculation control stuff
+ *
+ */
+
+#include <asm/msr.h>
+#include <asm/proto.h>
+#include <asm/processor.h>
+#include <asm/spec_ctrl.h>
+
+/*
+ * Keep it open for more flags in case needed.
+ */
+static unsigned int ibrs_state = 0;
+static unsigned int ibpb_state = 0;
+
+unsigned int notrace x86_ibrs_enabled(void)
+{
+ return ibrs_state;
+}
+EXPORT_SYMBOL_GPL(x86_ibrs_enabled);
+
+unsigned int notrace x86_ibpb_enabled(void)
+{
+ return ibpb_state;
+}
+EXPORT_SYMBOL_GPL(x86_ibpb_enabled);
+
+void x86_disable_ibrs(void)
+{
+ if (x86_ibrs_enabled())
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+}
+EXPORT_SYMBOL_GPL(x86_disable_ibrs);
+
+void x86_enable_ibrs(void)
+{
+ if (x86_ibrs_enabled())
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+}
+EXPORT_SYMBOL_GPL(x86_enable_ibrs);
+
+/*
+ * Do this indirection as otherwise we'd need to backport the
+ * EXPORT_SYMBOL_GPL() for asm stuff.
+ */
+void stuff_RSB(void)
+{
+ stuff_rsb();
+}
+EXPORT_SYMBOL_GPL(stuff_RSB);
+
+/*
+ * Called after upgrading microcode, check CPUID directly.
+ */
+void x86_spec_check(void)
+{
+ if (cpuid_edx(7) & BIT(26)) {
+ ibrs_state = 1;
+ ibpb_state = 1;
+
+ setup_force_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ }
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (cpuid_ebx(0x80000008) & BIT(12)) {
+ ibpb_state = 1;
+ } else {
+ switch (boot_cpu_data.x86) {
+ case 0x10:
+ case 0x12:
+ case 0x16:
+ pr_info_once("Disabling indirect branch predictor support\n");
+ msr_set_bit(MSR_F15H_IC_CFG, 14);
+ break;
+ }
+ ibpb_state = 0;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(x86_spec_check);
+
+static int __init nospec(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ ibrs_state = 0;
+ ibpb_state = 0;
+
+ return 0;
+}
+early_param("nospec", nospec);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416e2a7f..b02cb2ec6726 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+ if (kaiser_enabled) {
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+ }
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..9af949ce8a69 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
movq $(init_level4_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
@@ -441,6 +441,27 @@ early_idt_ripmsg:
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -450,9 +471,10 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.fill 512,8,0
+ .fill KAISER_USER_PGD_FILL,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..f480b38a03c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6707039b9032..536e6ab783cf 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/kaiser.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@@ -33,11 +34,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size);
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
+ int ret;
if (size > LDT_ENTRIES)
return NULL;
@@ -65,7 +76,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL;
}
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
new_ldt->size = size;
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
return new_ldt;
}
@@ -91,12 +108,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
- else
- free_page((unsigned long)ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa05583bc42..0677bf8d3a42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 863bc6c902a7..c941bdedd62c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,11 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
+#if defined(CONFIG_GENKSYMS)
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+#else
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
+#endif
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32
@@ -422,11 +426,16 @@ static void mwait_idle(void)
smp_mb(); /* quirk */
}
+ x86_disable_ibrs();
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
+
+ x86_enable_ibrs();
+
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a9b31eb815f2..e66e5c9e2ba1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -93,6 +93,10 @@ void __noreturn machine_real_restart(unsigned int type)
load_cr3(initial_page_table);
#else
write_cr3(real_mode_header->trampoline_pgd);
+
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
#endif
/* Jump to the identity-mapped low memory code */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 48fbe854dd06..ec79c834b0bd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -116,6 +116,7 @@
#include <asm/mmu_context.h>
#include <asm/suspend.h>
+#include <asm/kaiser.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1037,6 +1038,12 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ /*
+ * This needs to happen right after XENPV is set on xen and
+ * kaiser_enabled is checked below in cleanup_highmap().
+ */
+ kaiser_check_boottime_disable();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0ff1d9aaa2ac..50c8ea4a73c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -75,6 +75,7 @@
#include <asm/i8259.h>
#include <asm/realmode.h>
#include <asm/misc.h>
+#include <asm/spec_ctrl.h>
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
@@ -1544,9 +1545,13 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
+ x86_disable_ibrs();
+
mwait_play_dead(); /* Only returns on failure */
if (cpuidle_play_dead())
hlt_play_dead();
+
+ x86_enable_ibrs();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..2bb5ee464df3 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
#include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) trace_idt_table };
/* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
static int trace_irq_vector_refcount;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 22f0e88f8abd..9f550afedf66 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -69,6 +69,7 @@ u64 kvm_supported_xcr0(void)
/* These are scattered features in cpufeatures.h. */
#define KVM_CPUID_BIT_AVX512_4VNNIW 2
#define KVM_CPUID_BIT_AVX512_4FMAPS 3
+#define KVM_CPUID_BIT_SPEC_CTRL 26
#define KF(x) bit(KVM_CPUID_BIT_##x)
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
@@ -384,7 +385,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
+ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) |
+ KF(SPEC_CTRL);
+
+ /* cpuid 0x80000008.0.ebx */
+ const u32 kvm_cpuid_80000008_0_ebx_x86_features =
+ F(IBPB);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -619,7 +625,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
if (!g_phys_as)
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
- entry->ebx = entry->edx = 0;
+ entry->ebx &= kvm_cpuid_80000008_0_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ entry->edx = 0;
break;
}
case 0x80000019:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 11ce7945f97d..83f1a58b3f3e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,6 +44,7 @@
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
+#include <asm/spec_ctrl.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -174,6 +175,8 @@ struct vcpu_svm {
u64 next_rip;
+ u64 spec_ctrl;
+
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
struct {
u16 fs;
@@ -247,6 +250,8 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = true },
+ { .index = MSR_IA32_PRED_CMD, .always = true },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -505,6 +510,8 @@ struct svm_cpu_data {
struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
+
+ struct vmcb *current_vmcb;
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -1657,11 +1664,18 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
+
+ /*
+ * The VMCB could be recycled, causing a false negative in svm_vcpu_load;
+ * block speculative execution.
+ */
+ x86_ibp_barrier();
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int i;
if (unlikely(cpu != vcpu->cpu)) {
@@ -1690,6 +1704,11 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+ x86_ibp_barrier();
+ }
+
avic_vcpu_load(vcpu, cpu);
}
@@ -3533,6 +3552,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+ case MSR_IA32_SPEC_CTRL:
+ msr_info->data = svm->spec_ctrl;
+ break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
@@ -3671,6 +3693,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
+ case MSR_IA32_SPEC_CTRL:
+ svm->spec_ctrl = data;
+ break;
case MSR_IA32_APICBASE:
if (kvm_vcpu_apicv_active(vcpu))
avic_update_vapic_bar(to_svm(vcpu), data);
@@ -4834,6 +4859,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
+ if (x86_ibrs_enabled() && (svm->spec_ctrl != FEATURE_ENABLE_IBRS))
+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -4878,6 +4906,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
#endif
+ /* Clear host registers (marked as clobbered so it's safe) */
+ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+ "xor %%r8, %%r8 \n\t"
+ "xor %%r9, %%r9 \n\t"
+ "xor %%r10, %%r10 \n\t"
+ "xor %%r11, %%r11 \n\t"
+ "xor %%r12, %%r12 \n\t"
+ "xor %%r13, %%r13 \n\t"
+ "xor %%r14, %%r14 \n\t"
+ "xor %%r15, %%r15 \n\t"
+#endif
"pop %%" _ASM_BP
:
: [svm]"a"(svm),
@@ -4907,6 +4951,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ if (x86_ibrs_enabled()) {
+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+ if (svm->spec_ctrl != FEATURE_ENABLE_IBRS)
+ wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+ }
+
+ stuff_RSB();
+
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 87e6a45e5e10..65715783b19d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -48,6 +48,8 @@
#include <asm/kexec.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/spec_ctrl.h>
+#include <asm/proto.h>
#include "trace.h"
#include "pmu.h"
@@ -611,6 +613,8 @@ struct vcpu_vmx {
*/
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
+
+ u64 spec_ctrl;
};
enum segment_cache_field {
@@ -2061,6 +2065,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ x86_ibp_barrier();
}
if (vmx->loaded_vmcs->cpu != cpu) {
@@ -2815,6 +2821,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
switch (msr_info->index) {
@@ -2826,8 +2833,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(to_vmx(vcpu));
- msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ vmx_load_host_state(vmx);
+ msr_info->data = vmx->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
@@ -2835,6 +2842,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_SPEC_CTRL:
+ msr_info->data = vmx->spec_ctrl;
+ break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -2852,13 +2862,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
- !(to_vmx(vcpu)->msr_ia32_feature_control &
+ !(vmx->msr_ia32_feature_control &
FEATURE_CONTROL_LMCE))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
case MSR_IA32_FEATURE_CONTROL:
- msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
+ msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
@@ -2874,7 +2884,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
/* Otherwise falls through */
default:
- msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
+ msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
@@ -2939,6 +2949,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ vmx->spec_ctrl = msr_info->data;
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5725,6 +5738,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
msr_info.index = ecx;
msr_info.host_initiated = false;
+
if (vmx_get_msr(vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
@@ -6309,6 +6323,8 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -8640,6 +8656,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
__write_pkru(vmx->guest_pkru);
atomic_switch_perf_msrs(vmx);
+
+ if (x86_ibrs_enabled())
+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
+ vmx->spec_ctrl, FEATURE_ENABLE_IBRS);
+
debugctlmsr = get_debugctlmsr();
vmx->__launched = vmx->loaded_vmcs->launched;
@@ -8748,6 +8769,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ stuff_RSB();
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1384e020ce2..4eef3d94496b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -770,7 +770,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+ !is_long_mode(vcpu))
return 1;
}
@@ -970,7 +971,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, MSR_IA32_SPEC_CTRL,
};
static unsigned num_msrs_to_save;
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db000d727..a744506856b1 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
return 0; /* Buffer overrun */
}
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+ buffer, bufsize);
+}
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 45772560aceb..ee269309efcc 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -26,6 +26,8 @@
# include <asm/smp.h>
#endif
+#define IBRS_DISABLE_THRESHOLD 1000
+
/* simple loop based delay: */
static void delay_loop(unsigned long loops)
{
@@ -105,6 +107,9 @@ static void delay_mwaitx(unsigned long __loops)
for (;;) {
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+ if (delay > IBRS_DISABLE_THRESHOLD)
+ x86_disable_ibrs();
+
/*
* Use cpu_tss as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
@@ -118,6 +123,9 @@ static void delay_mwaitx(unsigned long __loops)
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+ if (delay > IBRS_DISABLE_THRESHOLD)
+ x86_enable_ibrs();
+
end = rdtsc_ordered();
if (loops <= end - start)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index ab1b644d2eec..e21cc266c61d 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -9,7 +9,6 @@ CFLAGS_setup_nx.o := $(nostackp)
CFLAGS_fault.o := -I$(src)/../include/asm/trace
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
-obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
@@ -34,4 +33,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3aebbd6c6f5f..2bd45ae91eb3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
- if (cpu_has_pge) {
+ if (cpu_has_pge && !kaiser_enabled) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else
@@ -753,10 +753,8 @@ void __init zone_sizes_init(void)
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
.active_mm = &init_mm,
.state = 0,
-#endif
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c34736707b56..9c91875a3474 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -397,6 +397,16 @@ void __init cleanup_highmap(void)
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
+ else if (kaiser_enabled) {
+ /*
+ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+ * clear that now. This is not important, so long as
+ * CR4.PGE remains clear, but it removes an anomaly.
+ * Physical mapping setup below avoids _PAGE_GLOBAL
+ * by use of massage_pgprot() inside pfn_pte() etc.
+ */
+ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+ }
}
}
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..acfbf8c69eef
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,444 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h> /* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/cmdline.h>
+
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
+
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ BUG_ON(pgd_none(*pgd));
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ pud = pud_offset(pgd, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud))
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd))
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ return pte_offset_kernel(pmd, address);
+}
+
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ int ret = 0;
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long )__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ /*
+ * It is convenient for callers to pass in __PAGE_KERNEL etc,
+ * and there is no actual harm from setting _PAGE_GLOBAL, so
+ * long as CR4.PGE is not set. But it is nonetheless troubling
+ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+ * requires that not to be #defined to 0): so mask it off here.
+ */
+ flags &= ~_PAGE_GLOBAL;
+
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+ ret = -EIO;
+ break;
+ }
+ pte = kaiser_pagetable_walk(address);
+ if (!pte) {
+ ret = -ENOMEM;
+ break;
+ }
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
+ return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+ unsigned long size = end - start;
+
+ return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+ pgd_t new_pgd;
+ pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+ */
+ if (!pgd_none(pgd[i])) {
+ WARN_ON(1);
+ continue;
+ }
+ set_pgd(pgd + i, new_pgd);
+ }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+void __init kaiser_check_boottime_disable(void)
+{
+ bool enable = true;
+ char arg[5];
+ int ret;
+
+ if (boot_cpu_has(X86_FEATURE_XENPV))
+ goto silent_disable;
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (!strncmp(arg, "on", 2))
+ goto enable;
+
+ if (!strncmp(arg, "off", 3))
+ goto disable;
+
+ if (!strncmp(arg, "auto", 4))
+ goto skip;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti"))
+ goto disable;
+
+skip:
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ goto disable;
+
+enable:
+ if (enable)
+ setup_force_cpu_cap(X86_FEATURE_KAISER);
+
+ return;
+
+disable:
+ pr_info("Kernel/User page tables isolation: disabled\n");
+
+silent_disable:
+ kaiser_enabled = 0;
+ setup_clear_cpu_cap(X86_FEATURE_KAISER);
+}
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+ int cpu;
+
+ if (!kaiser_enabled)
+ return;
+
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
+
+ /*
+ * Map the entry/exit text section, which is needed at
+ * switches from user to and from kernel.
+ */
+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+ __irqentry_text_end,
+ __PAGE_KERNEL_RX);
+#endif
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+ kaiser_add_user_map_early(&trace_idt_descr,
+ sizeof(trace_idt_descr),
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&trace_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
+#endif
+ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&debug_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ if (!kaiser_enabled)
+ return 0;
+ return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ extern void unmap_pud_range_nofree(pgd_t *pgd,
+ unsigned long start, unsigned long end);
+ unsigned long end = start + size;
+ unsigned long addr, next;
+ pgd_t *pgd;
+
+ if (!kaiser_enabled)
+ return;
+ pgd = native_get_shadow_pgd(pgd_offset_k(start));
+ for (addr = start; addr < end; pgd++, addr = next) {
+ next = pgd_addr_end(addr, end);
+ unmap_pud_range_nofree(pgd, addr, next);
+ }
+}
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!kaiser_enabled)
+ return pgd;
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+ */
+ if (pgd.pgd & _PAGE_USER) {
+ if (is_userspace_pgd(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace running on the kernel CR3,
+ * userspace will crash instead of running.
+ */
+ pgd.pgd |= _PAGE_NX;
+ }
+ } else if (!pgd.pgd) {
+ /*
+ * pgd_clear() cannot check _PAGE_USER, and is even used to
+ * clear corrupted pgd entries: so just rely on cases like
+ * kexec and EFI never to be using pgd_clear().
+ */
+ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+ is_userspace_pgd(pgdp))
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+ return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+ if (this_cpu_has(X86_FEATURE_PCID))
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+ this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+ if (this_cpu_has(X86_FEATURE_PCID))
+ this_cpu_write(x86_cr3_pcid_user,
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 32e0bb5a6718..e77ac22491da 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
return true;
}
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
return true;
}
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+ unsigned long start,
+ unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
pte++;
}
- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
- if (unmap_pte_range(pmd, start, end))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (unmap_pte_range(cpa, pmd, start, end))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+ unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- __unmap_pmd_range(pud, pmd, start, pre_end);
+ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end;
pmd++;
@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
if (pmd_large(*pmd))
pmd_clear(pmd);
else
- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+ __unmap_pmd_range(cpa, pud, pmd,
+ start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* 4K leftovers?
*/
if (start < end)
- return __unmap_pmd_range(pud, pmd, start, end);
+ return __unmap_pmd_range(cpa, pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+ unsigned long start,
+ unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
@@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- unmap_pmd_range(pud, start, pre_end);
+ unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end;
pud++;
@@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
if (pud_large(*pud))
pud_clear(pud);
else
- unmap_pmd_range(pud, start, start + PUD_SIZE);
+ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
@@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
* 2M leftovers?
*/
if (start < end)
- unmap_pmd_range(pud, start, end);
+ unmap_pmd_range(cpa, pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
@@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = CPA_FREE_PAGETABLES,
+ };
+
+ __unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = 0,
+ };
+
+ __unmap_pud_range(&cpa, pgd, start, end);
+}
+
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
{
pgd_t *pgd_entry = root + pgd_index(addr);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..dbc27a2b4ad5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
+
+/*
+ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER kaiser_enabled
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+ PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index eae22d82998d..a5858460a730 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,16 +6,19 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/ptrace.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>
+#include <asm/spec_ctrl.h>
/*
- * Smarter SMP flushing macros.
+ * TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
@@ -34,6 +37,36 @@ struct flush_tlb_info {
unsigned long flush_end;
};
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+ if (kaiser_enabled) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+ * Flush KERN below, flush USER when returning to userspace in
+ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+ *
+ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+ * do it here, but can only be used if X86_FEATURE_INVPCID is
+ * available - and many machines support pcid without invpcid.
+ *
+ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
+ * would be needed in the write_cr3() below - if PCIDs enabled.
+ */
+ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
+ kaiser_flush_tlb_on_return_to_user();
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
@@ -45,7 +78,7 @@ void leave_mm(int cpu)
BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
- load_cr3(swapper_pg_dir);
+ load_new_mm_cr3(swapper_pg_dir);
/*
* This gets called in the idle path where RCU
* functions differently. Tracing normally
@@ -57,6 +90,114 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+
+ /* Null tsk means switching to kernel, so that's safe */
+ if (tsk && ___ptrace_may_access(tsk, current, PTRACE_MODE_IBPB))
+ x86_ibp_barrier();
+
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ this_cpu_write(cpu_tlbstate.active_mm, next);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
+ load_new_mm_cr3(next->pgd);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+ /* Stop flush ipis for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+ /* Load per-mm CR4 state */
+ load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT, if the LDT is different.
+ *
+ * It's possible that prev->context.ldt doesn't match
+ * the LDT register. This can happen if leave_mm(prev)
+ * was called and then modify_ldt changed
+ * prev->context.ldt but suppressed an IPI to this CPU.
+ * In this case, prev->context.ldt != NULL, because we
+ * never set context.ldt to NULL while the mm still
+ * exists. That means that next->context.ldt !=
+ * prev->context.ldt, because mms never share an LDT.
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_mm_ldt(next);
+#endif
+ } else {
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * On established mms, the mm_cpumask is only changed
+ * from irq context, from ptep_clear_flush() while in
+ * lazy tlb mode, and here. Irqs are blocked during
+ * schedule, protecting us from simultaneous changes.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
+ */
+ load_new_mm_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
+ load_mm_ldt(next);
+ }
+ }
+}
+
/*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
@@ -158,23 +299,6 @@ void native_flush_tlb_others(struct cpumask *cpumask,
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
-
- preempt_disable();
-
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-
- /* This is an implicit full barrier that synchronizes with switch_mm. */
- local_flush_tlb();
-
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
- preempt_enable();
-}
-
/*
* See Documentation/x86/tlb.txt for details. We choose 33
* because it is large enough to cover the vast majority (at
@@ -195,6 +319,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
+
+ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+ base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+ if (base_pages_to_flush > tlb_single_page_flush_ceiling)
+ base_pages_to_flush = TLB_FLUSH_ALL;
+
if (current->active_mm != mm) {
/* Synchronize with switch_mm. */
smp_mb();
@@ -211,15 +341,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
goto out;
}
- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
- base_pages_to_flush = (end - start) >> PAGE_SHIFT;
-
/*
* Both branches below are implicit full barriers (MOV to CR or
* INVLPG) that synchronize with switch_mm.
*/
- if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
- base_pages_to_flush = TLB_FLUSH_ALL;
+ if (base_pages_to_flush == TLB_FLUSH_ALL) {
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
@@ -240,33 +366,6 @@ out:
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
- }
-
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
-}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 18750ea64c29..1957ab150549 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -443,6 +443,12 @@ static void __init xen_init_cpuid_mask(void)
~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ /*
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */
+
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c
index 0e7d16fe84d4..125c0bded9b2 100644
--- a/drivers/media/usb/uvc/uvc_v4l2.c
+++ b/drivers/media/usb/uvc/uvc_v4l2.c
@@ -811,6 +811,7 @@ static int uvc_ioctl_enum_input(struct file *file, void *fh,
}
pin = iterm->id;
} else if (index < selector->bNrInPins) {
+ gmb();
pin = selector->baSourceID[index];
list_for_each_entry(iterm, &chain->entities, chain) {
if (!UVC_ENTITY_IS_ITERM(iterm))
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 4d1527a2e292..fa41bf2b2a0c 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1388,6 +1388,7 @@ static int carl9170_op_conf_tx(struct ieee80211_hw *hw,
mutex_lock(&ar->mutex);
if (queue < ar->hw->queues) {
+ gmb();
memcpy(&ar->edcf[ar9170_qmap[queue]], param, sizeof(*param));
ret = carl9170_set_qos(ar);
} else {
diff --git a/drivers/net/wireless/cw1200/sta.c b/drivers/net/wireless/cw1200/sta.c
index c602a1e674ca..71a3c3aa3fa4 100644
--- a/drivers/net/wireless/cw1200/sta.c
+++ b/drivers/net/wireless/cw1200/sta.c
@@ -619,6 +619,7 @@ int cw1200_conf_tx(struct ieee80211_hw *dev, struct ieee80211_vif *vif,
mutex_lock(&priv->conf_mutex);
if (queue < dev->queues) {
+ gmb();
old_uapsd_flags = le16_to_cpu(priv->uapsd_info.uapsd_flags);
WSM_TX_QUEUE_SET(&priv->tx_queue_params, queue, 0, 0, 0);
diff --git a/drivers/net/wireless/p54/main.c b/drivers/net/wireless/p54/main.c
index 7805864e76f9..631aa53604ee 100644
--- a/drivers/net/wireless/p54/main.c
+++ b/drivers/net/wireless/p54/main.c
@@ -415,6 +415,7 @@ static int p54_conf_tx(struct ieee80211_hw *dev,
mutex_lock(&priv->conf_mutex);
if (queue < dev->queues) {
+ gmb();
P54_SET_QUEUE(priv->qos_params[queue], params->aifs,
params->cw_min, params->cw_max, params->txop);
ret = p54_set_edcf(priv);
diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c
index b9d1e2a9e326..11c20636ab60 100644
--- a/drivers/scsi/qla2xxx/qla_mr.c
+++ b/drivers/scsi/qla2xxx/qla_mr.c
@@ -2304,10 +2304,12 @@ qlafx00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt)
req = ha->req_q_map[que];
/* Validate handle. */
- if (handle < req->num_outstanding_cmds)
+ if (handle < req->num_outstanding_cmds) {
+ gmb();
sp = req->outstanding_cmds[handle];
- else
+ } else {
sp = NULL;
+ }
if (sp == NULL) {
ql_dbg(ql_dbg_io, vha, 0x3034,
@@ -2655,10 +2657,12 @@ qlafx00_multistatus_entry(struct scsi_qla_host *vha,
req = ha->req_q_map[que];
/* Validate handle. */
- if (handle < req->num_outstanding_cmds)
+ if (handle < req->num_outstanding_cmds) {
+ gmb();
sp = req->outstanding_cmds[handle];
- else
+ } else {
sp = NULL;
+ }
if (sp == NULL) {
ql_dbg(ql_dbg_io, vha, 0x3044,
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
index b9b2666aa94c..7a42fab4a413 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
@@ -57,15 +57,16 @@ static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone,
if (d->override_ops && d->override_ops->get_trip_temp)
return d->override_ops->get_trip_temp(zone, trip, temp);
- if (trip < d->aux_trip_nr)
+ if (trip < d->aux_trip_nr) {
+ gmb();
*temp = d->aux_trips[trip];
- else if (trip == d->crt_trip_id)
+ } else if (trip == d->crt_trip_id) {
*temp = d->crt_temp;
- else if (trip == d->psv_trip_id)
+ } else if (trip == d->psv_trip_id) {
*temp = d->psv_temp;
- else if (trip == d->hot_trip_id)
+ } else if (trip == d->hot_trip_id) {
*temp = d->hot_temp;
- else {
+ } else {
for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) {
if (d->act_trips[i].valid &&
d->act_trips[i].id == trip) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 93e80b3cda03..436f5774c2b0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2328,6 +2328,27 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
if (cache == NULL)
return;
+ /* If there is an old entry, remove it first to avoid cache getting
+ * too large
+ */
+ if (!list_empty(&NFS_I(inode)->access_cache_entry_lru)) {
+ struct nfs_access_entry *old;
+ spin_lock(&inode->i_lock);
+ old = list_first_entry_or_null(&NFS_I(inode)->access_cache_entry_lru,
+ struct nfs_access_entry, lru);
+ if (old &&
+ !nfs_have_delegated_attributes(inode) &&
+ !time_in_range_open(jiffies, old->jiffies,
+ old->jiffies + NFS_I(inode)->attrtimeo)) {
+ list_del_init(&old->lru);
+ rb_erase(&old->rb_node, &NFS_I(inode)->access_cache);
+ } else
+ old = NULL;
+ spin_unlock(&inode->i_lock);
+ if (old)
+ nfs_access_free_entry(old);
+ }
+
RB_CLEAR_NODE(&cache->rb_node);
cache->jiffies = set->jiffies;
cache->cred = get_rpccred(set->cred);
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 71d1c25f360d..1b1beeab39b3 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -104,6 +104,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
iinfo->i_lenEAttr) {
uint32_t aal =
le32_to_cpu(eahd->appAttrLocation);
+
+ gmb();
memmove(&ea[offset - aal + size],
&ea[aal], offset - aal);
offset -= aal;
@@ -114,6 +116,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
iinfo->i_lenEAttr) {
uint32_t ial =
le32_to_cpu(eahd->impAttrLocation);
+
+ gmb();
memmove(&ea[offset - ial + size],
&ea[ial], offset - ial);
offset -= ial;
@@ -125,6 +129,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
iinfo->i_lenEAttr) {
uint32_t aal =
le32_to_cpu(eahd->appAttrLocation);
+
+ gmb();
memmove(&ea[offset - aal + size],
&ea[aal], offset - aal);
offset -= aal;
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index fe297b599b0a..0ee1345c9222 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -42,6 +42,10 @@
#define wmb() mb()
#endif
+#ifndef gmb
+#define gmb() do { } while (0)
+#endif
+
#ifndef dma_rmb
#define dma_rmb() rmb()
#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 176aaa118cc9..4850d66bca34 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -728,7 +728,14 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
*(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 5295535b60c6..4f2c526d5683 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -81,8 +81,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
{
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
- if (fd < fdt->max_fds)
+ if (fd < fdt->max_fds) {
+ gmb();
return rcu_dereference_raw(fdt->fd[fd]);
+ }
return NULL;
}
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..4a4d6d911a14
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ /*
+ * Map that page of kernel stack on which we enter from user context.
+ */
+ return kaiser_add_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+ /*
+ * Note: may be called even when kaiser_map_thread_stack() failed.
+ */
+ kaiser_remove_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+ unsigned long size, unsigned long flags)
+{
+ return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+ unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 70fffeba7495..66be4424daf0 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,18 @@
#ifndef _LINUX_MMU_CONTEXT_H
#define _LINUX_MMU_CONTEXT_H
+#ifndef __GENKSYMS__
+#include <asm/mmu_context.h>
+#endif
+
struct mm_struct;
void use_mm(struct mm_struct *mm);
void unuse_mm(struct mm_struct *mm);
+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+# define switch_mm_irqs_off switch_mm
+#endif
+
#endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299ca068..cfe13cb4ec63 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -162,11 +182,21 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
*/
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 81fdf4b8aba4..c84b8817755b 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -59,12 +59,15 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
#define PTRACE_MODE_NOAUDIT 0x04
#define PTRACE_MODE_FSCREDS 0x08
#define PTRACE_MODE_REALCREDS 0x10
+#define PTRACE_MODE_NOACCESS_CHK 0x20
/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
+#define PTRACE_MODE_IBPB (PTRACE_MODE_ATTACH | PTRACE_MODE_NOAUDIT \
+ | PTRACE_MODE_NOACCESS_CHK | PTRACE_MODE_REALCREDS)
/**
* ptrace_may_access - check whether the caller is permitted to access
@@ -82,6 +85,9 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
*/
extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
+extern int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task,
+ unsigned int mode);
+
static inline int ptrace_reparented(struct task_struct *child)
{
return !same_thread_group(child->real_parent, child->parent);
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 1ecf13e148b8..b3e4cd7d1318 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -127,6 +127,9 @@ struct rpc_authops {
struct rpcsec_gss_info *);
int (*key_timeout)(struct rpc_auth *,
struct rpc_cred *);
+#ifndef __GENKSYMS__
+ int (*hash_cred)(struct auth_cred *, unsigned int);
+#endif
};
struct rpc_credops {
diff --git a/init/main.c b/init/main.c
index 63ddeaa26406..38336b48bb88 100644
--- a/init/main.c
+++ b/init/main.c
@@ -82,6 +82,7 @@
#include <linux/integrity.h>
#include <linux/proc_ns.h>
#include <linux/io.h>
+#include <linux/kaiser.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -493,6 +494,7 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+ kaiser_init();
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd572c..9e784b389c0a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -29,6 +29,7 @@
#include <linux/bpf.h>
#include <asm/unaligned.h>
+#include <asm/barrier.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -356,6 +357,7 @@ select_insn:
DST = IMM;
CONT;
LD_IMM_DW:
+ gmb();
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
@@ -570,6 +572,7 @@ out:
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
CONT; \
LDX_MEM_##SIZEOP: \
+ gmb(); \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT;
diff --git a/kernel/fork.c b/kernel/fork.c
index f96762e31aae..67c7c5c19a81 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
+ kaiser_unmap_thread_stack(ti);
free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
@@ -352,6 +354,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
goto free_ti;
tsk->stack = ti;
+
+ err = kaiser_map_thread_stack(tsk->stack);
+ if (err)
+ goto free_ti;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5e2cd1030702..14cc49a52881 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,9 +235,10 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
}
/* Returns 0 on success, -errno on denial. */
-static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task,
+ unsigned int mode)
{
- const struct cred *cred = current_cred(), *tcred;
+ const struct cred *cred = __task_cred(cur), *tcred;
struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -257,7 +258,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
*/
/* Don't let security modules deny introspection */
- if (same_thread_group(task, current))
+ if (same_thread_group(task, cur))
return 0;
rcu_read_lock();
if (mode & PTRACE_MODE_FSCREDS) {
@@ -295,7 +296,16 @@ ok:
!ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
- return security_ptrace_access_check(task, mode);
+ if (!(mode & PTRACE_MODE_NOACCESS_CHK))
+ return security_ptrace_access_check(task, mode);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(___ptrace_may_access);
+
+static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+{
+ return ___ptrace_may_access(current, task, mode);
}
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 620f6801a1a6..21153f9332a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -32,7 +32,11 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
+#ifndef __GENKSYMS__
+#include <linux/mmu_context.h>
+#else
#include <asm/mmu_context.h>
+#endif
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -2994,7 +2998,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm(oldmm, mm, next);
+ switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..7c1771800d90 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -496,8 +496,10 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
struct uid_gid_extent *extent = NULL;
loff_t pos = *ppos;
- if (pos < map->nr_extents)
+ if (pos < map->nr_extents) {
+ gmb();
extent = &map->extent[pos];
+ }
return extent;
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index f802c2d216a7..6f4d27c5bb32 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -4,9 +4,9 @@
*/
#include <linux/mm.h>
+#include <linux/sched.h>
#include <linux/mmu_context.h>
#include <linux/export.h>
-#include <linux/sched.h>
#include <asm/mmu_context.h>
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4d52a0e2f60d..cb0e28e16629 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -714,6 +714,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
if (offset < rfv->hlen) {
int copy = min(rfv->hlen - offset, len);
+ gmb();
if (skb->ip_summed == CHECKSUM_PARTIAL)
memcpy(to, rfv->c + offset, copy);
else
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 52cfc4478511..be8b78fedc61 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -43,6 +43,8 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
if (index < net->mpls.platform_labels) {
struct mpls_route __rcu **platform_label =
rcu_dereference(net->mpls.platform_label);
+
+ gmb();
rt = rcu_dereference(platform_label[index]);
}
return rt;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..2883527fa867 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
*entry, *new;
unsigned int nr;
- nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+ nr = auth->au_ops->hash_cred(acred, cache->hashbits);
rcu_read_lock();
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..68e526732a0f 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -13,6 +13,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/sched.h>
+#include <linux/hash.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -71,6 +72,25 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
return auth->au_ops->lookup_cred(auth, acred, lookupflags);
}
+static int
+generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ u32 uid = from_kuid(&init_user_ns, acred->uid);
+ u32 gid;
+ int ret = hash_32(uid, 32);
+
+ if (acred->group_info) {
+ int g;
+
+ for (g = 0; g < acred->group_info->ngroups; g++) {
+ gid = from_kgid(&init_user_ns, GROUP_AT(acred->group_info, g));
+ ret = hash_32(ret ^ gid, 32);
+ }
+ }
+ gid = from_kgid(&init_user_ns, acred->gid);
+ return hash_32(ret ^ gid, hashbits);
+}
+
/*
* Lookup generic creds for current process
*/
@@ -258,6 +278,7 @@ out_put:
static const struct rpc_authops generic_auth_ops = {
.owner = THIS_MODULE,
.au_name = "Generic",
+ .hash_cred = generic_hash_cred,
.lookup_cred = generic_lookup_cred,
.crcreate = generic_create_cred,
.key_timeout = generic_key_timeout,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 54c1be5e747d..6e340b23b907 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1300,6 +1300,13 @@ gss_destroy_cred(struct rpc_cred *cred)
gss_destroy_nullcred(cred);
}
+static int
+gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ u64 uid = from_kuid(&init_user_ns, acred->uid);
+ return hash_64(uid, hashbits);
+}
+
/*
* Lookup RPCSEC_GSS cred for the current process
*/
@@ -1984,6 +1991,7 @@ static const struct rpc_authops authgss_ops = {
.au_name = "RPCSEC_GSS",
.create = gss_create,
.destroy = gss_destroy,
+ .hash_cred = gss_hash_cred,
.lookup_cred = gss_lookup_cred,
.crcreate = gss_create_cred,
.list_pseudoflavors = gss_mech_list_pseudoflavors,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 548240dd15fc..7ddb58621e87 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -13,6 +13,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/user_namespace.h>
+#include <linux/hash.h>
#define NFS_NGROUPS 16
@@ -48,6 +49,25 @@ unx_destroy(struct rpc_auth *auth)
rpcauth_clear_credcache(auth->au_credcache);
}
+static int
+unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ u32 uid = from_kuid(&init_user_ns, acred->uid);
+ u32 gid;
+ int ret = hash_32(uid, 32);
+
+ if (acred->group_info) {
+ int g;
+
+ for (g = 0; g < acred->group_info->ngroups && g < NFS_NGROUPS; g++) {
+ gid = from_kgid(&init_user_ns, GROUP_AT(acred->group_info, g));
+ ret = hash_32(ret ^ gid, 32);
+ }
+ }
+ gid = from_kgid(&init_user_ns, acred->gid);
+ return hash_32(ret ^ gid, hashbits);
+}
+
/*
* Lookup AUTH_UNIX creds for current process
*/
@@ -222,6 +242,7 @@ const struct rpc_authops authunix_ops = {
.au_name = "UNIX",
.create = unx_create,
.destroy = unx_destroy,
+ .hash_cred = unx_hash_cred,
.lookup_cred = unx_lookup_cred,
.crcreate = unx_create_cred,
};
diff --git a/security/Kconfig b/security/Kconfig
index ca358bd671ba..5565288746f5 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -31,6 +31,16 @@ config SECURITY
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ default y
+ depends on X86_64 && SMP
+ help
+ This enforces a strict kernel and user space isolation, in order
+ to close hardware side channels on kernel address information.
+
+ If you are unsure how to answer this question, answer Y.
+
config SECURITYFS
bool "Enable the securityfs filesystem"
help