Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2018-01-12 21:55:29 +0100
committerBorislav Petkov <bp@suse.de>2018-01-12 21:55:38 +0100
commit8342aabecb4645621f68553598b779741fdee45d (patch)
tree12788e6acfa58acd2b4c7968ef40dbf4bb98e8c6
parent5b26955607c14607381b9ab7f0f933474ee79b4e (diff)
x86/entry/64: Create a per-CPU SYSCALL entry trampoline
(bsc#1068032 CVE-2017-5754).
-rw-r--r--patches.arch/18-x86-entry-64-create-a-per-cpu-syscall-entry-trampoline.patch221
-rw-r--r--series.conf1
2 files changed, 222 insertions, 0 deletions
diff --git a/patches.arch/18-x86-entry-64-create-a-per-cpu-syscall-entry-trampoline.patch b/patches.arch/18-x86-entry-64-create-a-per-cpu-syscall-entry-trampoline.patch
new file mode 100644
index 0000000000..b0dff7bf4f
--- /dev/null
+++ b/patches.arch/18-x86-entry-64-create-a-per-cpu-syscall-entry-trampoline.patch
@@ -0,0 +1,221 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:25 +0100
+Subject: x86/entry/64: Create a per-CPU SYSCALL entry trampoline
+Git-commit: 3386bc8aed825e9f1f65ce38df4b109b2019b71a
+Patch-mainline: v4.15-rc5
+References: bsc#1068032 CVE-2017-5754
+
+Handling SYSCALL is tricky: the SYSCALL handler is entered with every
+single register (except FLAGS), including RSP, live. It somehow needs
+to set RSP to point to a valid stack, which means it needs to save the
+user RSP somewhere and find its own stack pointer. The canonical way
+to do this is with SWAPGS, which lets us access percpu data using the
+%gs prefix.
+
+With PAGE_TABLE_ISOLATION-like pagetable switching, this is
+problematic. Without a scratch register, switching CR3 is impossible, so
+%gs-based percpu memory would need to be mapped in the user pagetables.
+Doing that without information leaks is difficult or impossible.
+
+Instead, use a different sneaky trick. Map a copy of the first part
+of the SYSCALL asm at a different address for each CPU. Now RIP
+varies depending on the CPU, so we can use RIP-relative memory access
+to access percpu memory. By putting the relevant information (one
+scratch slot and the stack address) at a constant offset relative to
+RIP, we can make SYSCALL work without relying on %gs.
+
+A nice thing about this approach is that we can easily switch it on
+and off if we want pagetable switching to be configurable.
+
+The compat variant of SYSCALL doesn't have this problem in the first
+place -- there are plenty of scratch registers, since we don't care
+about preserving r8-r15. This patch therefore doesn't touch SYSCALL32
+at all.
+
+This patch actually seems to be a small speedup. With this patch,
+SYSCALL touches an extra cache line and an extra virtual page, but
+the pipeline no longer stalls waiting for SWAPGS. It seems that, at
+least in a tight loop, the latter outweights the former.
+
+Thanks to David Laight for an optimization tip.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bpetkov@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Borislav Petkov <bp@suse.de>
+---
+ arch/x86/entry/entry_64.S | 58 ++++++++++++++++++++++++++++++++++++++++++
+ arch/x86/include/asm/fixmap.h | 2 +
+ arch/x86/kernel/asm-offsets.c | 1
+ arch/x86/kernel/cpu/common.c | 15 ++++++++++
+ arch/x86/kernel/vmlinux.lds.S | 9 ++++++
+ 5 files changed, 84 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -135,6 +135,64 @@ END(native_usergs_sysret64)
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
++ .pushsection .entry_trampoline, "ax"
++
++/*
++ * The code in here gets remapped into cpu_entry_area's trampoline. This means
++ * that the assembler and linker have the wrong idea as to where this code
++ * lives (and, in fact, it's mapped more than once, so it's not even at a
++ * fixed address). So we can't reference any symbols outside the entry
++ * trampoline and expect it to work.
++ *
++ * Instead, we carefully abuse %rip-relative addressing.
++ * _entry_trampoline(%rip) refers to the start of the remapped) entry
++ * trampoline. We can thus find cpu_entry_area with this macro:
++ */
++
++#define CPU_ENTRY_AREA \
++ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
++
++/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
++ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
++
++ENTRY(entry_SYSCALL_64_trampoline)
++ UNWIND_HINT_EMPTY
++ swapgs
++
++ /* Stash the user RSP. */
++ movq %rsp, RSP_SCRATCH
++
++ /* Load the top of the task stack into RSP */
++ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
++
++ /* Start building the simulated IRET frame. */
++ pushq $__USER_DS /* pt_regs->ss */
++ pushq RSP_SCRATCH /* pt_regs->sp */
++ pushq %r11 /* pt_regs->flags */
++ pushq $__USER_CS /* pt_regs->cs */
++ pushq %rcx /* pt_regs->ip */
++
++ /*
++ * x86 lacks a near absolute jump, and we can't jump to the real
++ * entry text with a relative jump. We could push the target
++ * address and then use retq, but this destroys the pipeline on
++ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
++ * spill RDI and restore it in a second-stage trampoline.
++ */
++ pushq %rdi
++ movq $entry_SYSCALL_64_stage2, %rdi
++ jmp *%rdi
++END(entry_SYSCALL_64_trampoline)
++
++ .popsection
++
++ENTRY(entry_SYSCALL_64_stage2)
++ UNWIND_HINT_EMPTY
++ popq %rdi
++ jmp entry_SYSCALL_64_after_hwframe
++END(entry_SYSCALL_64_stage2)
++
+ ENTRY(entry_SYSCALL_64)
+ UNWIND_HINT_EMPTY
+ /*
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -61,6 +61,8 @@ struct cpu_entry_area {
+ * of the TSS region.
+ */
+ struct tss_struct tss;
++
++ char entry_trampoline[PAGE_SIZE];
+ };
+
+ #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -100,4 +100,5 @@ void common(void) {
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
++ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ }
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -468,6 +468,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *,
+ static inline void setup_cpu_entry_area(int cpu)
+ {
+ #ifdef CONFIG_X86_64
++ extern char _entry_trampoline[];
++
+ /* On 64-bit systems, we use a read-only fixmap GDT. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ #else
+@@ -514,6 +516,11 @@ static inline void setup_cpu_entry_area(
+ #ifdef CONFIG_X86_32
+ this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
+ #endif
++
++#ifdef CONFIG_X86_64
++ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
++ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
++#endif
+ }
+
+ /* Load the original GDT from the per-cpu structure */
+@@ -1380,10 +1387,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char,
+ /* May not be marked __init: used by software suspend */
+ void syscall_init(void)
+ {
++ extern char _entry_trampoline[];
++ extern char entry_SYSCALL_64_trampoline[];
++
+ int cpu = smp_processor_id();
++ unsigned long SYSCALL64_entry_trampoline =
++ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
++ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+
+ #ifdef CONFIG_IA32_EMULATION
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -106,6 +106,15 @@ SECTIONS
+ SOFTIRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
++
++#ifdef CONFIG_X86_64
++ . = ALIGN(PAGE_SIZE);
++ _entry_trampoline = .;
++ *(.entry_trampoline)
++ . = ALIGN(PAGE_SIZE);
++ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
++#endif
++
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
diff --git a/series.conf b/series.conf
index b31f12c5a7..e7e38012af 100644
--- a/series.conf
+++ b/series.conf
@@ -7389,6 +7389,7 @@
patches.arch/15-x86-entry-64-use-a-per-cpu-trampoline-stack-for-idt-entries.patch
patches.arch/16-x86-entry-64-return-to-userspace-from-the-trampoline-stack.patch
patches.arch/17-x86-xen-64-rearrange-the-syscall-entries.patch
+ patches.arch/18-x86-entry-64-create-a-per-cpu-syscall-entry-trampoline.patch
########################################################
# Staging tree patches