Home Home > GIT Browse > SLE12-SP4-AZURE
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKernel Build Daemon <kbuild@suse.de>2019-05-07 07:01:24 +0200
committerKernel Build Daemon <kbuild@suse.de>2019-05-07 07:01:24 +0200
commit48ff8175a87b8ceeb9b34a71368aa5e7d08b3f31 (patch)
tree485656599f219be5c44023320380f6cdb2c7d9a5
parent36f7f6b0f3a2f26389e7907040de75afd6424f82 (diff)
parentc22e3cf2aa5d8933b587945d3bc7ebdccfa84d0e (diff)
Merge branch 'SLE15' into SLE12-SP4
-rw-r--r--blacklist.conf2
-rw-r--r--patches.arch/kvm-fix-uaf-in-nested-posted-interrupt-processing60
-rw-r--r--patches.arch/kvm-nvmx-clear-reserved-bits-of-db-exit-qualification66
-rw-r--r--patches.arch/kvm-nvmx-restore-host-state-in-nested_vmx_vmexit-for-vmfail324
-rw-r--r--patches.arch/kvm-vmx-zero-out-all-general-purpose-registers-after-vm-exit61
-rw-r--r--patches.arch/kvm-x86-always-use-32-bit-smram-save-state-for-32-bit-kernels153
-rw-r--r--patches.arch/kvm-x86-don-t-clear-efer-during-smm-transitions-for-32-bit-vcpu78
-rw-r--r--patches.arch/kvm-x86-svm-make-sure-nmi-is-injected-after-nmi_singlestep50
-rw-r--r--patches.fixes/0001-btrfs-qgroup-Move-reserved-data-accounting-from-btrf.patch307
-rw-r--r--patches.fixes/0001-btrfs-qgroup-Remove-duplicated-trace-points-for-qgro.patch46
-rw-r--r--series.conf9
11 files changed, 1156 insertions, 0 deletions
diff --git a/blacklist.conf b/blacklist.conf
index 726107fbf7..d4b06efdca 100644
--- a/blacklist.conf
+++ b/blacklist.conf
@@ -1097,3 +1097,5 @@ c8b1917c8987a6fa3695d479b4d60fbbbc3e537b # acpica: causes a regression (bsc#1132
2c2a2fb1e2a9256714338875bede6b7cbd4b9542 # acpica: reverting the above
45bb8d802742842fa974b0d7d474d115df1d07db # not a bug
379d98ddf41344273d9718556f761420f4dc80b3 # we don't use clang
+9fa246256e09dc30820524401cdbeeaadee94025 # We don't have the reverted commit: Revert "drm/i915/fbdev: Actually configure untiled displays"
+4a58038b9e420276157785afa0a0bbb4b9bc2265 # Reverted patch is not in SLE15
diff --git a/patches.arch/kvm-fix-uaf-in-nested-posted-interrupt-processing b/patches.arch/kvm-fix-uaf-in-nested-posted-interrupt-processing
new file mode 100644
index 0000000000..cb30c78eea
--- /dev/null
+++ b/patches.arch/kvm-fix-uaf-in-nested-posted-interrupt-processing
@@ -0,0 +1,60 @@
+From: Cfir Cohen <cfir@google.com>
+Date: Tue, 18 Dec 2018 08:18:41 -0800
+Subject: KVM: Fix UAF in nested posted interrupt processing
+Git-commit: c2dd5146e9fe1f22c77c1b011adf84eea0245806
+Patch-mainline: v4.20
+References: bsc#1134199
+
+nested_get_vmcs12_pages() processes the posted_intr address in vmcs12. It
+caches the kmap()ed page object and pointer, however, it doesn't handle
+errors correctly: it's possible to cache a valid pointer, then release
+the page and later dereference the dangling pointer.
+
+I was able to reproduce with the following steps:
+
+1. Call vmlaunch with valid posted_intr_desc_addr but an invalid
+MSR_EFER. This causes nested_get_vmcs12_pages() to cache the kmap()ed
+pi_desc_page and pi_desc. Later the invalid EFER value fails
+check_vmentry_postreqs() which fails the first vmlaunch.
+
+2. Call vmlanuch with a valid EFER but an invalid posted_intr_desc_addr
+(I set it to 2G - 0x80). The second time we call nested_get_vmcs12_pages
+pi_desc_page is unmapped and released and pi_desc_page is set to NULL
+(the "shouldn't happen" clause). Due to the invalid
+posted_intr_desc_addr, kvm_vcpu_gpa_to_page() fails and
+nested_get_vmcs12_pages() returns. It doesn't return an error value so
+vmlaunch proceeds. Note that at this time we have a dangling pointer in
+vmx->nested.pi_desc and POSTED_INTR_DESC_ADDR in L0's vmcs.
+
+3. Issue an IPI in L2 guest code. This triggers a call to
+vmx_complete_nested_posted_interrupt() and pi_test_and_clear_on() which
+dereferences the dangling pointer.
+
+Vulnerable code requires nested and enable_apicv variables to be set to
+true. The host CPU must also support posted interrupts.
+
+Fixes: 5e2f30b756a37 "KVM: nVMX: get rid of nested_get_page()"
+Cc: stable@vger.kernel.org
+Reviewed-by: Andy Honig <ahonig@google.com>
+Signed-off-by: Cfir Cohen <cfir@google.com>
+Reviewed-by: Liran Alon <liran.alon@oracle.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/kvm/vmx.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index 02edd9960e9d..8d5d984541be 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11985,6 +11985,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
++ vmx->nested.pi_desc = NULL;
++ vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (is_error_page(page))
+
diff --git a/patches.arch/kvm-nvmx-clear-reserved-bits-of-db-exit-qualification b/patches.arch/kvm-nvmx-clear-reserved-bits-of-db-exit-qualification
new file mode 100644
index 0000000000..89065e6309
--- /dev/null
+++ b/patches.arch/kvm-nvmx-clear-reserved-bits-of-db-exit-qualification
@@ -0,0 +1,66 @@
+From: Jim Mattson <jmattson@google.com>
+Date: Fri, 21 Sep 2018 10:36:17 -0700
+Subject: KVM: nVMX: Clear reserved bits of #DB exit qualification
+Git-commit: cfb634fe3052aefc4e1360fa322018c9a0b49755
+Patch-mainline: v4.20-rc1
+References: bsc#1134200
+
+According to volume 3 of the SDM, bits 63:15 and 12:4 of the exit
+qualification field for debug exceptions are reserved (cleared to
+0). However, the SDM is incorrect about bit 16 (corresponding to
+DR6.RTM). This bit should be set if a debug exception (#DB) or a
+breakpoint exception (#BP) occurred inside an RTM region while
+advanced debugging of RTM transactional regions was enabled. Note that
+this is the opposite of DR6.RTM, which "indicates (when clear) that a
+debug exception (#DB) or breakpoint exception (#BP) occurred inside an
+RTM region while advanced debugging of RTM transactional regions was
+enabled."
+
+There is still an issue with stale DR6 bits potentially being
+misreported for the current debug exception. DR6 should not have been
+modified before vectoring the #DB exception, and the "new DR6 bits"
+should be available somewhere, but it was and they aren't.
+
+Fixes: b96fb439774e1 ("KVM: nVMX: fixes to nested virt interrupt injection")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/vmx.c | 7 +++++--
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 09b2e3e2cf1b..1c09a0d1771f 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -177,6 +177,7 @@ enum {
+
+ #define DR6_BD (1 << 13)
+ #define DR6_BS (1 << 14)
++#define DR6_BT (1 << 15)
+ #define DR6_RTM (1 << 16)
+ #define DR6_FIXED_1 0xfffe0ff0
+ #define DR6_INIT 0xffff0ff0
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index b3c5517a89b3..14d446366ca5 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -3290,10 +3290,13 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
+ }
+ } else {
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+- if (nr == DB_VECTOR)
++ if (nr == DB_VECTOR) {
+ *exit_qual = vcpu->arch.dr6;
+- else
++ *exit_qual &= ~(DR6_FIXED_1 | DR6_BT);
++ *exit_qual ^= DR6_RTM;
++ } else {
+ *exit_qual = 0;
++ }
+ return 1;
+ }
+ }
+
diff --git a/patches.arch/kvm-nvmx-restore-host-state-in-nested_vmx_vmexit-for-vmfail b/patches.arch/kvm-nvmx-restore-host-state-in-nested_vmx_vmexit-for-vmfail
new file mode 100644
index 0000000000..5cfc1f78a1
--- /dev/null
+++ b/patches.arch/kvm-nvmx-restore-host-state-in-nested_vmx_vmexit-for-vmfail
@@ -0,0 +1,324 @@
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 22 Aug 2018 14:57:07 -0700
+Subject: KVM: nVMX: restore host state in nested_vmx_vmexit for VMFail
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: bd18bffca35397214ae68d85cf7203aca25c3c1d
+Patch-mainline: v4.20-rc1
+References: bsc#1134201
+
+A VMEnter that VMFails (as opposed to VMExits) does not touch host
+state beyond registers that are explicitly noted in the VMFail path,
+e.g. EFLAGS. Host state does not need to be loaded because VMFail
+is only signaled for consistency checks that occur before the CPU
+starts to load guest state, i.e. there is no need to restore any
+state as nothing has been modified. But in the case where a VMFail
+is detected by hardware and not by KVM (due to deferring consistency
+checks to hardware), KVM has already loaded some amount of guest
+state. Luckily, "loaded" only means loaded to KVM's software model,
+i.e. vmcs01 has not been modified. So, unwind our software model to
+the pre-VMEntry host state.
+
+Not restoring host state in this VMFail path leads to a variety of
+failures because we end up with stale data in vcpu->arch, e.g. CR0,
+CR4, EFER, etc... will all be out of sync relative to vmcs01. Any
+significant delta in the stale data is all but guaranteed to crash
+L1, e.g. emulation of SMEP, SMAP, UMIP, WP, etc... will be wrong.
+
+An alternative to this "soft" reload would be to load host state from
+vmcs12 as if we triggered a VMExit (as opposed to VMFail), but that is
+wildly inconsistent with respect to the VMX architecture, e.g. an L1
+VMM with separate VMExit and VMFail paths would explode.
+
+Note that this approach does not mean KVM is 100% accurate with
+respect to VMX hardware behavior, even at an architectural level
+(the exact order of consistency checks is microarchitecture specific).
+But 100% emulation accuracy isn't the goal (with this patch), rather
+the goal is to be consistent in the information delivered to L1, e.g.
+a VMExit should not fall-through VMENTER, and a VMFail should not jump
+to HOST_RIP.
+
+This technically reverts commit "5af4157388ad (KVM: nVMX: Fix mmu
+context after VMLAUNCH/VMRESUME failure)", but retains the core
+aspects of that patch, just in an open coded form due to the need to
+pull state from vmcs01 instead of vmcs12. Restoring host state
+resolves a variety of issues introduced by commit "4f350c6dbcb9
+(kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly)",
+which remedied the incorrect behavior of treating VMFail like VMExit
+but in doing so neglected to restore arch state that had been modified
+prior to attempting nested VMEnter.
+
+A sample failure that occurs due to stale vcpu.arch state is a fault
+of some form while emulating an LGDT (due to emulated UMIP) from L1
+after a failed VMEntry to L3, in this case when running the KVM unit
+test test_tpr_threshold_values in L1. L0 also hits a WARN in this
+case due to a stale arch.cr4.UMIP.
+
+L1:
+ BUG: unable to handle kernel paging request at ffffc90000663b9e
+ PGD 276512067 P4D 276512067 PUD 276513067 PMD 274efa067 PTE 8000000271de2163
+ Oops: 0009 [#1] SMP
+ CPU: 5 PID: 12495 Comm: qemu-system-x86 Tainted: G W 4.18.0-rc2+ #2
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+ RIP: 0010:native_load_gdt+0x0/0x10
+
+ ...
+
+ Call Trace:
+ load_fixmap_gdt+0x22/0x30
+ __vmx_load_host_state+0x10e/0x1c0 [kvm_intel]
+ vmx_switch_vmcs+0x2d/0x50 [kvm_intel]
+ nested_vmx_vmexit+0x222/0x9c0 [kvm_intel]
+ vmx_handle_exit+0x246/0x15a0 [kvm_intel]
+ kvm_arch_vcpu_ioctl_run+0x850/0x1830 [kvm]
+ kvm_vcpu_ioctl+0x3a1/0x5c0 [kvm]
+ do_vfs_ioctl+0x9f/0x600
+ ksys_ioctl+0x66/0x70
+ __x64_sys_ioctl+0x16/0x20
+ do_syscall_64+0x4f/0x100
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+L0:
+ WARNING: CPU: 2 PID: 3529 at arch/x86/kvm/vmx.c:6618 handle_desc+0x28/0x30 [kvm_intel]
+ ...
+ CPU: 2 PID: 3529 Comm: qemu-system-x86 Not tainted 4.17.2-coffee+ #76
+ Hardware name: Intel Corporation Kabylake Client platform/KBL S
+ RIP: 0010:handle_desc+0x28/0x30 [kvm_intel]
+
+ ...
+
+ Call Trace:
+ kvm_arch_vcpu_ioctl_run+0x863/0x1840 [kvm]
+ kvm_vcpu_ioctl+0x3a1/0x5c0 [kvm]
+ do_vfs_ioctl+0x9f/0x5e0
+ ksys_ioctl+0x66/0x70
+ __x64_sys_ioctl+0x16/0x20
+ do_syscall_64+0x49/0xf0
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Fixes: 5af4157388ad (KVM: nVMX: Fix mmu context after VMLAUNCH/VMRESUME failure)
+Fixes: 4f350c6dbcb9 (kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly)
+Cc: Jim Mattson <jmattson@google.com>
+Cc: Krish Sadhukhan <krish.sadhukhan@oracle.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Radim KrÄmář <rkrcmar@redhat.com>
+Cc: Wanpeng Li <wanpeng.li@hotmail.com>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/kvm/vmx.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 153 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11802,24 +11802,6 @@ static void prepare_vmcs12(struct kvm_vc
+ kvm_clear_interrupt_queue(vcpu);
+ }
+
+-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
+- struct vmcs12 *vmcs12)
+-{
+- u32 entry_failure_code;
+-
+- nested_ept_uninit_mmu_context(vcpu);
+-
+- /*
+- * Only PDPTE load can fail as the value of cr3 was checked on entry and
+- * couldn't have changed.
+- */
+- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+- nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+-
+- if (!enable_ept)
+- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+-}
+-
+ /*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+@@ -11833,6 +11815,7 @@ static void load_vmcs12_host_state(struc
+ struct vmcs12 *vmcs12)
+ {
+ struct kvm_segment seg;
++ u32 entry_failure_code;
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->host_ia32_efer;
+@@ -11859,7 +11842,17 @@ static void load_vmcs12_host_state(struc
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
+
+- load_vmcs12_mmu_host_state(vcpu, vmcs12);
++ nested_ept_uninit_mmu_context(vcpu);
++
++ /*
++ * Only PDPTE load can fail as the value of cr3 was checked on entry and
++ * couldn't have changed.
++ */
++ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
++ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
++
++ if (!enable_ept)
++ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * If vmcs01 don't use VPID, CPU flushes TLB on every
+@@ -11958,6 +11951,140 @@ static void load_vmcs12_host_state(struc
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+ }
+
++static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
++{
++ struct shared_msr_entry *efer_msr;
++ unsigned int i;
++
++ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
++ return vmcs_read64(GUEST_IA32_EFER);
++
++ if (cpu_has_load_ia32_efer)
++ return host_efer;
++
++ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
++ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
++ return vmx->msr_autoload.guest.val[i].value;
++ }
++
++ efer_msr = find_msr_entry(vmx, MSR_EFER);
++ if (efer_msr)
++ return efer_msr->data;
++
++ return host_efer;
++}
++
++static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
++{
++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
++ struct vcpu_vmx *vmx = to_vmx(vcpu);
++ struct vmx_msr_entry g, h;
++ struct msr_data msr;
++ gpa_t gpa;
++ u32 i, j;
++
++ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
++
++ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
++ /*
++ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
++ * as vmcs01.GUEST_DR7 contains a userspace defined value
++ * and vcpu->arch.dr7 is not squirreled away before the
++ * nested VMENTER (not worth adding a variable in nested_vmx).
++ */
++ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
++ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
++ else
++ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
++ }
++
++ /*
++ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
++ * handle a variety of side effects to KVM's software model.
++ */
++ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
++
++ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
++ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
++
++ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
++ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
++
++ nested_ept_uninit_mmu_context(vcpu);
++ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
++ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
++
++ /*
++ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
++ * from vmcs01 (if necessary). The PDPTRs are not loaded on
++ * VMFail, like everything else we just need to ensure our
++ * software model is up-to-date.
++ */
++ ept_save_pdptrs(vcpu);
++
++ kvm_mmu_reset_context(vcpu);
++
++ if (cpu_has_vmx_msr_bitmap())
++ vmx_update_msr_bitmap(vcpu);
++
++ /*
++ * This nasty bit of open coding is a compromise between blindly
++ * loading L1's MSRs using the exit load lists (incorrect emulation
++ * of VMFail), leaving the nested VM's MSRs in the software model
++ * (incorrect behavior) and snapshotting the modified MSRs (too
++ * expensive since the lists are unbound by hardware). For each
++ * MSR that was (prematurely) loaded from the nested VMEntry load
++ * list, reload it from the exit load list if it exists and differs
++ * from the guest value. The intent is to stuff host state as
++ * silently as possible, not to fully process the exit load list.
++ */
++ msr.host_initiated = false;
++ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
++ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
++ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
++ pr_debug_ratelimited(
++ "%s read MSR index failed (%u, 0x%08llx)\n",
++ __func__, i, gpa);
++ goto vmabort;
++ }
++
++ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
++ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
++ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
++ pr_debug_ratelimited(
++ "%s read MSR failed (%u, 0x%08llx)\n",
++ __func__, j, gpa);
++ goto vmabort;
++ }
++ if (h.index != g.index)
++ continue;
++ if (h.value == g.value)
++ break;
++
++ if (nested_vmx_load_msr_check(vcpu, &h)) {
++ pr_debug_ratelimited(
++ "%s check failed (%u, 0x%x, 0x%x)\n",
++ __func__, j, h.index, h.reserved);
++ goto vmabort;
++ }
++
++ msr.index = h.index;
++ msr.data = h.value;
++ if (kvm_set_msr(vcpu, &msr)) {
++ pr_debug_ratelimited(
++ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
++ __func__, j, h.index, h.value);
++ goto vmabort;
++ }
++ }
++ }
++
++ return;
++
++vmabort:
++ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
++}
++
+ /*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+@@ -12094,7 +12221,13 @@ static void nested_vmx_vmexit(struct kvm
+ */
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+- load_vmcs12_mmu_host_state(vcpu, vmcs12);
++ /*
++ * Restore L1's host state to KVM's software model. We're here
++ * because a consistency check was caught by hardware, which
++ * means some amount of guest state has been propagated to KVM's
++ * model and needs to be unwound to the host's state.
++ */
++ nested_vmx_restore_host_state(vcpu);
+
+ /*
+ * The emulated instruction was already skipped in
+
diff --git a/patches.arch/kvm-vmx-zero-out-all-general-purpose-registers-after-vm-exit b/patches.arch/kvm-vmx-zero-out-all-general-purpose-registers-after-vm-exit
new file mode 100644
index 0000000000..17e541bc71
--- /dev/null
+++ b/patches.arch/kvm-vmx-zero-out-all-general-purpose-registers-after-vm-exit
@@ -0,0 +1,61 @@
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 25 Jan 2019 07:40:50 -0800
+Subject: KVM: VMX: Zero out *all* general purpose registers after VM-Exit
+Git-commit: 0e0ab73c9a0243736bcd779b30b717e23ba9a56d
+Patch-mainline: v5.1-rc1
+References: bsc#1134202
+
+...except RSP, which is restored by hardware as part of VM-Exit.
+
+Paolo theorized that restoring registers from the stack after a VM-Exit
+in lieu of zeroing them could lead to speculative execution with the
+guest's values, e.g. if the stack accesses miss the L1 cache[1].
+Zeroing XORs are dirt cheap, so just be ultra-paranoid.
+
+Note that the scratch register (currently RCX) used to save/restore the
+guest state is also zeroed as its host-defined value is loaded via the
+stack, just with a MOV instead of a POP.
+
+[1] https://patchwork.kernel.org/patch/10771539/#22441255
+
+Fixes: 0cb5b30698fd ("kvm: vmx: Scrub hardware GPRs at VM-exit")
+Cc: <stable@vger.kernel.org>
+Cc: Jim Mattson <jmattson@google.com>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/kvm/vmx/vmx.c | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -9764,6 +9764,15 @@ static void __noclone vmx_vcpu_run(struc
+ "mov %%r13, %c[r13](%0) \n\t"
+ "mov %%r14, %c[r14](%0) \n\t"
+ "mov %%r15, %c[r15](%0) \n\t"
++
++ /*
++ * Clear all general purpose registers (except RSP, which is loaded by
++ * the CPU during VM-Exit) to prevent speculative use of the guest's
++ * values, even those that are saved/loaded via the stack. In theory,
++ * an L1 cache miss when restoring registers could lead to speculative
++ * execution with the guest's values. Zeroing XORs are dirt cheap,
++ * i.e. the extra paranoia is essentially free.
++ */
+ "xor %%r8d, %%r8d \n\t"
+ "xor %%r9d, %%r9d \n\t"
+ "xor %%r10d, %%r10d \n\t"
+@@ -9778,8 +9787,11 @@ static void __noclone vmx_vcpu_run(struc
+
+ "xor %%eax, %%eax \n\t"
+ "xor %%ebx, %%ebx \n\t"
++ "xor %%ecx, %%ecx \n\t"
++ "xor %%edx, %%edx \n\t"
+ "xor %%esi, %%esi \n\t"
+ "xor %%edi, %%edi \n\t"
++ "xor %%ebp, %%ebp \n\t"
+ "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
+ ".pushsection .rodata \n\t"
+ ".global vmx_return \n\t"
+
diff --git a/patches.arch/kvm-x86-always-use-32-bit-smram-save-state-for-32-bit-kernels b/patches.arch/kvm-x86-always-use-32-bit-smram-save-state-for-32-bit-kernels
new file mode 100644
index 0000000000..c6093821f9
--- /dev/null
+++ b/patches.arch/kvm-x86-always-use-32-bit-smram-save-state-for-32-bit-kernels
@@ -0,0 +1,153 @@
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Tue, 2 Apr 2019 08:10:48 -0700
+Subject: KVM: x86: Always use 32-bit SMRAM save state for 32-bit kernels
+Git-commit: b68f3cc7d978943fcf85148165b00594c38db776
+Patch-mainline: v5.1-rc6
+References: bsc#1134203
+
+Invoking the 64-bit variation on a 32-bit kenrel will crash the guest,
+trigger a WARN, and/or lead to a buffer overrun in the host, e.g.
+rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and
+thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64.
+
+KVM allows userspace to report long mode support via CPUID, even though
+the guest is all but guaranteed to crash if it actually tries to enable
+long mode. But, a pure 32-bit guest that is ignorant of long mode will
+happily plod along.
+
+SMM complicates things as 64-bit CPUs use a different SMRAM save state
+area. KVM handles this correctly for 64-bit kernels, e.g. uses the
+legacy save state map if userspace has hid long mode from the guest,
+but doesn't fare well when userspace reports long mode support on a
+32-bit host kernel (32-bit KVM doesn't support 64-bit guests).
+
+Since the alternative is to crash the guest, e.g. by not loading state
+or explicitly requesting shutdown, unconditionally use the legacy SMRAM
+save state map for 32-bit KVM. If a guest has managed to get far enough
+to handle SMIs when running under a weird/buggy userspace hypervisor,
+then don't deliberately crash the guest since there are no downsides
+(from KVM's perspective) to allow it to continue running.
+
+Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/kvm/emulate.c | 10 ++++++++++
+ arch/x86/kvm/x86.c | 10 ++++++----
+ 2 files changed, 16 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2330,12 +2330,16 @@ static int em_lseg(struct x86_emulate_ct
+
+ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+ {
++#ifdef CONFIG_X86_64
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000001;
+ ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ return edx & bit(X86_FEATURE_LM);
++#else
++ return false;
++#endif
+ }
+
+ #define GET_SMSTATE(type, smbase, offset) \
+@@ -2380,6 +2384,7 @@ static int rsm_load_seg_32(struct x86_em
+ return X86EMUL_CONTINUE;
+ }
+
++#ifdef CONFIG_X86_64
+ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+ {
+ struct desc_struct desc;
+@@ -2398,6 +2403,7 @@ static int rsm_load_seg_64(struct x86_em
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+ return X86EMUL_CONTINUE;
+ }
++#endif
+
+ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+ u64 cr0, u64 cr3, u64 cr4)
+@@ -2498,6 +2504,8 @@ static int rsm_load_state_32(struct x86_
+ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
+ }
+
++
++#ifdef CONFIG_X86_64
+ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+ {
+ struct desc_struct desc;
+@@ -2559,6 +2567,7 @@ static int rsm_load_state_64(struct x86_
+
+ return X86EMUL_CONTINUE;
+ }
++#endif
+
+ static int em_rsm(struct x86_emulate_ctxt *ctxt)
+ {
+@@ -2615,9 +2624,11 @@ static int em_rsm(struct x86_emulate_ctx
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
++#ifdef CONFIG_X86_64
+ if (emulator_has_longmode(ctxt))
+ ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ else
++#endif
+ ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+ if (ret != X86EMUL_CONTINUE) {
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -6787,9 +6787,9 @@ static void enter_smm_save_state_32(stru
+ put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+ }
+
++#ifdef CONFIG_X86_64
+ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+ {
+-#ifdef CONFIG_X86_64
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+@@ -6839,10 +6839,8 @@ static void enter_smm_save_state_64(stru
+
+ for (i = 0; i < 6; i++)
+ enter_smm_save_seg_64(vcpu, buf, i);
+-#else
+- WARN_ON_ONCE(1);
+-#endif
+ }
++#endif
+
+ static void enter_smm(struct kvm_vcpu *vcpu)
+ {
+@@ -6853,9 +6851,11 @@ static void enter_smm(struct kvm_vcpu *v
+
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+ memset(buf, 0, 512);
++#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, buf);
+ else
++#endif
+ enter_smm_save_state_32(vcpu, buf);
+
+ /*
+@@ -6913,8 +6913,10 @@ static void enter_smm(struct kvm_vcpu *v
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
++#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ kvm_x86_ops->set_efer(vcpu, 0);
++#endif
+
+ kvm_update_cpuid(vcpu);
+ kvm_mmu_reset_context(vcpu);
+
diff --git a/patches.arch/kvm-x86-don-t-clear-efer-during-smm-transitions-for-32-bit-vcpu b/patches.arch/kvm-x86-don-t-clear-efer-during-smm-transitions-for-32-bit-vcpu
new file mode 100644
index 0000000000..cae18225b6
--- /dev/null
+++ b/patches.arch/kvm-x86-don-t-clear-efer-during-smm-transitions-for-32-bit-vcpu
@@ -0,0 +1,78 @@
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Tue, 2 Apr 2019 08:10:47 -0700
+Subject: KVM: x86: Don't clear EFER during SMM transitions for 32-bit vCPU
+Git-commit: 8f4dc2e77cdfaf7e644ef29693fa229db29ee1de
+Patch-mainline: v5.1-rc6
+References: bsc#1134204
+
+Neither AMD nor Intel CPUs have an EFER field in the legacy SMRAM save
+state area, i.e. don't save/restore EFER across SMM transitions. KVM
+somewhat models this, e.g. doesn't clear EFER on entry to SMM if the
+guest doesn't support long mode. But during RSM, KVM unconditionally
+clears EFER so that it can get back to pure 32-bit mode in order to
+start loading CRs with their actual non-SMM values.
+
+Clear EFER only when it will be written when loading the non-SMM state
+so as to preserve bits that can theoretically be set on 32-bit vCPUs,
+e.g. KVM always emulates EFER_SCE.
+
+And because CR4.PAE is cleared only to play nice with EFER, wrap that
+code in the long mode check as well. Note, this may result in a
+compiler warning about cr4 being consumed uninitialized. Re-read CR4
+even though it's technically unnecessary, as doing so allows for more
+readable code and RSM emulation is not a performance critical path.
+
+Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/kvm/emulate.c | 21 +++++++++++----------
+ 1 file changed, 11 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2574,15 +2574,13 @@ static int em_rsm(struct x86_emulate_ctx
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+- cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (emulator_has_longmode(ctxt)) {
+ struct desc_struct cs_desc;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+- if (cr4 & X86_CR4_PCIDE) {
++ cr4 = ctxt->ops->get_cr(ctxt, 4);
++ if (cr4 & X86_CR4_PCIDE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+- cr4 &= ~X86_CR4_PCIDE;
+- }
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+@@ -2596,13 +2594,16 @@ static int em_rsm(struct x86_emulate_ctx
+ if (cr0 & X86_CR0_PE)
+ ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+- /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
+- if (cr4 & X86_CR4_PAE)
+- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+-
+- /* And finally go back to 32-bit mode. */
+- efer = 0;
+- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
++ if (emulator_has_longmode(ctxt)) {
++ /* Clear CR4.PAE before clearing EFER.LME. */
++ cr4 = ctxt->ops->get_cr(ctxt, 4);
++ if (cr4 & X86_CR4_PAE)
++ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
++
++ /* And finally go back to 32-bit mode. */
++ efer = 0;
++ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
++ }
+
+ smbase = ctxt->ops->get_smbase(ctxt);
+
+
diff --git a/patches.arch/kvm-x86-svm-make-sure-nmi-is-injected-after-nmi_singlestep b/patches.arch/kvm-x86-svm-make-sure-nmi-is-injected-after-nmi_singlestep
new file mode 100644
index 0000000000..64f4253cef
--- /dev/null
+++ b/patches.arch/kvm-x86-svm-make-sure-nmi-is-injected-after-nmi_singlestep
@@ -0,0 +1,50 @@
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Wed, 3 Apr 2019 16:06:42 +0200
+Subject: KVM: x86: svm: make sure NMI is injected after nmi_singlestep
+Git-commit: 99c221796a810055974b54c02e8f53297e48d146
+Patch-mainline: v5.1-rc6
+References: bsc#1134205
+
+I noticed that apic test from kvm-unit-tests always hangs on my EPYC 7401P,
+the hanging test nmi-after-sti is trying to deliver 30000 NMIs and tracing
+shows that we're sometimes able to deliver a few but never all.
+
+When we're trying to inject an NMI we may fail to do so immediately for
+various reasons, however, we still need to inject it so enable_nmi_window()
+arms nmi_singlestep mode. #DB occurs as expected, but we're not checking
+for pending NMIs before entering the guest and unless there's a different
+event to process, the NMI will never get delivered.
+
+Make KVM_REQ_EVENT request on the vCPU from db_interception() to make sure
+pending NMIs are checked and possibly injected.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Joerg Roedel <jroedel@suse.de>
+---
+ arch/x86/kvm/svm.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
+index 933f19d840fe..c6815aef2cac 100644
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -2693,6 +2693,7 @@ static int npf_interception(struct vcpu_svm *svm)
+ static int db_interception(struct vcpu_svm *svm)
+ {
+ struct kvm_run *kvm_run = svm->vcpu.run;
++ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (!(svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+@@ -2703,6 +2704,8 @@ static int db_interception(struct vcpu_svm *svm)
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
++ /* Make sure we check for pending NMIs upon entry */
++ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (svm->vcpu.guest_debug &
+
diff --git a/patches.fixes/0001-btrfs-qgroup-Move-reserved-data-accounting-from-btrf.patch b/patches.fixes/0001-btrfs-qgroup-Move-reserved-data-accounting-from-btrf.patch
new file mode 100644
index 0000000000..a1afb3c4ac
--- /dev/null
+++ b/patches.fixes/0001-btrfs-qgroup-Move-reserved-data-accounting-from-btrf.patch
@@ -0,0 +1,307 @@
+From 1418bae1c22951aad9883bc8f8f4dccb272cce1e Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 23 Jan 2019 15:15:12 +0800
+Git-commit: 1418bae1c22951aad9883bc8f8f4dccb272cce1e
+References: bsc#1134162
+Patch-mainline: v5.1-rc1
+Subject: [PATCH] btrfs: qgroup: Move reserved data accounting from
+ btrfs_delayed_ref_head to btrfs_qgroup_extent_record
+
+[BUG]
+Btrfs/139 will fail with a high probability if the testing machine (VM)
+has only 2G RAM.
+
+Resulting the final write success while it should fail due to EDQUOT,
+and the fs will have quota exceeding the limit by 16K.
+
+The simplified reproducer will be: (needs a 2G ram VM)
+
+ $ mkfs.btrfs -f $dev
+ $ mount $dev $mnt
+
+ $ btrfs subv create $mnt/subv
+ $ btrfs quota enable $mnt
+ $ btrfs quota rescan -w $mnt
+ $ btrfs qgroup limit -e 1G $mnt/subv
+
+ $ for i in $(seq -w 1 8); do
+ xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
+ echo "file $i written" > /dev/kmsg
+ done
+ $ sync
+ $ btrfs qgroup show -pcre --raw $mnt
+
+The last pwrite will not trigger EDQUOT and final 'qgroup show' will
+show something like:
+
+ qgroupid rfer excl max_rfer max_excl parent child
+ -------- ---- ---- -------- -------- ------ -----
+ 0/5 16384 16384 none none --- ---
+ 0/256 1073758208 1073758208 none 1073741824 --- ---
+
+And 1073758208 is larger than
+ > 1073741824.
+
+[CAUSE]
+It's a bug in btrfs qgroup data reserved space management.
+
+For quota limit, we must ensure that:
+ reserved (data + metadata) + rfer/excl <= limit
+
+Since rfer/excl is only updated at transaction commmit time, reserved
+space needs to be taken special care.
+
+One important part of reserved space is data, and for a new data extent
+written to disk, we still need to take the reserved space until
+rfer/excl numbers get updated.
+
+Originally when an ordered extent finishes, we migrate the reserved
+qgroup data space from extent_io tree to delayed ref head of the data
+extent, expecting delayed ref will only be cleaned up at commit
+transaction time.
+
+However for small RAM machine, due to memory pressure dirty pages can be
+flushed back to disk without committing a transaction.
+
+The related events will be something like:
+
+ file 1 written
+ btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
+ btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
+ btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
+ btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
+ btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
+ cleanup_ref_head: num_bytes=54947840
+ cleanup_ref_head: num_bytes=5636096
+ cleanup_ref_head: num_bytes=569344
+ cleanup_ref_head: num_bytes=57344
+ cleanup_ref_head: num_bytes=8192
+ ^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
+ file 2 written
+ ...
+ file 8 written
+ cleanup_ref_head: num_bytes=8192
+ ...
+ btrfs_commit_transaction <<< the only transaction committed during
+ the test
+
+When file 2 is written, we have already freed 128M reserved qgroup data
+space for ino 258. Thus later write won't trigger EDQUOT.
+
+This allows us to write more data beyond qgroup limit.
+
+In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
+
+[FIX]
+By moving reserved qgroup data space from btrfs_delayed_ref_head to
+btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
+space won't be freed half way before commit transaction, thus fix the
+problem.
+
+Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/delayed-ref.c | 12 ++++--------
+ fs/btrfs/delayed-ref.h | 11 -----------
+ fs/btrfs/extent-tree.c | 3 ---
+ fs/btrfs/qgroup.c | 19 +++++++++++++++----
+ fs/btrfs/qgroup.h | 18 +++++++++++-------
+ include/trace/events/btrfs.h | 29 -----------------------------
+ 6 files changed, 30 insertions(+), 62 deletions(-)
+
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -618,14 +618,12 @@ add_delayed_ref_head(struct btrfs_fs_inf
+ INIT_LIST_HEAD(&head_ref->ref_add_list);
+ head_ref->processing = 0;
+ head_ref->total_ref_mod = count_mod;
+- head_ref->qgroup_reserved = 0;
+- head_ref->qgroup_ref_root = 0;
+
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ if (ref_root && reserved) {
+- head_ref->qgroup_ref_root = ref_root;
+- head_ref->qgroup_reserved = reserved;
++ qrecord->data_rsv = reserved;
++ qrecord->data_rsv_refroot = ref_root;
+ }
+
+ qrecord->bytenr = bytenr;
+@@ -647,8 +645,6 @@ add_delayed_ref_head(struct btrfs_fs_inf
+ existing = htree_insert(&delayed_refs->href_root,
+ &head_ref->href_node);
+ if (existing) {
+- WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+- && existing->qgroup_reserved);
+ update_existing_head_ref(delayed_refs, &existing->node, ref,
+ old_ref_mod);
+ /*
+@@ -815,7 +811,7 @@ int btrfs_add_delayed_tree_ref(struct bt
+
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+ is_fstree(ref_root)) {
+- record = kmalloc(sizeof(*record), GFP_NOFS);
++ record = kzalloc(sizeof(*record), GFP_NOFS);
+ if (!record)
+ goto free_head_ref;
+ }
+@@ -878,7 +874,7 @@ int btrfs_add_delayed_data_ref(struct bt
+
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+ is_fstree(ref_root)) {
+- record = kmalloc(sizeof(*record), GFP_NOFS);
++ record = kzalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep,
+--- a/fs/btrfs/delayed-ref.h
++++ b/fs/btrfs/delayed-ref.h
+@@ -116,17 +116,6 @@ struct btrfs_delayed_ref_head {
+ int total_ref_mod;
+
+ /*
+- * For qgroup reserved space freeing.
+- *
+- * ref_root and reserved will be recorded after
+- * BTRFS_ADD_DELAYED_EXTENT is called.
+- * And will be used to free reserved qgroup space at
+- * run_delayed_refs() time.
+- */
+- u64 qgroup_ref_root;
+- u64 qgroup_reserved;
+-
+- /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -2465,9 +2465,6 @@ static int run_one_delayed_ref(struct bt
+ }
+ }
+
+- /* Also free its reserved qgroup space */
+- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+- head->qgroup_reserved);
+ return ret;
+ }
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1488,12 +1488,18 @@ int btrfs_qgroup_trace_extent_nolock(str
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+ node);
+- if (bytenr < entry->bytenr)
++ if (bytenr < entry->bytenr) {
+ p = &(*p)->rb_left;
+- else if (bytenr > entry->bytenr)
++ } else if (bytenr > entry->bytenr) {
+ p = &(*p)->rb_right;
+- else
++ } else {
++ if (record->data_rsv && !entry->data_rsv) {
++ entry->data_rsv = record->data_rsv;
++ entry->data_rsv_refroot =
++ record->data_rsv_refroot;
++ }
+ return 1;
++ }
+ }
+
+ rb_link_node(&record->node, parent_node, p);
+@@ -1536,7 +1542,7 @@ int btrfs_qgroup_trace_extent(struct btr
+ return 0;
+ if (WARN_ON(trans == NULL))
+ return -EINVAL;
+- record = kmalloc(sizeof(*record), gfp_flag);
++ record = kzalloc(sizeof(*record), gfp_flag);
+ if (!record)
+ return -ENOMEM;
+
+@@ -2460,6 +2466,11 @@ int btrfs_qgroup_account_extents(struct
+ goto cleanup;
+ }
+
++ /* Free the reserved data space */
++ btrfs_qgroup_free_refroot(fs_info,
++ record->data_rsv_refroot,
++ record->data_rsv,
++ BTRFS_QGROUP_RSV_DATA);
+ /*
+ * Use SEQ_LAST as time_seq to do special search, which
+ * doesn't lock tree or delayed_refs and search current
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -120,6 +120,17 @@ struct btrfs_qgroup_extent_record {
+ struct rb_node node;
+ u64 bytenr;
+ u64 num_bytes;
++
++ /*
++ * For qgroup reserved data space freeing.
++ *
++ * @data_rsv_refroot and @data_rsv will be recorded after
++ * BTRFS_ADD_DELAYED_EXTENT is called.
++ * And will be used to free reserved qgroup space at
++ * transaction commit time.
++ */
++ u32 data_rsv; /* reserved data space needs to be freed */
++ u64 data_rsv_refroot; /* which root the reserved data belongs to */
+ struct ulist *old_roots;
+ };
+
+@@ -353,13 +364,6 @@ int btrfs_qgroup_inherit(struct btrfs_tr
+ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type);
+-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+- u64 ref_root, u64 num_bytes)
+-{
+- trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
+- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
+- BTRFS_QGROUP_RSV_DATA);
+-}
+
+ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+--- a/include/trace/events/btrfs.h
++++ b/include/trace/events/btrfs.h
+@@ -1516,35 +1516,6 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btr
+ TP_ARGS(inode, start, len, reserved, op)
+ );
+
+-DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
+-
+- TP_PROTO(const struct btrfs_fs_info *fs_info,
+- u64 ref_root, u64 reserved),
+-
+- TP_ARGS(fs_info, ref_root, reserved),
+-
+- TP_STRUCT__entry_btrfs(
+- __field( u64, ref_root )
+- __field( u64, reserved )
+- ),
+-
+- TP_fast_assign_btrfs(fs_info,
+- __entry->ref_root = ref_root;
+- __entry->reserved = reserved;
+- ),
+-
+- TP_printk_btrfs("root=%llu reserved=%llu op=free",
+- __entry->ref_root, __entry->reserved)
+-);
+-
+-DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
+-
+- TP_PROTO(const struct btrfs_fs_info *fs_info,
+- u64 ref_root, u64 reserved),
+-
+- TP_ARGS(fs_info, ref_root, reserved)
+-);
+-
+ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
+ TP_PROTO(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_qgroup_extent_record *rec),
diff --git a/patches.fixes/0001-btrfs-qgroup-Remove-duplicated-trace-points-for-qgro.patch b/patches.fixes/0001-btrfs-qgroup-Remove-duplicated-trace-points-for-qgro.patch
new file mode 100644
index 0000000000..dbfe3755cf
--- /dev/null
+++ b/patches.fixes/0001-btrfs-qgroup-Remove-duplicated-trace-points-for-qgro.patch
@@ -0,0 +1,46 @@
+From c8f72b98b65e012d82a731d8e5f42e4bce006ccb Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 13 Nov 2018 15:05:08 +0800
+Git-commit: c8f72b98b65e012d82a731d8e5f42e4bce006ccb
+References: bsc#1134160
+Patch-mainline: v5.1-rc1
+Subject: [PATCH] btrfs: qgroup: Remove duplicated trace points for
+ qgroup_rsv_add/release
+
+Inside qgroup_rsv_add/release(), we have trace events
+trace_qgroup_update_reserve() to catch reserved space update.
+
+However we still have two manual trace_qgroup_update_reserve() calls
+just outside these functions. Remove these duplicated calls.
+
+Fixes: 64ee4e751a1c ("btrfs: qgroup: Update trace events to use new separate rsv types")
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/qgroup.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
+index d316df95bec4..9a2f8c4c0fb9 100644
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2900,7 +2900,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
+
+ qg = unode_aux_to_qgroup(unode);
+
+- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
+ qgroup_rsv_add(fs_info, qg, num_bytes, type);
+ }
+
+@@ -2967,7 +2966,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+
+ qg = unode_aux_to_qgroup(unode);
+
+- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
+ qgroup_rsv_release(fs_info, qg, num_bytes, type);
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+--
+2.21.0
+
diff --git a/series.conf b/series.conf
index f81ff4a07f..877cfb2036 100644
--- a/series.conf
+++ b/series.conf
@@ -19775,6 +19775,8 @@
patches.fixes/0001-cpufeature-avoid-warning-when-compiling-with-clang.patch
patches.drivers/hwrng-core-document-the-quality-field.patch
patches.fixes/printk-Fix-panic-caused-by-passing-log_buf_len-to-co.patch
+ patches.arch/kvm-nvmx-clear-reserved-bits-of-db-exit-qualification
+ patches.arch/kvm-nvmx-restore-host-state-in-nested_vmx_vmexit-for-vmfail
patches.arch/kvm-nvmx-always-reflect-nm-vm-exits-to-l1
patches.arch/kvm-nvmx-move-check_vmentry_postreqs-call-to-nested_vmx_enter_non_root_mode
patches.fixes/arm-arm64-KVM-Rename-function-kvm_arch_dev_ioctl_che.patch
@@ -20430,6 +20432,7 @@
patches.fixes/userfaultfd-check-vm_maywrite-was-set-after-verifying-the-uffd-is-registered.patch
patches.fixes/checkstack.pl-fix-for-aarch64.patch
patches.drivers/Revert-scsi-qla2xxx-Fix-NVMe-Target-discovery.patch
+ patches.arch/kvm-fix-uaf-in-nested-posted-interrupt-processing
patches.fixes/kvm-x86-Add-AMD-s-EX_CFG-to-the-list-of-ignored-MSRs.patch
patches.suse/sctp-initialize-sin6_flowinfo-for-ipv6-addrs-in-sctp.patch
patches.suse/ipv4-Fix-potential-Spectre-v1-vulnerability.patch
@@ -21387,6 +21390,8 @@
patches.suse/0003-btrfs-qgroup-Introduce-per-root-swapped-blocks-infra.patch
patches.suse/0004-btrfs-qgroup-Use-delayed-subtree-rescan-for-balance.patch
patches.suse/0005-btrfs-qgroup-Cleanup-old-subtree-swap-code.patch
+ patches.fixes/0001-btrfs-qgroup-Remove-duplicated-trace-points-for-qgro.patch
+ patches.fixes/0001-btrfs-qgroup-Move-reserved-data-accounting-from-btrf.patch
patches.suse/btrfs-fix-fsync-after-succession-of-renames-of-diffe.patch
patches.suse/btrfs-fix-fsync-after-succession-of-renames-and-unli.patch
patches.suse/btrfs-ensure-that-a-dup-or-raid1-block-group-has-exactly-two-stripes.patch
@@ -21614,6 +21619,7 @@
patches.drm/fbdev-fbmem-fix-memory-access-if-logo-is-bigger-than.patch
patches.drivers/iommu-amd-fix-null-dereference-bug-in-match_hid_uid
patches.arch/kvm-vmx-compare-only-a-single-byte-for-vmcs-launched-in-vcpu-run
+ patches.arch/kvm-vmx-zero-out-all-general-purpose-registers-after-vm-exit
patches.arch/svm-fix-avic-dfr-and-ldr-handling
patches.arch/svm-fix-improper-check-when-deactivate-avic
patches.arch/kvm-nvmx-sign-extend-displacements-of-vmx-instr-s-mem-operands
@@ -21757,6 +21763,9 @@
patches.fixes/0001-PCI-pciehp-Ignore-Link-State-Changes-after-powering-.patch
patches.fixes/clk-x86-Add-system-specific-quirk-to-mark-clocks-as-.patch
patches.arch/svm-avic-fix-invalidate-logical-apic-id-entry
+ patches.arch/kvm-x86-svm-make-sure-nmi-is-injected-after-nmi_singlestep
+ patches.arch/kvm-x86-don-t-clear-efer-during-smm-transitions-for-32-bit-vcpu
+ patches.arch/kvm-x86-always-use-32-bit-smram-save-state-for-32-bit-kernels
patches.fixes/mac80211-do-not-call-driver-wake_tx_queue-op-during-.patch
patches.drivers/ibmvnic-Enable-GRO.patch
patches.drivers/ibmvnic-Fix-netdev-feature-clobbering-during-a-reset.patch