KVM: x86: ratelimit and decrease severity for guest-triggered printk

[cascardo/linux.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 151d261..2029c00 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -422,6 +422,7 @@ struct nested_vmx {
         struct list_head vmcs02_pool;
         int vmcs02_num;
         u64 vmcs01_tsc_offset;
+       bool change_vmcs01_virtual_x2apic_mode;
         /* L2 must run next, and mustn't decide to exit to L1. */
         bool nested_run_pending;
         /*
@@ -435,6 +436,8 @@ struct nested_vmx {
         bool pi_pending;
         u16 posted_intr_nv;
  
+       unsigned long *msr_bitmap;
+
         struct hrtimer preemption_timer;
         bool preemption_timer_expired;
  
@@ -924,7 +927,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
  static unsigned long *vmx_msr_bitmap_longmode;
  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_nested;
  static unsigned long *vmx_vmread_bitmap;
  static unsigned long *vmx_vmwrite_bitmap;
  
@@ -2157,7 +2159,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
         unsigned int dest;
  
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP))
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
                 return;
  
         do {
@@ -2197,6 +2200,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
                         new.control) != old.control);
  }
  
+static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
+{
+       vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
+       vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+}
+
  /*
   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   * vcpu mutex is already taken.
@@ -2205,22 +2214,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
  
         if (!vmm_exclusive)
                 kvm_cpu_vmxon(phys_addr);
-       else if (vmx->loaded_vmcs->cpu != cpu)
+       else if (!already_loaded)
                 loaded_vmcs_clear(vmx->loaded_vmcs);
  
-       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-               vmcs_load(vmx->loaded_vmcs->vmcs);
-       }
-
-       if (vmx->loaded_vmcs->cpu != cpu) {
-               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-               unsigned long sysenter_esp;
-
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       if (!already_loaded) {
                 local_irq_disable();
                 crash_disable_local_vmclear(cpu);
  
@@ -2235,6 +2236,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                          &per_cpu(loaded_vmcss_on_cpu, cpu));
                 crash_enable_local_vmclear(cpu);
                 local_irq_enable();
+       }
+
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
+       }
+
+       if (!already_loaded) {
+               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+               unsigned long sysenter_esp;
+
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  
                 /*
                  * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -2251,10 +2264,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  
         /* Setup TSC multiplier */
         if (kvm_has_tsc_control &&
-           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
-               vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
-               vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-       }
+           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
+               decache_tsc_multiplier(vmx);
  
         vmx_vcpu_pi_load(vcpu, cpu);
         vmx->host_pkru = read_pkru();
@@ -2265,7 +2276,8 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP))
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
                 return;
  
         /* Set SN when the vCPU is preempted */
@@ -2502,7 +2514,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
         unsigned long *msr_bitmap;
  
         if (is_guest_mode(vcpu))
-               msr_bitmap = vmx_msr_bitmap_nested;
+               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
         else if (cpu_has_secondary_exec_ctrls() &&
                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
@@ -2790,8 +2802,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                 SECONDARY_EXEC_WBINVD_EXITING |
-               SECONDARY_EXEC_XSAVES |
-               SECONDARY_EXEC_PCOMMIT;
+               SECONDARY_EXEC_XSAVES;
  
         if (enable_ept) {
                 /* nested EPT: emulate EPT also to L1 */
@@ -2804,12 +2815,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                         vmx->nested.nested_vmx_ept_caps |=
                                 VMX_EPT_EXECUTE_ONLY_BIT;
                 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
-               /*
-                * For nested guests, we don't do anything specific
-                * for single context invalidation. Hence, only advertise
-                * support for global context invalidation.
-                */
-               vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+               vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+                       VMX_EPT_EXTENT_CONTEXT_BIT;
         } else
                 vmx->nested.nested_vmx_ept_caps = 0;
  
@@ -2940,7 +2947,6 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                         vmx->nested.nested_vmx_secondary_ctls_high);
                 break;
         case MSR_IA32_VMX_EPT_VPID_CAP:
-               /* Currently, no nested vpid support */
                 *pdata = vmx->nested.nested_vmx_ept_caps |
                         ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
                 break;
@@ -3377,7 +3383,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                         SECONDARY_EXEC_SHADOW_VMCS |
                         SECONDARY_EXEC_XSAVES |
                         SECONDARY_EXEC_ENABLE_PML |
-                       SECONDARY_EXEC_PCOMMIT |
                         SECONDARY_EXEC_TSC_SCALING;
                 if (adjust_vmx_controls(min2, opt2,
                                         MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -4969,9 +4974,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
         if (!enable_pml)
                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
  
-       /* Currently, we allow L1 guest to directly run pcommit instruction. */
-       exec_control &= ~SECONDARY_EXEC_PCOMMIT;
-
         return exec_control;
  }
  
@@ -5016,9 +5018,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
  
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
  
-       if (cpu_has_secondary_exec_ctrls())
+       if (cpu_has_secondary_exec_ctrls()) {
                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
                                 vmx_secondary_exec_control(vmx));
+       }
  
         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@ -5091,6 +5094,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
         if (vmx_xsaves_supported())
                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
  
+       if (enable_pml) {
+               ASSERT(vmx->pml_pg);
+               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+       }
+
         return 0;
  }
  
@@ -6100,7 +6109,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
  
         gla_validity = (exit_qualification >> 7) & 0x3;
-       if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+       if (gla_validity == 0x2) {
                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@ -6360,13 +6369,6 @@ static __init int hardware_setup(void)
         if (!vmx_msr_bitmap_longmode_x2apic)
                 goto out4;
  
-       if (nested) {
-               vmx_msr_bitmap_nested =
-                       (unsigned long *)__get_free_page(GFP_KERNEL);
-               if (!vmx_msr_bitmap_nested)
-                       goto out5;
-       }
-
         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_vmread_bitmap)
                 goto out6;
@@ -6389,8 +6391,6 @@ static __init int hardware_setup(void)
  
         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-       if (nested)
-               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
  
         if (setup_vmcs_config(&vmcs_config) < 0) {
                 r = -EIO;
@@ -6526,9 +6526,6 @@ out8:
  out7:
         free_page((unsigned long)vmx_vmread_bitmap);
  out6:
-       if (nested)
-               free_page((unsigned long)vmx_msr_bitmap_nested);
-out5:
         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
  out4:
         free_page((unsigned long)vmx_msr_bitmap_longmode);
@@ -6554,8 +6551,6 @@ static __exit void hardware_unsetup(void)
         free_page((unsigned long)vmx_io_bitmap_a);
         free_page((unsigned long)vmx_vmwrite_bitmap);
         free_page((unsigned long)vmx_vmread_bitmap);
-       if (nested)
-               free_page((unsigned long)vmx_msr_bitmap_nested);
  
         free_kvm_area();
  }
@@ -6731,7 +6726,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
  {
         /* TODO: not to reset guest simply here. */
         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-       pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+       pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
  }
  
  static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -6797,7 +6792,13 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
  
         /* Checks for #GP/#SS exceptions. */
         exn = false;
-       if (is_protmode(vcpu)) {
+       if (is_long_mode(vcpu)) {
+               /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+                * non-canonical form. This is the only check on the memory
+                * destination for long mode!
+                */
+               exn = is_noncanonical_address(*ret);
+       } else if (is_protmode(vcpu)) {
                 /* Protected mode: apply checks for segment validity in the
                  * following order:
                  * - segment type check (#GP(0) may be thrown)
@@ -6814,17 +6815,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
                          * execute-only code segment
                          */
                         exn = ((s.type & 0xa) == 8);
-       }
-       if (exn) {
-               kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-               return 1;
-       }
-       if (is_long_mode(vcpu)) {
-               /* Long mode: #GP(0)/#SS(0) if the memory address is in a
-                * non-canonical form. This is an only check for long mode.
-                */
-               exn = is_noncanonical_address(*ret);
-       } else if (is_protmode(vcpu)) {
+               if (exn) {
+                       kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+                       return 1;
+               }
                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
                  */
                 exn = (s.unusable != 0);
@@ -6993,16 +6987,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                 return 1;
         }
  
+       if (cpu_has_vmx_msr_bitmap()) {
+               vmx->nested.msr_bitmap =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+               if (!vmx->nested.msr_bitmap)
+                       goto out_msr_bitmap;
+       }
+
         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
         if (!vmx->nested.cached_vmcs12)
-               return -ENOMEM;
+               goto out_cached_vmcs12;
  
         if (enable_shadow_vmcs) {
                 shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs) {
-                       kfree(vmx->nested.cached_vmcs12);
-                       return -ENOMEM;
-               }
+               if (!shadow_vmcs)
+                       goto out_shadow_vmcs;
                 /* mark vmcs as shadow */
                 shadow_vmcs->revision_id |= (1u << 31);
                 /* init shadow vmcs */
@@ -7014,7 +7013,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         vmx->nested.vmcs02_num = 0;
  
         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL);
+                    HRTIMER_MODE_REL_PINNED);
         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
  
         vmx->nested.vmxon = true;
@@ -7022,6 +7021,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         skip_emulated_instruction(vcpu);
         nested_vmx_succeed(vcpu);
         return 1;
+
+out_shadow_vmcs:
+       kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+       free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+       return -ENOMEM;
  }
  
  /*
@@ -7096,6 +7104,10 @@ static void free_nested(struct vcpu_vmx *vmx)
         vmx->nested.vmxon = false;
         free_vpid(vmx->nested.vpid02);
         nested_release_vmcs12(vmx);
+       if (vmx->nested.msr_bitmap) {
+               free_page((unsigned long)vmx->nested.msr_bitmap);
+               vmx->nested.msr_bitmap = NULL;
+       }
         if (enable_shadow_vmcs)
                 free_vmcs(vmx->nested.current_shadow_vmcs);
         kfree(vmx->nested.cached_vmcs12);
@@ -7602,12 +7614,16 @@ static int handle_invept(struct kvm_vcpu *vcpu)
  
         switch (type) {
         case VMX_EPT_EXTENT_GLOBAL:
+       /*
+        * TODO: track mappings and invalidate
+        * single context requests appropriately
+        */
+       case VMX_EPT_EXTENT_CONTEXT:
                 kvm_mmu_sync_roots(vcpu);
                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                 nested_vmx_succeed(vcpu);
                 break;
         default:
-               /* Trap single context invalidation invept calls */
                 BUG_ON(1);
                 break;
         }
@@ -7704,13 +7720,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
         return 1;
  }
  
-static int handle_pcommit(struct kvm_vcpu *vcpu)
-{
-       /* we never catch pcommit instruct for L1 guest. */
-       WARN_ON(1);
-       return 1;
-}
-
  static int handle_preemption_timer(struct kvm_vcpu *vcpu)
  {
         kvm_lapic_expired_hv_timer(vcpu);
@@ -7767,7 +7776,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_XSAVES]                  = handle_xsaves,
         [EXIT_REASON_XRSTORS]                 = handle_xrstors,
         [EXIT_REASON_PML_FULL]                = handle_pml_full,
-       [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
  };
  
@@ -8077,8 +8085,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                  * the XSS exit bitmap in vmcs12.
                  */
                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
-       case EXIT_REASON_PCOMMIT:
-               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
         case EXIT_REASON_PREEMPTION_TIMER:
                 return false;
         default:
@@ -8092,22 +8098,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
  }
  
-static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
-{
-       struct page *pml_pg;
-
-       pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!pml_pg)
-               return -ENOMEM;
-
-       vmx->pml_pg = pml_pg;
-
-       vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
-       vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-
-       return 0;
-}
-
  static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
  {
         if (vmx->pml_pg) {
@@ -8379,6 +8369,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
+                       exit_reason != EXIT_REASON_PML_FULL &&
                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -8438,6 +8429,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
  {
         u32 sec_exec_control;
  
+       /* Postpone execution until vmcs01 is the current VMCS. */
+       if (is_guest_mode(vcpu)) {
+               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+               return;
+       }
+
         /*
          * There is not point to enable virtualize x2apic without enable
          * apicv
@@ -9029,6 +9026,22 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
         put_cpu();
  }
  
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int r;
+
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
+       vmx_load_vmcs01(vcpu);
+       free_nested(vmx);
+       vcpu_put(vcpu);
+}
+
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -9037,8 +9050,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
                 vmx_destroy_pml_buffer(vmx);
         free_vpid(vmx->vpid);
         leave_guest_mode(vcpu);
-       vmx_load_vmcs01(vcpu);
-       free_nested(vmx);
+       vmx_free_vcpu_nested(vcpu);
         free_loaded_vmcs(vmx->loaded_vmcs);
         kfree(vmx->guest_msrs);
         kvm_vcpu_uninit(vcpu);
@@ -9060,14 +9072,26 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
         if (err)
                 goto free_vcpu;
  
+       err = -ENOMEM;
+
+       /*
+        * If PML is turned on, failure on enabling PML just results in failure
+        * of creating the vcpu, therefore we can simplify PML logic (by
+        * avoiding dealing with cases, such as enabling PML partially on vcpus
+        * for the guest, etc.
+        */
+       if (enable_pml) {
+               vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (!vmx->pml_pg)
+                       goto uninit_vcpu;
+       }
+
         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
         BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
                      > PAGE_SIZE);
  
-       err = -ENOMEM;
-       if (!vmx->guest_msrs) {
-               goto uninit_vcpu;
-       }
+       if (!vmx->guest_msrs)
+               goto free_pml;
  
         vmx->loaded_vmcs = &vmx->vmcs01;
         vmx->loaded_vmcs->vmcs = alloc_vmcs();
@@ -9111,18 +9135,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
         vmx->nested.current_vmptr = -1ull;
         vmx->nested.current_vmcs12 = NULL;
  
-       /*
-        * If PML is turned on, failure on enabling PML just results in failure
-        * of creating the vcpu, therefore we can simplify PML logic (by
-        * avoiding dealing with cases, such as enabling PML partially on vcpus
-        * for the guest, etc.
-        */
-       if (enable_pml) {
-               err = vmx_create_pml_buffer(vmx);
-               if (err)
-                       goto free_vmcs;
-       }
-
         vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
  
         return &vmx->vcpu;
@@ -9132,6 +9144,8 @@ free_vmcs:
         free_loaded_vmcs(vmx->loaded_vmcs);
  free_msrs:
         kfree(vmx->guest_msrs);
+free_pml:
+       vmx_destroy_pml_buffer(vmx);
  uninit_vcpu:
         kvm_vcpu_uninit(&vmx->vcpu);
  free_vcpu:
@@ -9264,15 +9278,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
         if (cpu_has_secondary_exec_ctrls())
                 vmcs_set_secondary_exec_control(secondary_exec_ctl);
  
-       if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
-               if (guest_cpuid_has_pcommit(vcpu))
-                       vmx->nested.nested_vmx_secondary_ctls_high |=
-                               SECONDARY_EXEC_PCOMMIT;
-               else
-                       vmx->nested.nested_vmx_secondary_ctls_high &=
-                               ~SECONDARY_EXEC_PCOMMIT;
-       }
-
         if (nested_vmx_allowed(vcpu))
                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
                         FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -9483,8 +9488,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
  {
         int msr;
         struct page *page;
-       unsigned long *msr_bitmap;
+       unsigned long *msr_bitmap_l1;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
  
+       /* This shortcut is ok because we support only x2APIC MSRs so far. */
         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
                 return false;
  
@@ -9493,63 +9500,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
                 WARN_ON(1);
                 return false;
         }
-       msr_bitmap = (unsigned long *)kmap(page);
-       if (!msr_bitmap) {
+       msr_bitmap_l1 = (unsigned long *)kmap(page);
+       if (!msr_bitmap_l1) {
                 nested_release_page_clean(page);
                 WARN_ON(1);
                 return false;
         }
  
+       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+
         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
                 if (nested_cpu_has_apic_reg_virt(vmcs12))
                         for (msr = 0x800; msr <= 0x8ff; msr++)
                                 nested_vmx_disable_intercept_for_msr(
-                                       msr_bitmap,
-                                       vmx_msr_bitmap_nested,
+                                       msr_bitmap_l1, msr_bitmap_l0,
                                         msr, MSR_TYPE_R);
-               /* TPR is allowed */
-               nested_vmx_disable_intercept_for_msr(msr_bitmap,
-                               vmx_msr_bitmap_nested,
+
+               nested_vmx_disable_intercept_for_msr(
+                               msr_bitmap_l1, msr_bitmap_l0,
                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
                                 MSR_TYPE_R | MSR_TYPE_W);
+
                 if (nested_cpu_has_vid(vmcs12)) {
-                       /* EOI and self-IPI are allowed */
                         nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap,
-                               vmx_msr_bitmap_nested,
+                               msr_bitmap_l1, msr_bitmap_l0,
                                 APIC_BASE_MSR + (APIC_EOI >> 4),
                                 MSR_TYPE_W);
                         nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap,
-                               vmx_msr_bitmap_nested,
+                               msr_bitmap_l1, msr_bitmap_l0,
                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
                                 MSR_TYPE_W);
                 }
-       } else {
-               /*
-                * Enable reading intercept of all the x2apic
-                * MSRs. We should not rely on vmcs12 to do any
-                * optimizations here, it may have been modified
-                * by L1.
-                */
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               msr,
-                               MSR_TYPE_R);
-
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-                               MSR_TYPE_W);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_EOI >> 4),
-                               MSR_TYPE_W);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-                               MSR_TYPE_W);
         }
         kunmap(page);
         nested_release_page_clean(page);
@@ -9617,7 +9598,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
         maxphyaddr = cpuid_maxphyaddr(vcpu);
         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
-               pr_warn_ratelimited(
+               pr_debug_ratelimited(
                         "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
                         addr_field, maxphyaddr, count, addr);
                 return -EINVAL;
@@ -9690,13 +9671,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
         for (i = 0; i < count; i++) {
                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
                                         &e, sizeof(e))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                 __func__, i, gpa + i * sizeof(e));
                         goto fail;
                 }
                 if (nested_vmx_load_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s check failed (%u, 0x%x, 0x%x)\n",
                                 __func__, i, e.index, e.reserved);
                         goto fail;
@@ -9704,7 +9685,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                 msr.index = e.index;
                 msr.data = e.value;
                 if (kvm_set_msr(vcpu, &msr)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, e.value);
                         goto fail;
@@ -9725,13 +9706,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                 if (kvm_vcpu_read_guest(vcpu,
                                         gpa + i * sizeof(e),
                                         &e, 2 * sizeof(u32))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                 __func__, i, gpa + i * sizeof(e));
                         return -EINVAL;
                 }
                 if (nested_vmx_store_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s check failed (%u, 0x%x, 0x%x)\n",
                                 __func__, i, e.index, e.reserved);
                         return -EINVAL;
@@ -9739,7 +9720,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                 msr_info.host_initiated = false;
                 msr_info.index = e.index;
                 if (kvm_get_msr(vcpu, &msr_info)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR (%u, 0x%x)\n",
                                 __func__, i, e.index);
                         return -EINVAL;
@@ -9748,7 +9729,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                                          gpa + i * sizeof(e) +
                                              offsetof(struct vmx_msr_entry, value),
                                          &msr_info.data, sizeof(msr_info.data))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, msr_info.data);
                         return -EINVAL;
@@ -9896,8 +9877,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                                   SECONDARY_EXEC_RDTSCP |
                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
-                                 SECONDARY_EXEC_PCOMMIT);
+                                 SECONDARY_EXEC_APIC_REGISTER_VIRT);
                 if (nested_cpu_has(vmcs12,
                                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                         exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9969,10 +9949,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         }
  
         if (cpu_has_vmx_msr_bitmap() &&
-           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
-               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
-               /* MSR_BITMAP will be set by following vmx_set_efer. */
-       } else
+           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
+       else
                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
  
         /*
@@ -10023,6 +10003,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                         vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
         else
                 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
  
         if (enable_vpid) {
                 /*
@@ -10518,6 +10500,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
         }
  
+       if (nested_cpu_has_ept(vmcs12))
+               vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
         if (nested_cpu_has_vid(vmcs12))
                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
  
@@ -10779,6 +10764,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         else
                 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
+
+       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+               vmx_set_virtual_x2apic_mode(vcpu,
+                               vcpu->arch.apic_base & X2APIC_ENABLE);
+       }
  
         /* This is needed for same reason as it was needed in prepare_vmcs02 */
         vmx->host_rsp = 0;
@@ -10968,7 +10961,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP))
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
                 return 0;
  
         vcpu->pre_pcpu = vcpu->cpu;
@@ -11045,7 +11039,8 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
         unsigned long flags;
  
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP))
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
                 return;
  
         do {
@@ -11106,7 +11101,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
         int idx, ret = -EINVAL;
  
         if (!kvm_arch_has_assigned_device(kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP))
+               !irq_remapping_cap(IRQ_POSTING_CAP) ||
+               !kvm_vcpu_apicv_active(kvm->vcpus[0]))
                 return 0;
  
         idx = srcu_read_lock(&kvm->irq_srcu);