KVM: SVM: fix trashing of MSR_TSC_AUX
[cascardo/linux.git] / arch / x86 / kvm / svm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * AMD SVM support
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Authors:
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *   Avi Kivity   <avi@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #define pr_fmt(fmt) "SVM: " fmt
19
20 #include <linux/kvm_host.h>
21
22 #include "irq.h"
23 #include "mmu.h"
24 #include "kvm_cache_regs.h"
25 #include "x86.h"
26 #include "cpuid.h"
27 #include "pmu.h"
28
29 #include <linux/module.h>
30 #include <linux/mod_devicetable.h>
31 #include <linux/kernel.h>
32 #include <linux/vmalloc.h>
33 #include <linux/highmem.h>
34 #include <linux/sched.h>
35 #include <linux/trace_events.h>
36 #include <linux/slab.h>
37
38 #include <asm/apic.h>
39 #include <asm/perf_event.h>
40 #include <asm/tlbflush.h>
41 #include <asm/desc.h>
42 #include <asm/debugreg.h>
43 #include <asm/kvm_para.h>
44 #include <asm/vgtod.h>
45
46 #include <asm/virtext.h>
47 #include "trace.h"
48
49 #define __ex(x) __kvm_handle_fault_on_reboot(x)
50
51 MODULE_AUTHOR("Qumranet");
52 MODULE_LICENSE("GPL");
53
54 static const struct x86_cpu_id svm_cpu_id[] = {
55         X86_FEATURE_MATCH(X86_FEATURE_SVM),
56         {}
57 };
58 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
59
60 #define IOPM_ALLOC_ORDER 2
61 #define MSRPM_ALLOC_ORDER 1
62
63 #define SEG_TYPE_LDT 2
64 #define SEG_TYPE_BUSY_TSS16 3
65
66 #define SVM_FEATURE_NPT            (1 <<  0)
67 #define SVM_FEATURE_LBRV           (1 <<  1)
68 #define SVM_FEATURE_SVML           (1 <<  2)
69 #define SVM_FEATURE_NRIP           (1 <<  3)
70 #define SVM_FEATURE_TSC_RATE       (1 <<  4)
71 #define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
72 #define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
73 #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
74 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
75
76 #define SVM_AVIC_DOORBELL       0xc001011b
77
78 #define NESTED_EXIT_HOST        0       /* Exit handled on host level */
79 #define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
80 #define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
81
82 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
83
84 #define TSC_RATIO_RSVD          0xffffff0000000000ULL
85 #define TSC_RATIO_MIN           0x0000000000000001ULL
86 #define TSC_RATIO_MAX           0x000000ffffffffffULL
87
88 #define AVIC_HPA_MASK   ~((0xFFFULL << 52) | 0xFFF)
89
90 /*
91  * 0xff is broadcast, so the max index allowed for physical APIC ID
92  * table is 0xfe.  APIC IDs above 0xff are reserved.
93  */
94 #define AVIC_MAX_PHYSICAL_ID_COUNT      255
95
96 #define AVIC_UNACCEL_ACCESS_WRITE_MASK          1
97 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK         0xFF0
98 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK         0xFFFFFFFF
99
100 static bool erratum_383_found __read_mostly;
101
102 static const u32 host_save_user_msrs[] = {
103 #ifdef CONFIG_X86_64
104         MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
105         MSR_FS_BASE,
106 #endif
107         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
108         MSR_TSC_AUX,
109 };
110
111 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
112
113 struct kvm_vcpu;
114
115 struct nested_state {
116         struct vmcb *hsave;
117         u64 hsave_msr;
118         u64 vm_cr_msr;
119         u64 vmcb;
120
121         /* These are the merged vectors */
122         u32 *msrpm;
123
124         /* gpa pointers to the real vectors */
125         u64 vmcb_msrpm;
126         u64 vmcb_iopm;
127
128         /* A VMEXIT is required but not yet emulated */
129         bool exit_required;
130
131         /* cache for intercepts of the guest */
132         u32 intercept_cr;
133         u32 intercept_dr;
134         u32 intercept_exceptions;
135         u64 intercept;
136
137         /* Nested Paging related state */
138         u64 nested_cr3;
139 };
140
141 #define MSRPM_OFFSETS   16
142 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
143
144 /*
145  * Set osvw_len to higher value when updated Revision Guides
146  * are published and we know what the new status bits are
147  */
148 static uint64_t osvw_len = 4, osvw_status;
149
150 struct vcpu_svm {
151         struct kvm_vcpu vcpu;
152         struct vmcb *vmcb;
153         unsigned long vmcb_pa;
154         struct svm_cpu_data *svm_data;
155         uint64_t asid_generation;
156         uint64_t sysenter_esp;
157         uint64_t sysenter_eip;
158         uint64_t tsc_aux;
159
160         u64 next_rip;
161
162         u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
163         struct {
164                 u16 fs;
165                 u16 gs;
166                 u16 ldt;
167                 u64 gs_base;
168         } host;
169
170         u32 *msrpm;
171
172         ulong nmi_iret_rip;
173
174         struct nested_state nested;
175
176         bool nmi_singlestep;
177
178         unsigned int3_injected;
179         unsigned long int3_rip;
180         u32 apf_reason;
181
182         /* cached guest cpuid flags for faster access */
183         bool nrips_enabled      : 1;
184
185         u32 ldr_reg;
186         struct page *avic_backing_page;
187         u64 *avic_physical_id_cache;
188         bool avic_is_running;
189 };
190
191 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK    (0xFF)
192 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK                (1 << 31)
193
194 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK    (0xFFULL)
195 #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK        (0xFFFFFFFFFFULL << 12)
196 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK          (1ULL << 62)
197 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK               (1ULL << 63)
198
199 static DEFINE_PER_CPU(u64, current_tsc_ratio);
200 #define TSC_RATIO_DEFAULT       0x0100000000ULL
201
202 #define MSR_INVALID                     0xffffffffU
203
204 static const struct svm_direct_access_msrs {
205         u32 index;   /* Index of the MSR */
206         bool always; /* True if intercept is always on */
207 } direct_access_msrs[] = {
208         { .index = MSR_STAR,                            .always = true  },
209         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
210 #ifdef CONFIG_X86_64
211         { .index = MSR_GS_BASE,                         .always = true  },
212         { .index = MSR_FS_BASE,                         .always = true  },
213         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
214         { .index = MSR_LSTAR,                           .always = true  },
215         { .index = MSR_CSTAR,                           .always = true  },
216         { .index = MSR_SYSCALL_MASK,                    .always = true  },
217 #endif
218         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
219         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
220         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
221         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
222         { .index = MSR_INVALID,                         .always = false },
223 };
224
225 /* enable NPT for AMD64 and X86 with PAE */
226 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
227 static bool npt_enabled = true;
228 #else
229 static bool npt_enabled;
230 #endif
231
232 /* allow nested paging (virtualized MMU) for all guests */
233 static int npt = true;
234 module_param(npt, int, S_IRUGO);
235
236 /* allow nested virtualization in KVM/SVM */
237 static int nested = true;
238 module_param(nested, int, S_IRUGO);
239
240 /* enable / disable AVIC */
241 static int avic;
242 #ifdef CONFIG_X86_LOCAL_APIC
243 module_param(avic, int, S_IRUGO);
244 #endif
245
246 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
247 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
248 static void svm_complete_interrupts(struct vcpu_svm *svm);
249
250 static int nested_svm_exit_handled(struct vcpu_svm *svm);
251 static int nested_svm_intercept(struct vcpu_svm *svm);
252 static int nested_svm_vmexit(struct vcpu_svm *svm);
253 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
254                                       bool has_error_code, u32 error_code);
255
256 enum {
257         VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
258                             pause filter count */
259         VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
260         VMCB_ASID,       /* ASID */
261         VMCB_INTR,       /* int_ctl, int_vector */
262         VMCB_NPT,        /* npt_en, nCR3, gPAT */
263         VMCB_CR,         /* CR0, CR3, CR4, EFER */
264         VMCB_DR,         /* DR6, DR7 */
265         VMCB_DT,         /* GDT, IDT */
266         VMCB_SEG,        /* CS, DS, SS, ES, CPL */
267         VMCB_CR2,        /* CR2 only */
268         VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
269         VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
270                           * AVIC PHYSICAL_TABLE pointer,
271                           * AVIC LOGICAL_TABLE pointer
272                           */
273         VMCB_DIRTY_MAX,
274 };
275
276 /* TPR and CR2 are always written before VMRUN */
277 #define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
278
279 #define VMCB_AVIC_APIC_BAR_MASK         0xFFFFFFFFFF000ULL
280
281 static inline void mark_all_dirty(struct vmcb *vmcb)
282 {
283         vmcb->control.clean = 0;
284 }
285
286 static inline void mark_all_clean(struct vmcb *vmcb)
287 {
288         vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
289                                & ~VMCB_ALWAYS_DIRTY_MASK;
290 }
291
292 static inline void mark_dirty(struct vmcb *vmcb, int bit)
293 {
294         vmcb->control.clean &= ~(1 << bit);
295 }
296
297 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
298 {
299         return container_of(vcpu, struct vcpu_svm, vcpu);
300 }
301
302 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
303 {
304         svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
305         mark_dirty(svm->vmcb, VMCB_AVIC);
306 }
307
308 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
309 {
310         struct vcpu_svm *svm = to_svm(vcpu);
311         u64 *entry = svm->avic_physical_id_cache;
312
313         if (!entry)
314                 return false;
315
316         return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
317 }
318
319 static void recalc_intercepts(struct vcpu_svm *svm)
320 {
321         struct vmcb_control_area *c, *h;
322         struct nested_state *g;
323
324         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
325
326         if (!is_guest_mode(&svm->vcpu))
327                 return;
328
329         c = &svm->vmcb->control;
330         h = &svm->nested.hsave->control;
331         g = &svm->nested;
332
333         c->intercept_cr = h->intercept_cr | g->intercept_cr;
334         c->intercept_dr = h->intercept_dr | g->intercept_dr;
335         c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
336         c->intercept = h->intercept | g->intercept;
337 }
338
339 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
340 {
341         if (is_guest_mode(&svm->vcpu))
342                 return svm->nested.hsave;
343         else
344                 return svm->vmcb;
345 }
346
347 static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
348 {
349         struct vmcb *vmcb = get_host_vmcb(svm);
350
351         vmcb->control.intercept_cr |= (1U << bit);
352
353         recalc_intercepts(svm);
354 }
355
356 static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
357 {
358         struct vmcb *vmcb = get_host_vmcb(svm);
359
360         vmcb->control.intercept_cr &= ~(1U << bit);
361
362         recalc_intercepts(svm);
363 }
364
365 static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
366 {
367         struct vmcb *vmcb = get_host_vmcb(svm);
368
369         return vmcb->control.intercept_cr & (1U << bit);
370 }
371
372 static inline void set_dr_intercepts(struct vcpu_svm *svm)
373 {
374         struct vmcb *vmcb = get_host_vmcb(svm);
375
376         vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
377                 | (1 << INTERCEPT_DR1_READ)
378                 | (1 << INTERCEPT_DR2_READ)
379                 | (1 << INTERCEPT_DR3_READ)
380                 | (1 << INTERCEPT_DR4_READ)
381                 | (1 << INTERCEPT_DR5_READ)
382                 | (1 << INTERCEPT_DR6_READ)
383                 | (1 << INTERCEPT_DR7_READ)
384                 | (1 << INTERCEPT_DR0_WRITE)
385                 | (1 << INTERCEPT_DR1_WRITE)
386                 | (1 << INTERCEPT_DR2_WRITE)
387                 | (1 << INTERCEPT_DR3_WRITE)
388                 | (1 << INTERCEPT_DR4_WRITE)
389                 | (1 << INTERCEPT_DR5_WRITE)
390                 | (1 << INTERCEPT_DR6_WRITE)
391                 | (1 << INTERCEPT_DR7_WRITE);
392
393         recalc_intercepts(svm);
394 }
395
396 static inline void clr_dr_intercepts(struct vcpu_svm *svm)
397 {
398         struct vmcb *vmcb = get_host_vmcb(svm);
399
400         vmcb->control.intercept_dr = 0;
401
402         recalc_intercepts(svm);
403 }
404
405 static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
406 {
407         struct vmcb *vmcb = get_host_vmcb(svm);
408
409         vmcb->control.intercept_exceptions |= (1U << bit);
410
411         recalc_intercepts(svm);
412 }
413
414 static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
415 {
416         struct vmcb *vmcb = get_host_vmcb(svm);
417
418         vmcb->control.intercept_exceptions &= ~(1U << bit);
419
420         recalc_intercepts(svm);
421 }
422
423 static inline void set_intercept(struct vcpu_svm *svm, int bit)
424 {
425         struct vmcb *vmcb = get_host_vmcb(svm);
426
427         vmcb->control.intercept |= (1ULL << bit);
428
429         recalc_intercepts(svm);
430 }
431
432 static inline void clr_intercept(struct vcpu_svm *svm, int bit)
433 {
434         struct vmcb *vmcb = get_host_vmcb(svm);
435
436         vmcb->control.intercept &= ~(1ULL << bit);
437
438         recalc_intercepts(svm);
439 }
440
441 static inline void enable_gif(struct vcpu_svm *svm)
442 {
443         svm->vcpu.arch.hflags |= HF_GIF_MASK;
444 }
445
446 static inline void disable_gif(struct vcpu_svm *svm)
447 {
448         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
449 }
450
451 static inline bool gif_set(struct vcpu_svm *svm)
452 {
453         return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
454 }
455
456 static unsigned long iopm_base;
457
458 struct kvm_ldttss_desc {
459         u16 limit0;
460         u16 base0;
461         unsigned base1:8, type:5, dpl:2, p:1;
462         unsigned limit1:4, zero0:3, g:1, base2:8;
463         u32 base3;
464         u32 zero1;
465 } __attribute__((packed));
466
467 struct svm_cpu_data {
468         int cpu;
469
470         u64 asid_generation;
471         u32 max_asid;
472         u32 next_asid;
473         struct kvm_ldttss_desc *tss_desc;
474
475         struct page *save_area;
476 };
477
478 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
479
480 struct svm_init_data {
481         int cpu;
482         int r;
483 };
484
485 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
486
487 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
488 #define MSRS_RANGE_SIZE 2048
489 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
490
491 static u32 svm_msrpm_offset(u32 msr)
492 {
493         u32 offset;
494         int i;
495
496         for (i = 0; i < NUM_MSR_MAPS; i++) {
497                 if (msr < msrpm_ranges[i] ||
498                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
499                         continue;
500
501                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
502                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
503
504                 /* Now we have the u8 offset - but need the u32 offset */
505                 return offset / 4;
506         }
507
508         /* MSR not in any range */
509         return MSR_INVALID;
510 }
511
512 #define MAX_INST_SIZE 15
513
514 static inline void clgi(void)
515 {
516         asm volatile (__ex(SVM_CLGI));
517 }
518
519 static inline void stgi(void)
520 {
521         asm volatile (__ex(SVM_STGI));
522 }
523
524 static inline void invlpga(unsigned long addr, u32 asid)
525 {
526         asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
527 }
528
529 static int get_npt_level(void)
530 {
531 #ifdef CONFIG_X86_64
532         return PT64_ROOT_LEVEL;
533 #else
534         return PT32E_ROOT_LEVEL;
535 #endif
536 }
537
538 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
539 {
540         vcpu->arch.efer = efer;
541         if (!npt_enabled && !(efer & EFER_LMA))
542                 efer &= ~EFER_LME;
543
544         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
545         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
546 }
547
548 static int is_external_interrupt(u32 info)
549 {
550         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
551         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
552 }
553
554 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
555 {
556         struct vcpu_svm *svm = to_svm(vcpu);
557         u32 ret = 0;
558
559         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
560                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
561         return ret;
562 }
563
564 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
565 {
566         struct vcpu_svm *svm = to_svm(vcpu);
567
568         if (mask == 0)
569                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
570         else
571                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
572
573 }
574
575 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
576 {
577         struct vcpu_svm *svm = to_svm(vcpu);
578
579         if (svm->vmcb->control.next_rip != 0) {
580                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
581                 svm->next_rip = svm->vmcb->control.next_rip;
582         }
583
584         if (!svm->next_rip) {
585                 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
586                                 EMULATE_DONE)
587                         printk(KERN_DEBUG "%s: NOP\n", __func__);
588                 return;
589         }
590         if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
591                 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
592                        __func__, kvm_rip_read(vcpu), svm->next_rip);
593
594         kvm_rip_write(vcpu, svm->next_rip);
595         svm_set_interrupt_shadow(vcpu, 0);
596 }
597
598 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
599                                 bool has_error_code, u32 error_code,
600                                 bool reinject)
601 {
602         struct vcpu_svm *svm = to_svm(vcpu);
603
604         /*
605          * If we are within a nested VM we'd better #VMEXIT and let the guest
606          * handle the exception
607          */
608         if (!reinject &&
609             nested_svm_check_exception(svm, nr, has_error_code, error_code))
610                 return;
611
612         if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
613                 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
614
615                 /*
616                  * For guest debugging where we have to reinject #BP if some
617                  * INT3 is guest-owned:
618                  * Emulate nRIP by moving RIP forward. Will fail if injection
619                  * raises a fault that is not intercepted. Still better than
620                  * failing in all cases.
621                  */
622                 skip_emulated_instruction(&svm->vcpu);
623                 rip = kvm_rip_read(&svm->vcpu);
624                 svm->int3_rip = rip + svm->vmcb->save.cs.base;
625                 svm->int3_injected = rip - old_rip;
626         }
627
628         svm->vmcb->control.event_inj = nr
629                 | SVM_EVTINJ_VALID
630                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
631                 | SVM_EVTINJ_TYPE_EXEPT;
632         svm->vmcb->control.event_inj_err = error_code;
633 }
634
635 static void svm_init_erratum_383(void)
636 {
637         u32 low, high;
638         int err;
639         u64 val;
640
641         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
642                 return;
643
644         /* Use _safe variants to not break nested virtualization */
645         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
646         if (err)
647                 return;
648
649         val |= (1ULL << 47);
650
651         low  = lower_32_bits(val);
652         high = upper_32_bits(val);
653
654         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
655
656         erratum_383_found = true;
657 }
658
659 static void svm_init_osvw(struct kvm_vcpu *vcpu)
660 {
661         /*
662          * Guests should see errata 400 and 415 as fixed (assuming that
663          * HLT and IO instructions are intercepted).
664          */
665         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
666         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
667
668         /*
669          * By increasing VCPU's osvw.length to 3 we are telling the guest that
670          * all osvw.status bits inside that length, including bit 0 (which is
671          * reserved for erratum 298), are valid. However, if host processor's
672          * osvw_len is 0 then osvw_status[0] carries no information. We need to
673          * be conservative here and therefore we tell the guest that erratum 298
674          * is present (because we really don't know).
675          */
676         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
677                 vcpu->arch.osvw.status |= 1;
678 }
679
680 static int has_svm(void)
681 {
682         const char *msg;
683
684         if (!cpu_has_svm(&msg)) {
685                 printk(KERN_INFO "has_svm: %s\n", msg);
686                 return 0;
687         }
688
689         return 1;
690 }
691
692 static void svm_hardware_disable(void)
693 {
694         /* Make sure we clean up behind us */
695         if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
696                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
697
698         cpu_svm_disable();
699
700         amd_pmu_disable_virt();
701 }
702
703 static int svm_hardware_enable(void)
704 {
705
706         struct svm_cpu_data *sd;
707         uint64_t efer;
708         struct desc_ptr gdt_descr;
709         struct desc_struct *gdt;
710         int me = raw_smp_processor_id();
711
712         rdmsrl(MSR_EFER, efer);
713         if (efer & EFER_SVME)
714                 return -EBUSY;
715
716         if (!has_svm()) {
717                 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
718                 return -EINVAL;
719         }
720         sd = per_cpu(svm_data, me);
721         if (!sd) {
722                 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
723                 return -EINVAL;
724         }
725
726         sd->asid_generation = 1;
727         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
728         sd->next_asid = sd->max_asid + 1;
729
730         native_store_gdt(&gdt_descr);
731         gdt = (struct desc_struct *)gdt_descr.address;
732         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
733
734         wrmsrl(MSR_EFER, efer | EFER_SVME);
735
736         wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
737
738         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
739                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
740                 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
741         }
742
743
744         /*
745          * Get OSVW bits.
746          *
747          * Note that it is possible to have a system with mixed processor
748          * revisions and therefore different OSVW bits. If bits are not the same
749          * on different processors then choose the worst case (i.e. if erratum
750          * is present on one processor and not on another then assume that the
751          * erratum is present everywhere).
752          */
753         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
754                 uint64_t len, status = 0;
755                 int err;
756
757                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
758                 if (!err)
759                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
760                                                       &err);
761
762                 if (err)
763                         osvw_status = osvw_len = 0;
764                 else {
765                         if (len < osvw_len)
766                                 osvw_len = len;
767                         osvw_status |= status;
768                         osvw_status &= (1ULL << osvw_len) - 1;
769                 }
770         } else
771                 osvw_status = osvw_len = 0;
772
773         svm_init_erratum_383();
774
775         amd_pmu_enable_virt();
776
777         return 0;
778 }
779
780 static void svm_cpu_uninit(int cpu)
781 {
782         struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
783
784         if (!sd)
785                 return;
786
787         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
788         __free_page(sd->save_area);
789         kfree(sd);
790 }
791
792 static int svm_cpu_init(int cpu)
793 {
794         struct svm_cpu_data *sd;
795         int r;
796
797         sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
798         if (!sd)
799                 return -ENOMEM;
800         sd->cpu = cpu;
801         sd->save_area = alloc_page(GFP_KERNEL);
802         r = -ENOMEM;
803         if (!sd->save_area)
804                 goto err_1;
805
806         per_cpu(svm_data, cpu) = sd;
807
808         return 0;
809
810 err_1:
811         kfree(sd);
812         return r;
813
814 }
815
816 static bool valid_msr_intercept(u32 index)
817 {
818         int i;
819
820         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
821                 if (direct_access_msrs[i].index == index)
822                         return true;
823
824         return false;
825 }
826
827 static void set_msr_interception(u32 *msrpm, unsigned msr,
828                                  int read, int write)
829 {
830         u8 bit_read, bit_write;
831         unsigned long tmp;
832         u32 offset;
833
834         /*
835          * If this warning triggers extend the direct_access_msrs list at the
836          * beginning of the file
837          */
838         WARN_ON(!valid_msr_intercept(msr));
839
840         offset    = svm_msrpm_offset(msr);
841         bit_read  = 2 * (msr & 0x0f);
842         bit_write = 2 * (msr & 0x0f) + 1;
843         tmp       = msrpm[offset];
844
845         BUG_ON(offset == MSR_INVALID);
846
847         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
848         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
849
850         msrpm[offset] = tmp;
851 }
852
853 static void svm_vcpu_init_msrpm(u32 *msrpm)
854 {
855         int i;
856
857         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
858
859         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
860                 if (!direct_access_msrs[i].always)
861                         continue;
862
863                 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
864         }
865 }
866
867 static void add_msr_offset(u32 offset)
868 {
869         int i;
870
871         for (i = 0; i < MSRPM_OFFSETS; ++i) {
872
873                 /* Offset already in list? */
874                 if (msrpm_offsets[i] == offset)
875                         return;
876
877                 /* Slot used by another offset? */
878                 if (msrpm_offsets[i] != MSR_INVALID)
879                         continue;
880
881                 /* Add offset to list */
882                 msrpm_offsets[i] = offset;
883
884                 return;
885         }
886
887         /*
888          * If this BUG triggers the msrpm_offsets table has an overflow. Just
889          * increase MSRPM_OFFSETS in this case.
890          */
891         BUG();
892 }
893
894 static void init_msrpm_offsets(void)
895 {
896         int i;
897
898         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
899
900         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
901                 u32 offset;
902
903                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
904                 BUG_ON(offset == MSR_INVALID);
905
906                 add_msr_offset(offset);
907         }
908 }
909
910 static void svm_enable_lbrv(struct vcpu_svm *svm)
911 {
912         u32 *msrpm = svm->msrpm;
913
914         svm->vmcb->control.lbr_ctl = 1;
915         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
916         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
917         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
918         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
919 }
920
921 static void svm_disable_lbrv(struct vcpu_svm *svm)
922 {
923         u32 *msrpm = svm->msrpm;
924
925         svm->vmcb->control.lbr_ctl = 0;
926         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
927         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
928         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
929         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
930 }
931
932 static __init int svm_hardware_setup(void)
933 {
934         int cpu;
935         struct page *iopm_pages;
936         void *iopm_va;
937         int r;
938
939         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
940
941         if (!iopm_pages)
942                 return -ENOMEM;
943
944         iopm_va = page_address(iopm_pages);
945         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
946         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
947
948         init_msrpm_offsets();
949
950         if (boot_cpu_has(X86_FEATURE_NX))
951                 kvm_enable_efer_bits(EFER_NX);
952
953         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
954                 kvm_enable_efer_bits(EFER_FFXSR);
955
956         if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
957                 kvm_has_tsc_control = true;
958                 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
959                 kvm_tsc_scaling_ratio_frac_bits = 32;
960         }
961
962         if (nested) {
963                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
964                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
965         }
966
967         for_each_possible_cpu(cpu) {
968                 r = svm_cpu_init(cpu);
969                 if (r)
970                         goto err;
971         }
972
973         if (!boot_cpu_has(X86_FEATURE_NPT))
974                 npt_enabled = false;
975
976         if (npt_enabled && !npt) {
977                 printk(KERN_INFO "kvm: Nested Paging disabled\n");
978                 npt_enabled = false;
979         }
980
981         if (npt_enabled) {
982                 printk(KERN_INFO "kvm: Nested Paging enabled\n");
983                 kvm_enable_tdp();
984         } else
985                 kvm_disable_tdp();
986
987         if (avic) {
988                 if (!npt_enabled ||
989                     !boot_cpu_has(X86_FEATURE_AVIC) ||
990                     !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
991                         avic = false;
992                 else
993                         pr_info("AVIC enabled\n");
994         }
995
996         return 0;
997
998 err:
999         __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
1000         iopm_base = 0;
1001         return r;
1002 }
1003
1004 static __exit void svm_hardware_unsetup(void)
1005 {
1006         int cpu;
1007
1008         for_each_possible_cpu(cpu)
1009                 svm_cpu_uninit(cpu);
1010
1011         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1012         iopm_base = 0;
1013 }
1014
1015 static void init_seg(struct vmcb_seg *seg)
1016 {
1017         seg->selector = 0;
1018         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1019                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1020         seg->limit = 0xffff;
1021         seg->base = 0;
1022 }
1023
1024 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1025 {
1026         seg->selector = 0;
1027         seg->attrib = SVM_SELECTOR_P_MASK | type;
1028         seg->limit = 0xffff;
1029         seg->base = 0;
1030 }
1031
1032 static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1033 {
1034         struct vcpu_svm *svm = to_svm(vcpu);
1035
1036         return svm->vmcb->control.tsc_offset;
1037 }
1038
1039 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1040 {
1041         struct vcpu_svm *svm = to_svm(vcpu);
1042         u64 g_tsc_offset = 0;
1043
1044         if (is_guest_mode(vcpu)) {
1045                 g_tsc_offset = svm->vmcb->control.tsc_offset -
1046                                svm->nested.hsave->control.tsc_offset;
1047                 svm->nested.hsave->control.tsc_offset = offset;
1048         } else
1049                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1050                                            svm->vmcb->control.tsc_offset,
1051                                            offset);
1052
1053         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1054
1055         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1056 }
1057
1058 static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
1059 {
1060         struct vcpu_svm *svm = to_svm(vcpu);
1061
1062         svm->vmcb->control.tsc_offset += adjustment;
1063         if (is_guest_mode(vcpu))
1064                 svm->nested.hsave->control.tsc_offset += adjustment;
1065         else
1066                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1067                                      svm->vmcb->control.tsc_offset - adjustment,
1068                                      svm->vmcb->control.tsc_offset);
1069
1070         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1071 }
1072
1073 static void avic_init_vmcb(struct vcpu_svm *svm)
1074 {
1075         struct vmcb *vmcb = svm->vmcb;
1076         struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
1077         phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
1078         phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
1079         phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
1080
1081         vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1082         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
1083         vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1084         vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1085         vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1086         svm->vcpu.arch.apicv_active = true;
1087 }
1088
1089 static void init_vmcb(struct vcpu_svm *svm)
1090 {
1091         struct vmcb_control_area *control = &svm->vmcb->control;
1092         struct vmcb_save_area *save = &svm->vmcb->save;
1093
1094         svm->vcpu.fpu_active = 1;
1095         svm->vcpu.arch.hflags = 0;
1096
1097         set_cr_intercept(svm, INTERCEPT_CR0_READ);
1098         set_cr_intercept(svm, INTERCEPT_CR3_READ);
1099         set_cr_intercept(svm, INTERCEPT_CR4_READ);
1100         set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1101         set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1102         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1103         if (!kvm_vcpu_apicv_active(&svm->vcpu))
1104                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1105
1106         set_dr_intercepts(svm);
1107
1108         set_exception_intercept(svm, PF_VECTOR);
1109         set_exception_intercept(svm, UD_VECTOR);
1110         set_exception_intercept(svm, MC_VECTOR);
1111         set_exception_intercept(svm, AC_VECTOR);
1112         set_exception_intercept(svm, DB_VECTOR);
1113
1114         set_intercept(svm, INTERCEPT_INTR);
1115         set_intercept(svm, INTERCEPT_NMI);
1116         set_intercept(svm, INTERCEPT_SMI);
1117         set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1118         set_intercept(svm, INTERCEPT_RDPMC);
1119         set_intercept(svm, INTERCEPT_CPUID);
1120         set_intercept(svm, INTERCEPT_INVD);
1121         set_intercept(svm, INTERCEPT_HLT);
1122         set_intercept(svm, INTERCEPT_INVLPG);
1123         set_intercept(svm, INTERCEPT_INVLPGA);
1124         set_intercept(svm, INTERCEPT_IOIO_PROT);
1125         set_intercept(svm, INTERCEPT_MSR_PROT);
1126         set_intercept(svm, INTERCEPT_TASK_SWITCH);
1127         set_intercept(svm, INTERCEPT_SHUTDOWN);
1128         set_intercept(svm, INTERCEPT_VMRUN);
1129         set_intercept(svm, INTERCEPT_VMMCALL);
1130         set_intercept(svm, INTERCEPT_VMLOAD);
1131         set_intercept(svm, INTERCEPT_VMSAVE);
1132         set_intercept(svm, INTERCEPT_STGI);
1133         set_intercept(svm, INTERCEPT_CLGI);
1134         set_intercept(svm, INTERCEPT_SKINIT);
1135         set_intercept(svm, INTERCEPT_WBINVD);
1136         set_intercept(svm, INTERCEPT_MONITOR);
1137         set_intercept(svm, INTERCEPT_MWAIT);
1138         set_intercept(svm, INTERCEPT_XSETBV);
1139
1140         control->iopm_base_pa = iopm_base;
1141         control->msrpm_base_pa = __pa(svm->msrpm);
1142         control->int_ctl = V_INTR_MASKING_MASK;
1143
1144         init_seg(&save->es);
1145         init_seg(&save->ss);
1146         init_seg(&save->ds);
1147         init_seg(&save->fs);
1148         init_seg(&save->gs);
1149
1150         save->cs.selector = 0xf000;
1151         save->cs.base = 0xffff0000;
1152         /* Executable/Readable Code Segment */
1153         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1154                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1155         save->cs.limit = 0xffff;
1156
1157         save->gdtr.limit = 0xffff;
1158         save->idtr.limit = 0xffff;
1159
1160         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1161         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1162
1163         svm_set_efer(&svm->vcpu, 0);
1164         save->dr6 = 0xffff0ff0;
1165         kvm_set_rflags(&svm->vcpu, 2);
1166         save->rip = 0x0000fff0;
1167         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1168
1169         /*
1170          * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1171          * It also updates the guest-visible cr0 value.
1172          */
1173         svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1174         kvm_mmu_reset_context(&svm->vcpu);
1175
1176         save->cr4 = X86_CR4_PAE;
1177         /* rdx = ?? */
1178
1179         if (npt_enabled) {
1180                 /* Setup VMCB for Nested Paging */
1181                 control->nested_ctl = 1;
1182                 clr_intercept(svm, INTERCEPT_INVLPG);
1183                 clr_exception_intercept(svm, PF_VECTOR);
1184                 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1185                 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1186                 save->g_pat = svm->vcpu.arch.pat;
1187                 save->cr3 = 0;
1188                 save->cr4 = 0;
1189         }
1190         svm->asid_generation = 0;
1191
1192         svm->nested.vmcb = 0;
1193         svm->vcpu.arch.hflags = 0;
1194
1195         if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1196                 control->pause_filter_count = 3000;
1197                 set_intercept(svm, INTERCEPT_PAUSE);
1198         }
1199
1200         if (avic)
1201                 avic_init_vmcb(svm);
1202
1203         mark_all_dirty(svm->vmcb);
1204
1205         enable_gif(svm);
1206
1207 }
1208
1209 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
1210 {
1211         u64 *avic_physical_id_table;
1212         struct kvm_arch *vm_data = &vcpu->kvm->arch;
1213
1214         if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1215                 return NULL;
1216
1217         avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
1218
1219         return &avic_physical_id_table[index];
1220 }
1221
1222 /**
1223  * Note:
1224  * AVIC hardware walks the nested page table to check permissions,
1225  * but does not use the SPA address specified in the leaf page
1226  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
1227  * field of the VMCB. Therefore, we set up the
1228  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
1229  */
1230 static int avic_init_access_page(struct kvm_vcpu *vcpu)
1231 {
1232         struct kvm *kvm = vcpu->kvm;
1233         int ret;
1234
1235         if (kvm->arch.apic_access_page_done)
1236                 return 0;
1237
1238         ret = x86_set_memory_region(kvm,
1239                                     APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
1240                                     APIC_DEFAULT_PHYS_BASE,
1241                                     PAGE_SIZE);
1242         if (ret)
1243                 return ret;
1244
1245         kvm->arch.apic_access_page_done = true;
1246         return 0;
1247 }
1248
1249 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1250 {
1251         int ret;
1252         u64 *entry, new_entry;
1253         int id = vcpu->vcpu_id;
1254         struct vcpu_svm *svm = to_svm(vcpu);
1255
1256         ret = avic_init_access_page(vcpu);
1257         if (ret)
1258                 return ret;
1259
1260         if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
1261                 return -EINVAL;
1262
1263         if (!svm->vcpu.arch.apic->regs)
1264                 return -EINVAL;
1265
1266         svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
1267
1268         /* Setting AVIC backing page address in the phy APIC ID table */
1269         entry = avic_get_physical_id_entry(vcpu, id);
1270         if (!entry)
1271                 return -EINVAL;
1272
1273         new_entry = READ_ONCE(*entry);
1274         new_entry = (page_to_phys(svm->avic_backing_page) &
1275                      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1276                      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
1277         WRITE_ONCE(*entry, new_entry);
1278
1279         svm->avic_physical_id_cache = entry;
1280
1281         return 0;
1282 }
1283
1284 static void avic_vm_destroy(struct kvm *kvm)
1285 {
1286         struct kvm_arch *vm_data = &kvm->arch;
1287
1288         if (vm_data->avic_logical_id_table_page)
1289                 __free_page(vm_data->avic_logical_id_table_page);
1290         if (vm_data->avic_physical_id_table_page)
1291                 __free_page(vm_data->avic_physical_id_table_page);
1292 }
1293
1294 static int avic_vm_init(struct kvm *kvm)
1295 {
1296         int err = -ENOMEM;
1297         struct kvm_arch *vm_data = &kvm->arch;
1298         struct page *p_page;
1299         struct page *l_page;
1300
1301         if (!avic)
1302                 return 0;
1303
1304         /* Allocating physical APIC ID table (4KB) */
1305         p_page = alloc_page(GFP_KERNEL);
1306         if (!p_page)
1307                 goto free_avic;
1308
1309         vm_data->avic_physical_id_table_page = p_page;
1310         clear_page(page_address(p_page));
1311
1312         /* Allocating logical APIC ID table (4KB) */
1313         l_page = alloc_page(GFP_KERNEL);
1314         if (!l_page)
1315                 goto free_avic;
1316
1317         vm_data->avic_logical_id_table_page = l_page;
1318         clear_page(page_address(l_page));
1319
1320         return 0;
1321
1322 free_avic:
1323         avic_vm_destroy(kvm);
1324         return err;
1325 }
1326
1327 /**
1328  * This function is called during VCPU halt/unhalt.
1329  */
1330 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
1331 {
1332         u64 entry;
1333         int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
1334         struct vcpu_svm *svm = to_svm(vcpu);
1335
1336         if (!kvm_vcpu_apicv_active(vcpu))
1337                 return;
1338
1339         svm->avic_is_running = is_run;
1340
1341         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1342         if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
1343                 return;
1344
1345         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1346         WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
1347
1348         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1349         if (is_run)
1350                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1351         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1352 }
1353
1354 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1355 {
1356         u64 entry;
1357         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1358         int h_physical_id = kvm_cpu_get_apicid(cpu);
1359         struct vcpu_svm *svm = to_svm(vcpu);
1360
1361         if (!kvm_vcpu_apicv_active(vcpu))
1362                 return;
1363
1364         if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
1365                 return;
1366
1367         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1368         WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1369
1370         entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1371         entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1372
1373         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1374         if (svm->avic_is_running)
1375                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1376
1377         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1378 }
1379
1380 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
1381 {
1382         u64 entry;
1383         struct vcpu_svm *svm = to_svm(vcpu);
1384
1385         if (!kvm_vcpu_apicv_active(vcpu))
1386                 return;
1387
1388         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1389         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1390         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1391 }
1392
1393 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1394 {
1395         struct vcpu_svm *svm = to_svm(vcpu);
1396         u32 dummy;
1397         u32 eax = 1;
1398
1399         if (!init_event) {
1400                 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1401                                            MSR_IA32_APICBASE_ENABLE;
1402                 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1403                         svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1404         }
1405         init_vmcb(svm);
1406
1407         kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1408         kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1409
1410         if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1411                 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1412 }
1413
1414 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1415 {
1416         struct vcpu_svm *svm;
1417         struct page *page;
1418         struct page *msrpm_pages;
1419         struct page *hsave_page;
1420         struct page *nested_msrpm_pages;
1421         int err;
1422
1423         svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1424         if (!svm) {
1425                 err = -ENOMEM;
1426                 goto out;
1427         }
1428
1429         err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1430         if (err)
1431                 goto free_svm;
1432
1433         err = -ENOMEM;
1434         page = alloc_page(GFP_KERNEL);
1435         if (!page)
1436                 goto uninit;
1437
1438         msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1439         if (!msrpm_pages)
1440                 goto free_page1;
1441
1442         nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1443         if (!nested_msrpm_pages)
1444                 goto free_page2;
1445
1446         hsave_page = alloc_page(GFP_KERNEL);
1447         if (!hsave_page)
1448                 goto free_page3;
1449
1450         if (avic) {
1451                 err = avic_init_backing_page(&svm->vcpu);
1452                 if (err)
1453                         goto free_page4;
1454         }
1455
1456         /* We initialize this flag to true to make sure that the is_running
1457          * bit would be set the first time the vcpu is loaded.
1458          */
1459         svm->avic_is_running = true;
1460
1461         svm->nested.hsave = page_address(hsave_page);
1462
1463         svm->msrpm = page_address(msrpm_pages);
1464         svm_vcpu_init_msrpm(svm->msrpm);
1465
1466         svm->nested.msrpm = page_address(nested_msrpm_pages);
1467         svm_vcpu_init_msrpm(svm->nested.msrpm);
1468
1469         svm->vmcb = page_address(page);
1470         clear_page(svm->vmcb);
1471         svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1472         svm->asid_generation = 0;
1473         init_vmcb(svm);
1474
1475         svm_init_osvw(&svm->vcpu);
1476
1477         return &svm->vcpu;
1478
1479 free_page4:
1480         __free_page(hsave_page);
1481 free_page3:
1482         __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1483 free_page2:
1484         __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1485 free_page1:
1486         __free_page(page);
1487 uninit:
1488         kvm_vcpu_uninit(&svm->vcpu);
1489 free_svm:
1490         kmem_cache_free(kvm_vcpu_cache, svm);
1491 out:
1492         return ERR_PTR(err);
1493 }
1494
1495 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1496 {
1497         struct vcpu_svm *svm = to_svm(vcpu);
1498
1499         __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1500         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1501         __free_page(virt_to_page(svm->nested.hsave));
1502         __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1503         kvm_vcpu_uninit(vcpu);
1504         kmem_cache_free(kvm_vcpu_cache, svm);
1505 }
1506
1507 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1508 {
1509         struct vcpu_svm *svm = to_svm(vcpu);
1510         int i;
1511
1512         if (unlikely(cpu != vcpu->cpu)) {
1513                 svm->asid_generation = 0;
1514                 mark_all_dirty(svm->vmcb);
1515         }
1516
1517 #ifdef CONFIG_X86_64
1518         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1519 #endif
1520         savesegment(fs, svm->host.fs);
1521         savesegment(gs, svm->host.gs);
1522         svm->host.ldt = kvm_read_ldt();
1523
1524         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1525                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1526
1527         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1528                 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1529                 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1530                         __this_cpu_write(current_tsc_ratio, tsc_ratio);
1531                         wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1532                 }
1533         }
1534
1535         avic_vcpu_load(vcpu, cpu);
1536 }
1537
1538 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1539 {
1540         struct vcpu_svm *svm = to_svm(vcpu);
1541         int i;
1542
1543         avic_vcpu_put(vcpu);
1544
1545         ++vcpu->stat.host_state_reload;
1546         kvm_load_ldt(svm->host.ldt);
1547 #ifdef CONFIG_X86_64
1548         loadsegment(fs, svm->host.fs);
1549         wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
1550         load_gs_index(svm->host.gs);
1551 #else
1552 #ifdef CONFIG_X86_32_LAZY_GS
1553         loadsegment(gs, svm->host.gs);
1554 #endif
1555 #endif
1556         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1557                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1558 }
1559
1560 static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
1561 {
1562         avic_set_running(vcpu, false);
1563 }
1564
1565 static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
1566 {
1567         avic_set_running(vcpu, true);
1568 }
1569
1570 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1571 {
1572         return to_svm(vcpu)->vmcb->save.rflags;
1573 }
1574
1575 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1576 {
1577        /*
1578         * Any change of EFLAGS.VM is accompained by a reload of SS
1579         * (caused by either a task switch or an inter-privilege IRET),
1580         * so we do not need to update the CPL here.
1581         */
1582         to_svm(vcpu)->vmcb->save.rflags = rflags;
1583 }
1584
1585 static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
1586 {
1587         return 0;
1588 }
1589
1590 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1591 {
1592         switch (reg) {
1593         case VCPU_EXREG_PDPTR:
1594                 BUG_ON(!npt_enabled);
1595                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1596                 break;
1597         default:
1598                 BUG();
1599         }
1600 }
1601
1602 static void svm_set_vintr(struct vcpu_svm *svm)
1603 {
1604         set_intercept(svm, INTERCEPT_VINTR);
1605 }
1606
1607 static void svm_clear_vintr(struct vcpu_svm *svm)
1608 {
1609         clr_intercept(svm, INTERCEPT_VINTR);
1610 }
1611
1612 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1613 {
1614         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1615
1616         switch (seg) {
1617         case VCPU_SREG_CS: return &save->cs;
1618         case VCPU_SREG_DS: return &save->ds;
1619         case VCPU_SREG_ES: return &save->es;
1620         case VCPU_SREG_FS: return &save->fs;
1621         case VCPU_SREG_GS: return &save->gs;
1622         case VCPU_SREG_SS: return &save->ss;
1623         case VCPU_SREG_TR: return &save->tr;
1624         case VCPU_SREG_LDTR: return &save->ldtr;
1625         }
1626         BUG();
1627         return NULL;
1628 }
1629
1630 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1631 {
1632         struct vmcb_seg *s = svm_seg(vcpu, seg);
1633
1634         return s->base;
1635 }
1636
1637 static void svm_get_segment(struct kvm_vcpu *vcpu,
1638                             struct kvm_segment *var, int seg)
1639 {
1640         struct vmcb_seg *s = svm_seg(vcpu, seg);
1641
1642         var->base = s->base;
1643         var->limit = s->limit;
1644         var->selector = s->selector;
1645         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1646         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1647         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1648         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1649         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1650         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1651         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1652
1653         /*
1654          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1655          * However, the SVM spec states that the G bit is not observed by the
1656          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1657          * So let's synthesize a legal G bit for all segments, this helps
1658          * running KVM nested. It also helps cross-vendor migration, because
1659          * Intel's vmentry has a check on the 'G' bit.
1660          */
1661         var->g = s->limit > 0xfffff;
1662
1663         /*
1664          * AMD's VMCB does not have an explicit unusable field, so emulate it
1665          * for cross vendor migration purposes by "not present"
1666          */
1667         var->unusable = !var->present || (var->type == 0);
1668
1669         switch (seg) {
1670         case VCPU_SREG_TR:
1671                 /*
1672                  * Work around a bug where the busy flag in the tr selector
1673                  * isn't exposed
1674                  */
1675                 var->type |= 0x2;
1676                 break;
1677         case VCPU_SREG_DS:
1678         case VCPU_SREG_ES:
1679         case VCPU_SREG_FS:
1680         case VCPU_SREG_GS:
1681                 /*
1682                  * The accessed bit must always be set in the segment
1683                  * descriptor cache, although it can be cleared in the
1684                  * descriptor, the cached bit always remains at 1. Since
1685                  * Intel has a check on this, set it here to support
1686                  * cross-vendor migration.
1687                  */
1688                 if (!var->unusable)
1689                         var->type |= 0x1;
1690                 break;
1691         case VCPU_SREG_SS:
1692                 /*
1693                  * On AMD CPUs sometimes the DB bit in the segment
1694                  * descriptor is left as 1, although the whole segment has
1695                  * been made unusable. Clear it here to pass an Intel VMX
1696                  * entry check when cross vendor migrating.
1697                  */
1698                 if (var->unusable)
1699                         var->db = 0;
1700                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1701                 break;
1702         }
1703 }
1704
1705 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1706 {
1707         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1708
1709         return save->cpl;
1710 }
1711
1712 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1713 {
1714         struct vcpu_svm *svm = to_svm(vcpu);
1715
1716         dt->size = svm->vmcb->save.idtr.limit;
1717         dt->address = svm->vmcb->save.idtr.base;
1718 }
1719
1720 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1721 {
1722         struct vcpu_svm *svm = to_svm(vcpu);
1723
1724         svm->vmcb->save.idtr.limit = dt->size;
1725         svm->vmcb->save.idtr.base = dt->address ;
1726         mark_dirty(svm->vmcb, VMCB_DT);
1727 }
1728
1729 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1730 {
1731         struct vcpu_svm *svm = to_svm(vcpu);
1732
1733         dt->size = svm->vmcb->save.gdtr.limit;
1734         dt->address = svm->vmcb->save.gdtr.base;
1735 }
1736
1737 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1738 {
1739         struct vcpu_svm *svm = to_svm(vcpu);
1740
1741         svm->vmcb->save.gdtr.limit = dt->size;
1742         svm->vmcb->save.gdtr.base = dt->address ;
1743         mark_dirty(svm->vmcb, VMCB_DT);
1744 }
1745
1746 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1747 {
1748 }
1749
1750 static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1751 {
1752 }
1753
1754 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1755 {
1756 }
1757
1758 static void update_cr0_intercept(struct vcpu_svm *svm)
1759 {
1760         ulong gcr0 = svm->vcpu.arch.cr0;
1761         u64 *hcr0 = &svm->vmcb->save.cr0;
1762
1763         if (!svm->vcpu.fpu_active)
1764                 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
1765         else
1766                 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1767                         | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1768
1769         mark_dirty(svm->vmcb, VMCB_CR);
1770
1771         if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1772                 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1773                 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1774         } else {
1775                 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1776                 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1777         }
1778 }
1779
1780 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1781 {
1782         struct vcpu_svm *svm = to_svm(vcpu);
1783
1784 #ifdef CONFIG_X86_64
1785         if (vcpu->arch.efer & EFER_LME) {
1786                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1787                         vcpu->arch.efer |= EFER_LMA;
1788                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1789                 }
1790
1791                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1792                         vcpu->arch.efer &= ~EFER_LMA;
1793                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1794                 }
1795         }
1796 #endif
1797         vcpu->arch.cr0 = cr0;
1798
1799         if (!npt_enabled)
1800                 cr0 |= X86_CR0_PG | X86_CR0_WP;
1801
1802         if (!vcpu->fpu_active)
1803                 cr0 |= X86_CR0_TS;
1804         /*
1805          * re-enable caching here because the QEMU bios
1806          * does not do it - this results in some delay at
1807          * reboot
1808          */
1809         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1810                 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1811         svm->vmcb->save.cr0 = cr0;
1812         mark_dirty(svm->vmcb, VMCB_CR);
1813         update_cr0_intercept(svm);
1814 }
1815
1816 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1817 {
1818         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1819         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1820
1821         if (cr4 & X86_CR4_VMXE)
1822                 return 1;
1823
1824         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1825                 svm_flush_tlb(vcpu);
1826
1827         vcpu->arch.cr4 = cr4;
1828         if (!npt_enabled)
1829                 cr4 |= X86_CR4_PAE;
1830         cr4 |= host_cr4_mce;
1831         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1832         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1833         return 0;
1834 }
1835
1836 static void svm_set_segment(struct kvm_vcpu *vcpu,
1837                             struct kvm_segment *var, int seg)
1838 {
1839         struct vcpu_svm *svm = to_svm(vcpu);
1840         struct vmcb_seg *s = svm_seg(vcpu, seg);
1841
1842         s->base = var->base;
1843         s->limit = var->limit;
1844         s->selector = var->selector;
1845         if (var->unusable)
1846                 s->attrib = 0;
1847         else {
1848                 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1849                 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1850                 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1851                 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1852                 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1853                 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1854                 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1855                 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1856         }
1857
1858         /*
1859          * This is always accurate, except if SYSRET returned to a segment
1860          * with SS.DPL != 3.  Intel does not have this quirk, and always
1861          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1862          * would entail passing the CPL to userspace and back.
1863          */
1864         if (seg == VCPU_SREG_SS)
1865                 svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1866
1867         mark_dirty(svm->vmcb, VMCB_SEG);
1868 }
1869
1870 static void update_bp_intercept(struct kvm_vcpu *vcpu)
1871 {
1872         struct vcpu_svm *svm = to_svm(vcpu);
1873
1874         clr_exception_intercept(svm, BP_VECTOR);
1875
1876         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1877                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1878                         set_exception_intercept(svm, BP_VECTOR);
1879         } else
1880                 vcpu->guest_debug = 0;
1881 }
1882
1883 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1884 {
1885         if (sd->next_asid > sd->max_asid) {
1886                 ++sd->asid_generation;
1887                 sd->next_asid = 1;
1888                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1889         }
1890
1891         svm->asid_generation = sd->asid_generation;
1892         svm->vmcb->control.asid = sd->next_asid++;
1893
1894         mark_dirty(svm->vmcb, VMCB_ASID);
1895 }
1896
1897 static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1898 {
1899         return to_svm(vcpu)->vmcb->save.dr6;
1900 }
1901
1902 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1903 {
1904         struct vcpu_svm *svm = to_svm(vcpu);
1905
1906         svm->vmcb->save.dr6 = value;
1907         mark_dirty(svm->vmcb, VMCB_DR);
1908 }
1909
1910 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1911 {
1912         struct vcpu_svm *svm = to_svm(vcpu);
1913
1914         get_debugreg(vcpu->arch.db[0], 0);
1915         get_debugreg(vcpu->arch.db[1], 1);
1916         get_debugreg(vcpu->arch.db[2], 2);
1917         get_debugreg(vcpu->arch.db[3], 3);
1918         vcpu->arch.dr6 = svm_get_dr6(vcpu);
1919         vcpu->arch.dr7 = svm->vmcb->save.dr7;
1920
1921         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1922         set_dr_intercepts(svm);
1923 }
1924
1925 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1926 {
1927         struct vcpu_svm *svm = to_svm(vcpu);
1928
1929         svm->vmcb->save.dr7 = value;
1930         mark_dirty(svm->vmcb, VMCB_DR);
1931 }
1932
1933 static int pf_interception(struct vcpu_svm *svm)
1934 {
1935         u64 fault_address = svm->vmcb->control.exit_info_2;
1936         u32 error_code;
1937         int r = 1;
1938
1939         switch (svm->apf_reason) {
1940         default:
1941                 error_code = svm->vmcb->control.exit_info_1;
1942
1943                 trace_kvm_page_fault(fault_address, error_code);
1944                 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1945                         kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1946                 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1947                         svm->vmcb->control.insn_bytes,
1948                         svm->vmcb->control.insn_len);
1949                 break;
1950         case KVM_PV_REASON_PAGE_NOT_PRESENT:
1951                 svm->apf_reason = 0;
1952                 local_irq_disable();
1953                 kvm_async_pf_task_wait(fault_address);
1954                 local_irq_enable();
1955                 break;
1956         case KVM_PV_REASON_PAGE_READY:
1957                 svm->apf_reason = 0;
1958                 local_irq_disable();
1959                 kvm_async_pf_task_wake(fault_address);
1960                 local_irq_enable();
1961                 break;
1962         }
1963         return r;
1964 }
1965
1966 static int db_interception(struct vcpu_svm *svm)
1967 {
1968         struct kvm_run *kvm_run = svm->vcpu.run;
1969
1970         if (!(svm->vcpu.guest_debug &
1971               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1972                 !svm->nmi_singlestep) {
1973                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1974                 return 1;
1975         }
1976
1977         if (svm->nmi_singlestep) {
1978                 svm->nmi_singlestep = false;
1979                 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1980                         svm->vmcb->save.rflags &=
1981                                 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1982         }
1983
1984         if (svm->vcpu.guest_debug &
1985             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1986                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1987                 kvm_run->debug.arch.pc =
1988                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1989                 kvm_run->debug.arch.exception = DB_VECTOR;
1990                 return 0;
1991         }
1992
1993         return 1;
1994 }
1995
1996 static int bp_interception(struct vcpu_svm *svm)
1997 {
1998         struct kvm_run *kvm_run = svm->vcpu.run;
1999
2000         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2001         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2002         kvm_run->debug.arch.exception = BP_VECTOR;
2003         return 0;
2004 }
2005
2006 static int ud_interception(struct vcpu_svm *svm)
2007 {
2008         int er;
2009
2010         er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
2011         if (er != EMULATE_DONE)
2012                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2013         return 1;
2014 }
2015
2016 static int ac_interception(struct vcpu_svm *svm)
2017 {
2018         kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
2019         return 1;
2020 }
2021
2022 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
2023 {
2024         struct vcpu_svm *svm = to_svm(vcpu);
2025
2026         clr_exception_intercept(svm, NM_VECTOR);
2027
2028         svm->vcpu.fpu_active = 1;
2029         update_cr0_intercept(svm);
2030 }
2031
2032 static int nm_interception(struct vcpu_svm *svm)
2033 {
2034         svm_fpu_activate(&svm->vcpu);
2035         return 1;
2036 }
2037
2038 static bool is_erratum_383(void)
2039 {
2040         int err, i;
2041         u64 value;
2042
2043         if (!erratum_383_found)
2044                 return false;
2045
2046         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2047         if (err)
2048                 return false;
2049
2050         /* Bit 62 may or may not be set for this mce */
2051         value &= ~(1ULL << 62);
2052
2053         if (value != 0xb600000000010015ULL)
2054                 return false;
2055
2056         /* Clear MCi_STATUS registers */
2057         for (i = 0; i < 6; ++i)
2058                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2059
2060         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2061         if (!err) {
2062                 u32 low, high;
2063
2064                 value &= ~(1ULL << 2);
2065                 low    = lower_32_bits(value);
2066                 high   = upper_32_bits(value);
2067
2068                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2069         }
2070
2071         /* Flush tlb to evict multi-match entries */
2072         __flush_tlb_all();
2073
2074         return true;
2075 }
2076
2077 static void svm_handle_mce(struct vcpu_svm *svm)
2078 {
2079         if (is_erratum_383()) {
2080                 /*
2081                  * Erratum 383 triggered. Guest state is corrupt so kill the
2082                  * guest.
2083                  */
2084                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2085
2086                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
2087
2088                 return;
2089         }
2090
2091         /*
2092          * On an #MC intercept the MCE handler is not called automatically in
2093          * the host. So do it by hand here.
2094          */
2095         asm volatile (
2096                 "int $0x12\n");
2097         /* not sure if we ever come back to this point */
2098
2099         return;
2100 }
2101
2102 static int mc_interception(struct vcpu_svm *svm)
2103 {
2104         return 1;
2105 }
2106
2107 static int shutdown_interception(struct vcpu_svm *svm)
2108 {
2109         struct kvm_run *kvm_run = svm->vcpu.run;
2110
2111         /*
2112          * VMCB is undefined after a SHUTDOWN intercept
2113          * so reinitialize it.
2114          */
2115         clear_page(svm->vmcb);
2116         init_vmcb(svm);
2117
2118         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2119         return 0;
2120 }
2121
2122 static int io_interception(struct vcpu_svm *svm)
2123 {
2124         struct kvm_vcpu *vcpu = &svm->vcpu;
2125         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2126         int size, in, string;
2127         unsigned port;
2128
2129         ++svm->vcpu.stat.io_exits;
2130         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2131         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2132         if (string || in)
2133                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
2134
2135         port = io_info >> 16;
2136         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2137         svm->next_rip = svm->vmcb->control.exit_info_2;
2138         skip_emulated_instruction(&svm->vcpu);
2139
2140         return kvm_fast_pio_out(vcpu, size, port);
2141 }
2142
2143 static int nmi_interception(struct vcpu_svm *svm)
2144 {
2145         return 1;
2146 }
2147
2148 static int intr_interception(struct vcpu_svm *svm)
2149 {
2150         ++svm->vcpu.stat.irq_exits;
2151         return 1;
2152 }
2153
2154 static int nop_on_interception(struct vcpu_svm *svm)
2155 {
2156         return 1;
2157 }
2158
2159 static int halt_interception(struct vcpu_svm *svm)
2160 {
2161         svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
2162         return kvm_emulate_halt(&svm->vcpu);
2163 }
2164
2165 static int vmmcall_interception(struct vcpu_svm *svm)
2166 {
2167         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2168         return kvm_emulate_hypercall(&svm->vcpu);
2169 }
2170
2171 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
2172 {
2173         struct vcpu_svm *svm = to_svm(vcpu);
2174
2175         return svm->nested.nested_cr3;
2176 }
2177
2178 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2179 {
2180         struct vcpu_svm *svm = to_svm(vcpu);
2181         u64 cr3 = svm->nested.nested_cr3;
2182         u64 pdpte;
2183         int ret;
2184
2185         ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
2186                                        offset_in_page(cr3) + index * 8, 8);
2187         if (ret)
2188                 return 0;
2189         return pdpte;
2190 }
2191
2192 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2193                                    unsigned long root)
2194 {
2195         struct vcpu_svm *svm = to_svm(vcpu);
2196
2197         svm->vmcb->control.nested_cr3 = root;
2198         mark_dirty(svm->vmcb, VMCB_NPT);
2199         svm_flush_tlb(vcpu);
2200 }
2201
2202 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2203                                        struct x86_exception *fault)
2204 {
2205         struct vcpu_svm *svm = to_svm(vcpu);
2206
2207         if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
2208                 /*
2209                  * TODO: track the cause of the nested page fault, and
2210                  * correctly fill in the high bits of exit_info_1.
2211                  */
2212                 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
2213                 svm->vmcb->control.exit_code_hi = 0;
2214                 svm->vmcb->control.exit_info_1 = (1ULL << 32);
2215                 svm->vmcb->control.exit_info_2 = fault->address;
2216         }
2217
2218         svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
2219         svm->vmcb->control.exit_info_1 |= fault->error_code;
2220
2221         /*
2222          * The present bit is always zero for page structure faults on real
2223          * hardware.
2224          */
2225         if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
2226                 svm->vmcb->control.exit_info_1 &= ~1;
2227
2228         nested_svm_vmexit(svm);
2229 }
2230
2231 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2232 {
2233         WARN_ON(mmu_is_nested(vcpu));
2234         kvm_init_shadow_mmu(vcpu);
2235         vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
2236         vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
2237         vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
2238         vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2239         vcpu->arch.mmu.shadow_root_level = get_npt_level();
2240         reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
2241         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2242 }
2243
2244 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2245 {
2246         vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2247 }
2248
2249 static int nested_svm_check_permissions(struct vcpu_svm *svm)
2250 {
2251         if (!(svm->vcpu.arch.efer & EFER_SVME)
2252             || !is_paging(&svm->vcpu)) {
2253                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2254                 return 1;
2255         }
2256
2257         if (svm->vmcb->save.cpl) {
2258                 kvm_inject_gp(&svm->vcpu, 0);
2259                 return 1;
2260         }
2261
2262        return 0;
2263 }
2264
2265 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2266                                       bool has_error_code, u32 error_code)
2267 {
2268         int vmexit;
2269
2270         if (!is_guest_mode(&svm->vcpu))
2271                 return 0;
2272
2273         svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2274         svm->vmcb->control.exit_code_hi = 0;
2275         svm->vmcb->control.exit_info_1 = error_code;
2276         svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2277
2278         vmexit = nested_svm_intercept(svm);
2279         if (vmexit == NESTED_EXIT_DONE)
2280                 svm->nested.exit_required = true;
2281
2282         return vmexit;
2283 }
2284
2285 /* This function returns true if it is save to enable the irq window */
2286 static inline bool nested_svm_intr(struct vcpu_svm *svm)
2287 {
2288         if (!is_guest_mode(&svm->vcpu))
2289                 return true;
2290
2291         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2292                 return true;
2293
2294         if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2295                 return false;
2296
2297         /*
2298          * if vmexit was already requested (by intercepted exception
2299          * for instance) do not overwrite it with "external interrupt"
2300          * vmexit.
2301          */
2302         if (svm->nested.exit_required)
2303                 return false;
2304
2305         svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2306         svm->vmcb->control.exit_info_1 = 0;
2307         svm->vmcb->control.exit_info_2 = 0;
2308
2309         if (svm->nested.intercept & 1ULL) {
2310                 /*
2311                  * The #vmexit can't be emulated here directly because this
2312                  * code path runs with irqs and preemption disabled. A
2313                  * #vmexit emulation might sleep. Only signal request for
2314                  * the #vmexit here.
2315                  */
2316                 svm->nested.exit_required = true;
2317                 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2318                 return false;
2319         }
2320
2321         return true;
2322 }
2323
2324 /* This function returns true if it is save to enable the nmi window */
2325 static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2326 {
2327         if (!is_guest_mode(&svm->vcpu))
2328                 return true;
2329
2330         if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2331                 return true;
2332
2333         svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2334         svm->nested.exit_required = true;
2335
2336         return false;
2337 }
2338
2339 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2340 {
2341         struct page *page;
2342
2343         might_sleep();
2344
2345         page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2346         if (is_error_page(page))
2347                 goto error;
2348
2349         *_page = page;
2350
2351         return kmap(page);
2352
2353 error:
2354         kvm_inject_gp(&svm->vcpu, 0);
2355
2356         return NULL;
2357 }
2358
2359 static void nested_svm_unmap(struct page *page)
2360 {
2361         kunmap(page);
2362         kvm_release_page_dirty(page);
2363 }
2364
2365 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2366 {
2367         unsigned port, size, iopm_len;
2368         u16 val, mask;
2369         u8 start_bit;
2370         u64 gpa;
2371
2372         if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2373                 return NESTED_EXIT_HOST;
2374
2375         port = svm->vmcb->control.exit_info_1 >> 16;
2376         size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2377                 SVM_IOIO_SIZE_SHIFT;
2378         gpa  = svm->nested.vmcb_iopm + (port / 8);
2379         start_bit = port % 8;
2380         iopm_len = (start_bit + size > 8) ? 2 : 1;
2381         mask = (0xf >> (4 - size)) << start_bit;
2382         val = 0;
2383
2384         if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2385                 return NESTED_EXIT_DONE;
2386
2387         return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2388 }
2389
2390 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2391 {
2392         u32 offset, msr, value;
2393         int write, mask;
2394
2395         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2396                 return NESTED_EXIT_HOST;
2397
2398         msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2399         offset = svm_msrpm_offset(msr);
2400         write  = svm->vmcb->control.exit_info_1 & 1;
2401         mask   = 1 << ((2 * (msr & 0xf)) + write);
2402
2403         if (offset == MSR_INVALID)
2404                 return NESTED_EXIT_DONE;
2405
2406         /* Offset is in 32 bit units but need in 8 bit units */
2407         offset *= 4;
2408
2409         if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2410                 return NESTED_EXIT_DONE;
2411
2412         return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2413 }
2414
2415 static int nested_svm_exit_special(struct vcpu_svm *svm)
2416 {
2417         u32 exit_code = svm->vmcb->control.exit_code;
2418
2419         switch (exit_code) {
2420         case SVM_EXIT_INTR:
2421         case SVM_EXIT_NMI:
2422         case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2423                 return NESTED_EXIT_HOST;
2424         case SVM_EXIT_NPF:
2425                 /* For now we are always handling NPFs when using them */
2426                 if (npt_enabled)
2427                         return NESTED_EXIT_HOST;
2428                 break;
2429         case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2430                 /* When we're shadowing, trap PFs, but not async PF */
2431                 if (!npt_enabled && svm->apf_reason == 0)
2432                         return NESTED_EXIT_HOST;
2433                 break;
2434         case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2435                 nm_interception(svm);
2436                 break;
2437         default:
2438                 break;
2439         }
2440
2441         return NESTED_EXIT_CONTINUE;
2442 }
2443
2444 /*
2445  * If this function returns true, this #vmexit was already handled
2446  */
2447 static int nested_svm_intercept(struct vcpu_svm *svm)
2448 {
2449         u32 exit_code = svm->vmcb->control.exit_code;
2450         int vmexit = NESTED_EXIT_HOST;
2451
2452         switch (exit_code) {
2453         case SVM_EXIT_MSR:
2454                 vmexit = nested_svm_exit_handled_msr(svm);
2455                 break;
2456         case SVM_EXIT_IOIO:
2457                 vmexit = nested_svm_intercept_ioio(svm);
2458                 break;
2459         case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2460                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2461                 if (svm->nested.intercept_cr & bit)
2462                         vmexit = NESTED_EXIT_DONE;
2463                 break;
2464         }
2465         case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2466                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2467                 if (svm->nested.intercept_dr & bit)
2468                         vmexit = NESTED_EXIT_DONE;
2469                 break;
2470         }
2471         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2472                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2473                 if (svm->nested.intercept_exceptions & excp_bits)
2474                         vmexit = NESTED_EXIT_DONE;
2475                 /* async page fault always cause vmexit */
2476                 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2477                          svm->apf_reason != 0)
2478                         vmexit = NESTED_EXIT_DONE;
2479                 break;
2480         }
2481         case SVM_EXIT_ERR: {
2482                 vmexit = NESTED_EXIT_DONE;
2483                 break;
2484         }
2485         default: {
2486                 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2487                 if (svm->nested.intercept & exit_bits)
2488                         vmexit = NESTED_EXIT_DONE;
2489         }
2490         }
2491
2492         return vmexit;
2493 }
2494
2495 static int nested_svm_exit_handled(struct vcpu_svm *svm)
2496 {
2497         int vmexit;
2498
2499         vmexit = nested_svm_intercept(svm);
2500
2501         if (vmexit == NESTED_EXIT_DONE)
2502                 nested_svm_vmexit(svm);
2503
2504         return vmexit;
2505 }
2506
2507 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2508 {
2509         struct vmcb_control_area *dst  = &dst_vmcb->control;
2510         struct vmcb_control_area *from = &from_vmcb->control;
2511
2512         dst->intercept_cr         = from->intercept_cr;
2513         dst->intercept_dr         = from->intercept_dr;
2514         dst->intercept_exceptions = from->intercept_exceptions;
2515         dst->intercept            = from->intercept;
2516         dst->iopm_base_pa         = from->iopm_base_pa;
2517         dst->msrpm_base_pa        = from->msrpm_base_pa;
2518         dst->tsc_offset           = from->tsc_offset;
2519         dst->asid                 = from->asid;
2520         dst->tlb_ctl              = from->tlb_ctl;
2521         dst->int_ctl              = from->int_ctl;
2522         dst->int_vector           = from->int_vector;
2523         dst->int_state            = from->int_state;
2524         dst->exit_code            = from->exit_code;
2525         dst->exit_code_hi         = from->exit_code_hi;
2526         dst->exit_info_1          = from->exit_info_1;
2527         dst->exit_info_2          = from->exit_info_2;
2528         dst->exit_int_info        = from->exit_int_info;
2529         dst->exit_int_info_err    = from->exit_int_info_err;
2530         dst->nested_ctl           = from->nested_ctl;
2531         dst->event_inj            = from->event_inj;
2532         dst->event_inj_err        = from->event_inj_err;
2533         dst->nested_cr3           = from->nested_cr3;
2534         dst->lbr_ctl              = from->lbr_ctl;
2535 }
2536
2537 static int nested_svm_vmexit(struct vcpu_svm *svm)
2538 {
2539         struct vmcb *nested_vmcb;
2540         struct vmcb *hsave = svm->nested.hsave;
2541         struct vmcb *vmcb = svm->vmcb;
2542         struct page *page;
2543
2544         trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2545                                        vmcb->control.exit_info_1,
2546                                        vmcb->control.exit_info_2,
2547                                        vmcb->control.exit_int_info,
2548                                        vmcb->control.exit_int_info_err,
2549                                        KVM_ISA_SVM);
2550
2551         nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2552         if (!nested_vmcb)
2553                 return 1;
2554
2555         /* Exit Guest-Mode */
2556         leave_guest_mode(&svm->vcpu);
2557         svm->nested.vmcb = 0;
2558
2559         /* Give the current vmcb to the guest */
2560         disable_gif(svm);
2561
2562         nested_vmcb->save.es     = vmcb->save.es;
2563         nested_vmcb->save.cs     = vmcb->save.cs;
2564         nested_vmcb->save.ss     = vmcb->save.ss;
2565         nested_vmcb->save.ds     = vmcb->save.ds;
2566         nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2567         nested_vmcb->save.idtr   = vmcb->save.idtr;
2568         nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2569         nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2570         nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2571         nested_vmcb->save.cr2    = vmcb->save.cr2;
2572         nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2573         nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2574         nested_vmcb->save.rip    = vmcb->save.rip;
2575         nested_vmcb->save.rsp    = vmcb->save.rsp;
2576         nested_vmcb->save.rax    = vmcb->save.rax;
2577         nested_vmcb->save.dr7    = vmcb->save.dr7;
2578         nested_vmcb->save.dr6    = vmcb->save.dr6;
2579         nested_vmcb->save.cpl    = vmcb->save.cpl;
2580
2581         nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2582         nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2583         nested_vmcb->control.int_state         = vmcb->control.int_state;
2584         nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2585         nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2586         nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2587         nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2588         nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2589         nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2590
2591         if (svm->nrips_enabled)
2592                 nested_vmcb->control.next_rip  = vmcb->control.next_rip;
2593
2594         /*
2595          * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2596          * to make sure that we do not lose injected events. So check event_inj
2597          * here and copy it to exit_int_info if it is valid.
2598          * Exit_int_info and event_inj can't be both valid because the case
2599          * below only happens on a VMRUN instruction intercept which has
2600          * no valid exit_int_info set.
2601          */
2602         if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2603                 struct vmcb_control_area *nc = &nested_vmcb->control;
2604
2605                 nc->exit_int_info     = vmcb->control.event_inj;
2606                 nc->exit_int_info_err = vmcb->control.event_inj_err;
2607         }
2608
2609         nested_vmcb->control.tlb_ctl           = 0;
2610         nested_vmcb->control.event_inj         = 0;
2611         nested_vmcb->control.event_inj_err     = 0;
2612
2613         /* We always set V_INTR_MASKING and remember the old value in hflags */
2614         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2615                 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2616
2617         /* Restore the original control entries */
2618         copy_vmcb_control_area(vmcb, hsave);
2619
2620         kvm_clear_exception_queue(&svm->vcpu);
2621         kvm_clear_interrupt_queue(&svm->vcpu);
2622
2623         svm->nested.nested_cr3 = 0;
2624
2625         /* Restore selected save entries */
2626         svm->vmcb->save.es = hsave->save.es;
2627         svm->vmcb->save.cs = hsave->save.cs;
2628         svm->vmcb->save.ss = hsave->save.ss;
2629         svm->vmcb->save.ds = hsave->save.ds;
2630         svm->vmcb->save.gdtr = hsave->save.gdtr;
2631         svm->vmcb->save.idtr = hsave->save.idtr;
2632         kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2633         svm_set_efer(&svm->vcpu, hsave->save.efer);
2634         svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2635         svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2636         if (npt_enabled) {
2637                 svm->vmcb->save.cr3 = hsave->save.cr3;
2638                 svm->vcpu.arch.cr3 = hsave->save.cr3;
2639         } else {
2640                 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2641         }
2642         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2643         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2644         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2645         svm->vmcb->save.dr7 = 0;
2646         svm->vmcb->save.cpl = 0;
2647         svm->vmcb->control.exit_int_info = 0;
2648
2649         mark_all_dirty(svm->vmcb);
2650
2651         nested_svm_unmap(page);
2652
2653         nested_svm_uninit_mmu_context(&svm->vcpu);
2654         kvm_mmu_reset_context(&svm->vcpu);
2655         kvm_mmu_load(&svm->vcpu);
2656
2657         return 0;
2658 }
2659
2660 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2661 {
2662         /*
2663          * This function merges the msr permission bitmaps of kvm and the
2664          * nested vmcb. It is optimized in that it only merges the parts where
2665          * the kvm msr permission bitmap may contain zero bits
2666          */
2667         int i;
2668
2669         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2670                 return true;
2671
2672         for (i = 0; i < MSRPM_OFFSETS; i++) {
2673                 u32 value, p;
2674                 u64 offset;
2675
2676                 if (msrpm_offsets[i] == 0xffffffff)
2677                         break;
2678
2679                 p      = msrpm_offsets[i];
2680                 offset = svm->nested.vmcb_msrpm + (p * 4);
2681
2682                 if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
2683                         return false;
2684
2685                 svm->nested.msrpm[p] = svm->msrpm[p] | value;
2686         }
2687
2688         svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2689
2690         return true;
2691 }
2692
2693 static bool nested_vmcb_checks(struct vmcb *vmcb)
2694 {
2695         if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2696                 return false;
2697
2698         if (vmcb->control.asid == 0)
2699                 return false;
2700
2701         if (vmcb->control.nested_ctl && !npt_enabled)
2702                 return false;
2703
2704         return true;
2705 }
2706
2707 static bool nested_svm_vmrun(struct vcpu_svm *svm)
2708 {
2709         struct vmcb *nested_vmcb;
2710         struct vmcb *hsave = svm->nested.hsave;
2711         struct vmcb *vmcb = svm->vmcb;
2712         struct page *page;
2713         u64 vmcb_gpa;
2714
2715         vmcb_gpa = svm->vmcb->save.rax;
2716
2717         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2718         if (!nested_vmcb)
2719                 return false;
2720
2721         if (!nested_vmcb_checks(nested_vmcb)) {
2722                 nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2723                 nested_vmcb->control.exit_code_hi = 0;
2724                 nested_vmcb->control.exit_info_1  = 0;
2725                 nested_vmcb->control.exit_info_2  = 0;
2726
2727                 nested_svm_unmap(page);
2728
2729                 return false;
2730         }
2731
2732         trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2733                                nested_vmcb->save.rip,
2734                                nested_vmcb->control.int_ctl,
2735                                nested_vmcb->control.event_inj,
2736                                nested_vmcb->control.nested_ctl);
2737
2738         trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2739                                     nested_vmcb->control.intercept_cr >> 16,
2740                                     nested_vmcb->control.intercept_exceptions,
2741                                     nested_vmcb->control.intercept);
2742
2743         /* Clear internal status */
2744         kvm_clear_exception_queue(&svm->vcpu);
2745         kvm_clear_interrupt_queue(&svm->vcpu);
2746
2747         /*
2748          * Save the old vmcb, so we don't need to pick what we save, but can
2749          * restore everything when a VMEXIT occurs
2750          */
2751         hsave->save.es     = vmcb->save.es;
2752         hsave->save.cs     = vmcb->save.cs;
2753         hsave->save.ss     = vmcb->save.ss;
2754         hsave->save.ds     = vmcb->save.ds;
2755         hsave->save.gdtr   = vmcb->save.gdtr;
2756         hsave->save.idtr   = vmcb->save.idtr;
2757         hsave->save.efer   = svm->vcpu.arch.efer;
2758         hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2759         hsave->save.cr4    = svm->vcpu.arch.cr4;
2760         hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2761         hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2762         hsave->save.rsp    = vmcb->save.rsp;
2763         hsave->save.rax    = vmcb->save.rax;
2764         if (npt_enabled)
2765                 hsave->save.cr3    = vmcb->save.cr3;
2766         else
2767                 hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2768
2769         copy_vmcb_control_area(hsave, vmcb);
2770
2771         if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2772                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2773         else
2774                 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2775
2776         if (nested_vmcb->control.nested_ctl) {
2777                 kvm_mmu_unload(&svm->vcpu);
2778                 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2779                 nested_svm_init_mmu_context(&svm->vcpu);
2780         }
2781
2782         /* Load the nested guest state */
2783         svm->vmcb->save.es = nested_vmcb->save.es;
2784         svm->vmcb->save.cs = nested_vmcb->save.cs;
2785         svm->vmcb->save.ss = nested_vmcb->save.ss;
2786         svm->vmcb->save.ds = nested_vmcb->save.ds;
2787         svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2788         svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2789         kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2790         svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2791         svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2792         svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2793         if (npt_enabled) {
2794                 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2795                 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2796         } else
2797                 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2798
2799         /* Guest paging mode is active - reset mmu */
2800         kvm_mmu_reset_context(&svm->vcpu);
2801
2802         svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2803         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2804         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2805         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2806
2807         /* In case we don't even reach vcpu_run, the fields are not updated */
2808         svm->vmcb->save.rax = nested_vmcb->save.rax;
2809         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2810         svm->vmcb->save.rip = nested_vmcb->save.rip;
2811         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2812         svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2813         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2814
2815         svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2816         svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2817
2818         /* cache intercepts */
2819         svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2820         svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2821         svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2822         svm->nested.intercept            = nested_vmcb->control.intercept;
2823
2824         svm_flush_tlb(&svm->vcpu);
2825         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2826         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2827                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2828         else
2829                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2830
2831         if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2832                 /* We only want the cr8 intercept bits of the guest */
2833                 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2834                 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2835         }
2836
2837         /* We don't want to see VMMCALLs from a nested guest */
2838         clr_intercept(svm, INTERCEPT_VMMCALL);
2839
2840         svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2841         svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2842         svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2843         svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2844         svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2845         svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2846
2847         nested_svm_unmap(page);
2848
2849         /* Enter Guest-Mode */
2850         enter_guest_mode(&svm->vcpu);
2851
2852         /*
2853          * Merge guest and host intercepts - must be called  with vcpu in
2854          * guest-mode to take affect here
2855          */
2856         recalc_intercepts(svm);
2857
2858         svm->nested.vmcb = vmcb_gpa;
2859
2860         enable_gif(svm);
2861
2862         mark_all_dirty(svm->vmcb);
2863
2864         return true;
2865 }
2866
2867 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2868 {
2869         to_vmcb->save.fs = from_vmcb->save.fs;
2870         to_vmcb->save.gs = from_vmcb->save.gs;
2871         to_vmcb->save.tr = from_vmcb->save.tr;
2872         to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2873         to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2874         to_vmcb->save.star = from_vmcb->save.star;
2875         to_vmcb->save.lstar = from_vmcb->save.lstar;
2876         to_vmcb->save.cstar = from_vmcb->save.cstar;
2877         to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2878         to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2879         to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2880         to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2881 }
2882
2883 static int vmload_interception(struct vcpu_svm *svm)
2884 {
2885         struct vmcb *nested_vmcb;
2886         struct page *page;
2887
2888         if (nested_svm_check_permissions(svm))
2889                 return 1;
2890
2891         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2892         if (!nested_vmcb)
2893                 return 1;
2894
2895         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2896         skip_emulated_instruction(&svm->vcpu);
2897
2898         nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2899         nested_svm_unmap(page);
2900
2901         return 1;
2902 }
2903
2904 static int vmsave_interception(struct vcpu_svm *svm)
2905 {
2906         struct vmcb *nested_vmcb;
2907         struct page *page;
2908
2909         if (nested_svm_check_permissions(svm))
2910                 return 1;
2911
2912         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2913         if (!nested_vmcb)
2914                 return 1;
2915
2916         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2917         skip_emulated_instruction(&svm->vcpu);
2918
2919         nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2920         nested_svm_unmap(page);
2921
2922         return 1;
2923 }
2924
2925 static int vmrun_interception(struct vcpu_svm *svm)
2926 {
2927         if (nested_svm_check_permissions(svm))
2928                 return 1;
2929
2930         /* Save rip after vmrun instruction */
2931         kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2932
2933         if (!nested_svm_vmrun(svm))
2934                 return 1;
2935
2936         if (!nested_svm_vmrun_msrpm(svm))
2937                 goto failed;
2938
2939         return 1;
2940
2941 failed:
2942
2943         svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2944         svm->vmcb->control.exit_code_hi = 0;
2945         svm->vmcb->control.exit_info_1  = 0;
2946         svm->vmcb->control.exit_info_2  = 0;
2947
2948         nested_svm_vmexit(svm);
2949
2950         return 1;
2951 }
2952
2953 static int stgi_interception(struct vcpu_svm *svm)
2954 {
2955         if (nested_svm_check_permissions(svm))
2956                 return 1;
2957
2958         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2959         skip_emulated_instruction(&svm->vcpu);
2960         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2961
2962         enable_gif(svm);
2963
2964         return 1;
2965 }
2966
2967 static int clgi_interception(struct vcpu_svm *svm)
2968 {
2969         if (nested_svm_check_permissions(svm))
2970                 return 1;
2971
2972         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2973         skip_emulated_instruction(&svm->vcpu);
2974
2975         disable_gif(svm);
2976
2977         /* After a CLGI no interrupts should come */
2978         if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
2979                 svm_clear_vintr(svm);
2980                 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2981                 mark_dirty(svm->vmcb, VMCB_INTR);
2982         }
2983
2984         return 1;
2985 }
2986
2987 static int invlpga_interception(struct vcpu_svm *svm)
2988 {
2989         struct kvm_vcpu *vcpu = &svm->vcpu;
2990
2991         trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
2992                           kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2993
2994         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2995         kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2996
2997         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2998         skip_emulated_instruction(&svm->vcpu);
2999         return 1;
3000 }
3001
3002 static int skinit_interception(struct vcpu_svm *svm)
3003 {
3004         trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3005
3006         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3007         return 1;
3008 }
3009
3010 static int wbinvd_interception(struct vcpu_svm *svm)
3011 {
3012         kvm_emulate_wbinvd(&svm->vcpu);
3013         return 1;
3014 }
3015
3016 static int xsetbv_interception(struct vcpu_svm *svm)
3017 {
3018         u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
3019         u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3020
3021         if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
3022                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3023                 skip_emulated_instruction(&svm->vcpu);
3024         }
3025
3026         return 1;
3027 }
3028
3029 static int task_switch_interception(struct vcpu_svm *svm)
3030 {
3031         u16 tss_selector;
3032         int reason;
3033         int int_type = svm->vmcb->control.exit_int_info &
3034                 SVM_EXITINTINFO_TYPE_MASK;
3035         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
3036         uint32_t type =
3037                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
3038         uint32_t idt_v =
3039                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
3040         bool has_error_code = false;
3041         u32 error_code = 0;
3042
3043         tss_selector = (u16)svm->vmcb->control.exit_info_1;
3044
3045         if (svm->vmcb->control.exit_info_2 &
3046             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
3047                 reason = TASK_SWITCH_IRET;
3048         else if (svm->vmcb->control.exit_info_2 &
3049                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
3050                 reason = TASK_SWITCH_JMP;
3051         else if (idt_v)
3052                 reason = TASK_SWITCH_GATE;
3053         else
3054                 reason = TASK_SWITCH_CALL;
3055
3056         if (reason == TASK_SWITCH_GATE) {
3057                 switch (type) {
3058                 case SVM_EXITINTINFO_TYPE_NMI:
3059                         svm->vcpu.arch.nmi_injected = false;
3060                         break;
3061                 case SVM_EXITINTINFO_TYPE_EXEPT:
3062                         if (svm->vmcb->control.exit_info_2 &
3063                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
3064                                 has_error_code = true;
3065                                 error_code =
3066                                         (u32)svm->vmcb->control.exit_info_2;
3067                         }
3068                         kvm_clear_exception_queue(&svm->vcpu);
3069                         break;
3070                 case SVM_EXITINTINFO_TYPE_INTR:
3071                         kvm_clear_interrupt_queue(&svm->vcpu);
3072                         break;
3073                 default:
3074                         break;
3075                 }
3076         }
3077
3078         if (reason != TASK_SWITCH_GATE ||
3079             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
3080             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
3081              (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
3082                 skip_emulated_instruction(&svm->vcpu);
3083
3084         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
3085                 int_vec = -1;
3086
3087         if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
3088                                 has_error_code, error_code) == EMULATE_FAIL) {
3089                 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3090                 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3091                 svm->vcpu.run->internal.ndata = 0;
3092                 return 0;
3093         }
3094         return 1;
3095 }
3096
3097 static int cpuid_interception(struct vcpu_svm *svm)
3098 {
3099         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3100         kvm_emulate_cpuid(&svm->vcpu);
3101         return 1;
3102 }
3103
3104 static int iret_interception(struct vcpu_svm *svm)
3105 {
3106         ++svm->vcpu.stat.nmi_window_exits;
3107         clr_intercept(svm, INTERCEPT_IRET);
3108         svm->vcpu.arch.hflags |= HF_IRET_MASK;
3109         svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
3110         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3111         return 1;
3112 }
3113
3114 static int invlpg_interception(struct vcpu_svm *svm)
3115 {
3116         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3117                 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3118
3119         kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
3120         skip_emulated_instruction(&svm->vcpu);
3121         return 1;
3122 }
3123
3124 static int emulate_on_interception(struct vcpu_svm *svm)
3125 {
3126         return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3127 }
3128
3129 static int rdpmc_interception(struct vcpu_svm *svm)
3130 {
3131         int err;
3132
3133         if (!static_cpu_has(X86_FEATURE_NRIPS))
3134                 return emulate_on_interception(svm);
3135
3136         err = kvm_rdpmc(&svm->vcpu);
3137         kvm_complete_insn_gp(&svm->vcpu, err);
3138
3139         return 1;
3140 }
3141
3142 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
3143                                             unsigned long val)
3144 {
3145         unsigned long cr0 = svm->vcpu.arch.cr0;
3146         bool ret = false;
3147         u64 intercept;
3148
3149         intercept = svm->nested.intercept;
3150
3151         if (!is_guest_mode(&svm->vcpu) ||
3152             (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
3153                 return false;
3154
3155         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
3156         val &= ~SVM_CR0_SELECTIVE_MASK;
3157
3158         if (cr0 ^ val) {
3159                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3160                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
3161         }
3162
3163         return ret;
3164 }
3165
3166 #define CR_VALID (1ULL << 63)
3167
3168 static int cr_interception(struct vcpu_svm *svm)
3169 {
3170         int reg, cr;
3171         unsigned long val;
3172         int err;
3173
3174         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3175                 return emulate_on_interception(svm);
3176
3177         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
3178                 return emulate_on_interception(svm);
3179
3180         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3181         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
3182                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
3183         else
3184                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
3185
3186         err = 0;
3187         if (cr >= 16) { /* mov to cr */
3188                 cr -= 16;
3189                 val = kvm_register_read(&svm->vcpu, reg);
3190                 switch (cr) {
3191                 case 0:
3192                         if (!check_selective_cr0_intercepted(svm, val))
3193                                 err = kvm_set_cr0(&svm->vcpu, val);
3194                         else
3195                                 return 1;
3196
3197                         break;
3198                 case 3:
3199                         err = kvm_set_cr3(&svm->vcpu, val);
3200                         break;
3201                 case 4:
3202                         err = kvm_set_cr4(&svm->vcpu, val);
3203                         break;
3204                 case 8:
3205                         err = kvm_set_cr8(&svm->vcpu, val);
3206                         break;
3207                 default:
3208                         WARN(1, "unhandled write to CR%d", cr);
3209                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3210                         return 1;
3211                 }
3212         } else { /* mov from cr */
3213                 switch (cr) {
3214                 case 0:
3215                         val = kvm_read_cr0(&svm->vcpu);
3216                         break;
3217                 case 2:
3218                         val = svm->vcpu.arch.cr2;
3219                         break;
3220                 case 3:
3221                         val = kvm_read_cr3(&svm->vcpu);
3222                         break;
3223                 case 4:
3224                         val = kvm_read_cr4(&svm->vcpu);
3225                         break;
3226                 case 8:
3227                         val = kvm_get_cr8(&svm->vcpu);
3228                         break;
3229                 default:
3230                         WARN(1, "unhandled read from CR%d", cr);
3231                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3232                         return 1;
3233                 }
3234                 kvm_register_write(&svm->vcpu, reg, val);
3235         }
3236         kvm_complete_insn_gp(&svm->vcpu, err);
3237
3238         return 1;
3239 }
3240
3241 static int dr_interception(struct vcpu_svm *svm)
3242 {
3243         int reg, dr;
3244         unsigned long val;
3245
3246         if (svm->vcpu.guest_debug == 0) {
3247                 /*
3248                  * No more DR vmexits; force a reload of the debug registers
3249                  * and reenter on this instruction.  The next vmexit will
3250                  * retrieve the full state of the debug registers.
3251                  */
3252                 clr_dr_intercepts(svm);
3253                 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3254                 return 1;
3255         }
3256
3257         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3258                 return emulate_on_interception(svm);
3259
3260         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3261         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3262
3263         if (dr >= 16) { /* mov to DRn */
3264                 if (!kvm_require_dr(&svm->vcpu, dr - 16))
3265                         return 1;
3266                 val = kvm_register_read(&svm->vcpu, reg);
3267                 kvm_set_dr(&svm->vcpu, dr - 16, val);
3268         } else {
3269                 if (!kvm_require_dr(&svm->vcpu, dr))
3270                         return 1;
3271                 kvm_get_dr(&svm->vcpu, dr, &val);
3272                 kvm_register_write(&svm->vcpu, reg, val);
3273         }
3274
3275         skip_emulated_instruction(&svm->vcpu);
3276
3277         return 1;
3278 }
3279
3280 static int cr8_write_interception(struct vcpu_svm *svm)
3281 {
3282         struct kvm_run *kvm_run = svm->vcpu.run;
3283         int r;
3284
3285         u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3286         /* instruction emulation calls kvm_set_cr8() */
3287         r = cr_interception(svm);
3288         if (lapic_in_kernel(&svm->vcpu))
3289                 return r;
3290         if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3291                 return r;
3292         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3293         return 0;
3294 }
3295
3296 static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3297 {
3298         struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3299         return vmcb->control.tsc_offset + host_tsc;
3300 }
3301
3302 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3303 {
3304         struct vcpu_svm *svm = to_svm(vcpu);
3305
3306         switch (msr_info->index) {
3307         case MSR_IA32_TSC: {
3308                 msr_info->data = svm->vmcb->control.tsc_offset +
3309                         kvm_scale_tsc(vcpu, rdtsc());
3310
3311                 break;
3312         }
3313         case MSR_STAR:
3314                 msr_info->data = svm->vmcb->save.star;
3315                 break;
3316 #ifdef CONFIG_X86_64
3317         case MSR_LSTAR:
3318                 msr_info->data = svm->vmcb->save.lstar;
3319                 break;
3320         case MSR_CSTAR:
3321                 msr_info->data = svm->vmcb->save.cstar;
3322                 break;
3323         case MSR_KERNEL_GS_BASE:
3324                 msr_info->data = svm->vmcb->save.kernel_gs_base;
3325                 break;
3326         case MSR_SYSCALL_MASK:
3327                 msr_info->data = svm->vmcb->save.sfmask;
3328                 break;
3329 #endif
3330         case MSR_IA32_SYSENTER_CS:
3331                 msr_info->data = svm->vmcb->save.sysenter_cs;
3332                 break;
3333         case MSR_IA32_SYSENTER_EIP:
3334                 msr_info->data = svm->sysenter_eip;
3335                 break;
3336         case MSR_IA32_SYSENTER_ESP:
3337                 msr_info->data = svm->sysenter_esp;
3338                 break;
3339         case MSR_TSC_AUX:
3340                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3341                         return 1;
3342                 msr_info->data = svm->tsc_aux;
3343                 break;
3344         /*
3345          * Nobody will change the following 5 values in the VMCB so we can
3346          * safely return them on rdmsr. They will always be 0 until LBRV is
3347          * implemented.
3348          */
3349         case MSR_IA32_DEBUGCTLMSR:
3350                 msr_info->data = svm->vmcb->save.dbgctl;
3351                 break;
3352         case MSR_IA32_LASTBRANCHFROMIP:
3353                 msr_info->data = svm->vmcb->save.br_from;
3354                 break;
3355         case MSR_IA32_LASTBRANCHTOIP:
3356                 msr_info->data = svm->vmcb->save.br_to;
3357                 break;
3358         case MSR_IA32_LASTINTFROMIP:
3359                 msr_info->data = svm->vmcb->save.last_excp_from;
3360                 break;
3361         case MSR_IA32_LASTINTTOIP:
3362                 msr_info->data = svm->vmcb->save.last_excp_to;
3363                 break;
3364         case MSR_VM_HSAVE_PA:
3365                 msr_info->data = svm->nested.hsave_msr;
3366                 break;
3367         case MSR_VM_CR:
3368                 msr_info->data = svm->nested.vm_cr_msr;
3369                 break;
3370         case MSR_IA32_UCODE_REV:
3371                 msr_info->data = 0x01000065;
3372                 break;
3373         case MSR_F15H_IC_CFG: {
3374
3375                 int family, model;
3376
3377                 family = guest_cpuid_family(vcpu);
3378                 model  = guest_cpuid_model(vcpu);
3379
3380                 if (family < 0 || model < 0)
3381                         return kvm_get_msr_common(vcpu, msr_info);
3382
3383                 msr_info->data = 0;
3384
3385                 if (family == 0x15 &&
3386                     (model >= 0x2 && model < 0x20))
3387                         msr_info->data = 0x1E;
3388                 }
3389                 break;
3390         default:
3391                 return kvm_get_msr_common(vcpu, msr_info);
3392         }
3393         return 0;
3394 }
3395
3396 static int rdmsr_interception(struct vcpu_svm *svm)
3397 {
3398         u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3399         struct msr_data msr_info;
3400
3401         msr_info.index = ecx;
3402         msr_info.host_initiated = false;
3403         if (svm_get_msr(&svm->vcpu, &msr_info)) {
3404                 trace_kvm_msr_read_ex(ecx);
3405                 kvm_inject_gp(&svm->vcpu, 0);
3406         } else {
3407                 trace_kvm_msr_read(ecx, msr_info.data);
3408
3409                 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3410                                    msr_info.data & 0xffffffff);
3411                 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3412                                    msr_info.data >> 32);
3413                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3414                 skip_emulated_instruction(&svm->vcpu);
3415         }
3416         return 1;
3417 }
3418
3419 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3420 {
3421         struct vcpu_svm *svm = to_svm(vcpu);
3422         int svm_dis, chg_mask;
3423
3424         if (data & ~SVM_VM_CR_VALID_MASK)
3425                 return 1;
3426
3427         chg_mask = SVM_VM_CR_VALID_MASK;
3428
3429         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3430                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3431
3432         svm->nested.vm_cr_msr &= ~chg_mask;
3433         svm->nested.vm_cr_msr |= (data & chg_mask);
3434
3435         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3436
3437         /* check for svm_disable while efer.svme is set */
3438         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3439                 return 1;
3440
3441         return 0;
3442 }
3443
3444 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3445 {
3446         struct vcpu_svm *svm = to_svm(vcpu);
3447
3448         u32 ecx = msr->index;
3449         u64 data = msr->data;
3450         switch (ecx) {
3451         case MSR_IA32_TSC:
3452                 kvm_write_tsc(vcpu, msr);
3453                 break;
3454         case MSR_STAR:
3455                 svm->vmcb->save.star = data;
3456                 break;
3457 #ifdef CONFIG_X86_64
3458         case MSR_LSTAR:
3459                 svm->vmcb->save.lstar = data;
3460                 break;
3461         case MSR_CSTAR:
3462                 svm->vmcb->save.cstar = data;
3463                 break;
3464         case MSR_KERNEL_GS_BASE:
3465                 svm->vmcb->save.kernel_gs_base = data;
3466                 break;
3467         case MSR_SYSCALL_MASK:
3468                 svm->vmcb->save.sfmask = data;
3469                 break;
3470 #endif
3471         case MSR_IA32_SYSENTER_CS:
3472                 svm->vmcb->save.sysenter_cs = data;
3473                 break;
3474         case MSR_IA32_SYSENTER_EIP:
3475                 svm->sysenter_eip = data;
3476                 svm->vmcb->save.sysenter_eip = data;
3477                 break;
3478         case MSR_IA32_SYSENTER_ESP:
3479                 svm->sysenter_esp = data;
3480                 svm->vmcb->save.sysenter_esp = data;
3481                 break;
3482         case MSR_TSC_AUX:
3483                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3484                         return 1;
3485
3486                 /*
3487                  * This is rare, so we update the MSR here instead of using
3488                  * direct_access_msrs.  Doing that would require a rdmsr in
3489                  * svm_vcpu_put.
3490                  */
3491                 svm->tsc_aux = data;
3492                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
3493                 break;
3494         case MSR_IA32_DEBUGCTLMSR:
3495                 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3496                         vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3497                                     __func__, data);
3498                         break;
3499                 }
3500                 if (data & DEBUGCTL_RESERVED_BITS)
3501                         return 1;
3502
3503                 svm->vmcb->save.dbgctl = data;
3504                 mark_dirty(svm->vmcb, VMCB_LBR);
3505                 if (data & (1ULL<<0))
3506                         svm_enable_lbrv(svm);
3507                 else
3508                         svm_disable_lbrv(svm);
3509                 break;
3510         case MSR_VM_HSAVE_PA:
3511                 svm->nested.hsave_msr = data;
3512                 break;
3513         case MSR_VM_CR:
3514                 return svm_set_vm_cr(vcpu, data);
3515         case MSR_VM_IGNNE:
3516                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3517                 break;
3518         case MSR_IA32_APICBASE:
3519                 if (kvm_vcpu_apicv_active(vcpu))
3520                         avic_update_vapic_bar(to_svm(vcpu), data);
3521                 /* Follow through */
3522         default:
3523                 return kvm_set_msr_common(vcpu, msr);
3524         }
3525         return 0;
3526 }
3527
3528 static int wrmsr_interception(struct vcpu_svm *svm)
3529 {
3530         struct msr_data msr;
3531         u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3532         u64 data = kvm_read_edx_eax(&svm->vcpu);
3533
3534         msr.data = data;
3535         msr.index = ecx;
3536         msr.host_initiated = false;
3537
3538         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3539         if (kvm_set_msr(&svm->vcpu, &msr)) {
3540                 trace_kvm_msr_write_ex(ecx, data);
3541                 kvm_inject_gp(&svm->vcpu, 0);
3542         } else {
3543                 trace_kvm_msr_write(ecx, data);
3544                 skip_emulated_instruction(&svm->vcpu);
3545         }
3546         return 1;
3547 }
3548
3549 static int msr_interception(struct vcpu_svm *svm)
3550 {
3551         if (svm->vmcb->control.exit_info_1)
3552                 return wrmsr_interception(svm);
3553         else
3554                 return rdmsr_interception(svm);
3555 }
3556
3557 static int interrupt_window_interception(struct vcpu_svm *svm)
3558 {
3559         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3560         svm_clear_vintr(svm);
3561         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3562         mark_dirty(svm->vmcb, VMCB_INTR);
3563         ++svm->vcpu.stat.irq_window_exits;
3564         return 1;
3565 }
3566
3567 static int pause_interception(struct vcpu_svm *svm)
3568 {
3569         kvm_vcpu_on_spin(&(svm->vcpu));
3570         return 1;
3571 }
3572
3573 static int nop_interception(struct vcpu_svm *svm)
3574 {
3575         skip_emulated_instruction(&(svm->vcpu));
3576         return 1;
3577 }
3578
3579 static int monitor_interception(struct vcpu_svm *svm)
3580 {
3581         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3582         return nop_interception(svm);
3583 }
3584
3585 static int mwait_interception(struct vcpu_svm *svm)
3586 {
3587         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3588         return nop_interception(svm);
3589 }
3590
3591 enum avic_ipi_failure_cause {
3592         AVIC_IPI_FAILURE_INVALID_INT_TYPE,
3593         AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
3594         AVIC_IPI_FAILURE_INVALID_TARGET,
3595         AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
3596 };
3597
3598 static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
3599 {
3600         u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
3601         u32 icrl = svm->vmcb->control.exit_info_1;
3602         u32 id = svm->vmcb->control.exit_info_2 >> 32;
3603         u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
3604         struct kvm_lapic *apic = svm->vcpu.arch.apic;
3605
3606         trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
3607
3608         switch (id) {
3609         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
3610                 /*
3611                  * AVIC hardware handles the generation of
3612                  * IPIs when the specified Message Type is Fixed
3613                  * (also known as fixed delivery mode) and
3614                  * the Trigger Mode is edge-triggered. The hardware
3615                  * also supports self and broadcast delivery modes
3616                  * specified via the Destination Shorthand(DSH)
3617                  * field of the ICRL. Logical and physical APIC ID
3618                  * formats are supported. All other IPI types cause
3619                  * a #VMEXIT, which needs to emulated.
3620                  */
3621                 kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
3622                 kvm_lapic_reg_write(apic, APIC_ICR, icrl);
3623                 break;
3624         case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
3625                 int i;
3626                 struct kvm_vcpu *vcpu;
3627                 struct kvm *kvm = svm->vcpu.kvm;
3628                 struct kvm_lapic *apic = svm->vcpu.arch.apic;
3629
3630                 /*
3631                  * At this point, we expect that the AVIC HW has already
3632                  * set the appropriate IRR bits on the valid target
3633                  * vcpus. So, we just need to kick the appropriate vcpu.
3634                  */
3635                 kvm_for_each_vcpu(i, vcpu, kvm) {
3636                         bool m = kvm_apic_match_dest(vcpu, apic,
3637                                                      icrl & KVM_APIC_SHORT_MASK,
3638                                                      GET_APIC_DEST_FIELD(icrh),
3639                                                      icrl & KVM_APIC_DEST_MASK);
3640
3641                         if (m && !avic_vcpu_is_running(vcpu))
3642                                 kvm_vcpu_wake_up(vcpu);
3643                 }
3644                 break;
3645         }
3646         case AVIC_IPI_FAILURE_INVALID_TARGET:
3647                 break;
3648         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
3649                 WARN_ONCE(1, "Invalid backing page\n");
3650                 break;
3651         default:
3652                 pr_err("Unknown IPI interception\n");
3653         }
3654
3655         return 1;
3656 }
3657
3658 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
3659 {
3660         struct kvm_arch *vm_data = &vcpu->kvm->arch;
3661         int index;
3662         u32 *logical_apic_id_table;
3663         int dlid = GET_APIC_LOGICAL_ID(ldr);
3664
3665         if (!dlid)
3666                 return NULL;
3667
3668         if (flat) { /* flat */
3669                 index = ffs(dlid) - 1;
3670                 if (index > 7)
3671                         return NULL;
3672         } else { /* cluster */
3673                 int cluster = (dlid & 0xf0) >> 4;
3674                 int apic = ffs(dlid & 0x0f) - 1;
3675
3676                 if ((apic < 0) || (apic > 7) ||
3677                     (cluster >= 0xf))
3678                         return NULL;
3679                 index = (cluster << 2) + apic;
3680         }
3681
3682         logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
3683
3684         return &logical_apic_id_table[index];
3685 }
3686
3687 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
3688                           bool valid)
3689 {
3690         bool flat;
3691         u32 *entry, new_entry;
3692
3693         flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
3694         entry = avic_get_logical_id_entry(vcpu, ldr, flat);
3695         if (!entry)
3696                 return -EINVAL;
3697
3698         new_entry = READ_ONCE(*entry);
3699         new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
3700         new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
3701         if (valid)
3702                 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3703         else
3704                 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3705         WRITE_ONCE(*entry, new_entry);
3706
3707         return 0;
3708 }
3709
3710 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
3711 {
3712         int ret;
3713         struct vcpu_svm *svm = to_svm(vcpu);
3714         u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
3715
3716         if (!ldr)
3717                 return 1;
3718
3719         ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
3720         if (ret && svm->ldr_reg) {
3721                 avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
3722                 svm->ldr_reg = 0;
3723         } else {
3724                 svm->ldr_reg = ldr;
3725         }
3726         return ret;
3727 }
3728
3729 static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
3730 {
3731         u64 *old, *new;
3732         struct vcpu_svm *svm = to_svm(vcpu);
3733         u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
3734         u32 id = (apic_id_reg >> 24) & 0xff;
3735
3736         if (vcpu->vcpu_id == id)
3737                 return 0;
3738
3739         old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
3740         new = avic_get_physical_id_entry(vcpu, id);
3741         if (!new || !old)
3742                 return 1;
3743
3744         /* We need to move physical_id_entry to new offset */
3745         *new = *old;
3746         *old = 0ULL;
3747         to_svm(vcpu)->avic_physical_id_cache = new;
3748
3749         /*
3750          * Also update the guest physical APIC ID in the logical
3751          * APIC ID table entry if already setup the LDR.
3752          */
3753         if (svm->ldr_reg)
3754                 avic_handle_ldr_update(vcpu);
3755
3756         return 0;
3757 }
3758
3759 static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
3760 {
3761         struct vcpu_svm *svm = to_svm(vcpu);
3762         struct kvm_arch *vm_data = &vcpu->kvm->arch;
3763         u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
3764         u32 mod = (dfr >> 28) & 0xf;
3765
3766         /*
3767          * We assume that all local APICs are using the same type.
3768          * If this changes, we need to flush the AVIC logical
3769          * APID id table.
3770          */
3771         if (vm_data->ldr_mode == mod)
3772                 return 0;
3773
3774         clear_page(page_address(vm_data->avic_logical_id_table_page));
3775         vm_data->ldr_mode = mod;
3776
3777         if (svm->ldr_reg)
3778                 avic_handle_ldr_update(vcpu);
3779         return 0;
3780 }
3781
3782 static int avic_unaccel_trap_write(struct vcpu_svm *svm)
3783 {
3784         struct kvm_lapic *apic = svm->vcpu.arch.apic;
3785         u32 offset = svm->vmcb->control.exit_info_1 &
3786                                 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3787
3788         switch (offset) {
3789         case APIC_ID:
3790                 if (avic_handle_apic_id_update(&svm->vcpu))
3791                         return 0;
3792                 break;
3793         case APIC_LDR:
3794                 if (avic_handle_ldr_update(&svm->vcpu))
3795                         return 0;
3796                 break;
3797         case APIC_DFR:
3798                 avic_handle_dfr_update(&svm->vcpu);
3799                 break;
3800         default:
3801                 break;
3802         }
3803
3804         kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
3805
3806         return 1;
3807 }
3808
3809 static bool is_avic_unaccelerated_access_trap(u32 offset)
3810 {
3811         bool ret = false;
3812
3813         switch (offset) {
3814         case APIC_ID:
3815         case APIC_EOI:
3816         case APIC_RRR:
3817         case APIC_LDR:
3818         case APIC_DFR:
3819         case APIC_SPIV:
3820         case APIC_ESR:
3821         case APIC_ICR:
3822         case APIC_LVTT:
3823         case APIC_LVTTHMR:
3824         case APIC_LVTPC:
3825         case APIC_LVT0:
3826         case APIC_LVT1:
3827         case APIC_LVTERR:
3828         case APIC_TMICT:
3829         case APIC_TDCR:
3830                 ret = true;
3831                 break;
3832         default:
3833                 break;
3834         }
3835         return ret;
3836 }
3837
3838 static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
3839 {
3840         int ret = 0;
3841         u32 offset = svm->vmcb->control.exit_info_1 &
3842                      AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3843         u32 vector = svm->vmcb->control.exit_info_2 &
3844                      AVIC_UNACCEL_ACCESS_VECTOR_MASK;
3845         bool write = (svm->vmcb->control.exit_info_1 >> 32) &
3846                      AVIC_UNACCEL_ACCESS_WRITE_MASK;
3847         bool trap = is_avic_unaccelerated_access_trap(offset);
3848
3849         trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
3850                                             trap, write, vector);
3851         if (trap) {
3852                 /* Handling Trap */
3853                 WARN_ONCE(!write, "svm: Handling trap read.\n");
3854                 ret = avic_unaccel_trap_write(svm);
3855         } else {
3856                 /* Handling Fault */
3857                 ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
3858         }
3859
3860         return ret;
3861 }
3862
3863 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3864         [SVM_EXIT_READ_CR0]                     = cr_interception,
3865         [SVM_EXIT_READ_CR3]                     = cr_interception,
3866         [SVM_EXIT_READ_CR4]                     = cr_interception,
3867         [SVM_EXIT_READ_CR8]                     = cr_interception,
3868         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3869         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3870         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3871         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3872         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3873         [SVM_EXIT_READ_DR0]                     = dr_interception,
3874         [SVM_EXIT_READ_DR1]                     = dr_interception,
3875         [SVM_EXIT_READ_DR2]                     = dr_interception,
3876         [SVM_EXIT_READ_DR3]                     = dr_interception,
3877         [SVM_EXIT_READ_DR4]                     = dr_interception,
3878         [SVM_EXIT_READ_DR5]                     = dr_interception,
3879         [SVM_EXIT_READ_DR6]                     = dr_interception,
3880         [SVM_EXIT_READ_DR7]                     = dr_interception,
3881         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3882         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3883         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3884         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3885         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3886         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3887         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3888         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3889         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3890         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3891         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3892         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3893         [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
3894         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3895         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3896         [SVM_EXIT_INTR]                         = intr_interception,
3897         [SVM_EXIT_NMI]                          = nmi_interception,
3898         [SVM_EXIT_SMI]                          = nop_on_interception,
3899         [SVM_EXIT_INIT]                         = nop_on_interception,
3900         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3901         [SVM_EXIT_RDPMC]                        = rdpmc_interception,
3902         [SVM_EXIT_CPUID]                        = cpuid_interception,
3903         [SVM_EXIT_IRET]                         = iret_interception,
3904         [SVM_EXIT_INVD]                         = emulate_on_interception,
3905         [SVM_EXIT_PAUSE]                        = pause_interception,
3906         [SVM_EXIT_HLT]                          = halt_interception,
3907         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3908         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3909         [SVM_EXIT_IOIO]                         = io_interception,
3910         [SVM_EXIT_MSR]                          = msr_interception,
3911         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3912         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3913         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3914         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3915         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3916         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3917         [SVM_EXIT_STGI]                         = stgi_interception,
3918         [SVM_EXIT_CLGI]                         = clgi_interception,
3919         [SVM_EXIT_SKINIT]                       = skinit_interception,
3920         [SVM_EXIT_WBINVD]                       = wbinvd_interception,
3921         [SVM_EXIT_MONITOR]                      = monitor_interception,
3922         [SVM_EXIT_MWAIT]                        = mwait_interception,
3923         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3924         [SVM_EXIT_NPF]                          = pf_interception,
3925         [SVM_EXIT_RSM]                          = emulate_on_interception,
3926         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3927         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3928 };
3929
3930 static void dump_vmcb(struct kvm_vcpu *vcpu)
3931 {
3932         struct vcpu_svm *svm = to_svm(vcpu);
3933         struct vmcb_control_area *control = &svm->vmcb->control;
3934         struct vmcb_save_area *save = &svm->vmcb->save;
3935
3936         pr_err("VMCB Control Area:\n");
3937         pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3938         pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3939         pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3940         pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3941         pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3942         pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3943         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3944         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3945         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3946         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3947         pr_err("%-20s%d\n", "asid:", control->asid);
3948         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3949         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3950         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3951         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3952         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3953         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3954         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3955         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3956         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3957         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3958         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3959         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3960         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3961         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3962         pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3963         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3964         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3965         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3966         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3967         pr_err("VMCB State Save Area:\n");
3968         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3969                "es:",
3970                save->es.selector, save->es.attrib,
3971                save->es.limit, save->es.base);
3972         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3973                "cs:",
3974                save->cs.selector, save->cs.attrib,
3975                save->cs.limit, save->cs.base);
3976         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3977                "ss:",
3978                save->ss.selector, save->ss.attrib,
3979                save->ss.limit, save->ss.base);
3980         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3981                "ds:",
3982                save->ds.selector, save->ds.attrib,
3983                save->ds.limit, save->ds.base);
3984         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3985                "fs:",
3986                save->fs.selector, save->fs.attrib,
3987                save->fs.limit, save->fs.base);
3988         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3989                "gs:",
3990                save->gs.selector, save->gs.attrib,
3991                save->gs.limit, save->gs.base);
3992         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3993                "gdtr:",
3994                save->gdtr.selector, save->gdtr.attrib,
3995                save->gdtr.limit, save->gdtr.base);
3996         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3997                "ldtr:",
3998                save->ldtr.selector, save->ldtr.attrib,
3999                save->ldtr.limit, save->ldtr.base);
4000         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4001                "idtr:",
4002                save->idtr.selector, save->idtr.attrib,
4003                save->idtr.limit, save->idtr.base);
4004         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4005                "tr:",
4006                save->tr.selector, save->tr.attrib,
4007                save->tr.limit, save->tr.base);
4008         pr_err("cpl:            %d                efer:         %016llx\n",
4009                 save->cpl, save->efer);
4010         pr_err("%-15s %016llx %-13s %016llx\n",
4011                "cr0:", save->cr0, "cr2:", save->cr2);
4012         pr_err("%-15s %016llx %-13s %016llx\n",
4013                "cr3:", save->cr3, "cr4:", save->cr4);
4014         pr_err("%-15s %016llx %-13s %016llx\n",
4015                "dr6:", save->dr6, "dr7:", save->dr7);
4016         pr_err("%-15s %016llx %-13s %016llx\n",
4017                "rip:", save->rip, "rflags:", save->rflags);
4018         pr_err("%-15s %016llx %-13s %016llx\n",
4019                "rsp:", save->rsp, "rax:", save->rax);
4020         pr_err("%-15s %016llx %-13s %016llx\n",
4021                "star:", save->star, "lstar:", save->lstar);
4022         pr_err("%-15s %016llx %-13s %016llx\n",
4023                "cstar:", save->cstar, "sfmask:", save->sfmask);
4024         pr_err("%-15s %016llx %-13s %016llx\n",
4025                "kernel_gs_base:", save->kernel_gs_base,
4026                "sysenter_cs:", save->sysenter_cs);
4027         pr_err("%-15s %016llx %-13s %016llx\n",
4028                "sysenter_esp:", save->sysenter_esp,
4029                "sysenter_eip:", save->sysenter_eip);
4030         pr_err("%-15s %016llx %-13s %016llx\n",
4031                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
4032         pr_err("%-15s %016llx %-13s %016llx\n",
4033                "br_from:", save->br_from, "br_to:", save->br_to);
4034         pr_err("%-15s %016llx %-13s %016llx\n",
4035                "excp_from:", save->last_excp_from,
4036                "excp_to:", save->last_excp_to);
4037 }
4038
4039 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
4040 {
4041         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4042
4043         *info1 = control->exit_info_1;
4044         *info2 = control->exit_info_2;
4045 }
4046
4047 static int handle_exit(struct kvm_vcpu *vcpu)
4048 {
4049         struct vcpu_svm *svm = to_svm(vcpu);
4050         struct kvm_run *kvm_run = vcpu->run;
4051         u32 exit_code = svm->vmcb->control.exit_code;
4052
4053         trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4054
4055         if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4056                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
4057         if (npt_enabled)
4058                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
4059
4060         if (unlikely(svm->nested.exit_required)) {
4061                 nested_svm_vmexit(svm);
4062                 svm->nested.exit_required = false;
4063
4064                 return 1;
4065         }
4066
4067         if (is_guest_mode(vcpu)) {
4068                 int vmexit;
4069
4070                 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
4071                                         svm->vmcb->control.exit_info_1,
4072                                         svm->vmcb->control.exit_info_2,
4073                                         svm->vmcb->control.exit_int_info,
4074                                         svm->vmcb->control.exit_int_info_err,
4075                                         KVM_ISA_SVM);
4076
4077                 vmexit = nested_svm_exit_special(svm);
4078
4079                 if (vmexit == NESTED_EXIT_CONTINUE)
4080                         vmexit = nested_svm_exit_handled(svm);
4081
4082                 if (vmexit == NESTED_EXIT_DONE)
4083                         return 1;
4084         }
4085
4086         svm_complete_interrupts(svm);
4087
4088         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
4089                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4090                 kvm_run->fail_entry.hardware_entry_failure_reason
4091                         = svm->vmcb->control.exit_code;
4092                 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
4093                 dump_vmcb(vcpu);
4094                 return 0;
4095         }
4096
4097         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
4098             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
4099             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
4100             exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
4101                 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
4102                        "exit_code 0x%x\n",
4103                        __func__, svm->vmcb->control.exit_int_info,
4104                        exit_code);
4105
4106         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
4107             || !svm_exit_handlers[exit_code]) {
4108                 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
4109                 kvm_queue_exception(vcpu, UD_VECTOR);
4110                 return 1;
4111         }
4112
4113         return svm_exit_handlers[exit_code](svm);
4114 }
4115
4116 static void reload_tss(struct kvm_vcpu *vcpu)
4117 {
4118         int cpu = raw_smp_processor_id();
4119
4120         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4121         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
4122         load_TR_desc();
4123 }
4124
4125 static void pre_svm_run(struct vcpu_svm *svm)
4126 {
4127         int cpu = raw_smp_processor_id();
4128
4129         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4130
4131         /* FIXME: handle wraparound of asid_generation */
4132         if (svm->asid_generation != sd->asid_generation)
4133                 new_asid(svm, sd);
4134 }
4135
4136 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
4137 {
4138         struct vcpu_svm *svm = to_svm(vcpu);
4139
4140         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
4141         vcpu->arch.hflags |= HF_NMI_MASK;
4142         set_intercept(svm, INTERCEPT_IRET);
4143         ++vcpu->stat.nmi_injections;
4144 }
4145
4146 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
4147 {
4148         struct vmcb_control_area *control;
4149
4150         /* The following fields are ignored when AVIC is enabled */
4151         control = &svm->vmcb->control;
4152         control->int_vector = irq;
4153         control->int_ctl &= ~V_INTR_PRIO_MASK;
4154         control->int_ctl |= V_IRQ_MASK |
4155                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
4156         mark_dirty(svm->vmcb, VMCB_INTR);
4157 }
4158
4159 static void svm_set_irq(struct kvm_vcpu *vcpu)
4160 {
4161         struct vcpu_svm *svm = to_svm(vcpu);
4162
4163         BUG_ON(!(gif_set(svm)));
4164
4165         trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
4166         ++vcpu->stat.irq_injections;
4167
4168         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
4169                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
4170 }
4171
4172 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
4173 {
4174         return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
4175 }
4176
4177 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
4178 {
4179         struct vcpu_svm *svm = to_svm(vcpu);
4180
4181         if (svm_nested_virtualize_tpr(vcpu) ||
4182             kvm_vcpu_apicv_active(vcpu))
4183                 return;
4184
4185         clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4186
4187         if (irr == -1)
4188                 return;
4189
4190         if (tpr >= irr)
4191                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4192 }
4193
4194 static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
4195 {
4196         return;
4197 }
4198
4199 static bool svm_get_enable_apicv(void)
4200 {
4201         return avic;
4202 }
4203
4204 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
4205 {
4206 }
4207
4208 static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
4209 {
4210 }
4211
4212 /* Note: Currently only used by Hyper-V. */
4213 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4214 {
4215         struct vcpu_svm *svm = to_svm(vcpu);
4216         struct vmcb *vmcb = svm->vmcb;
4217
4218         if (!avic)
4219                 return;
4220
4221         vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
4222         mark_dirty(vmcb, VMCB_INTR);
4223 }
4224
4225 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
4226 {
4227         return;
4228 }
4229
4230 static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4231 {
4232         return;
4233 }
4234
4235 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
4236 {
4237         kvm_lapic_set_irr(vec, vcpu->arch.apic);
4238         smp_mb__after_atomic();
4239
4240         if (avic_vcpu_is_running(vcpu))
4241                 wrmsrl(SVM_AVIC_DOORBELL,
4242                        kvm_cpu_get_apicid(vcpu->cpu));
4243         else
4244                 kvm_vcpu_wake_up(vcpu);
4245 }
4246
4247 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
4248 {
4249         struct vcpu_svm *svm = to_svm(vcpu);
4250         struct vmcb *vmcb = svm->vmcb;
4251         int ret;
4252         ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
4253               !(svm->vcpu.arch.hflags & HF_NMI_MASK);
4254         ret = ret && gif_set(svm) && nested_svm_nmi(svm);
4255
4256         return ret;
4257 }
4258
4259 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
4260 {
4261         struct vcpu_svm *svm = to_svm(vcpu);
4262
4263         return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
4264 }
4265
4266 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4267 {
4268         struct vcpu_svm *svm = to_svm(vcpu);
4269
4270         if (masked) {
4271                 svm->vcpu.arch.hflags |= HF_NMI_MASK;
4272                 set_intercept(svm, INTERCEPT_IRET);
4273         } else {
4274                 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
4275                 clr_intercept(svm, INTERCEPT_IRET);
4276         }
4277 }
4278
4279 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
4280 {
4281         struct vcpu_svm *svm = to_svm(vcpu);
4282         struct vmcb *vmcb = svm->vmcb;
4283         int ret;
4284
4285         if (!gif_set(svm) ||
4286              (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
4287                 return 0;
4288
4289         ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
4290
4291         if (is_guest_mode(vcpu))
4292                 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
4293
4294         return ret;
4295 }
4296
4297 static void enable_irq_window(struct kvm_vcpu *vcpu)
4298 {
4299         struct vcpu_svm *svm = to_svm(vcpu);
4300
4301         if (kvm_vcpu_apicv_active(vcpu))
4302                 return;
4303
4304         /*
4305          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
4306          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
4307          * get that intercept, this function will be called again though and
4308          * we'll get the vintr intercept.
4309          */
4310         if (gif_set(svm) && nested_svm_intr(svm)) {
4311                 svm_set_vintr(svm);
4312                 svm_inject_irq(svm, 0x0);
4313         }
4314 }
4315
4316 static void enable_nmi_window(struct kvm_vcpu *vcpu)
4317 {
4318         struct vcpu_svm *svm = to_svm(vcpu);
4319
4320         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
4321             == HF_NMI_MASK)
4322                 return; /* IRET will cause a vm exit */
4323
4324         /*
4325          * Something prevents NMI from been injected. Single step over possible
4326          * problem (IRET or exception injection or interrupt shadow)
4327          */
4328         svm->nmi_singlestep = true;
4329         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4330 }
4331
4332 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
4333 {
4334         return 0;
4335 }
4336
4337 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
4338 {
4339         struct vcpu_svm *svm = to_svm(vcpu);
4340
4341         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4342                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4343         else
4344                 svm->asid_generation--;
4345 }
4346
4347 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
4348 {
4349 }
4350
4351 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4352 {
4353         struct vcpu_svm *svm = to_svm(vcpu);
4354
4355         if (svm_nested_virtualize_tpr(vcpu))
4356                 return;
4357
4358         if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
4359                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4360                 kvm_set_cr8(vcpu, cr8);
4361         }
4362 }
4363
4364 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
4365 {
4366         struct vcpu_svm *svm = to_svm(vcpu);
4367         u64 cr8;
4368
4369         if (svm_nested_virtualize_tpr(vcpu) ||
4370             kvm_vcpu_apicv_active(vcpu))
4371                 return;
4372
4373         cr8 = kvm_get_cr8(vcpu);
4374         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
4375         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
4376 }
4377
4378 static void svm_complete_interrupts(struct vcpu_svm *svm)
4379 {
4380         u8 vector;
4381         int type;
4382         u32 exitintinfo = svm->vmcb->control.exit_int_info;
4383         unsigned int3_injected = svm->int3_injected;
4384
4385         svm->int3_injected = 0;
4386
4387         /*
4388          * If we've made progress since setting HF_IRET_MASK, we've
4389          * executed an IRET and can allow NMI injection.
4390          */
4391         if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
4392             && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
4393                 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
4394                 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4395         }
4396
4397         svm->vcpu.arch.nmi_injected = false;
4398         kvm_clear_exception_queue(&svm->vcpu);
4399         kvm_clear_interrupt_queue(&svm->vcpu);
4400
4401         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4402                 return;
4403
4404         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4405
4406         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4407         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4408
4409         switch (type) {
4410         case SVM_EXITINTINFO_TYPE_NMI:
4411                 svm->vcpu.arch.nmi_injected = true;
4412                 break;
4413         case SVM_EXITINTINFO_TYPE_EXEPT:
4414                 /*
4415                  * In case of software exceptions, do not reinject the vector,
4416                  * but re-execute the instruction instead. Rewind RIP first
4417                  * if we emulated INT3 before.
4418                  */
4419                 if (kvm_exception_is_soft(vector)) {
4420                         if (vector == BP_VECTOR && int3_injected &&
4421                             kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
4422                                 kvm_rip_write(&svm->vcpu,
4423                                               kvm_rip_read(&svm->vcpu) -
4424                                               int3_injected);
4425                         break;
4426                 }
4427                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4428                         u32 err = svm->vmcb->control.exit_int_info_err;
4429                         kvm_requeue_exception_e(&svm->vcpu, vector, err);
4430
4431                 } else
4432                         kvm_requeue_exception(&svm->vcpu, vector);
4433                 break;
4434         case SVM_EXITINTINFO_TYPE_INTR:
4435                 kvm_queue_interrupt(&svm->vcpu, vector, false);
4436                 break;
4437         default:
4438                 break;
4439         }
4440 }
4441
4442 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4443 {
4444         struct vcpu_svm *svm = to_svm(vcpu);
4445         struct vmcb_control_area *control = &svm->vmcb->control;
4446
4447         control->exit_int_info = control->event_inj;
4448         control->exit_int_info_err = control->event_inj_err;
4449         control->event_inj = 0;
4450         svm_complete_interrupts(svm);
4451 }
4452
4453 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
4454 {
4455         struct vcpu_svm *svm = to_svm(vcpu);
4456
4457         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4458         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4459         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4460
4461         /*
4462          * A vmexit emulation is required before the vcpu can be executed
4463          * again.
4464          */
4465         if (unlikely(svm->nested.exit_required))
4466                 return;
4467
4468         pre_svm_run(svm);
4469
4470         sync_lapic_to_cr8(vcpu);
4471
4472         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4473
4474         clgi();
4475         if (static_cpu_has(X86_FEATURE_RDTSCP))
4476                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
4477
4478         local_irq_enable();
4479
4480         asm volatile (
4481                 "push %%" _ASM_BP "; \n\t"
4482                 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
4483                 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
4484                 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
4485                 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
4486                 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
4487                 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
4488 #ifdef CONFIG_X86_64
4489                 "mov %c[r8](%[svm]),  %%r8  \n\t"
4490                 "mov %c[r9](%[svm]),  %%r9  \n\t"
4491                 "mov %c[r10](%[svm]), %%r10 \n\t"
4492                 "mov %c[r11](%[svm]), %%r11 \n\t"
4493                 "mov %c[r12](%[svm]), %%r12 \n\t"
4494                 "mov %c[r13](%[svm]), %%r13 \n\t"
4495                 "mov %c[r14](%[svm]), %%r14 \n\t"
4496                 "mov %c[r15](%[svm]), %%r15 \n\t"
4497 #endif
4498
4499                 /* Enter guest mode */
4500                 "push %%" _ASM_AX " \n\t"
4501                 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
4502                 __ex(SVM_VMLOAD) "\n\t"
4503                 __ex(SVM_VMRUN) "\n\t"
4504                 __ex(SVM_VMSAVE) "\n\t"
4505                 "pop %%" _ASM_AX " \n\t"
4506
4507                 /* Save guest registers, load host registers */
4508                 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
4509                 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
4510                 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
4511                 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
4512                 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
4513                 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
4514 #ifdef CONFIG_X86_64
4515                 "mov %%r8,  %c[r8](%[svm]) \n\t"
4516                 "mov %%r9,  %c[r9](%[svm]) \n\t"
4517                 "mov %%r10, %c[r10](%[svm]) \n\t"
4518                 "mov %%r11, %c[r11](%[svm]) \n\t"
4519                 "mov %%r12, %c[r12](%[svm]) \n\t"
4520                 "mov %%r13, %c[r13](%[svm]) \n\t"
4521                 "mov %%r14, %c[r14](%[svm]) \n\t"
4522                 "mov %%r15, %c[r15](%[svm]) \n\t"
4523 #endif
4524                 "pop %%" _ASM_BP
4525                 :
4526                 : [svm]"a"(svm),
4527                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
4528                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
4529                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
4530                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
4531                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
4532                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
4533                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
4534 #ifdef CONFIG_X86_64
4535                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
4536                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
4537                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
4538                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
4539                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
4540                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
4541                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
4542                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
4543 #endif
4544                 : "cc", "memory"
4545 #ifdef CONFIG_X86_64
4546                 , "rbx", "rcx", "rdx", "rsi", "rdi"
4547                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
4548 #else
4549                 , "ebx", "ecx", "edx", "esi", "edi"
4550 #endif
4551                 );
4552
4553         if (static_cpu_has(X86_FEATURE_RDTSCP))
4554                 wrmsrl(MSR_TSC_AUX, __getcpu());
4555 #ifdef CONFIG_X86_64
4556         wrmsrl(MSR_GS_BASE, svm->host.gs_base);
4557 #else
4558         loadsegment(fs, svm->host.fs);
4559 #ifndef CONFIG_X86_32_LAZY_GS
4560         loadsegment(gs, svm->host.gs);
4561 #endif
4562 #endif
4563
4564         reload_tss(vcpu);
4565
4566         local_irq_disable();
4567
4568         vcpu->arch.cr2 = svm->vmcb->save.cr2;
4569         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4570         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4571         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4572
4573         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4574                 kvm_before_handle_nmi(&svm->vcpu);
4575
4576         stgi();
4577
4578         /* Any pending NMI will happen here */
4579
4580         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4581                 kvm_after_handle_nmi(&svm->vcpu);
4582
4583         sync_cr8_to_lapic(vcpu);
4584
4585         svm->next_rip = 0;
4586
4587         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4588
4589         /* if exit due to PF check for async PF */
4590         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4591                 svm->apf_reason = kvm_read_and_reset_pf_reason();
4592
4593         if (npt_enabled) {
4594                 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
4595                 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
4596         }
4597
4598         /*
4599          * We need to handle MC intercepts here before the vcpu has a chance to
4600          * change the physical cpu
4601          */
4602         if (unlikely(svm->vmcb->control.exit_code ==
4603                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4604                 svm_handle_mce(svm);
4605
4606         mark_all_clean(svm->vmcb);
4607 }
4608
4609 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4610 {
4611         struct vcpu_svm *svm = to_svm(vcpu);
4612
4613         svm->vmcb->save.cr3 = root;
4614         mark_dirty(svm->vmcb, VMCB_CR);
4615         svm_flush_tlb(vcpu);
4616 }
4617
4618 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4619 {
4620         struct vcpu_svm *svm = to_svm(vcpu);
4621
4622         svm->vmcb->control.nested_cr3 = root;
4623         mark_dirty(svm->vmcb, VMCB_NPT);
4624
4625         /* Also sync guest cr3 here in case we live migrate */
4626         svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
4627         mark_dirty(svm->vmcb, VMCB_CR);
4628
4629         svm_flush_tlb(vcpu);
4630 }
4631
4632 static int is_disabled(void)
4633 {
4634         u64 vm_cr;
4635
4636         rdmsrl(MSR_VM_CR, vm_cr);
4637         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4638                 return 1;
4639
4640         return 0;
4641 }
4642
4643 static void
4644 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4645 {
4646         /*
4647          * Patch in the VMMCALL instruction:
4648          */
4649         hypercall[0] = 0x0f;
4650         hypercall[1] = 0x01;
4651         hypercall[2] = 0xd9;
4652 }
4653
4654 static void svm_check_processor_compat(void *rtn)
4655 {
4656         *(int *)rtn = 0;
4657 }
4658
4659 static bool svm_cpu_has_accelerated_tpr(void)
4660 {
4661         return false;
4662 }
4663
4664 static bool svm_has_high_real_mode_segbase(void)
4665 {
4666         return true;
4667 }
4668
4669 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4670 {
4671         return 0;
4672 }
4673
4674 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4675 {
4676         struct vcpu_svm *svm = to_svm(vcpu);
4677         struct kvm_cpuid_entry2 *entry;
4678
4679         /* Update nrips enabled cache */
4680         svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
4681
4682         if (!kvm_vcpu_apicv_active(vcpu))
4683                 return;
4684
4685         entry = kvm_find_cpuid_entry(vcpu, 1, 0);
4686         if (entry)
4687                 entry->ecx &= ~bit(X86_FEATURE_X2APIC);
4688 }
4689
4690 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4691 {
4692         switch (func) {
4693         case 0x1:
4694                 if (avic)
4695                         entry->ecx &= ~bit(X86_FEATURE_X2APIC);
4696                 break;
4697         case 0x80000001:
4698                 if (nested)
4699                         entry->ecx |= (1 << 2); /* Set SVM bit */
4700                 break;
4701         case 0x8000000A:
4702                 entry->eax = 1; /* SVM revision 1 */
4703                 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
4704                                    ASID emulation to nested SVM */
4705                 entry->ecx = 0; /* Reserved */
4706                 entry->edx = 0; /* Per default do not support any
4707                                    additional features */
4708
4709                 /* Support next_rip if host supports it */
4710                 if (boot_cpu_has(X86_FEATURE_NRIPS))
4711                         entry->edx |= SVM_FEATURE_NRIP;
4712
4713                 /* Support NPT for the guest if enabled */
4714                 if (npt_enabled)
4715                         entry->edx |= SVM_FEATURE_NPT;
4716
4717                 break;
4718         }
4719 }
4720
4721 static int svm_get_lpage_level(void)
4722 {
4723         return PT_PDPE_LEVEL;
4724 }
4725
4726 static bool svm_rdtscp_supported(void)
4727 {
4728         return boot_cpu_has(X86_FEATURE_RDTSCP);
4729 }
4730
4731 static bool svm_invpcid_supported(void)
4732 {
4733         return false;
4734 }
4735
4736 static bool svm_mpx_supported(void)
4737 {
4738         return false;
4739 }
4740
4741 static bool svm_xsaves_supported(void)
4742 {
4743         return false;
4744 }
4745
4746 static bool svm_has_wbinvd_exit(void)
4747 {
4748         return true;
4749 }
4750
4751 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4752 {
4753         struct vcpu_svm *svm = to_svm(vcpu);
4754
4755         set_exception_intercept(svm, NM_VECTOR);
4756         update_cr0_intercept(svm);
4757 }
4758
4759 #define PRE_EX(exit)  { .exit_code = (exit), \
4760                         .stage = X86_ICPT_PRE_EXCEPT, }
4761 #define POST_EX(exit) { .exit_code = (exit), \
4762                         .stage = X86_ICPT_POST_EXCEPT, }
4763 #define POST_MEM(exit) { .exit_code = (exit), \
4764                         .stage = X86_ICPT_POST_MEMACCESS, }
4765
4766 static const struct __x86_intercept {
4767         u32 exit_code;
4768         enum x86_intercept_stage stage;
4769 } x86_intercept_map[] = {
4770         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4771         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4772         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4773         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4774         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4775         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4776         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4777         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4778         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4779         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4780         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4781         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4782         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4783         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4784         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4785         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4786         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4787         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4788         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4789         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4790         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4791         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4792         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4793         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4794         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4795         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4796         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4797         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4798         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4799         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4800         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4801         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4802         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4803         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4804         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4805         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4806         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4807         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4808         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4809         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4810         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4811         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4812         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4813         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4814         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4815         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4816 };
4817
4818 #undef PRE_EX
4819 #undef POST_EX
4820 #undef POST_MEM
4821
4822 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4823                                struct x86_instruction_info *info,
4824                                enum x86_intercept_stage stage)
4825 {
4826         struct vcpu_svm *svm = to_svm(vcpu);
4827         int vmexit, ret = X86EMUL_CONTINUE;
4828         struct __x86_intercept icpt_info;
4829         struct vmcb *vmcb = svm->vmcb;
4830
4831         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4832                 goto out;
4833
4834         icpt_info = x86_intercept_map[info->intercept];
4835
4836         if (stage != icpt_info.stage)
4837                 goto out;
4838
4839         switch (icpt_info.exit_code) {
4840         case SVM_EXIT_READ_CR0:
4841                 if (info->intercept == x86_intercept_cr_read)
4842                         icpt_info.exit_code += info->modrm_reg;
4843                 break;
4844         case SVM_EXIT_WRITE_CR0: {
4845                 unsigned long cr0, val;
4846                 u64 intercept;
4847
4848                 if (info->intercept == x86_intercept_cr_write)
4849                         icpt_info.exit_code += info->modrm_reg;
4850
4851                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4852                     info->intercept == x86_intercept_clts)
4853                         break;
4854
4855                 intercept = svm->nested.intercept;
4856
4857                 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4858                         break;
4859
4860                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4861                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4862
4863                 if (info->intercept == x86_intercept_lmsw) {
4864                         cr0 &= 0xfUL;
4865                         val &= 0xfUL;
4866                         /* lmsw can't clear PE - catch this here */
4867                         if (cr0 & X86_CR0_PE)
4868                                 val |= X86_CR0_PE;
4869                 }
4870
4871                 if (cr0 ^ val)
4872                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4873
4874                 break;
4875         }
4876         case SVM_EXIT_READ_DR0:
4877         case SVM_EXIT_WRITE_DR0:
4878                 icpt_info.exit_code += info->modrm_reg;
4879                 break;
4880         case SVM_EXIT_MSR:
4881                 if (info->intercept == x86_intercept_wrmsr)
4882                         vmcb->control.exit_info_1 = 1;
4883                 else
4884                         vmcb->control.exit_info_1 = 0;
4885                 break;
4886         case SVM_EXIT_PAUSE:
4887                 /*
4888                  * We get this for NOP only, but pause
4889                  * is rep not, check this here
4890                  */
4891                 if (info->rep_prefix != REPE_PREFIX)
4892                         goto out;
4893         case SVM_EXIT_IOIO: {
4894                 u64 exit_info;
4895                 u32 bytes;
4896
4897                 if (info->intercept == x86_intercept_in ||
4898                     info->intercept == x86_intercept_ins) {
4899                         exit_info = ((info->src_val & 0xffff) << 16) |
4900                                 SVM_IOIO_TYPE_MASK;
4901                         bytes = info->dst_bytes;
4902                 } else {
4903                         exit_info = (info->dst_val & 0xffff) << 16;
4904                         bytes = info->src_bytes;
4905                 }
4906
4907                 if (info->intercept == x86_intercept_outs ||
4908                     info->intercept == x86_intercept_ins)
4909                         exit_info |= SVM_IOIO_STR_MASK;
4910
4911                 if (info->rep_prefix)
4912                         exit_info |= SVM_IOIO_REP_MASK;
4913
4914                 bytes = min(bytes, 4u);
4915
4916                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4917
4918                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4919
4920                 vmcb->control.exit_info_1 = exit_info;
4921                 vmcb->control.exit_info_2 = info->next_rip;
4922
4923                 break;
4924         }
4925         default:
4926                 break;
4927         }
4928
4929         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4930         if (static_cpu_has(X86_FEATURE_NRIPS))
4931                 vmcb->control.next_rip  = info->next_rip;
4932         vmcb->control.exit_code = icpt_info.exit_code;
4933         vmexit = nested_svm_exit_handled(svm);
4934
4935         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4936                                            : X86EMUL_CONTINUE;
4937
4938 out:
4939         return ret;
4940 }
4941
4942 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4943 {
4944         local_irq_enable();
4945 }
4946
4947 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4948 {
4949 }
4950
4951 static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
4952 {
4953         if (avic_handle_apic_id_update(vcpu) != 0)
4954                 return;
4955         if (avic_handle_dfr_update(vcpu) != 0)
4956                 return;
4957         avic_handle_ldr_update(vcpu);
4958 }
4959
4960 static struct kvm_x86_ops svm_x86_ops = {
4961         .cpu_has_kvm_support = has_svm,
4962         .disabled_by_bios = is_disabled,
4963         .hardware_setup = svm_hardware_setup,
4964         .hardware_unsetup = svm_hardware_unsetup,
4965         .check_processor_compatibility = svm_check_processor_compat,
4966         .hardware_enable = svm_hardware_enable,
4967         .hardware_disable = svm_hardware_disable,
4968         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4969         .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
4970
4971         .vcpu_create = svm_create_vcpu,
4972         .vcpu_free = svm_free_vcpu,
4973         .vcpu_reset = svm_vcpu_reset,
4974
4975         .vm_init = avic_vm_init,
4976         .vm_destroy = avic_vm_destroy,
4977
4978         .prepare_guest_switch = svm_prepare_guest_switch,
4979         .vcpu_load = svm_vcpu_load,
4980         .vcpu_put = svm_vcpu_put,
4981         .vcpu_blocking = svm_vcpu_blocking,
4982         .vcpu_unblocking = svm_vcpu_unblocking,
4983
4984         .update_bp_intercept = update_bp_intercept,
4985         .get_msr = svm_get_msr,
4986         .set_msr = svm_set_msr,
4987         .get_segment_base = svm_get_segment_base,
4988         .get_segment = svm_get_segment,
4989         .set_segment = svm_set_segment,
4990         .get_cpl = svm_get_cpl,
4991         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4992         .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4993         .decache_cr3 = svm_decache_cr3,
4994         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
4995         .set_cr0 = svm_set_cr0,
4996         .set_cr3 = svm_set_cr3,
4997         .set_cr4 = svm_set_cr4,
4998         .set_efer = svm_set_efer,
4999         .get_idt = svm_get_idt,
5000         .set_idt = svm_set_idt,
5001         .get_gdt = svm_get_gdt,
5002         .set_gdt = svm_set_gdt,
5003         .get_dr6 = svm_get_dr6,
5004         .set_dr6 = svm_set_dr6,
5005         .set_dr7 = svm_set_dr7,
5006         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
5007         .cache_reg = svm_cache_reg,
5008         .get_rflags = svm_get_rflags,
5009         .set_rflags = svm_set_rflags,
5010
5011         .get_pkru = svm_get_pkru,
5012
5013         .fpu_activate = svm_fpu_activate,
5014         .fpu_deactivate = svm_fpu_deactivate,
5015
5016         .tlb_flush = svm_flush_tlb,
5017
5018         .run = svm_vcpu_run,
5019         .handle_exit = handle_exit,
5020         .skip_emulated_instruction = skip_emulated_instruction,
5021         .set_interrupt_shadow = svm_set_interrupt_shadow,
5022         .get_interrupt_shadow = svm_get_interrupt_shadow,
5023         .patch_hypercall = svm_patch_hypercall,
5024         .set_irq = svm_set_irq,
5025         .set_nmi = svm_inject_nmi,
5026         .queue_exception = svm_queue_exception,
5027         .cancel_injection = svm_cancel_injection,
5028         .interrupt_allowed = svm_interrupt_allowed,
5029         .nmi_allowed = svm_nmi_allowed,
5030         .get_nmi_mask = svm_get_nmi_mask,
5031         .set_nmi_mask = svm_set_nmi_mask,
5032         .enable_nmi_window = enable_nmi_window,
5033         .enable_irq_window = enable_irq_window,
5034         .update_cr8_intercept = update_cr8_intercept,
5035         .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
5036         .get_enable_apicv = svm_get_enable_apicv,
5037         .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
5038         .load_eoi_exitmap = svm_load_eoi_exitmap,
5039         .sync_pir_to_irr = svm_sync_pir_to_irr,
5040         .hwapic_irr_update = svm_hwapic_irr_update,
5041         .hwapic_isr_update = svm_hwapic_isr_update,
5042         .apicv_post_state_restore = avic_post_state_restore,
5043
5044         .set_tss_addr = svm_set_tss_addr,
5045         .get_tdp_level = get_npt_level,
5046         .get_mt_mask = svm_get_mt_mask,
5047
5048         .get_exit_info = svm_get_exit_info,
5049
5050         .get_lpage_level = svm_get_lpage_level,
5051
5052         .cpuid_update = svm_cpuid_update,
5053
5054         .rdtscp_supported = svm_rdtscp_supported,
5055         .invpcid_supported = svm_invpcid_supported,
5056         .mpx_supported = svm_mpx_supported,
5057         .xsaves_supported = svm_xsaves_supported,
5058
5059         .set_supported_cpuid = svm_set_supported_cpuid,
5060
5061         .has_wbinvd_exit = svm_has_wbinvd_exit,
5062
5063         .read_tsc_offset = svm_read_tsc_offset,
5064         .write_tsc_offset = svm_write_tsc_offset,
5065         .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
5066         .read_l1_tsc = svm_read_l1_tsc,
5067
5068         .set_tdp_cr3 = set_tdp_cr3,
5069
5070         .check_intercept = svm_check_intercept,
5071         .handle_external_intr = svm_handle_external_intr,
5072
5073         .sched_in = svm_sched_in,
5074
5075         .pmu_ops = &amd_pmu_ops,
5076         .deliver_posted_interrupt = svm_deliver_avic_intr,
5077 };
5078
5079 static int __init svm_init(void)
5080 {
5081         return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
5082                         __alignof__(struct vcpu_svm), THIS_MODULE);
5083 }
5084
5085 static void __exit svm_exit(void)
5086 {
5087         kvm_exit();
5088 }
5089
5090 module_init(svm_init)
5091 module_exit(svm_exit)