Merge tag 'kvm-s390-next-4.8-2' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Paolo Bonzini <pbonzini@redhat.com>

Tue, 21 Jun 2016 13:21:51 +0000 (15:21 +0200)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 21 Jun 2016 13:21:51 +0000 (15:21 +0200)
author Paolo Bonzini <pbonzini@redhat.com>
Tue, 21 Jun 2016 13:21:51 +0000 (15:21 +0200)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 21 Jun 2016 13:21:51 +0000 (15:21 +0200)
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h

index d054c1b..741ddba 100644 (file)
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,14 +10,25 @@
  
  /**
   * struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
   * @crst_list: list of all crst tables used in the guest address space
   * @mm: pointer to the parent mm_struct
   * @guest_to_host: radix tree with guest to host address translation
   * @host_to_guest: radix tree with pointer to segment table entries
   * @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
   * @table: pointer to the page directory
   * @asce: address space control element for gmap page table
   * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
   */
  struct gmap {
         struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
         struct radix_tree_root guest_to_host;
         struct radix_tree_root host_to_guest;
         spinlock_t guest_table_lock;
+       atomic_t ref_count;
         unsigned long *table;
         unsigned long asce;
         unsigned long asce_end;
         void *private;
         bool pfault_enabled;
+       /* Additional data for shadow guest address spaces */
+       struct radix_tree_root host_to_rmap;
+       struct list_head children;
+       struct list_head pt_list;
+       spinlock_t shadow_lock;
+       struct gmap *parent;
+       unsigned long orig_asce;
+       int edat_level;
+       bool removed;
+       bool initialized;
  };
  
+/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+       struct gmap_rmap *next;
+       unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+       for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+       for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
  /**
   * struct gmap_notifier - notify function block for page invalidation
   * @notifier_call: address of callback function
   */
  struct gmap_notifier {
         struct list_head list;
-       void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+       struct rcu_head rcu;
+       void (*notifier_call)(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
  };
  
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+       return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
  void gmap_enable(struct gmap *gmap);
  void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
  int gmap_map_segment(struct gmap *gmap, unsigned long from,
                      unsigned long to, unsigned long len);
  int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
  void __gmap_zap(struct gmap *, unsigned long gaddr);
  void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
  
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+                    unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+                        unsigned long len, int prot);
  
  #endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index 9eed5c1..946fc86 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -145,7 +145,7 @@ struct kvm_s390_sie_block {
         __u64   cputm;                  /* 0x0028 */
         __u64   ckc;                    /* 0x0030 */
         __u64   epoch;                  /* 0x0038 */
-       __u8    reserved40[4];          /* 0x0040 */
+       __u32   svcc;                   /* 0x0040 */
  #define LCTL_CR0       0x8000
  #define LCTL_CR6       0x0200
  #define LCTL_CR9       0x0040
@@ -167,6 +167,9 @@ struct kvm_s390_sie_block {
  #define ICPT_INST      0x04
  #define ICPT_PROGI     0x08
  #define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTINT    0x14
+#define ICPT_VALIDITY  0x20
+#define ICPT_STOP      0x28
  #define ICPT_OPEREXC   0x2C
  #define ICPT_PARTEXEC  0x38
  #define ICPT_IOINST    0x40
@@ -226,7 +229,7 @@ struct kvm_s390_sie_block {
         __u8    reserved1e6[2];         /* 0x01e6 */
         __u64   itdba;                  /* 0x01e8 */
         __u64   riccbd;                 /* 0x01f0 */
-       __u8    reserved1f8[8];         /* 0x01f8 */
+       __u64   gvrd;                   /* 0x01f8 */
  } __attribute__((packed));
  
  struct kvm_s390_itdb {
@@ -281,6 +284,7 @@ struct kvm_vcpu_stat {
         u32 instruction_stsi;
         u32 instruction_stfl;
         u32 instruction_tprot;
+       u32 instruction_sie;
         u32 instruction_essa;
         u32 instruction_sthyi;
         u32 instruction_sigp_sense;
@@ -545,12 +549,16 @@ struct kvm_guestdbg_info_arch {
  
  struct kvm_vcpu_arch {
         struct kvm_s390_sie_block *sie_block;
+       /* if vsie is active, currently executed shadow sie control block */
+       struct kvm_s390_sie_block *vsie_block;
         unsigned int      host_acrs[NUM_ACRS];
         struct fpu        host_fpregs;
         struct kvm_s390_local_interrupt local_int;
         struct hrtimer    ckc_timer;
         struct kvm_s390_pgm_info pgm;
         struct gmap *gmap;
+       /* backup location for the currently enabled gmap when scheduled out */
+       struct gmap *enabled_gmap;
         struct kvm_guestdbg_info_arch guestdbg;
         unsigned long pfault_token;
         unsigned long pfault_select;
@@ -635,6 +643,14 @@ struct sie_page2 {
         u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
  } __packed;
  
+struct kvm_s390_vsie {
+       struct mutex mutex;
+       struct radix_tree_root addr_to_page;
+       int page_count;
+       int next;
+       struct page *pages[KVM_MAX_VCPUS];
+};
+
  struct kvm_arch{
         void *sca;
         int use_esca;
@@ -659,6 +675,7 @@ struct kvm_arch{
         struct sie_page2 *sie_page2;
         struct kvm_s390_cpu_model model;
         struct kvm_s390_crypto crypto;
+       struct kvm_s390_vsie vsie;
         u64 epoch;
         /* subset of available cpu features enabled by user space */
         DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h

index 081b2ad..b941528 100644 (file)
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -8,8 +8,9 @@ typedef struct {
         cpumask_t cpu_attach_mask;
         atomic_t attach_count;
         unsigned int flush_mm;
-       spinlock_t list_lock;
+       spinlock_t pgtable_lock;
         struct list_head pgtable_list;
+       spinlock_t gmap_lock;
         struct list_head gmap_list;
         unsigned long asce;
         unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
         unsigned int use_skey:1;
  } mm_context_t;
  
-#define INIT_MM_CONTEXT(name)                                                \
-       .context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+#define INIT_MM_CONTEXT(name)                                             \
+       .context.pgtable_lock =                                            \
+                       __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+       .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
         .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
  
  static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h

index c837b79..3ce3854 100644 (file)
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,8 +15,9 @@
  static inline int init_new_context(struct task_struct *tsk,
                                    struct mm_struct *mm)
  {
-       spin_lock_init(&mm->context.list_lock);
+       spin_lock_init(&mm->context.pgtable_lock);
         INIT_LIST_HEAD(&mm->context.pgtable_list);
+       spin_lock_init(&mm->context.gmap_lock);
         INIT_LIST_HEAD(&mm->context.gmap_list);
         cpumask_clear(&mm->context.cpu_attach_mask);
         atomic_set(&mm->context.attach_count, 0);
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h

index f874e7d..b5edff3 100644 (file)
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -147,6 +147,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
  #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
  #define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
  #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn)       __va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page)     pfn_to_virt(page_to_pfn(page))
  
  #define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE | \
                                  VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h

index da34cb6..f4eb984 100644 (file)
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
  void crst_table_free(struct mm_struct *, unsigned long *);
  
  unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
  void page_table_free(struct mm_struct *, unsigned long *);
  void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
  extern int page_table_allocate_pgste;
  
  static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index 9951e7e..c7ebba4 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr)
  /* Bits in the region table entry */
  #define _REGION_ENTRY_ORIGIN   ~0xfffUL/* region/segment table origin      */
  #define _REGION_ENTRY_PROTECT  0x200   /* region protection bit            */
+#define _REGION_ENTRY_OFFSET   0xc0    /* region table offset              */
  #define _REGION_ENTRY_INVALID  0x20    /* invalid region table entry       */
  #define _REGION_ENTRY_TYPE_MASK        0x0c    /* region/segment table type mask   */
  #define _REGION_ENTRY_TYPE_R1  0x0c    /* region first table type          */
@@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr)
  #define PGSTE_GC_BIT   0x0002000000000000UL
  #define PGSTE_UC_BIT   0x0000800000000000UL    /* user dirty (migration) */
  #define PGSTE_IN_BIT   0x0000400000000000UL    /* IPTE notify bit */
+#define PGSTE_VSIE_BIT 0x0000200000000000UL    /* ref'd in a shadow table */
  
  /* Guest Page State used for virtualization */
  #define _PGSTE_GPS_ZERO                0x0000000080000000UL
@@ -885,10 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
  void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep, pte_t entry);
  void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+                pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+                   pte_t *ptep, int prot, unsigned long bit);
  void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep , int reset);
  void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
  
  bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
  int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h

index 9d4d311..8c2922f 100644 (file)
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -109,6 +109,8 @@ struct thread_struct {
          unsigned long ksp;              /* kernel stack pointer             */
         mm_segment_t mm_segment;
         unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_write_flag;   /* gmap fault write indication */
+       unsigned int gmap_int_code;     /* int code of last gmap fault */
         unsigned int gmap_pfault;       /* signal of a pending guest pfault */
         struct per_regs per_user;       /* User specified PER registers */
         struct per_event per_event;     /* Cause of the last PER trap */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h

index f0818d7..a2ffec4 100644 (file)
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -98,6 +98,18 @@ struct kvm_s390_vm_cpu_machine {
  
  #define KVM_S390_VM_CPU_FEAT_NR_BITS   1024
  #define KVM_S390_VM_CPU_FEAT_ESOP      0
+#define KVM_S390_VM_CPU_FEAT_SIEF2     1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO   2
+#define KVM_S390_VM_CPU_FEAT_SIIF      3
+#define KVM_S390_VM_CPU_FEAT_GPERE     4
+#define KVM_S390_VM_CPU_FEAT_GSLS      5
+#define KVM_S390_VM_CPU_FEAT_IB                6
+#define KVM_S390_VM_CPU_FEAT_CEI       7
+#define KVM_S390_VM_CPU_FEAT_IBS       8
+#define KVM_S390_VM_CPU_FEAT_SKEY      9
+#define KVM_S390_VM_CPU_FEAT_CMMA      10
+#define KVM_S390_VM_CPU_FEAT_PFMFI     11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF    12
  struct kvm_s390_vm_cpu_feat {
         __u64 feat[16];
  };
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile

index 82e73e2..09a9e6d 100644 (file)
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
  ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
  
  kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
  
  obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c

index 8e245e7..5420020 100644 (file)
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
  #include <linux/vmalloc.h>
  #include <linux/err.h>
  #include <asm/pgtable.h>
+#include <asm/gmap.h>
  #include "kvm-s390.h"
  #include "gaccess.h"
  #include <asm/switch_to.h>
@@ -946,3 +947,241 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
                 return 0;
         return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
  }
+
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+                                 unsigned long *pgt, int *dat_protection,
+                                 int *fake)
+{
+       struct gmap *parent;
+       union asce asce;
+       union vaddress vaddr;
+       unsigned long ptr;
+       int rc;
+
+       *fake = 0;
+       *dat_protection = 0;
+       parent = sg->parent;
+       vaddr.addr = saddr;
+       asce.val = sg->orig_asce;
+       ptr = asce.origin * 4096;
+       if (asce.r) {
+               *fake = 1;
+               asce.dt = ASCE_TYPE_REGION1;
+       }
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1:
+               if (vaddr.rfx01 > asce.tl && !asce.r)
+                       return PGM_REGION_FIRST_TRANS;
+               break;
+       case ASCE_TYPE_REGION2:
+               if (vaddr.rfx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rsx01 > asce.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               break;
+       case ASCE_TYPE_REGION3:
+               if (vaddr.rfx || vaddr.rsx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rtx01 > asce.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               break;
+       case ASCE_TYPE_SEGMENT:
+               if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.sx01 > asce.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               break;
+       }
+
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1: {
+               union region1_table_entry rfte;
+
+               if (*fake) {
+                       /* offset in 16EB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+                       rfte.val = ptr;
+                       goto shadow_r2t;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+               if (rc)
+                       return rc;
+               if (rfte.i)
+                       return PGM_REGION_FIRST_TRANS;
+               if (rfte.tt != TABLE_TYPE_REGION1)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rfte.p;
+               ptr = rfte.rto << 12UL;
+shadow_r2t:
+               rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_REGION2: {
+               union region2_table_entry rste;
+
+               if (*fake) {
+                       /* offset in 8PB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+                       rste.val = ptr;
+                       goto shadow_r3t;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+               if (rc)
+                       return rc;
+               if (rste.i)
+                       return PGM_REGION_SECOND_TRANS;
+               if (rste.tt != TABLE_TYPE_REGION2)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rste.p;
+               ptr = rste.rto << 12UL;
+shadow_r3t:
+               rste.p |= *dat_protection;
+               rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_REGION3: {
+               union region3_table_entry rtte;
+
+               if (*fake) {
+                       /* offset in 4TB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+                       rtte.val = ptr;
+                       goto shadow_sgt;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+               if (rc)
+                       return rc;
+               if (rtte.i)
+                       return PGM_REGION_THIRD_TRANS;
+               if (rtte.tt != TABLE_TYPE_REGION3)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.cr && asce.p && sg->edat_level >= 2)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.fc && sg->edat_level >= 2) {
+                       *dat_protection |= rtte.fc0.p;
+                       *fake = 1;
+                       ptr = rtte.fc1.rfaa << 31UL;
+                       rtte.val = ptr;
+                       goto shadow_sgt;
+               }
+               if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rtte.fc0.p;
+               ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+               rtte.fc0.p |= *dat_protection;
+               rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_SEGMENT: {
+               union segment_table_entry ste;
+
+               if (*fake) {
+                       /* offset in 2G guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+                       ste.val = ptr;
+                       goto shadow_pgt;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+               if (rc)
+                       return rc;
+               if (ste.i)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (ste.tt != TABLE_TYPE_SEGMENT)
+                       return PGM_TRANSLATION_SPEC;
+               if (ste.cs && asce.p)
+                       return PGM_TRANSLATION_SPEC;
+               *dat_protection |= ste.fc0.p;
+               if (ste.fc && sg->edat_level >= 1) {
+                       *fake = 1;
+                       ptr = ste.fc1.sfaa << 20UL;
+                       ste.val = ptr;
+                       goto shadow_pgt;
+               }
+               ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+               ste.fc0.p |= *dat_protection;
+               rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+               if (rc)
+                       return rc;
+       }
+       }
+       /* Return the parent address of the page table */
+       *pgt = ptr;
+       return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ *         - > 0 (pgm exception code) on exceptions while faulting
+ *         - -EAGAIN if the caller can retry immediately
+ *         - -EFAULT when accessing invalid guest addresses
+ *         - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+                         unsigned long saddr)
+{
+       union vaddress vaddr;
+       union page_table_entry pte;
+       unsigned long pgt;
+       int dat_protection, fake;
+       int rc;
+
+       down_read(&sg->mm->mmap_sem);
+       /*
+        * We don't want any guest-2 tables to change - so the parent
+        * tables/pointers we read stay valid - unshadowing is however
+        * always possible - only guest_table_lock protects us.
+        */
+       ipte_lock(vcpu);
+
+       rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+       if (rc)
+               rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+                                           &fake);
+
+       vaddr.addr = saddr;
+       if (fake) {
+               /* offset in 1MB guest memory block */
+               pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+               goto shadow_page;
+       }
+       if (!rc)
+               rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+       if (!rc && pte.i)
+               rc = PGM_PAGE_TRANSLATION;
+       if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+               rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+       pte.p |= dat_protection;
+       if (!rc)
+               rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+       ipte_unlock(vcpu);
+       up_read(&sg->mm->mmap_sem);
+       return rc;
+}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h

index df0a79d..8756569 100644 (file)
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
  int ipte_lock_held(struct kvm_vcpu *vcpu);
  int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
  
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+                         unsigned long saddr);
+
  #endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index d72c4a8..ca19627 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -995,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
                 swake_up(&vcpu->wq);
                 vcpu->stat.halt_wakeup++;
         }
+       /*
+        * The VCPU might not be sleeping but is executing the VSIE. Let's
+        * kick it, so it leaves the SIE to process the request.
+        */
+       kvm_s390_vsie_kick(vcpu);
  }
  
  enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 0dcf9b8..03eeeb0 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,6 +21,7 @@
  #include <linux/init.h>
  #include <linux/kvm.h>
  #include <linux/kvm_host.h>
+#include <linux/mman.h>
  #include <linux/module.h>
  #include <linux/random.h>
  #include <linux/slab.h>
@@ -98,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "instruction_stfl", VCPU_STAT(instruction_stfl) },
         { "instruction_tprot", VCPU_STAT(instruction_tprot) },
         { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+       { "instruction_sie", VCPU_STAT(instruction_sie) },
         { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
         { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
         { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -123,6 +125,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { NULL }
  };
  
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
  /* upper facilities limit for kvm */
  unsigned long kvm_s390_fac_list_mask[16] = {
         0xffe6000000000000UL,
@@ -141,6 +148,7 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
  static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
  
  static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
  debug_info_t *kvm_s390_dbf;
  
  /* Section: not file related */
@@ -150,7 +158,8 @@ int kvm_arch_hardware_enable(void)
         return 0;
  }
  
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
  
  /*
   * This callback is executed during stop_machine(). All CPUs are therefore
@@ -172,6 +181,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
                         vcpu->arch.sie_block->epoch -= *delta;
                         if (vcpu->arch.cputm_enabled)
                                 vcpu->arch.cputm_start += *delta;
+                       if (vcpu->arch.vsie_block)
+                               vcpu->arch.vsie_block->epoch -= *delta;
                 }
         }
         return NOTIFY_OK;
@@ -184,7 +195,9 @@ static struct notifier_block kvm_clock_notifier = {
  int kvm_arch_hardware_setup(void)
  {
         gmap_notifier.notifier_call = kvm_gmap_notifier;
-       gmap_register_ipte_notifier(&gmap_notifier);
+       gmap_register_pte_notifier(&gmap_notifier);
+       vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+       gmap_register_pte_notifier(&vsie_gmap_notifier);
         atomic_notifier_chain_register(&s390_epoch_delta_notifier,
                                        &kvm_clock_notifier);
         return 0;
@@ -192,7 +205,8 @@ int kvm_arch_hardware_setup(void)
  
  void kvm_arch_hardware_unsetup(void)
  {
-       gmap_unregister_ipte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&vsie_gmap_notifier);
         atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
                                          &kvm_clock_notifier);
  }
@@ -250,6 +264,46 @@ static void kvm_s390_cpu_feat_init(void)
  
         if (MACHINE_HAS_ESOP)
                 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+       /*
+        * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+        * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+        */
+       if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+           !test_facility(3) || !nested)
+               return;
+       allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+       if (sclp.has_64bscao)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+       if (sclp.has_siif)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+       if (sclp.has_gpere)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+       if (sclp.has_gsls)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+       if (sclp.has_ib)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+       if (sclp.has_cei)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+       if (sclp.has_ibs)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+       /*
+        * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+        * all skey handling functions read/set the skey from the PGSTE
+        * instead of the real storage key.
+        *
+        * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+        * pages being detected as preserved although they are resident.
+        *
+        * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+        * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+        *
+        * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+        * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+        * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+        *
+        * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+        * cannot easily shadow the SCA because of the ipte lock.
+        */
  }
  
  int kvm_arch_init(void *opaque)
@@ -530,20 +584,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
                 if (!new_limit)
                         return -EINVAL;
  
-               /* gmap_alloc takes last usable address */
+               /* gmap_create takes last usable address */
                 if (new_limit != KVM_S390_NO_MEM_LIMIT)
                         new_limit -= 1;
  
                 ret = -EBUSY;
                 mutex_lock(&kvm->lock);
                 if (!kvm->created_vcpus) {
-                       /* gmap_alloc will round the limit up */
-                       struct gmap *new = gmap_alloc(current->mm, new_limit);
+                       /* gmap_create will round the limit up */
+                       struct gmap *new = gmap_create(current->mm, new_limit);
  
                         if (!new) {
                                 ret = -ENOMEM;
                         } else {
-                               gmap_free(kvm->arch.gmap);
+                               gmap_remove(kvm->arch.gmap);
                                 new->private = kvm;
                                 kvm->arch.gmap = new;
                                 ret = 0;
@@ -1392,7 +1446,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                 else
                         kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
                                                     sclp.hamax + 1);
-               kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+               kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
                 if (!kvm->arch.gmap)
                         goto out_err;
                 kvm->arch.gmap->private = kvm;
@@ -1404,6 +1458,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         kvm->arch.epoch = 0;
  
         spin_lock_init(&kvm->arch.start_stop_lock);
+       kvm_s390_vsie_init(kvm);
         KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
  
         return 0;
@@ -1425,7 +1480,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
                 sca_del_vcpu(vcpu);
  
         if (kvm_is_ucontrol(vcpu->kvm))
-               gmap_free(vcpu->arch.gmap);
+               gmap_remove(vcpu->arch.gmap);
  
         if (vcpu->kvm->arch.use_cmma)
                 kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1458,16 +1513,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
         debug_unregister(kvm->arch.dbf);
         free_page((unsigned long)kvm->arch.sie_page2);
         if (!kvm_is_ucontrol(kvm))
-               gmap_free(kvm->arch.gmap);
+               gmap_remove(kvm->arch.gmap);
         kvm_s390_destroy_adapters(kvm);
         kvm_s390_clear_float_irqs(kvm);
+       kvm_s390_vsie_destroy(kvm);
         KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
  }
  
  /* Section: vcpu related */
  static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+       vcpu->arch.gmap = gmap_create(current->mm, -1UL);
         if (!vcpu->arch.gmap)
                 return -ENOMEM;
         vcpu->arch.gmap->private = vcpu->kvm;
@@ -1717,7 +1773,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  
         save_access_regs(vcpu->arch.host_acrs);
         restore_access_regs(vcpu->run->s.regs.acrs);
-       gmap_enable(vcpu->arch.gmap);
+       gmap_enable(vcpu->arch.enabled_gmap);
         atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
         if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                 __start_cpu_timer_accounting(vcpu);
@@ -1730,7 +1786,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
         if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                 __stop_cpu_timer_accounting(vcpu);
         atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-       gmap_disable(vcpu->arch.gmap);
+       vcpu->arch.enabled_gmap = gmap_get_enabled();
+       gmap_disable(vcpu->arch.enabled_gmap);
  
         /* Save guest register state */
         save_fpu_regs();
@@ -1779,7 +1836,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
                 vcpu->arch.gmap = vcpu->kvm->arch.gmap;
                 sca_add_vcpu(vcpu);
         }
-
+       /* make vcpu_load load the right gmap on the first trigger */
+       vcpu->arch.enabled_gmap = vcpu->arch.gmap;
  }
  
  static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1976,16 +2034,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
         kvm_s390_vcpu_request(vcpu);
  }
  
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end)
  {
-       int i;
         struct kvm *kvm = gmap->private;
         struct kvm_vcpu *vcpu;
+       unsigned long prefix;
+       int i;
  
+       if (gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
         kvm_for_each_vcpu(i, vcpu, kvm) {
                 /* match against both prefix pages */
-               if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+               prefix = kvm_s390_get_prefix(vcpu);
+               if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+                                  start, end);
                         kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
                 }
         }
@@ -2264,16 +2331,16 @@ retry:
                 return 0;
         /*
          * We use MMU_RELOAD just to re-arm the ipte notifier for the
-        * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+        * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
          * This ensures that the ipte instruction for this request has
          * already finished. We might race against a second unmapper that
          * wants to set the blocking bit. Lets just retry the request loop.
          */
         if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
                 int rc;
-               rc = gmap_ipte_notify(vcpu->arch.gmap,
-                                     kvm_s390_get_prefix(vcpu),
-                                     PAGE_SIZE * 2);
+               rc = gmap_mprotect_notify(vcpu->arch.gmap,
+                                         kvm_s390_get_prefix(vcpu),
+                                         PAGE_SIZE * 2, PROT_WRITE);
                 if (rc)
                         return rc;
                 goto retry;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h

index 52aa47e..031f451 100644 (file)
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
  
  static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
  {
-       return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+       return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
  }
  
  static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -252,6 +252,14 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
  int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
  int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
  
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+                                unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
  /* implemented in sigp.c */
  int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
  int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c

index 3db3be1..c77ad2d 100644 (file)
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -719,6 +719,7 @@ static const intercept_handler_t b2_handlers[256] = {
         [0x10] = handle_set_prefix,
         [0x11] = handle_store_prefix,
         [0x12] = handle_store_cpu_address,
+       [0x14] = kvm_s390_handle_vsie,
         [0x21] = handle_ipte_interlock,
         [0x29] = handle_iske,
         [0x2a] = handle_rrbe,
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c

index 28ea0ca..1a252f5 100644 (file)
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
         const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
         u16 p_asn, s_asn;
         psw_t *psw;
-       u32 flags;
+       bool idle;
  
-       flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+       idle = is_vcpu_idle(vcpu);
         psw = &dst_vcpu->arch.sie_block->gpsw;
         p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff;  /* Primary ASN */
         s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff;  /* Secondary ASN */
  
         /* Inject the emergency signal? */
-       if (!(flags & CPUSTAT_STOPPED)
+       if (!is_vcpu_stopped(vcpu)
             || (psw->mask & psw_int_mask) != psw_int_mask
-           || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
-           || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+           || (idle && psw->addr != 0)
+           || (!idle && (asn == p_asn || asn == s_asn))) {
                 return __inject_sigp_emergency(vcpu, dst_vcpu);
         } else {
                 *reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c

new file mode 100644 (file)

index 0000000..6895e7b
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+       struct kvm_s390_sie_block scb_s;        /* 0x0000 */
+       /* the pinned originial scb */
+       struct kvm_s390_sie_block *scb_o;       /* 0x0200 */
+       /* the shadow gmap in use by the vsie_page */
+       struct gmap *gmap;                      /* 0x0208 */
+       /* address of the last reported fault to guest2 */
+       unsigned long fault_addr;               /* 0x0210 */
+       __u8 reserved[0x0700 - 0x0218];         /* 0x0218 */
+       struct kvm_s390_crypto_cb crycb;        /* 0x0700 */
+       __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+                            __u16 reason_code)
+{
+       scb->ipa = 0x1000;
+       scb->ipb = ((__u32) reason_code) << 16;
+       scb->icptcode = ICPT_VALIDITY;
+       return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+       atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+       prefix_unmapped(vsie_page);
+       if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+               atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+       while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+               cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+       atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+       return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+       const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+       int cpuflags;
+
+       cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+       atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+       atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags  */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+       /* we don't allow ESA/390 guests */
+       if (!(cpuflags & CPUSTAT_ZARCH))
+               return set_validity_icpt(scb_s, 0x0001U);
+
+       if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+               return set_validity_icpt(scb_s, 0x0001U);
+       else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+               return set_validity_icpt(scb_s, 0x0007U);
+
+       /* intervention requests will be set later */
+       newflags = CPUSTAT_ZARCH;
+       if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+               newflags |= CPUSTAT_GED;
+       if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+               if (cpuflags & CPUSTAT_GED)
+                       return set_validity_icpt(scb_s, 0x0001U);
+               newflags |= CPUSTAT_GED2;
+       }
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+               newflags |= cpuflags & CPUSTAT_P;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+               newflags |= cpuflags & CPUSTAT_SM;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+               newflags |= cpuflags & CPUSTAT_IBS;
+
+       atomic_set(&scb_s->cpuflags, newflags);
+       return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+       unsigned long *b1, *b2;
+       u8 ecb3_flags;
+
+       scb_s->crycbd = 0;
+       if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+               return 0;
+       /* format-1 is supported with message-security-assist extension 3 */
+       if (!test_kvm_facility(vcpu->kvm, 76))
+               return 0;
+       /* we may only allow it if enabled for guest 2 */
+       ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+                    (ECB3_AES | ECB3_DEA);
+       if (!ecb3_flags)
+               return 0;
+
+       if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+               return set_validity_icpt(scb_s, 0x003CU);
+       else if (!crycb_addr)
+               return set_validity_icpt(scb_s, 0x0039U);
+
+       /* copy only the wrapping keys */
+       if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+               return set_validity_icpt(scb_s, 0x0035U);
+
+       scb_s->ecb3 |= ecb3_flags;
+       scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+                       CRYCB_FORMAT2;
+
+       /* xor both blocks in one run */
+       b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+       b2 = (unsigned long *)
+                           vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+       /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+       bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+       return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+       scb_s->ibc = 0;
+       /* ibc installed in g2 and requested for g3 */
+       if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+               scb_s->ibc = scb_o->ibc & 0x0fffU;
+               /* takte care of the minimum ibc level of the machine */
+               if (scb_s->ibc < min_ibc)
+                       scb_s->ibc = min_ibc;
+               /* take care of the maximum ibc level set for the guest */
+               if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+                       scb_s->ibc = vcpu->kvm->arch.model.ibc;
+       }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+       /* interception */
+       scb_o->icptcode = scb_s->icptcode;
+       scb_o->icptstatus = scb_s->icptstatus;
+       scb_o->ipa = scb_s->ipa;
+       scb_o->ipb = scb_s->ipb;
+       scb_o->gbea = scb_s->gbea;
+
+       /* timer */
+       scb_o->cputm = scb_s->cputm;
+       scb_o->ckc = scb_s->ckc;
+       scb_o->todpr = scb_s->todpr;
+
+       /* guest state */
+       scb_o->gpsw = scb_s->gpsw;
+       scb_o->gg14 = scb_s->gg14;
+       scb_o->gg15 = scb_s->gg15;
+       memcpy(scb_o->gcr, scb_s->gcr, 128);
+       scb_o->pp = scb_s->pp;
+
+       /* interrupt intercept */
+       switch (scb_s->icptcode) {
+       case ICPT_PROGI:
+       case ICPT_INSTPROGI:
+       case ICPT_EXTINT:
+               memcpy((void *)((u64)scb_o + 0xc0),
+                      (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+               break;
+       case ICPT_PARTEXEC:
+               /* MVPG only */
+               memcpy((void *)((u64)scb_o + 0xc0),
+                      (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+               break;
+       }
+
+       if (scb_s->ihcpu != 0xffffU)
+               scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       bool had_tx = scb_s->ecb & 0x10U;
+       unsigned long new_mso = 0;
+       int rc;
+
+       /* make sure we don't have any leftovers when reusing the scb */
+       scb_s->icptcode = 0;
+       scb_s->eca = 0;
+       scb_s->ecb = 0;
+       scb_s->ecb2 = 0;
+       scb_s->ecb3 = 0;
+       scb_s->ecd = 0;
+       scb_s->fac = 0;
+
+       rc = prepare_cpuflags(vcpu, vsie_page);
+       if (rc)
+               goto out;
+
+       /* timer */
+       scb_s->cputm = scb_o->cputm;
+       scb_s->ckc = scb_o->ckc;
+       scb_s->todpr = scb_o->todpr;
+       scb_s->epoch = scb_o->epoch;
+
+       /* guest state */
+       scb_s->gpsw = scb_o->gpsw;
+       scb_s->gg14 = scb_o->gg14;
+       scb_s->gg15 = scb_o->gg15;
+       memcpy(scb_s->gcr, scb_o->gcr, 128);
+       scb_s->pp = scb_o->pp;
+
+       /* interception / execution handling */
+       scb_s->gbea = scb_o->gbea;
+       scb_s->lctl = scb_o->lctl;
+       scb_s->svcc = scb_o->svcc;
+       scb_s->ictl = scb_o->ictl;
+       /*
+        * SKEY handling functions can't deal with false setting of PTE invalid
+        * bits. Therefore we cannot provide interpretation and would later
+        * have to provide own emulation handlers.
+        */
+       scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+       scb_s->icpua = scb_o->icpua;
+
+       if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+               new_mso = scb_o->mso & 0xfffffffffff00000UL;
+       /* if the hva of the prefix changes, we have to remap the prefix */
+       if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+               prefix_unmapped(vsie_page);
+        /* SIE will do mso/msl validity and exception checks for us */
+       scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+       scb_s->mso = new_mso;
+       scb_s->prefix = scb_o->prefix;
+
+       /* We have to definetly flush the tlb if this scb never ran */
+       if (scb_s->ihcpu != 0xffffU)
+               scb_s->ihcpu = scb_o->ihcpu;
+
+       /* MVPG and Protection Exception Interpretation are always available */
+       scb_s->eca |= scb_o->eca & 0x01002000U;
+       /* Host-protection-interruption introduced with ESOP */
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+               scb_s->ecb |= scb_o->ecb & 0x02U;
+       /* transactional execution */
+       if (test_kvm_facility(vcpu->kvm, 73)) {
+               /* remap the prefix is tx is toggled on */
+               if ((scb_o->ecb & 0x10U) && !had_tx)
+                       prefix_unmapped(vsie_page);
+               scb_s->ecb |= scb_o->ecb & 0x10U;
+       }
+       /* SIMD */
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               scb_s->eca |= scb_o->eca & 0x00020000U;
+               scb_s->ecd |= scb_o->ecd & 0x20000000U;
+       }
+       /* Run-time-Instrumentation */
+       if (test_kvm_facility(vcpu->kvm, 64))
+               scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+               scb_s->eca |= scb_o->eca & 0x00000001U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+               scb_s->eca |= scb_o->eca & 0x40000000U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+               scb_s->eca |= scb_o->eca & 0x80000000U;
+
+       prepare_ibc(vcpu, vsie_page);
+       rc = shadow_crycb(vcpu, vsie_page);
+out:
+       if (rc)
+               unshadow_scb(vcpu, vsie_page);
+       return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+                                unsigned long end)
+{
+       struct kvm *kvm = gmap->private;
+       struct vsie_page *cur;
+       unsigned long prefix;
+       struct page *page;
+       int i;
+
+       if (!gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
+
+       /*
+        * Only new shadow blocks are added to the list during runtime,
+        * therefore we can safely reference them all the time.
+        */
+       for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+               page = READ_ONCE(kvm->arch.vsie.pages[i]);
+               if (!page)
+                       continue;
+               cur = page_to_virt(page);
+               if (READ_ONCE(cur->gmap) != gmap)
+                       continue;
+               prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+               /* with mso/msl, the prefix lies at an offset */
+               prefix += cur->scb_s.mso;
+               if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+                       prefix_unmapped_sync(cur);
+       }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ *          - > 0 if control has to be given to guest 2
+ *          - -EAGAIN if the caller can retry immediately
+ *          - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+       int rc;
+
+       if (prefix_is_mapped(vsie_page))
+               return 0;
+
+       /* mark it as mapped so we can catch any concurrent unmappers */
+       prefix_mapped(vsie_page);
+
+       /* with mso/msl, the prefix lies at offset *mso* */
+       prefix += scb_s->mso;
+
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+       if (!rc && (scb_s->ecb & 0x10U))
+               rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                          prefix + PAGE_SIZE);
+       /*
+        * We don't have to mprotect, we will be called for all unshadows.
+        * SIE will detect if protection applies and trigger a validity.
+        */
+       if (rc)
+               prefix_unmapped(vsie_page);
+       if (rc > 0 || rc == -EFAULT)
+               rc = set_validity_icpt(scb_s, 0x0037U);
+       return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ *          - -EINVAL if the gpa is not valid guest storage
+ *          - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+       struct page *page;
+       hva_t hva;
+       int rc;
+
+       hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+       if (kvm_is_error_hva(hva))
+               return -EINVAL;
+       rc = get_user_pages_fast(hva, 1, 1, &page);
+       if (rc < 0)
+               return rc;
+       else if (rc != 1)
+               return -ENOMEM;
+       *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+       return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+       struct page *page;
+
+       page = virt_to_page(hpa);
+       set_page_dirty_lock(page);
+       put_page(page);
+       /* mark the page always as dirty for migration */
+       mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       hpa_t hpa;
+       gpa_t gpa;
+
+       hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+       if (hpa) {
+               gpa = scb_o->scaol & ~0xfUL;
+               if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+                       gpa |= (u64) scb_o->scaoh << 32;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->scaol = 0;
+               scb_s->scaoh = 0;
+       }
+
+       hpa = scb_s->itdba;
+       if (hpa) {
+               gpa = scb_o->itdba & ~0xffUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->itdba = 0;
+       }
+
+       hpa = scb_s->gvrd;
+       if (hpa) {
+               gpa = scb_o->gvrd & ~0x1ffUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->gvrd = 0;
+       }
+
+       hpa = scb_s->riccbd;
+       if (hpa) {
+               gpa = scb_o->riccbd & ~0x3fUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->riccbd = 0;
+       }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       hpa_t hpa;
+       gpa_t gpa;
+       int rc = 0;
+
+       gpa = scb_o->scaol & ~0xfUL;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+               gpa |= (u64) scb_o->scaoh << 32;
+       if (gpa) {
+               if (!(gpa & ~0x1fffUL))
+                       rc = set_validity_icpt(scb_s, 0x0038U);
+               else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+                       rc = set_validity_icpt(scb_s, 0x0011U);
+               else if ((gpa & PAGE_MASK) !=
+                        ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+                       rc = set_validity_icpt(scb_s, 0x003bU);
+               if (!rc) {
+                       rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+                       if (rc == -EINVAL)
+                               rc = set_validity_icpt(scb_s, 0x0034U);
+               }
+               if (rc)
+                       goto unpin;
+               scb_s->scaoh = (u32)((u64)hpa >> 32);
+               scb_s->scaol = (u32)(u64)hpa;
+       }
+
+       gpa = scb_o->itdba & ~0xffUL;
+       if (gpa && (scb_s->ecb & 0x10U)) {
+               if (!(gpa & ~0x1fffU)) {
+                       rc = set_validity_icpt(scb_s, 0x0080U);
+                       goto unpin;
+               }
+               /* 256 bytes cannot cross page boundaries */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x0080U);
+               if (rc)
+                       goto unpin;
+               scb_s->itdba = hpa;
+       }
+
+       gpa = scb_o->gvrd & ~0x1ffUL;
+       if (gpa && (scb_s->eca & 0x00020000U) &&
+           !(scb_s->ecd & 0x20000000U)) {
+               if (!(gpa & ~0x1fffUL)) {
+                       rc = set_validity_icpt(scb_s, 0x1310U);
+                       goto unpin;
+               }
+               /*
+                * 512 bytes vector registers cannot cross page boundaries
+                * if this block gets bigger, we have to shadow it.
+                */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x1310U);
+               if (rc)
+                       goto unpin;
+               scb_s->gvrd = hpa;
+       }
+
+       gpa = scb_o->riccbd & ~0x3fUL;
+       if (gpa && (scb_s->ecb3 & 0x01U)) {
+               if (!(gpa & ~0x1fffUL)) {
+                       rc = set_validity_icpt(scb_s, 0x0043U);
+                       goto unpin;
+               }
+               /* 64 bytes cannot cross page boundaries */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x0043U);
+               /* Validity 0x0044 will be checked by SIE */
+               if (rc)
+                       goto unpin;
+               scb_s->gvrd = hpa;
+       }
+       return 0;
+unpin:
+       unpin_blocks(vcpu, vsie_page);
+       return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+                     gpa_t gpa)
+{
+       hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+       if (hpa)
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+       vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+                  gpa_t gpa)
+{
+       hpa_t hpa;
+       int rc;
+
+       rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+       if (rc == -EINVAL) {
+               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               if (!rc)
+                       rc = 1;
+       }
+       if (!rc)
+               vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+       return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ *            < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+                       bool write_flag)
+{
+       struct kvm_s390_pgm_info pgm = {
+               .code = code,
+               .trans_exc_code =
+                       /* 0-51: virtual address */
+                       (vaddr & 0xfffffffffffff000UL) |
+                       /* 52-53: store / fetch */
+                       (((unsigned int) !write_flag) + 1) << 10,
+                       /* 62-63: asce id (alway primary == 0) */
+               .exc_access_id = 0, /* always primary */
+               .op_access_id = 0, /* not MVPG */
+       };
+       int rc;
+
+       if (code == PGM_PROTECTION)
+               pgm.trans_exc_code |= 0x4UL;
+
+       rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+       return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       int rc;
+
+       if (current->thread.gmap_int_code == PGM_PROTECTION)
+               /* we can directly forward all protection exceptions */
+               return inject_fault(vcpu, PGM_PROTECTION,
+                                   current->thread.gmap_addr, 1);
+
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                  current->thread.gmap_addr);
+       if (rc > 0) {
+               rc = inject_fault(vcpu, rc,
+                                 current->thread.gmap_addr,
+                                 current->thread.gmap_write_flag);
+               if (rc >= 0)
+                       vsie_page->fault_addr = current->thread.gmap_addr;
+       }
+       return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+                             struct vsie_page *vsie_page)
+{
+       if (vsie_page->fault_addr)
+               kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                     vsie_page->fault_addr);
+       vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+       vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       int ilen = insn_length(scb_s->ipa >> 8);
+
+       /* take care of EXECUTE instructions */
+       if (scb_s->icptstatus & 1) {
+               ilen = (scb_s->icptstatus >> 4) & 0x6;
+               if (!ilen)
+                       ilen = 4;
+       }
+       scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+       clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ *          - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+       if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+               retry_vsie_icpt(vsie_page);
+               if (read_guest_real(vcpu, fac, &vsie_page->fac,
+                                   sizeof(vsie_page->fac)))
+                       return set_validity_icpt(scb_s, 0x1090U);
+               scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+       }
+       return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       int rc;
+
+       handle_last_fault(vcpu, vsie_page);
+
+       if (need_resched())
+               schedule();
+       if (test_cpu_flag(CIF_MCCK_PENDING))
+               s390_handle_mcck();
+
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       local_irq_disable();
+       kvm_guest_enter();
+       local_irq_enable();
+
+       rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+       local_irq_disable();
+       kvm_guest_exit();
+       local_irq_enable();
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       if (rc > 0)
+               rc = 0; /* we could still have an icpt */
+       else if (rc == -EFAULT)
+               return handle_fault(vcpu, vsie_page);
+
+       switch (scb_s->icptcode) {
+       case ICPT_INST:
+               if (scb_s->ipa == 0xb2b0)
+                       rc = handle_stfle(vcpu, vsie_page);
+               break;
+       case ICPT_STOP:
+               /* stop not requested by g2 - must have been a kick */
+               if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+                       clear_vsie_icpt(vsie_page);
+               break;
+       case ICPT_VALIDITY:
+               if ((scb_s->ipa & 0xf000) != 0xf000)
+                       scb_s->ipa += 0x1000;
+               break;
+       }
+       return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+       if (vsie_page->gmap)
+               gmap_put(vsie_page->gmap);
+       WRITE_ONCE(vsie_page->gmap, NULL);
+       prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+                              struct vsie_page *vsie_page)
+{
+       unsigned long asce;
+       union ctlreg0 cr0;
+       struct gmap *gmap;
+       int edat;
+
+       asce = vcpu->arch.sie_block->gcr[1];
+       cr0.val = vcpu->arch.sie_block->gcr[0];
+       edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+       edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+       /*
+        * ASCE or EDAT could have changed since last icpt, or the gmap
+        * we're holding has been unshadowed. If the gmap is still valid,
+        * we can safely reuse it.
+        */
+       if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+               return 0;
+
+       /* release the old shadow - if any, and mark the prefix as unmapped */
+       release_gmap_shadow(vsie_page);
+       gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+       if (IS_ERR(gmap))
+               return PTR_ERR(gmap);
+       gmap->private = vcpu->kvm;
+       WRITE_ONCE(vsie_page->gmap, gmap);
+       return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+                               struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+       WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+       /*
+        * External calls have to lead to a kick of the vcpu and
+        * therefore the vsie -> Simulate Wait state.
+        */
+       atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       /*
+        * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+        * automatically be adjusted on tod clock changes via kvm_sync_clock.
+        */
+       preempt_disable();
+       scb_s->epoch += vcpu->kvm->arch.epoch;
+       preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+       atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       int rc = 0;
+
+       while (1) {
+               rc = acquire_gmap_shadow(vcpu, vsie_page);
+               if (!rc)
+                       rc = map_prefix(vcpu, vsie_page);
+               if (!rc) {
+                       gmap_enable(vsie_page->gmap);
+                       update_intervention_requests(vsie_page);
+                       rc = do_vsie_run(vcpu, vsie_page);
+                       gmap_enable(vcpu->arch.gmap);
+               }
+               atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+               if (rc == -EAGAIN)
+                       rc = 0;
+               if (rc || scb_s->icptcode || signal_pending(current) ||
+                   kvm_s390_vcpu_has_irq(vcpu, 0))
+                       break;
+       };
+
+       if (rc == -EFAULT) {
+               /*
+                * Addressing exceptions are always presentes as intercepts.
+                * As addressing exceptions are suppressing and our guest 3 PSW
+                * points at the responsible instruction, we have to
+                * forward the PSW and set the ilc. If we can't read guest 3
+                * instruction, we can use an arbitrary ilc. Let's always use
+                * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+                * memory. (we could also fake the shadow so the hardware
+                * handles it).
+                */
+               scb_s->icptcode = ICPT_PROGI;
+               scb_s->iprcc = PGM_ADDRESSING;
+               scb_s->pgmilc = 4;
+               scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+       }
+       return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ *          - NULL if the same scb address is already used by another VCPU
+ *          - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+       struct vsie_page *vsie_page;
+       struct page *page;
+       int nr_vcpus;
+
+       rcu_read_lock();
+       page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+       rcu_read_unlock();
+       if (page) {
+               if (page_ref_inc_return(page) == 2)
+                       return page_to_virt(page);
+               page_ref_dec(page);
+       }
+
+       /*
+        * We want at least #online_vcpus shadows, so every VCPU can execute
+        * the VSIE in parallel.
+        */
+       nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+       mutex_lock(&kvm->arch.vsie.mutex);
+       if (kvm->arch.vsie.page_count < nr_vcpus) {
+               page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+               if (!page) {
+                       mutex_unlock(&kvm->arch.vsie.mutex);
+                       return ERR_PTR(-ENOMEM);
+               }
+               page_ref_inc(page);
+               kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+               kvm->arch.vsie.page_count++;
+       } else {
+               /* reuse an existing entry that belongs to nobody */
+               while (true) {
+                       page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+                       if (page_ref_inc_return(page) == 2)
+                               break;
+                       page_ref_dec(page);
+                       kvm->arch.vsie.next++;
+                       kvm->arch.vsie.next %= nr_vcpus;
+               }
+               radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+       }
+       page->index = addr;
+       /* double use of the same address */
+       if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+               page_ref_dec(page);
+               mutex_unlock(&kvm->arch.vsie.mutex);
+               return NULL;
+       }
+       mutex_unlock(&kvm->arch.vsie.mutex);
+
+       vsie_page = page_to_virt(page);
+       memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+       release_gmap_shadow(vsie_page);
+       vsie_page->fault_addr = 0;
+       vsie_page->scb_s.ihcpu = 0xffffU;
+       return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+       struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+       page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+       struct vsie_page *vsie_page;
+       unsigned long scb_addr;
+       int rc;
+
+       vcpu->stat.instruction_sie++;
+       if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+               return -EOPNOTSUPP;
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+       scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+       /* 512 byte alignment */
+       if (unlikely(scb_addr & 0x1ffUL))
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+               return 0;
+
+       vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+       if (IS_ERR(vsie_page))
+               return PTR_ERR(vsie_page);
+       else if (!vsie_page)
+               /* double use of sie control block - simply do nothing */
+               return 0;
+
+       rc = pin_scb(vcpu, vsie_page, scb_addr);
+       if (rc)
+               goto out_put;
+       rc = shadow_scb(vcpu, vsie_page);
+       if (rc)
+               goto out_unpin_scb;
+       rc = pin_blocks(vcpu, vsie_page);
+       if (rc)
+               goto out_unshadow;
+       register_shadow_scb(vcpu, vsie_page);
+       rc = vsie_run(vcpu, vsie_page);
+       unregister_shadow_scb(vcpu);
+       unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+       unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+       unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+       put_vsie_page(vcpu->kvm, vsie_page);
+
+       return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+       mutex_init(&kvm->arch.vsie.mutex);
+       INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+       struct vsie_page *vsie_page;
+       struct page *page;
+       int i;
+
+       mutex_lock(&kvm->arch.vsie.mutex);
+       for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+               page = kvm->arch.vsie.pages[i];
+               kvm->arch.vsie.pages[i] = NULL;
+               vsie_page = page_to_virt(page);
+               release_gmap_shadow(vsie_page);
+               /* free the radix tree entry */
+               radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+               __free_page(page);
+       }
+       kvm->arch.vsie.page_count = 0;
+       mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+       /*
+        * Even if the VCPU lets go of the shadow sie block reference, it is
+        * still valid in the cache. So we can safely kick it.
+        */
+       if (scb) {
+               atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+               if (scb->prog0c & PROG_IN_SIE)
+                       atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+       }
+}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c

index 19288c1..730e0d3 100644 (file)
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
                 (struct gmap *) S390_lowcore.gmap : NULL;
         if (gmap) {
                 current->thread.gmap_addr = address;
+               current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+               current->thread.gmap_int_code = regs->int_code & 0xffff;
                 address = __gmap_translate(gmap, address);
                 if (address == -EFAULT) {
                         fault = VM_FAULT_BADMAP;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c

index cace818..af0ae6d 100644 (file)
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,14 +20,16 @@
  #include <asm/gmap.h>
  #include <asm/tlb.h>
  
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
  /**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
   * @mm: pointer to the parent mm_struct
   * @limit: maximum address of the gmap address space
   *
   * Returns a guest address space structure.
   */
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
  {
         struct gmap *gmap;
         struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
         if (!gmap)
                 goto out;
         INIT_LIST_HEAD(&gmap->crst_list);
+       INIT_LIST_HEAD(&gmap->children);
+       INIT_LIST_HEAD(&gmap->pt_list);
         INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
         INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+       INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
         spin_lock_init(&gmap->guest_table_lock);
-       gmap->mm = mm;
+       spin_lock_init(&gmap->shadow_lock);
+       atomic_set(&gmap->ref_count, 1);
         page = alloc_pages(GFP_KERNEL, 2);
         if (!page)
                 goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
         gmap->asce = atype | _ASCE_TABLE_LENGTH |
                 _ASCE_USER_BITS | __pa(table);
         gmap->asce_end = limit;
-       down_write(&mm->mmap_sem);
-       list_add(&gmap->list, &mm->context.gmap_list);
-       up_write(&mm->mmap_sem);
         return gmap;
  
  out_free:
@@ -80,7 +83,28 @@ out_free:
  out:
         return NULL;
  }
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+       struct gmap *gmap;
+
+       gmap = gmap_alloc(limit);
+       if (!gmap)
+               return NULL;
+       gmap->mm = mm;
+       spin_lock(&mm->context.gmap_lock);
+       list_add_rcu(&gmap->list, &mm->context.gmap_list);
+       spin_unlock(&mm->context.gmap_lock);
+       return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
  
  static void gmap_flush_tlb(struct gmap *gmap)
  {
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
         } while (nr > 0);
  }
  
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+       struct gmap_rmap *rmap, *rnext, *head;
+       struct radix_tree_iter iter;
+       unsigned long indices[16];
+       unsigned long index;
+       void **slot;
+       int i, nr;
+
+       /* A radix tree is freed by deleting all of its entries */
+       index = 0;
+       do {
+               nr = 0;
+               radix_tree_for_each_slot(slot, root, &iter, index) {
+                       indices[nr] = iter.index;
+                       if (++nr == 16)
+                               break;
+               }
+               for (i = 0; i < nr; i++) {
+                       index = indices[i];
+                       head = radix_tree_delete(root, index);
+                       gmap_for_each_rmap_safe(rmap, rnext, head)
+                               kfree(rmap);
+               }
+       } while (nr > 0);
+}
+
  /**
   * gmap_free - free a guest address space
   * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
   */
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
  {
         struct page *page, *next;
  
-       /* Flush tlb. */
-       if (MACHINE_HAS_IDTE)
-               __tlb_flush_asce(gmap->mm, gmap->asce);
-       else
-               __tlb_flush_global();
-
+       /* Flush tlb of all gmaps (if not already done for shadows) */
+       if (!(gmap_is_shadow(gmap) && gmap->removed))
+               gmap_flush_tlb(gmap);
         /* Free all segment & region tables. */
         list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
                 __free_pages(page, 2);
         gmap_radix_tree_free(&gmap->guest_to_host);
         gmap_radix_tree_free(&gmap->host_to_guest);
-       down_write(&gmap->mm->mmap_sem);
-       list_del(&gmap->list);
-       up_write(&gmap->mm->mmap_sem);
+
+       /* Free additional data for a shadow gmap */
+       if (gmap_is_shadow(gmap)) {
+               /* Free all page tables. */
+               list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+                       page_table_free_pgste(page);
+               gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+               /* Release reference to the parent */
+               gmap_put(gmap->parent);
+       }
+
         kfree(gmap);
  }
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+       atomic_inc(&gmap->ref_count);
+       return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+       if (atomic_dec_return(&gmap->ref_count) == 0)
+               gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+       struct gmap *sg, *next;
+
+       /* Remove all shadow gmaps linked to this gmap */
+       if (!list_empty(&gmap->children)) {
+               spin_lock(&gmap->shadow_lock);
+               list_for_each_entry_safe(sg, next, &gmap->children, list) {
+                       list_del(&sg->list);
+                       gmap_put(sg);
+               }
+               spin_unlock(&gmap->shadow_lock);
+       }
+       /* Remove gmap from the pre-mm list */
+       spin_lock(&gmap->mm->context.gmap_lock);
+       list_del_rcu(&gmap->list);
+       spin_unlock(&gmap->mm->context.gmap_lock);
+       synchronize_rcu();
+       /* Put reference */
+       gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
  
  /**
   * gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
  }
  EXPORT_SYMBOL_GPL(gmap_disable);
  
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+       return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
  /*
   * gmap_alloc_table is assumed to be called with mmap_sem held
   */
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
                 return -ENOMEM;
         new = (unsigned long *) page_to_phys(page);
         crst_table_init(new, init);
-       spin_lock(&gmap->mm->page_table_lock);
+       spin_lock(&gmap->guest_table_lock);
         if (*table & _REGION_ENTRY_INVALID) {
                 list_add(&page->lru, &gmap->crst_list);
                 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
                 page->index = gaddr;
                 page = NULL;
         }
-       spin_unlock(&gmap->mm->page_table_lock);
+       spin_unlock(&gmap->guest_table_lock);
         if (page)
                 __free_pages(page, 2);
         return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
         unsigned long *entry;
         int flush = 0;
  
+       BUG_ON(gmap_is_shadow(gmap));
         spin_lock(&gmap->guest_table_lock);
         entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
         if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
         unsigned long off;
         int flush;
  
+       BUG_ON(gmap_is_shadow(gmap));
         if ((to | len) & (PMD_SIZE - 1))
                 return -EINVAL;
         if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
         unsigned long off;
         int flush;
  
+       BUG_ON(gmap_is_shadow(gmap));
         if ((from | to | len) & (PMD_SIZE - 1))
                 return -EINVAL;
         if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
   * This function does not establish potentially missing page table entries.
   * The mmap_sem of the mm that belongs to the address space must be held
   * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
   */
  unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
  {
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
  
         vmaddr = (unsigned long)
                 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+       /* Note: guest_to_host is empty for a shadow gmap */
         return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
  }
  EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
         struct gmap *gmap;
         int flush;
  
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
                 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
                 if (flush)
                         gmap_flush_tlb(gmap);
         }
+       rcu_read_unlock();
  }
  
  /**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
         pmd_t *pmd;
         int rc;
  
+       BUG_ON(gmap_is_shadow(gmap));
         /* Create higher level tables in the gmap page table */
         table = gmap->table;
         if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -549,116 +679,1412 @@ static LIST_HEAD(gmap_notifier_list);
  static DEFINE_SPINLOCK(gmap_notifier_lock);
  
  /**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
   * @nb: pointer to the gmap notifier block
   */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
  {
         spin_lock(&gmap_notifier_lock);
-       list_add(&nb->list, &gmap_notifier_list);
+       list_add_rcu(&nb->list, &gmap_notifier_list);
         spin_unlock(&gmap_notifier_lock);
  }
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
  
  /**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
   * @nb: pointer to the gmap notifier block
   */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
  {
         spin_lock(&gmap_notifier_lock);
-       list_del_init(&nb->list);
+       list_del_rcu(&nb->list);
         spin_unlock(&gmap_notifier_lock);
+       synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+                              unsigned long end)
+{
+       struct gmap_notifier *nb;
+
+       list_for_each_entry(nb, &gmap_notifier_list, list)
+               nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+                                            unsigned long gaddr, int level)
+{
+       unsigned long *table;
+
+       if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+               return NULL;
+       if (gmap_is_shadow(gmap) && gmap->removed)
+               return NULL;
+       if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+               return NULL;
+       table = gmap->table;
+       switch (gmap->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               table += (gaddr >> 53) & 0x7ff;
+               if (level == 4)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION2:
+               table += (gaddr >> 42) & 0x7ff;
+               if (level == 3)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION3:
+               table += (gaddr >> 31) & 0x7ff;
+               if (level == 2)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_SEGMENT:
+               table += (gaddr >> 20) & 0x7ff;
+               if (level == 1)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+               table += (gaddr >> 12) & 0xff;
+       }
+       return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ *                   and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+                              spinlock_t **ptl)
+{
+       unsigned long *table;
+
+       if (gmap_is_shadow(gmap))
+               spin_lock(&gmap->guest_table_lock);
+       /* Walk the gmap page table, lock and get pte pointer */
+       table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+       if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+               if (gmap_is_shadow(gmap))
+                       spin_unlock(&gmap->guest_table_lock);
+               return NULL;
+       }
+       if (gmap_is_shadow(gmap)) {
+               *ptl = &gmap->guest_table_lock;
+               return pte_offset_map((pmd_t *) table, gaddr);
+       }
+       return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+                            unsigned long vmaddr, int prot)
+{
+       struct mm_struct *mm = gmap->mm;
+       unsigned int fault_flags;
+       bool unlocked = false;
+
+       BUG_ON(gmap_is_shadow(gmap));
+       fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+       if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+               return -EFAULT;
+       if (unlocked)
+               /* lost mmap_sem, caller has to retry __gmap_translate */
+               return 0;
+       /* Connect the page tables */
+       return __gmap_link(gmap, gaddr, vmaddr);
  }
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
  
  /**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+       spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
   * @gmap: pointer to guest mapping meta data structure
   * @gaddr: virtual address in the guest address space
   * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
   *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
   */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+                             unsigned long len, int prot, unsigned long bits)
  {
-       unsigned long addr;
+       unsigned long vmaddr;
         spinlock_t *ptl;
         pte_t *ptep;
-       bool unlocked;
-       int rc = 0;
+       int rc;
+
+       while (len) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+                       gmap_pte_op_end(ptl);
+               }
+               if (rc) {
+                       vmaddr = __gmap_translate(gmap, gaddr);
+                       if (IS_ERR_VALUE(vmaddr))
+                               return vmaddr;
+                       rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
+                       continue;
+               }
+               gaddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
+       }
+       return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ *                        call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+                        unsigned long len, int prot)
+{
+       int rc;
  
-       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+               return -EINVAL;
+       if (!MACHINE_HAS_ESOP && prot == PROT_READ)
                 return -EINVAL;
         down_read(&gmap->mm->mmap_sem);
-       while (len) {
-               unlocked = false;
-               /* Convert gmap address and connect the page tables */
-               addr = __gmap_translate(gmap, gaddr);
-               if (IS_ERR_VALUE(addr)) {
-                       rc = addr;
-                       break;
+       rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+       up_read(&gmap->mm->mmap_sem);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ *                   absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+       unsigned long address, vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep, pte;
+       int rc;
+
+       while (1) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       pte = *ptep;
+                       if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+                               address = pte_val(pte) & PAGE_MASK;
+                               address += gaddr & ~PAGE_MASK;
+                               *val = *(unsigned long *) address;
+                               pte_val(*ptep) |= _PAGE_YOUNG;
+                               /* Do *NOT* clear the _PAGE_INVALID bit! */
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
                 }
-               /* Get the page mapped */
-               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-                                    &unlocked)) {
-                       rc = -EFAULT;
+               if (!rc)
+                       break;
+               vmaddr = __gmap_translate(gmap, gaddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
                         break;
                 }
-               /* While trying to map mmap_sem got unlocked. Let us retry */
-               if (unlocked)
-                       continue;
-               rc = __gmap_link(gmap, gaddr, addr);
+               rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
                 if (rc)
                         break;
-               /* Walk the process page table, lock and get pte pointer */
-               ptep = get_locked_pte(gmap->mm, addr, &ptl);
-               VM_BUG_ON(!ptep);
-               /* Set notification bit in the pgste of the pte */
-               if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-                       ptep_set_notify(gmap->mm, addr, ptep);
-                       gaddr += PAGE_SIZE;
-                       len -= PAGE_SIZE;
-               }
-               pte_unmap_unlock(ptep, ptl);
         }
-       up_read(&gmap->mm->mmap_sem);
         return rc;
  }
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_read_table);
  
  /**
- * ptep_notify - call all invalidation callbacks for a specific pte.
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
- * @pte: pointer to the page table entry
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
   *
- * This function is assumed to be called with the page table lock held
- * for the pte to notify.
+ * Called with the sg->guest_table_lock
   */
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+                                   struct gmap_rmap *rmap)
  {
-       unsigned long offset, gaddr;
-       unsigned long *table;
-       struct gmap_notifier *nb;
-       struct gmap *gmap;
+       void **slot;
  
-       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-       offset = offset * (4096 / sizeof(pte_t));
-       spin_lock(&gmap_notifier_lock);
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-               table = radix_tree_lookup(&gmap->host_to_guest,
-                                         vmaddr >> PMD_SHIFT);
-               if (!table)
+       BUG_ON(!gmap_is_shadow(sg));
+       slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+       if (slot) {
+               rmap->next = radix_tree_deref_slot_protected(slot,
+                                                       &sg->guest_table_lock);
+               radix_tree_replace_slot(slot, rmap);
+       } else {
+               rmap->next = NULL;
+               radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+                                 rmap);
+       }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+                            unsigned long paddr, unsigned long len, int prot)
+{
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       while (len) {
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr))
+                       return vmaddr;
+               rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+               if (!rmap)
+                       return -ENOMEM;
+               rmap->raddr = raddr;
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc) {
+                       kfree(rmap);
+                       return rc;
+               }
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (ptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+                                            PGSTE_VSIE_BIT);
+                       if (!rc)
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                       spin_unlock(&sg->guest_table_lock);
+                       gmap_pte_op_end(ptl);
+               }
+               radix_tree_preload_end();
+               if (rc) {
+                       kfree(rmap);
+                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
                         continue;
-               gaddr = __gmap_segment_gaddr(table) + offset;
-               list_for_each_entry(nb, &gmap_notifier_list, list)
-                       nb->notifier_call(gmap, gaddr);
+               }
+               paddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
         }
-       spin_unlock(&gmap_notifier_lock);
+       return 0;
+}
+
+#define _SHADOW_RMAP_MASK      0x7
+#define _SHADOW_RMAP_REGION1   0x5
+#define _SHADOW_RMAP_REGION2   0x4
+#define _SHADOW_RMAP_REGION3   0x3
+#define _SHADOW_RMAP_SEGMENT   0x2
+#define _SHADOW_RMAP_PGTABLE   0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%0,%1,0,0"
+               : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+       if (!table || *table & _PAGE_INVALID)
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+       ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *pgt)
+{
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       for (i = 0; i < 256; i++, raddr += 1UL << 12)
+               pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long sto, *ste, *pgt;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+       if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+       sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+       gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+       pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+       *ste = _SEGMENT_ENTRY_EMPTY;
+       __gmap_unshadow_pgt(sg, raddr, pgt);
+       /* Free page table */
+       page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *sgt)
+{
+       unsigned long asce, *pgt;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+               if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+                       continue;
+               pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+               sgt[i] = _SEGMENT_ENTRY_EMPTY;
+               __gmap_unshadow_pgt(sg, raddr, pgt);
+               /* Free page table */
+               page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               page_table_free_pgste(page);
+       }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r3o, *r3e, *sgt;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+       if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+       r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+       gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+       sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+       *r3e = _REGION3_ENTRY_EMPTY;
+       __gmap_unshadow_sgt(sg, raddr, sgt);
+       /* Free segment table */
+       page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r3t)
+{
+       unsigned long asce, *sgt;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+               if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+               r3t[i] = _REGION3_ENTRY_EMPTY;
+               __gmap_unshadow_sgt(sg, raddr, sgt);
+               /* Free segment table */
+               page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r2o, *r2e, *r3t;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+       if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+       r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+       gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+       r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+       *r2e = _REGION2_ENTRY_EMPTY;
+       __gmap_unshadow_r3t(sg, raddr, r3t);
+       /* Free region 3 table */
+       page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r2t)
+{
+       unsigned long asce, *r3t;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+               if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+               r2t[i] = _REGION2_ENTRY_EMPTY;
+               __gmap_unshadow_r3t(sg, raddr, r3t);
+               /* Free region 3 table */
+               page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r1o, *r1e, *r2t;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+       if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+       r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+       gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+       r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+       *r1e = _REGION1_ENTRY_EMPTY;
+       __gmap_unshadow_r2t(sg, raddr, r2t);
+       /* Free region 2 table */
+       page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r1t)
+{
+       unsigned long asce, *r2t;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+               if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+               __gmap_unshadow_r2t(sg, raddr, r2t);
+               /* Clear entry and flush translation r1t -> r2t */
+               gmap_idte_one(asce, raddr);
+               r1t[i] = _REGION1_ENTRY_EMPTY;
+               /* Free region 2 table */
+               page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       if (sg->removed)
+               return;
+       sg->removed = 1;
+       gmap_call_notifier(sg, 0, -1UL);
+       gmap_flush_tlb(sg);
+       table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+       switch (sg->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               __gmap_unshadow_r1t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION2:
+               __gmap_unshadow_r2t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION3:
+               __gmap_unshadow_r3t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_SEGMENT:
+               __gmap_unshadow_sgt(sg, 0, table);
+               break;
+       }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+                                    int edat_level)
+{
+       struct gmap *sg;
+
+       list_for_each_entry(sg, &parent->children, list) {
+               if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+                   sg->removed)
+                       continue;
+               if (!sg->initialized)
+                       return ERR_PTR(-EAGAIN);
+               atomic_inc(&sg->ref_count);
+               return sg;
+       }
+       return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ *                     given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+       if (sg->removed)
+               return 0;
+       return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level)
+{
+       struct gmap *sg, *new;
+       unsigned long limit;
+       int rc;
+
+       BUG_ON(gmap_is_shadow(parent));
+       spin_lock(&parent->shadow_lock);
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       spin_unlock(&parent->shadow_lock);
+       if (sg)
+               return sg;
+       /* Create a new shadow gmap */
+       limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+       if (asce & _ASCE_REAL_SPACE)
+               limit = -1UL;
+       new = gmap_alloc(limit);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+       new->mm = parent->mm;
+       new->parent = gmap_get(parent);
+       new->orig_asce = asce;
+       new->edat_level = edat_level;
+       new->initialized = false;
+       spin_lock(&parent->shadow_lock);
+       /* Recheck if another CPU created the same shadow */
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       if (sg) {
+               spin_unlock(&parent->shadow_lock);
+               gmap_free(new);
+               return sg;
+       }
+       if (asce & _ASCE_REAL_SPACE) {
+               /* only allow one real-space gmap shadow */
+               list_for_each_entry(sg, &parent->children, list) {
+                       if (sg->orig_asce & _ASCE_REAL_SPACE) {
+                               spin_lock(&sg->guest_table_lock);
+                               gmap_unshadow(sg);
+                               spin_unlock(&sg->guest_table_lock);
+                               list_del(&sg->list);
+                               gmap_put(sg);
+                               break;
+                       }
+               }
+       }
+       atomic_set(&new->ref_count, 2);
+       list_add(&new->list, &parent->children);
+       if (asce & _ASCE_REAL_SPACE) {
+               /* nothing to protect, return right away */
+               new->initialized = true;
+               spin_unlock(&parent->shadow_lock);
+               return new;
+       }
+       spin_unlock(&parent->shadow_lock);
+       /* protect after insertion, so it will get properly invalidated */
+       down_read(&parent->mm->mmap_sem);
+       rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+                               ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+                               PROT_READ, PGSTE_VSIE_BIT);
+       up_read(&parent->mm->mmap_sem);
+       spin_lock(&parent->shadow_lock);
+       new->initialized = true;
+       if (rc) {
+               list_del(&new->list);
+               gmap_free(new);
+               new = ERR_PTR(rc);
+       }
+       spin_unlock(&parent->shadow_lock);
+       return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r2t, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r2t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r2t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r2t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r2t read-only in parent gmap page table */
+       raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+       origin = r2t & _REGION_ENTRY_ORIGIN;
+       offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 4);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r2t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r2t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r3t, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r3t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r3t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+       }
+       crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r3t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r3t read-only in parent gmap page table */
+       raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+       origin = r3t & _REGION_ENTRY_ORIGIN;
+       offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 3);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r3t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r3t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_sgt, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+       /* Allocate a shadow segment table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = sgt & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_sgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= sgt & _REGION_ENTRY_PROTECT;
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make sgt read-only in parent gmap page table */
+       raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+       origin = sgt & _REGION_ENTRY_ORIGIN;
+       offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 2);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_sgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_sgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection,
+                          int *fake)
+{
+       unsigned long *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+               /* Shadow page tables are full pages (pte+pgste) */
+               page = pfn_to_page(*table >> PAGE_SHIFT);
+               *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+               *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+               *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+               rc = 0;
+       } else  {
+               rc = -EAGAIN;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake)
+{
+       unsigned long raddr, origin;
+       unsigned long *s_pgt, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+       /* Allocate a shadow page table */
+       page = page_table_alloc_pgste(sg->mm);
+       if (!page)
+               return -ENOMEM;
+       page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_pgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow page table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+                (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+       list_add(&page->lru, &sg->pt_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_SEGMENT_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make pgt read-only in parent gmap page table (not the pgste) */
+       raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+       origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 1);
+               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+                             (unsigned long) s_pgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_SEGMENT_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_pgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       page_table_free_pgste(page);
+       return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr, paddr;
+       spinlock_t *ptl;
+       pte_t *sptep, *tptep;
+       int prot;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+       rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+       if (!rmap)
+               return -ENOMEM;
+       rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+       while (1) {
+               paddr = pte_val(pte) & PAGE_MASK;
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
+                       break;
+               }
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc)
+                       break;
+               rc = -EAGAIN;
+               sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (sptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       /* Get page table pointer */
+                       tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+                       if (!tptep) {
+                               spin_unlock(&sg->guest_table_lock);
+                               gmap_pte_op_end(ptl);
+                               radix_tree_preload_end();
+                               break;
+                       }
+                       rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+                       if (rc > 0) {
+                               /* Success and a new mapping */
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                               rmap = NULL;
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
+                       spin_unlock(&sg->guest_table_lock);
+               }
+               radix_tree_preload_end();
+               if (!rc)
+                       break;
+               rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+               if (rc)
+                       break;
+       }
+       kfree(rmap);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+                              unsigned long offset, pte_t *pte)
+{
+       struct gmap_rmap *rmap, *rnext, *head;
+       unsigned long gaddr, start, end, bits, raddr;
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->parent->guest_table_lock);
+       table = radix_tree_lookup(&sg->parent->host_to_guest,
+                                 vmaddr >> PMD_SHIFT);
+       gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+       spin_unlock(&sg->parent->guest_table_lock);
+       if (!table)
+               return;
+
+       spin_lock(&sg->guest_table_lock);
+       if (sg->removed) {
+               spin_unlock(&sg->guest_table_lock);
+               return;
+       }
+       /* Check for top level table */
+       start = sg->orig_asce & _ASCE_ORIGIN;
+       end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+       if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+           gaddr < end) {
+               /* The complete shadow table has to go */
+               gmap_unshadow(sg);
+               spin_unlock(&sg->guest_table_lock);
+               list_del(&sg->list);
+               gmap_put(sg);
+               return;
+       }
+       /* Remove the page table tree from on specific entry */
+       head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+       gmap_for_each_rmap_safe(rmap, rnext, head) {
+               bits = rmap->raddr & _SHADOW_RMAP_MASK;
+               raddr = rmap->raddr ^ bits;
+               switch (bits) {
+               case _SHADOW_RMAP_REGION1:
+                       gmap_unshadow_r2t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION2:
+                       gmap_unshadow_r3t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION3:
+                       gmap_unshadow_sgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_SEGMENT:
+                       gmap_unshadow_pgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_PGTABLE:
+                       gmap_unshadow_page(sg, raddr);
+                       break;
+               }
+               kfree(rmap);
+       }
+       spin_unlock(&sg->guest_table_lock);
+}
+
+/**
+ * ptep_notify - call all invalidation callbacks for a specific pte.
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
+ *
+ * This function is assumed to be called with the page table lock held
+ * for the pte to notify.
+ */
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+                pte_t *pte, unsigned long bits)
+{
+       unsigned long offset, gaddr;
+       unsigned long *table;
+       struct gmap *gmap, *sg, *next;
+
+       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+       offset = offset * (4096 / sizeof(pte_t));
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+               if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+                       spin_lock(&gmap->shadow_lock);
+                       list_for_each_entry_safe(sg, next,
+                                                &gmap->children, list)
+                               gmap_shadow_notify(sg, vmaddr, offset, pte);
+                       spin_unlock(&gmap->shadow_lock);
+               }
+               if (!(bits & PGSTE_IN_BIT))
+                       continue;
+               spin_lock(&gmap->guest_table_lock);
+               table = radix_tree_lookup(&gmap->host_to_guest,
+                                         vmaddr >> PMD_SHIFT);
+               if (table)
+                       gaddr = __gmap_segment_gaddr(table) + offset;
+               spin_unlock(&gmap->guest_table_lock);
+               if (table)
+                       gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+       }
+       rcu_read_unlock();
  }
  EXPORT_SYMBOL_GPL(ptep_notify);
  
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c

index e8b5962..9c57a29 100644 (file)
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
         return new;
  }
  
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+       struct page *page;
+       unsigned long *table;
+
+       page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+       if (page) {
+               table = (unsigned long *) page_to_phys(page);
+               clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+               clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+       }
+       return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+       __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
  /*
   * page table entry allocation/free routines.
   */
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
         /* Try to get a fragment of a 4K page as a 2K page table */
         if (!mm_alloc_pgste(mm)) {
                 table = NULL;
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                 if (!list_empty(&mm->context.pgtable_list)) {
                         page = list_first_entry(&mm->context.pgtable_list,
                                                 struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                                 list_del(&page->lru);
                         }
                 }
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                 if (table)
                         return table;
         }
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                 /* Return the first 2K fragment of the page */
                 atomic_set(&page->_mapcount, 1);
                 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                 list_add(&page->lru, &mm->context.pgtable_list);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
         }
         return table;
  }
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
         if (!mm_alloc_pgste(mm)) {
                 /* Free 2K page table fragment of a 4K page */
                 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                 mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
                 if (mask & 3)
                         list_add(&page->lru, &mm->context.pgtable_list);
                 else
                         list_del(&page->lru);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                 if (mask != 0)
                         return;
         }
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
                 return;
         }
         bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-       spin_lock_bh(&mm->context.list_lock);
+       spin_lock_bh(&mm->context.pgtable_lock);
         mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
         if (mask & 3)
                 list_add_tail(&page->lru, &mm->context.pgtable_list);
         else
                 list_del(&page->lru);
-       spin_unlock_bh(&mm->context.list_lock);
+       spin_unlock_bh(&mm->context.pgtable_lock);
         table = (unsigned long *) (__pa(table) | (1U << bit));
         tlb_remove_table(tlb, table);
  }
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c

index fa286d0..293130b 100644 (file)
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -179,14 +179,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
         return pgste;
  }
  
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-                                       unsigned long addr,
-                                       pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+                                      unsigned long addr,
+                                      pte_t *ptep, pgste_t pgste)
  {
  #ifdef CONFIG_PGSTE
-       if (pgste_val(pgste) & PGSTE_IN_BIT) {
-               pgste_val(pgste) &= ~PGSTE_IN_BIT;
-               ptep_notify(mm, addr, ptep);
+       unsigned long bits;
+
+       bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+       if (bits) {
+               pgste_val(pgste) ^= bits;
+               ptep_notify(mm, addr, ptep, bits);
         }
  #endif
         return pgste;
@@ -199,7 +202,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
  
         if (mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
         }
         return pgste;
  }
@@ -414,6 +417,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
         pgste_set_unlock(ptep, pgste);
  }
  
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, int prot, unsigned long bit)
+{
+       pte_t entry;
+       pgste_t pgste;
+       int pte_i, pte_p;
+
+       pgste = pgste_get_lock(ptep);
+       entry = *ptep;
+       /* Check pte entry after all locks have been acquired */
+       pte_i = pte_val(entry) & _PAGE_INVALID;
+       pte_p = pte_val(entry) & _PAGE_PROTECT;
+       if ((pte_i && (prot != PROT_NONE)) ||
+           (pte_p && (prot & PROT_WRITE))) {
+               pgste_set_unlock(ptep, pgste);
+               return -EAGAIN;
+       }
+       /* Change access rights and set pgste bit */
+       if (prot == PROT_NONE && !pte_i) {
+               ptep_flush_direct(mm, addr, ptep);
+               pgste = pgste_update_all(entry, pgste, mm);
+               pte_val(entry) |= _PAGE_INVALID;
+       }
+       if (prot == PROT_READ && !pte_p) {
+               ptep_flush_direct(mm, addr, ptep);
+               pte_val(entry) &= ~_PAGE_INVALID;
+               pte_val(entry) |= _PAGE_PROTECT;
+       }
+       pgste_val(pgste) |= bit;
+       pgste = pgste_set_pte(ptep, pgste, entry);
+       pgste_set_unlock(ptep, pgste);
+       return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+       pgste_t spgste, tpgste;
+       pte_t spte, tpte;
+       int rc = -EAGAIN;
+
+       if (!(pte_val(*tptep) & _PAGE_INVALID))
+               return 0;       /* already shadowed */
+       spgste = pgste_get_lock(sptep);
+       spte = *sptep;
+       if (!(pte_val(spte) & _PAGE_INVALID) &&
+           !((pte_val(spte) & _PAGE_PROTECT) &&
+             !(pte_val(pte) & _PAGE_PROTECT))) {
+               pgste_val(spgste) |= PGSTE_VSIE_BIT;
+               tpgste = pgste_get_lock(tptep);
+               pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+                               (pte_val(pte) & _PAGE_PROTECT);
+               /* don't touch the storage key - it belongs to parent pgste */
+               tpgste = pgste_set_pte(tptep, tpgste, tpte);
+               pgste_set_unlock(tptep, tpgste);
+               rc = 1;
+       }
+       pgste_set_unlock(sptep, spgste);
+       return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+       pgste_t pgste;
+
+       pgste = pgste_get_lock(ptep);
+       /* notifier is called by the caller */
+       ptep_flush_direct(mm, saddr, ptep);
+       /* don't touch the storage key - it belongs to parent pgste */
+       pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+       pgste_set_unlock(ptep, pgste);
+}
+
  static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
  {
         if (!non_swap_entry(entry))
@@ -483,7 +570,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
         pgste_val(pgste) &= ~PGSTE_UC_BIT;
         pte = *ptep;
         if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
                 __ptep_ipte(addr, ptep);
                 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
                         pte_val(pte) |= _PAGE_PROTECT;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h

index 8b5e0a9..610e132 100644 (file)
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
         return ret;
  }
  
+static inline int page_ref_inc_return(struct page *page)
+{
+       int ret = atomic_inc_return(&page->_refcount);
+
+       if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+               __page_ref_mod_and_return(page, 1, ret);
+       return ret;
+}
+
  static inline int page_ref_dec_and_test(struct page *page)
  {
         int ret = atomic_dec_and_test(&page->_refcount);
author	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 21 Jun 2016 13:21:51 +0000 (15:21 +0200)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 21 Jun 2016 13:21:51 +0000 (15:21 +0200)
arch/s390/include/asm/gmap.h		patch \| blob \| history
arch/s390/include/asm/kvm_host.h		patch \| blob \| history
arch/s390/include/asm/mmu.h		patch \| blob \| history
arch/s390/include/asm/mmu_context.h		patch \| blob \| history
arch/s390/include/asm/page.h		patch \| blob \| history
arch/s390/include/asm/pgalloc.h		patch \| blob \| history
arch/s390/include/asm/pgtable.h		patch \| blob \| history
arch/s390/include/asm/processor.h		patch \| blob \| history
arch/s390/include/uapi/asm/kvm.h		patch \| blob \| history
arch/s390/kvm/Makefile		patch \| blob \| history
arch/s390/kvm/gaccess.c		patch \| blob \| history
arch/s390/kvm/gaccess.h		patch \| blob \| history
arch/s390/kvm/interrupt.c		patch \| blob \| history
arch/s390/kvm/kvm-s390.c		patch \| blob \| history
arch/s390/kvm/kvm-s390.h		patch \| blob \| history
arch/s390/kvm/priv.c		patch \| blob \| history
arch/s390/kvm/sigp.c		patch \| blob \| history
arch/s390/kvm/vsie.c	[new file with mode: 0644]	patch \| blob
arch/s390/mm/fault.c		patch \| blob \| history
arch/s390/mm/gmap.c		patch \| blob \| history
arch/s390/mm/pgalloc.c		patch \| blob \| history
arch/s390/mm/pgtable.c		patch \| blob \| history
include/linux/page_ref.h		patch \| blob \| history