Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/borntraeger...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 14 Feb 2015 18:54:28 +0000 (10:54 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 14 Feb 2015 18:54:28 +0000 (10:54 -0800)
Pull ACCESS_ONCE() rule tightening from Christian Borntraeger:
 "Tighten rules for ACCESS_ONCE

  This series tightens the rules for ACCESS_ONCE to only work on scalar
  types.  It also contains the necessary fixups as indicated by build
  bots of linux-next.  Now everything is in place to prevent new
  non-scalar users of ACCESS_ONCE and we can continue to convert code to
  READ_ONCE/WRITE_ONCE"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/borntraeger/linux:
  kernel: Fix sparse warning for ACCESS_ONCE
  next: sh: Fix compile error
  kernel: tighten rules for ACCESS ONCE
  mm/gup: Replace ACCESS_ONCE with READ_ONCE
  x86/spinlock: Leftover conversion ACCESS_ONCE->READ_ONCE
  x86/xen/p2m: Replace ACCESS_ONCE with READ_ONCE
  ppc/hugetlbfs: Replace ACCESS_ONCE with READ_ONCE
  ppc/kvm: Replace ACCESS_ONCE with READ_ONCE

1  2 
arch/powerpc/mm/hugetlbpage.c
arch/sh/mm/gup.c
arch/x86/xen/p2m.c
include/linux/compiler.h
mm/gup.c

@@@ -714,14 -714,6 +714,14 @@@ follow_huge_pmd(struct mm_struct *mm, u
        return NULL;
  }
  
 +struct page *
 +follow_huge_pud(struct mm_struct *mm, unsigned long address,
 +              pud_t *pud, int write)
 +{
 +      BUG();
 +      return NULL;
 +}
 +
  static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
                                      unsigned long sz)
  {
@@@ -986,7 -978,7 +986,7 @@@ pte_t *find_linux_pte_or_hugepte(pgd_t 
                 */
                pdshift = PUD_SHIFT;
                pudp = pud_offset(&pgd, ea);
-               pud  = ACCESS_ONCE(*pudp);
+               pud  = READ_ONCE(*pudp);
  
                if (pud_none(pud))
                        return NULL;
                else {
                        pdshift = PMD_SHIFT;
                        pmdp = pmd_offset(&pud, ea);
-                       pmd  = ACCESS_ONCE(*pmdp);
+                       pmd  = READ_ONCE(*pmdp);
                        /*
                         * A hugepage collapse is captured by pmd_none, because
                         * it mark the pmd none and do a hpte invalidate.
diff --combined arch/sh/mm/gup.c
@@@ -17,7 -17,7 +17,7 @@@
  static inline pte_t gup_get_pte(pte_t *ptep)
  {
  #ifndef CONFIG_X2TLB
-       return ACCESS_ONCE(*ptep);
+       return READ_ONCE(*ptep);
  #else
        /*
         * With get_user_pages_fast, we walk down the pagetables without
@@@ -257,8 -257,10 +257,8 @@@ slow_irqon
                start += nr << PAGE_SHIFT;
                pages += nr;
  
 -              down_read(&mm->mmap_sem);
 -              ret = get_user_pages(current, mm, start,
 -                      (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
 -              up_read(&mm->mmap_sem);
 +              ret = get_user_pages_unlocked(current, mm, start,
 +                      (end - start) >> PAGE_SHIFT, write, 0, pages);
  
                /* Have to be a bit careful with return values */
                if (nr > 0) {
diff --combined arch/x86/xen/p2m.c
@@@ -84,6 -84,8 +84,6 @@@
  
  #define PMDS_PER_MID_PAGE     (P2M_MID_PER_PAGE / PTRS_PER_PTE)
  
 -static void __init m2p_override_init(void);
 -
  unsigned long *xen_p2m_addr __read_mostly;
  EXPORT_SYMBOL_GPL(xen_p2m_addr);
  unsigned long xen_p2m_size __read_mostly;
@@@ -165,13 -167,10 +165,13 @@@ static void * __ref alloc_p2m_page(void
        return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
  }
  
 -/* Only to be called in case of a race for a page just allocated! */
 -static void free_p2m_page(void *p)
 +static void __ref free_p2m_page(void *p)
  {
 -      BUG_ON(!slab_is_available());
 +      if (unlikely(!slab_is_available())) {
 +              free_bootmem((unsigned long)p, PAGE_SIZE);
 +              return;
 +      }
 +
        free_page((unsigned long)p);
  }
  
@@@ -376,7 -375,7 +376,7 @@@ static void __init xen_rebuild_p2m_list
                        p2m_missing_pte : p2m_identity_pte;
                for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
                        pmdp = populate_extra_pmd(
 -                              (unsigned long)(p2m + pfn + i * PTRS_PER_PTE));
 +                              (unsigned long)(p2m + pfn) + i * PMD_SIZE);
                        set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
                }
        }
@@@ -400,6 -399,8 +400,6 @@@ void __init xen_vmalloc_p2m_tree(void
        xen_p2m_size = xen_max_p2m_pfn;
  
        xen_inv_extra_mem();
 -
 -      m2p_override_init();
  }
  
  unsigned long get_phys_to_machine(unsigned long pfn)
@@@ -435,9 -436,10 +435,9 @@@ EXPORT_SYMBOL_GPL(get_phys_to_machine)
   * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
   * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
   */
 -static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
 +static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  {
        pte_t *ptechk;
 -      pte_t *pteret = ptep;
        pte_t *pte_newpg[PMDS_PER_MID_PAGE];
        pmd_t *pmdp;
        unsigned int level;
                if (ptechk == pte_pg) {
                        set_pmd(pmdp,
                                __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
 -                      if (vaddr == (addr & ~(PMD_SIZE - 1)))
 -                              pteret = pte_offset_kernel(pmdp, addr);
                        pte_newpg[i] = NULL;
                }
  
                vaddr += PMD_SIZE;
        }
  
 -      return pteret;
 +      return lookup_address(addr, &level);
  }
  
  /*
@@@ -513,7 -517,7 +513,7 @@@ static bool alloc_p2m(unsigned long pfn
  
        if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
                /* PMD level is missing, allocate a new one */
 -              ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
 +              ptep = alloc_p2m_pmd(addr, pte_pg);
                if (!ptep)
                        return false;
        }
                mid_mfn = NULL;
        }
  
-       p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
+       p2m_pfn = pte_pfn(READ_ONCE(*ptep));
        if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
            p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
                /* p2m leaf page is missing */
@@@ -648,21 -652,100 +648,21 @@@ bool set_phys_to_machine(unsigned long 
        return true;
  }
  
 -#define M2P_OVERRIDE_HASH_SHIFT       10
 -#define M2P_OVERRIDE_HASH     (1 << M2P_OVERRIDE_HASH_SHIFT)
 -
 -static struct list_head *m2p_overrides;
 -static DEFINE_SPINLOCK(m2p_override_lock);
 -
 -static void __init m2p_override_init(void)
 -{
 -      unsigned i;
 -
 -      m2p_overrides = alloc_bootmem_align(
 -                              sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
 -                              sizeof(unsigned long));
 -
 -      for (i = 0; i < M2P_OVERRIDE_HASH; i++)
 -              INIT_LIST_HEAD(&m2p_overrides[i]);
 -}
 -
 -static unsigned long mfn_hash(unsigned long mfn)
 -{
 -      return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
 -}
 -
 -/* Add an MFN override for a particular page */
 -static int m2p_add_override(unsigned long mfn, struct page *page,
 -                          struct gnttab_map_grant_ref *kmap_op)
 -{
 -      unsigned long flags;
 -      unsigned long pfn;
 -      unsigned long uninitialized_var(address);
 -      unsigned level;
 -      pte_t *ptep = NULL;
 -
 -      pfn = page_to_pfn(page);
 -      if (!PageHighMem(page)) {
 -              address = (unsigned long)__va(pfn << PAGE_SHIFT);
 -              ptep = lookup_address(address, &level);
 -              if (WARN(ptep == NULL || level != PG_LEVEL_4K,
 -                       "m2p_add_override: pfn %lx not mapped", pfn))
 -                      return -EINVAL;
 -      }
 -
 -      if (kmap_op != NULL) {
 -              if (!PageHighMem(page)) {
 -                      struct multicall_space mcs =
 -                              xen_mc_entry(sizeof(*kmap_op));
 -
 -                      MULTI_grant_table_op(mcs.mc,
 -                                      GNTTABOP_map_grant_ref, kmap_op, 1);
 -
 -                      xen_mc_issue(PARAVIRT_LAZY_MMU);
 -              }
 -      }
 -      spin_lock_irqsave(&m2p_override_lock, flags);
 -      list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
 -      spin_unlock_irqrestore(&m2p_override_lock, flags);
 -
 -      /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
 -       * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
 -       * pfn so that the following mfn_to_pfn(mfn) calls will return the
 -       * pfn from the m2p_override (the backend pfn) instead.
 -       * We need to do this because the pages shared by the frontend
 -       * (xen-blkfront) can be already locked (lock_page, called by
 -       * do_read_cache_page); when the userspace backend tries to use them
 -       * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
 -       * do_blockdev_direct_IO is going to try to lock the same pages
 -       * again resulting in a deadlock.
 -       * As a side effect get_user_pages_fast might not be safe on the
 -       * frontend pages while they are being shared with the backend,
 -       * because mfn_to_pfn (that ends up being called by GUPF) will
 -       * return the backend pfn rather than the frontend pfn. */
 -      pfn = mfn_to_pfn_no_overrides(mfn);
 -      if (__pfn_to_mfn(pfn) == mfn)
 -              set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
 -
 -      return 0;
 -}
 -
  int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
                            struct gnttab_map_grant_ref *kmap_ops,
                            struct page **pages, unsigned int count)
  {
        int i, ret = 0;
 -      bool lazy = false;
        pte_t *pte;
  
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return 0;
  
 -      if (kmap_ops &&
 -          !in_interrupt() &&
 -          paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
 -              arch_enter_lazy_mmu_mode();
 -              lazy = true;
 +      if (kmap_ops) {
 +              ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
 +                                              kmap_ops, count);
 +              if (ret)
 +                      goto out;
        }
  
        for (i = 0; i < count; i++) {
                }
                pfn = page_to_pfn(pages[i]);
  
 -              WARN_ON(PagePrivate(pages[i]));
 -              SetPagePrivate(pages[i]);
 -              set_page_private(pages[i], mfn);
 -              pages[i]->index = pfn_to_mfn(pfn);
 +              WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
  
                if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
                        ret = -ENOMEM;
                        goto out;
                }
 -
 -              if (kmap_ops) {
 -                      ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
 -                      if (ret)
 -                              goto out;
 -              }
        }
  
  out:
 -      if (lazy)
 -              arch_leave_lazy_mmu_mode();
 -
        return ret;
  }
  EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
  
 -static struct page *m2p_find_override(unsigned long mfn)
 -{
 -      unsigned long flags;
 -      struct list_head *bucket;
 -      struct page *p, *ret;
 -
 -      if (unlikely(!m2p_overrides))
 -              return NULL;
 -
 -      ret = NULL;
 -      bucket = &m2p_overrides[mfn_hash(mfn)];
 -
 -      spin_lock_irqsave(&m2p_override_lock, flags);
 -
 -      list_for_each_entry(p, bucket, lru) {
 -              if (page_private(p) == mfn) {
 -                      ret = p;
 -                      break;
 -              }
 -      }
 -
 -      spin_unlock_irqrestore(&m2p_override_lock, flags);
 -
 -      return ret;
 -}
 -
 -static int m2p_remove_override(struct page *page,
 -                             struct gnttab_map_grant_ref *kmap_op,
 -                             unsigned long mfn)
 -{
 -      unsigned long flags;
 -      unsigned long pfn;
 -      unsigned long uninitialized_var(address);
 -      unsigned level;
 -      pte_t *ptep = NULL;
 -
 -      pfn = page_to_pfn(page);
 -
 -      if (!PageHighMem(page)) {
 -              address = (unsigned long)__va(pfn << PAGE_SHIFT);
 -              ptep = lookup_address(address, &level);
 -
 -              if (WARN(ptep == NULL || level != PG_LEVEL_4K,
 -                       "m2p_remove_override: pfn %lx not mapped", pfn))
 -                      return -EINVAL;
 -      }
 -
 -      spin_lock_irqsave(&m2p_override_lock, flags);
 -      list_del(&page->lru);
 -      spin_unlock_irqrestore(&m2p_override_lock, flags);
 -
 -      if (kmap_op != NULL) {
 -              if (!PageHighMem(page)) {
 -                      struct multicall_space mcs;
 -                      struct gnttab_unmap_and_replace *unmap_op;
 -                      struct page *scratch_page = get_balloon_scratch_page();
 -                      unsigned long scratch_page_address = (unsigned long)
 -                              __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
 -
 -                      /*
 -                       * It might be that we queued all the m2p grant table
 -                       * hypercalls in a multicall, then m2p_remove_override
 -                       * get called before the multicall has actually been
 -                       * issued. In this case handle is going to -1 because
 -                       * it hasn't been modified yet.
 -                       */
 -                      if (kmap_op->handle == -1)
 -                              xen_mc_flush();
 -                      /*
 -                       * Now if kmap_op->handle is negative it means that the
 -                       * hypercall actually returned an error.
 -                       */
 -                      if (kmap_op->handle == GNTST_general_error) {
 -                              pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
 -                                      pfn, mfn);
 -                              put_balloon_scratch_page();
 -                              return -1;
 -                      }
 -
 -                      xen_mc_batch();
 -
 -                      mcs = __xen_mc_entry(
 -                              sizeof(struct gnttab_unmap_and_replace));
 -                      unmap_op = mcs.args;
 -                      unmap_op->host_addr = kmap_op->host_addr;
 -                      unmap_op->new_addr = scratch_page_address;
 -                      unmap_op->handle = kmap_op->handle;
 -
 -                      MULTI_grant_table_op(mcs.mc,
 -                              GNTTABOP_unmap_and_replace, unmap_op, 1);
 -
 -                      mcs = __xen_mc_entry(0);
 -                      MULTI_update_va_mapping(mcs.mc, scratch_page_address,
 -                                      pfn_pte(page_to_pfn(scratch_page),
 -                                      PAGE_KERNEL_RO), 0);
 -
 -                      xen_mc_issue(PARAVIRT_LAZY_MMU);
 -
 -                      kmap_op->host_addr = 0;
 -                      put_balloon_scratch_page();
 -              }
 -      }
 -
 -      /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
 -       * somewhere in this domain, even before being added to the
 -       * m2p_override (see comment above in m2p_add_override).
 -       * If there are no other entries in the m2p_override corresponding
 -       * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
 -       * the original pfn (the one shared by the frontend): the backend
 -       * cannot do any IO on this page anymore because it has been
 -       * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
 -       * the original pfn causes mfn_to_pfn(mfn) to return the frontend
 -       * pfn again. */
 -      mfn &= ~FOREIGN_FRAME_BIT;
 -      pfn = mfn_to_pfn_no_overrides(mfn);
 -      if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
 -                      m2p_find_override(mfn) == NULL)
 -              set_phys_to_machine(pfn, mfn);
 -
 -      return 0;
 -}
 -
  int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 -                            struct gnttab_map_grant_ref *kmap_ops,
 +                            struct gnttab_unmap_grant_ref *kunmap_ops,
                              struct page **pages, unsigned int count)
  {
        int i, ret = 0;
 -      bool lazy = false;
  
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return 0;
  
 -      if (kmap_ops &&
 -          !in_interrupt() &&
 -          paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
 -              arch_enter_lazy_mmu_mode();
 -              lazy = true;
 -      }
 -
        for (i = 0; i < count; i++) {
                unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
                unsigned long pfn = page_to_pfn(pages[i]);
                        goto out;
                }
  
 -              set_page_private(pages[i], INVALID_P2M_ENTRY);
 -              WARN_ON(!PagePrivate(pages[i]));
 -              ClearPagePrivate(pages[i]);
 -              set_phys_to_machine(pfn, pages[i]->index);
 -
 -              if (kmap_ops)
 -                      ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
 -              if (ret)
 -                      goto out;
 +              set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
        }
 -
 +      if (kunmap_ops)
 +              ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
 +                                              kunmap_ops, count);
  out:
 -      if (lazy)
 -              arch_leave_lazy_mmu_mode();
        return ret;
  }
  EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
  
 -unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
 -{
 -      struct page *p = m2p_find_override(mfn);
 -      unsigned long ret = pfn;
 -
 -      if (p)
 -              ret = page_to_pfn(p);
 -
 -      return ret;
 -}
 -EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
 -
  #ifdef CONFIG_XEN_DEBUG_FS
  #include <linux/debugfs.h>
  #include "debugfs.h"
diff --combined include/linux/compiler.h
@@@ -54,11 -54,7 +54,11 @@@ extern void __chk_io_ptr(const volatil
  #include <linux/compiler-gcc.h>
  #endif
  
 +#ifdef CC_USING_HOTPATCH
 +#define notrace __attribute__((hotpatch(0,0)))
 +#else
  #define notrace __attribute__((no_instrument_function))
 +#endif
  
  /* Intel compiler defines __GNUC__. So we will overwrite implementations
   * coming from above header files here
@@@ -219,7 -215,7 +219,7 @@@ static __always_inline void __read_once
        }
  }
  
 -static __always_inline void __assign_once_size(volatile void *p, void *res, int size)
 +static __always_inline void __write_once_size(volatile void *p, void *res, int size)
  {
        switch (size) {
        case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
  /*
   * Prevent the compiler from merging or refetching reads or writes. The
   * compiler is also forbidden from reordering successive instances of
 - * READ_ONCE, ASSIGN_ONCE and ACCESS_ONCE (see below), but only when the
 + * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the
   * compiler is aware of some particular ordering.  One way to make the
   * compiler aware of ordering is to put the two invocations of READ_ONCE,
 - * ASSIGN_ONCE or ACCESS_ONCE() in different C statements.
 + * WRITE_ONCE or ACCESS_ONCE() in different C statements.
   *
   * In contrast to ACCESS_ONCE these two macros will also work on aggregate
   * data types like structs or unions. If the size of the accessed data
   * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
 - * READ_ONCE() and ASSIGN_ONCE()  will fall back to memcpy and print a
 + * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
   * compile-time warning.
   *
   * Their two major use cases are: (1) Mediating communication between
  #define READ_ONCE(x) \
        ({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
  
 -#define ASSIGN_ONCE(val, x) \
 -      ({ typeof(x) __val; __val = val; __assign_once_size(&x, &__val, sizeof(__val)); __val; })
 +#define WRITE_ONCE(x, val) \
 +      ({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; })
  
  #endif /* __KERNEL__ */
  
  
  /* Is this type a native word size -- useful for atomic operations */
  #ifndef __native_word
 -# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
 +# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
  #endif
  
  /* Compile time object size, -1 for unknown */
   * to make the compiler aware of ordering is to put the two invocations of
   * ACCESS_ONCE() in different C statements.
   *
-  * This macro does absolutely -nothing- to prevent the CPU from reordering,
-  * merging, or refetching absolutely anything at any time.  Its main intended
-  * use is to mediate communication between process-level code and irq/NMI
-  * handlers, all running on the same CPU.
+  * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE
+  * on a union member will work as long as the size of the member matches the
+  * size of the union and the size is smaller than word size.
+  *
+  * The major use cases of ACCESS_ONCE used to be (1) Mediating communication
+  * between process-level code and irq/NMI handlers, all running on the same CPU,
+  * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+  * mutilate accesses that either do not require ordering or that interact
+  * with an explicit memory barrier or atomic instruction that provides the
+  * required ordering.
+  *
+  * If possible use READ_ONCE/ASSIGN_ONCE instead.
   */
- #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+ #define __ACCESS_ONCE(x) ({ \
+        __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
+       (volatile typeof(x) *)&(x); })
+ #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
  
  /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
  #ifdef CONFIG_KPROBES
diff --combined mm/gup.c
+++ b/mm/gup.c
@@@ -55,7 -55,7 +55,7 @@@ retry
                 */
                if (likely(!(flags & FOLL_MIGRATION)))
                        goto no_page;
 -              if (pte_none(pte) || pte_file(pte))
 +              if (pte_none(pte))
                        goto no_page;
                entry = pte_to_swp_entry(pte);
                if (!is_migration_entry(entry))
@@@ -64,7 -64,7 +64,7 @@@
                migration_entry_wait(mm, pmd, address);
                goto retry;
        }
 -      if ((flags & FOLL_NUMA) && pte_numa(pte))
 +      if ((flags & FOLL_NUMA) && pte_protnone(pte))
                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte)) {
                pte_unmap_unlock(ptep, ptl);
@@@ -167,10 -167,10 +167,10 @@@ struct page *follow_page_mask(struct vm
        if (pud_none(*pud))
                return no_page_table(vma, flags);
        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 -              if (flags & FOLL_GET)
 -                      return NULL;
 -              page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
 -              return page;
 +              page = follow_huge_pud(mm, address, pud, flags);
 +              if (page)
 +                      return page;
 +              return no_page_table(vma, flags);
        }
        if (unlikely(pud_bad(*pud)))
                return no_page_table(vma, flags);
        if (pmd_none(*pmd))
                return no_page_table(vma, flags);
        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
 -              page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 -              if (flags & FOLL_GET) {
 -                      /*
 -                       * Refcount on tail pages are not well-defined and
 -                       * shouldn't be taken. The caller should handle a NULL
 -                       * return when trying to follow tail pages.
 -                       */
 -                      if (PageHead(page))
 -                              get_page(page);
 -                      else
 -                              page = NULL;
 -              }
 -              return page;
 +              page = follow_huge_pmd(mm, address, pmd, flags);
 +              if (page)
 +                      return page;
 +              return no_page_table(vma, flags);
        }
 -      if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 +      if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                return no_page_table(vma, flags);
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
@@@ -287,7 -296,7 +287,7 @@@ static int faultin_page(struct task_str
                        return -ENOMEM;
                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                        return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
 -              if (ret & VM_FAULT_SIGBUS)
 +              if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                        return -EFAULT;
                BUG();
        }
@@@ -562,7 -571,7 +562,7 @@@ int fixup_user_fault(struct task_struc
                        return -ENOMEM;
                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                        return -EHWPOISON;
 -              if (ret & VM_FAULT_SIGBUS)
 +              if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                        return -EFAULT;
                BUG();
        }
        return 0;
  }
  
 +static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 +                                              struct mm_struct *mm,
 +                                              unsigned long start,
 +                                              unsigned long nr_pages,
 +                                              int write, int force,
 +                                              struct page **pages,
 +                                              struct vm_area_struct **vmas,
 +                                              int *locked, bool notify_drop,
 +                                              unsigned int flags)
 +{
 +      long ret, pages_done;
 +      bool lock_dropped;
 +
 +      if (locked) {
 +              /* if VM_FAULT_RETRY can be returned, vmas become invalid */
 +              BUG_ON(vmas);
 +              /* check caller initialized locked */
 +              BUG_ON(*locked != 1);
 +      }
 +
 +      if (pages)
 +              flags |= FOLL_GET;
 +      if (write)
 +              flags |= FOLL_WRITE;
 +      if (force)
 +              flags |= FOLL_FORCE;
 +
 +      pages_done = 0;
 +      lock_dropped = false;
 +      for (;;) {
 +              ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
 +                                     vmas, locked);
 +              if (!locked)
 +                      /* VM_FAULT_RETRY couldn't trigger, bypass */
 +                      return ret;
 +
 +              /* VM_FAULT_RETRY cannot return errors */
 +              if (!*locked) {
 +                      BUG_ON(ret < 0);
 +                      BUG_ON(ret >= nr_pages);
 +              }
 +
 +              if (!pages)
 +                      /* If it's a prefault don't insist harder */
 +                      return ret;
 +
 +              if (ret > 0) {
 +                      nr_pages -= ret;
 +                      pages_done += ret;
 +                      if (!nr_pages)
 +                              break;
 +              }
 +              if (*locked) {
 +                      /* VM_FAULT_RETRY didn't trigger */
 +                      if (!pages_done)
 +                              pages_done = ret;
 +                      break;
 +              }
 +              /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
 +              pages += ret;
 +              start += ret << PAGE_SHIFT;
 +
 +              /*
 +               * Repeat on the address that fired VM_FAULT_RETRY
 +               * without FAULT_FLAG_ALLOW_RETRY but with
 +               * FAULT_FLAG_TRIED.
 +               */
 +              *locked = 1;
 +              lock_dropped = true;
 +              down_read(&mm->mmap_sem);
 +              ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
 +                                     pages, NULL, NULL);
 +              if (ret != 1) {
 +                      BUG_ON(ret > 1);
 +                      if (!pages_done)
 +                              pages_done = ret;
 +                      break;
 +              }
 +              nr_pages--;
 +              pages_done++;
 +              if (!nr_pages)
 +                      break;
 +              pages++;
 +              start += PAGE_SIZE;
 +      }
 +      if (notify_drop && lock_dropped && *locked) {
 +              /*
 +               * We must let the caller know we temporarily dropped the lock
 +               * and so the critical section protected by it was lost.
 +               */
 +              up_read(&mm->mmap_sem);
 +              *locked = 0;
 +      }
 +      return pages_done;
 +}
 +
 +/*
 + * We can leverage the VM_FAULT_RETRY functionality in the page fault
 + * paths better by using either get_user_pages_locked() or
 + * get_user_pages_unlocked().
 + *
 + * get_user_pages_locked() is suitable to replace the form:
 + *
 + *      down_read(&mm->mmap_sem);
 + *      do_something()
 + *      get_user_pages(tsk, mm, ..., pages, NULL);
 + *      up_read(&mm->mmap_sem);
 + *
 + *  to:
 + *
 + *      int locked = 1;
 + *      down_read(&mm->mmap_sem);
 + *      do_something()
 + *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
 + *      if (locked)
 + *          up_read(&mm->mmap_sem);
 + */
 +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 +                         unsigned long start, unsigned long nr_pages,
 +                         int write, int force, struct page **pages,
 +                         int *locked)
 +{
 +      return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
 +                                     pages, NULL, locked, true, FOLL_TOUCH);
 +}
 +EXPORT_SYMBOL(get_user_pages_locked);
 +
 +/*
 + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
 + * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
 + *
 + * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
 + * caller if required (just like with __get_user_pages). "FOLL_GET",
 + * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
 + * according to the parameters "pages", "write", "force"
 + * respectively.
 + */
 +__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 +                                             unsigned long start, unsigned long nr_pages,
 +                                             int write, int force, struct page **pages,
 +                                             unsigned int gup_flags)
 +{
 +      long ret;
 +      int locked = 1;
 +      down_read(&mm->mmap_sem);
 +      ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
 +                                    pages, NULL, &locked, false, gup_flags);
 +      if (locked)
 +              up_read(&mm->mmap_sem);
 +      return ret;
 +}
 +EXPORT_SYMBOL(__get_user_pages_unlocked);
 +
 +/*
 + * get_user_pages_unlocked() is suitable to replace the form:
 + *
 + *      down_read(&mm->mmap_sem);
 + *      get_user_pages(tsk, mm, ..., pages, NULL);
 + *      up_read(&mm->mmap_sem);
 + *
 + *  with:
 + *
 + *      get_user_pages_unlocked(tsk, mm, ..., pages);
 + *
 + * It is functionally equivalent to get_user_pages_fast so
 + * get_user_pages_fast should be used instead, if the two parameters
 + * "tsk" and "mm" are respectively equal to current and current->mm,
 + * or if "force" shall be set to 1 (get_user_pages_fast misses the
 + * "force" parameter).
 + */
 +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 +                           unsigned long start, unsigned long nr_pages,
 +                           int write, int force, struct page **pages)
 +{
 +      return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
 +                                       force, pages, FOLL_TOUCH);
 +}
 +EXPORT_SYMBOL(get_user_pages_unlocked);
 +
  /*
   * get_user_pages() - pin user pages in memory
   * @tsk:      the task_struct to use for page fault accounting, or
   * use the correct cache flushing APIs.
   *
   * See also get_user_pages_fast, for performance critical applications.
 + *
 + * get_user_pages should be phased out in favor of
 + * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
 + * should use get_user_pages because it cannot pass
 + * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
   */
  long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages, int write,
                int force, struct page **pages, struct vm_area_struct **vmas)
  {
 -      int flags = FOLL_TOUCH;
 -
 -      if (pages)
 -              flags |= FOLL_GET;
 -      if (write)
 -              flags |= FOLL_WRITE;
 -      if (force)
 -              flags |= FOLL_FORCE;
 -
 -      return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
 -                              NULL);
 +      return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
 +                                     pages, vmas, NULL, false, FOLL_TOUCH);
  }
  EXPORT_SYMBOL(get_user_pages);
  
@@@ -906,10 -740,10 +906,10 @@@ static int gup_pte_range(pmd_t pmd, uns
  
                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
 -               * path
 +               * path using the pte_protnone check.
                 */
                if (!pte_present(pte) || pte_special(pte) ||
 -                      pte_numa(pte) || (write && !pte_write(pte)))
 +                      pte_protnone(pte) || (write && !pte_write(pte)))
                        goto pte_unmap;
  
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@@ -1092,7 -926,7 +1092,7 @@@ static int gup_pmd_range(pud_t pud, uns
  
        pmdp = pmd_offset(&pud, addr);
        do {
-               pmd_t pmd = ACCESS_ONCE(*pmdp);
+               pmd_t pmd = READ_ONCE(*pmdp);
  
                next = pmd_addr_end(addr, end);
                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                         * slowpath for accounting purposes and so that they
                         * can be serialised against THP migration.
                         */
 -                      if (pmd_numa(pmd))
 +                      if (pmd_protnone(pmd))
                                return 0;
  
                        if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
@@@ -1243,8 -1077,10 +1243,8 @@@ int get_user_pages_fast(unsigned long s
                start += nr << PAGE_SHIFT;
                pages += nr;
  
 -              down_read(&mm->mmap_sem);
 -              ret = get_user_pages(current, mm, start,
 -                                   nr_pages - nr, write, 0, pages, NULL);
 -              up_read(&mm->mmap_sem);
 +              ret = get_user_pages_unlocked(current, mm, start,
 +                                            nr_pages - nr, write, 0, pages);
  
                /* Have to be a bit careful with return values */
                if (nr > 0) {