mm: numa: add paranoid check around pte_protnone_numa
[cascardo/linux.git] / mm / huge_memory.c
index 8897131..8e07342 100644 (file)
@@ -761,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
        return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 
-static inline struct page *alloc_hugepage_vma(int defrag,
-                                             struct vm_area_struct *vma,
-                                             unsigned long haddr, int nd,
-                                             gfp_t extra_gfp)
-{
-       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
-                              HPAGE_PMD_ORDER, vma, haddr, nd);
-}
-
 /* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -790,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
                               unsigned int flags)
 {
+       gfp_t gfp;
        struct page *page;
        unsigned long haddr = address & HPAGE_PMD_MASK;
 
@@ -824,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                return 0;
        }
-       page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                       vma, haddr, numa_node_id(), 0);
+       gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+       page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1113,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
-           !transparent_hugepage_debug_cow())
-               new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                             vma, haddr, numa_node_id(), 0);
-       else
+           !transparent_hugepage_debug_cow()) {
+               gfp_t gfp;
+
+               gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+               new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+       } else
                new_page = NULL;
 
        if (unlikely(!new_page)) {
@@ -1217,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                return ERR_PTR(-EFAULT);
 
        /* Full NUMA hinting faults to serialise migration in fault paths */
-       if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+       if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                goto out;
 
        page = pmd_page(*pmd);
@@ -1268,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        bool migrated = false;
        int flags = 0;
 
+       /* A PROT_NONE fault should not end up here */
+       BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
        ptl = pmd_lock(mm, pmdp);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
@@ -1278,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * check_same as the page may no longer be mapped.
         */
        if (unlikely(pmd_trans_migrating(*pmdp))) {
+               page = pmd_page(*pmdp);
                spin_unlock(ptl);
-               wait_migrate_huge_page(vma->anon_vma, pmdp);
+               wait_on_page_locked(page);
                goto out;
        }
 
@@ -1347,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        /*
         * Migrate the THP to the requested node, returns with page unlocked
-        * and pmd_numa cleared.
+        * and access rights restored.
         */
        spin_unlock(ptl);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1360,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        goto out;
 clear_pmdnuma:
        BUG_ON(!PageLocked(page));
-       pmd = pmd_mknonnuma(pmd);
+       pmd = pmd_modify(pmd, vma->vm_page_prot);
        set_pmd_at(mm, haddr, pmdp, pmd);
-       VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
        unlock_page(page);
 out_unlock:
@@ -1418,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return ret;
 }
 
-int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-               unsigned long addr, unsigned long end,
-               unsigned char *vec)
-{
-       spinlock_t *ptl;
-       int ret = 0;
-
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-               /*
-                * All logical pages in the range are present
-                * if backed by a huge page.
-                */
-               spin_unlock(ptl);
-               memset(vec, 1, (end - addr) >> PAGE_SHIFT);
-               ret = 1;
-       }
-
-       return ret;
-}
-
 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
@@ -1505,30 +1482,23 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                pmd_t entry;
-               ret = 1;
-               if (!prot_numa) {
-                       entry = pmdp_get_and_clear_notify(mm, addr, pmd);
-                       if (pmd_numa(entry))
-                               entry = pmd_mknonnuma(entry);
-                       entry = pmd_modify(entry, newprot);
-                       ret = HPAGE_PMD_NR;
-                       set_pmd_at(mm, addr, pmd, entry);
-                       BUG_ON(pmd_write(entry));
-               } else {
-                       struct page *page = pmd_page(*pmd);
 
-                       /*
-                        * Do not trap faults against the zero page. The
-                        * read-only data is likely to be read-cached on the
-                        * local CPU cache and it is less useful to know about
-                        * local vs remote hits on the zero page.
-                        */
-                       if (!is_huge_zero_page(page) &&
-                           !pmd_numa(*pmd)) {
-                               pmdp_set_numa(mm, addr, pmd);
-                               ret = HPAGE_PMD_NR;
-                       }
+               /*
+                * Avoid trapping faults against the zero page. The read-only
+                * data is likely to be read-cached on the local CPU and
+                * local/remote hits to the zero page are not interesting.
+                */
+               if (prot_numa && is_huge_zero_pmd(*pmd)) {
+                       spin_unlock(ptl);
+                       return 0;
                }
+
+               ret = 1;
+               entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+               entry = pmd_modify(entry, newprot);
+               ret = HPAGE_PMD_NR;
+               set_pmd_at(mm, addr, pmd, entry);
+               BUG_ON(pmd_write(entry));
                spin_unlock(ptl);
        }
 
@@ -1792,9 +1762,9 @@ static int __split_huge_page_map(struct page *page,
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        /*
-                        * Note that pmd_numa is not transferred deliberately
-                        * to avoid any possibility that pte_numa leaks to
-                        * a PROT_NONE VMA by accident.
+                        * Note that NUMA hinting access restrictions are not
+                        * transferred to avoid any possibility of altering
+                        * permissions across VMAs.
                         */
                        entry = mk_pte(page + i, vma->vm_page_prot);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2143,7 +2113,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
        struct page *page;
        pte_t *_pte;
-       int referenced = 0, none = 0;
+       int none = 0;
+       bool referenced = false, writable = false;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
@@ -2153,7 +2124,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        else
                                goto out;
                }
-               if (!pte_present(pteval) || !pte_write(pteval))
+               if (!pte_present(pteval))
                        goto out;
                page = vm_normal_page(vma, address, pteval);
                if (unlikely(!page))
@@ -2163,9 +2134,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                VM_BUG_ON_PAGE(!PageAnon(page), page);
                VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
-               /* cannot use mapcount: can't collapse if there's a gup pin */
-               if (page_count(page) != 1)
-                       goto out;
                /*
                 * We can do it before isolate_lru_page because the
                 * page can't be freed from under us. NOTE: PG_lock
@@ -2174,6 +2142,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 */
                if (!trylock_page(page))
                        goto out;
+
+               /*
+                * cannot use mapcount: can't collapse if there's a gup pin.
+                * The page must only be referenced by the scanned process
+                * and page swap cache.
+                */
+               if (page_count(page) != 1 + !!PageSwapCache(page)) {
+                       unlock_page(page);
+                       goto out;
+               }
+               if (pte_write(pteval)) {
+                       writable = true;
+               } else {
+                       if (PageSwapCache(page) && !reuse_swap_page(page)) {
+                               unlock_page(page);
+                               goto out;
+                       }
+                       /*
+                        * Page is not in the swap cache. It can be collapsed
+                        * into a THP.
+                        */
+               }
+
                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
@@ -2190,9 +2181,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                /* If there is no mapped pte young don't collapse the page */
                if (pte_young(pteval) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
-                       referenced = 1;
+                       referenced = true;
        }
-       if (likely(referenced))
+       if (likely(referenced && writable))
                return 1;
 out:
        release_pte_pages(pte, _pte);
@@ -2545,11 +2536,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-       int ret = 0, referenced = 0, none = 0;
+       int ret = 0, none = 0;
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE;
+       bool writable = false, referenced = false;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -2568,8 +2560,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        else
                                goto out_unmap;
                }
-               if (!pte_present(pteval) || !pte_write(pteval))
+               if (!pte_present(pteval))
                        goto out_unmap;
+               if (pte_write(pteval))
+                       writable = true;
+
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
@@ -2586,14 +2581,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                VM_BUG_ON_PAGE(PageCompound(page), page);
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
-               /* cannot use mapcount: can't collapse if there's a gup pin */
-               if (page_count(page) != 1)
+               /*
+                * cannot use mapcount: can't collapse if there's a gup pin.
+                * The page must only be referenced by the scanned process
+                * and page swap cache.
+                */
+               if (page_count(page) != 1 + !!PageSwapCache(page))
                        goto out_unmap;
                if (pte_young(pteval) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
-                       referenced = 1;
+                       referenced = true;
        }
-       if (referenced)
+       if (referenced && writable)
                ret = 1;
 out_unmap:
        pte_unmap_unlock(pte, ptl);