thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers
[cascardo/linux.git] / mm / huge_memory.c
index 11d1567..4a58fa1 100644 (file)
@@ -2977,7 +2977,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 }
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-               unsigned long address)
+               unsigned long address, bool freeze)
 {
        spinlock_t *ptl;
        struct mm_struct *mm = vma->vm_mm;
@@ -2994,7 +2994,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        page = NULL;
        } else if (!pmd_devmap(*pmd))
                goto out;
-       __split_huge_pmd_locked(vma, pmd, haddr, false);
+       __split_huge_pmd_locked(vma, pmd, haddr, freeze);
 out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
@@ -3006,7 +3006,8 @@ out:
        }
 }
 
-void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address)
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+               bool freeze, struct page *page)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -3023,11 +3024,20 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address)
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
                return;
+
+       /*
+        * If caller asks to setup a migration entries, we need a page to check
+        * pmd against. Otherwise we can end up replacing wrong page.
+        */
+       VM_BUG_ON(freeze && !page);
+       if (page && page != pmd_page(*pmd))
+               return;
+
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
         * materialize from under us.
         */
-       split_huge_pmd(vma, pmd, address);
+       __split_huge_pmd(vma, pmd, address, freeze);
 }
 
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3043,7 +3053,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (start & ~HPAGE_PMD_MASK &&
            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, start);
+               split_huge_pmd_address(vma, start, false, NULL);
 
        /*
         * If the new end address isn't hpage aligned and it could
@@ -3053,7 +3063,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (end & ~HPAGE_PMD_MASK &&
            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, end);
+               split_huge_pmd_address(vma, end, false, NULL);
 
        /*
         * If we're also updating the vma->vm_next->vm_start, if the new
@@ -3067,184 +3077,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                if (nstart & ~HPAGE_PMD_MASK &&
                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_pmd_address(next, nstart);
+                       split_huge_pmd_address(next, nstart, false, NULL);
        }
 }
 
-static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
-               unsigned long address)
+static void freeze_page(struct page *page)
 {
-       unsigned long haddr = address & HPAGE_PMD_MASK;
-       spinlock_t *ptl;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       int i, nr = HPAGE_PMD_NR;
-
-       /* Skip pages which doesn't belong to the VMA */
-       if (address < vma->vm_start) {
-               int off = (vma->vm_start - address) >> PAGE_SHIFT;
-               page += off;
-               nr -= off;
-               address = vma->vm_start;
-       }
-
-       pgd = pgd_offset(vma->vm_mm, address);
-       if (!pgd_present(*pgd))
-               return;
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return;
-       pmd = pmd_offset(pud, address);
-       ptl = pmd_lock(vma->vm_mm, pmd);
-       if (!pmd_present(*pmd)) {
-               spin_unlock(ptl);
-               return;
-       }
-       if (pmd_trans_huge(*pmd)) {
-               if (page == pmd_page(*pmd))
-                       __split_huge_pmd_locked(vma, pmd, haddr, true);
-               spin_unlock(ptl);
-               return;
-       }
-       spin_unlock(ptl);
-
-       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
-       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
-               pte_t entry, swp_pte;
-               swp_entry_t swp_entry;
-
-               /*
-                * We've just crossed page table boundary: need to map next one.
-                * It can happen if THP was mremaped to non PMD-aligned address.
-                */
-               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
-                       pte_unmap_unlock(pte - 1, ptl);
-                       pmd = mm_find_pmd(vma->vm_mm, address);
-                       if (!pmd)
-                               return;
-                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
-                                       address, &ptl);
-               }
-
-               if (!pte_present(*pte))
-                       continue;
-               if (page_to_pfn(page) != pte_pfn(*pte))
-                       continue;
-               flush_cache_page(vma, address, page_to_pfn(page));
-               entry = ptep_clear_flush(vma, address, pte);
-               if (pte_dirty(entry))
-                       SetPageDirty(page);
-               swp_entry = make_migration_entry(page, pte_write(entry));
-               swp_pte = swp_entry_to_pte(swp_entry);
-               if (pte_soft_dirty(entry))
-                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
-               set_pte_at(vma->vm_mm, address, pte, swp_pte);
-               page_remove_rmap(page, false);
-               put_page(page);
-       }
-       pte_unmap_unlock(pte - 1, ptl);
-}
-
-static void freeze_page(struct anon_vma *anon_vma, struct page *page)
-{
-       struct anon_vma_chain *avc;
-       pgoff_t pgoff = page_to_pgoff(page);
+       enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
+               TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+       int i, ret;
 
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
-                       pgoff + HPAGE_PMD_NR - 1) {
-               unsigned long address = __vma_address(page, avc->vma);
-
-               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
-                               address, address + HPAGE_PMD_SIZE);
-               freeze_page_vma(avc->vma, page, address);
-               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
-                               address, address + HPAGE_PMD_SIZE);
-       }
-}
-
-static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
-               unsigned long address)
-{
-       spinlock_t *ptl;
-       pmd_t *pmd;
-       pte_t *pte, entry;
-       swp_entry_t swp_entry;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
-       int i, nr = HPAGE_PMD_NR;
-
-       /* Skip pages which doesn't belong to the VMA */
-       if (address < vma->vm_start) {
-               int off = (vma->vm_start - address) >> PAGE_SHIFT;
-               page += off;
-               nr -= off;
-               address = vma->vm_start;
-       }
-
-       pmd = mm_find_pmd(vma->vm_mm, address);
-       if (!pmd)
-               return;
-
-       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
-       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
-               /*
-                * We've just crossed page table boundary: need to map next one.
-                * It can happen if THP was mremaped to non-PMD aligned address.
-                */
-               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
-                       pte_unmap_unlock(pte - 1, ptl);
-                       pmd = mm_find_pmd(vma->vm_mm, address);
-                       if (!pmd)
-                               return;
-                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
-                                       address, &ptl);
-               }
-
-               if (!is_swap_pte(*pte))
-                       continue;
-
-               swp_entry = pte_to_swp_entry(*pte);
-               if (!is_migration_entry(swp_entry))
-                       continue;
-               if (migration_entry_to_page(swp_entry) != page)
-                       continue;
-
-               get_page(page);
-               page_add_anon_rmap(page, vma, address, false);
-
-               entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
-               if (PageDirty(page))
-                       entry = pte_mkdirty(entry);
-               if (is_write_migration_entry(swp_entry))
-                       entry = maybe_mkwrite(entry, vma);
-
-               flush_dcache_page(page);
-               set_pte_at(vma->vm_mm, address, pte, entry);
+       /* We only need TTU_SPLIT_HUGE_PMD once */
+       ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
+       for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
+               /* Cut short if the page is unmapped */
+               if (page_count(page) == 1)
+                       return;
 
-               /* No need to invalidate - it was non-present before */
-               update_mmu_cache(vma, address, pte);
+               ret = try_to_unmap(page + i, ttu_flags);
        }
-       pte_unmap_unlock(pte - 1, ptl);
+       VM_BUG_ON(ret);
 }
 
-static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+static void unfreeze_page(struct page *page)
 {
-       struct anon_vma_chain *avc;
-       pgoff_t pgoff = page_to_pgoff(page);
-
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
-                       pgoff, pgoff + HPAGE_PMD_NR - 1) {
-               unsigned long address = __vma_address(page, avc->vma);
+       int i;
 
-               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
-                               address, address + HPAGE_PMD_SIZE);
-               unfreeze_page_vma(avc->vma, page, address);
-               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
-                               address, address + HPAGE_PMD_SIZE);
-       }
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               remove_migration_ptes(page + i, page + i, true);
 }
 
 static void __split_huge_page_tail(struct page *head, int tail,
@@ -3322,7 +3184,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
        ClearPageCompound(head);
        spin_unlock_irq(&zone->lru_lock);
 
-       unfreeze_page(page_anon_vma(head), head);
+       unfreeze_page(head);
 
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                struct page *subpage = head + i;
@@ -3418,7 +3280,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        }
 
        mlocked = PageMlocked(page);
-       freeze_page(anon_vma, head);
+       freeze_page(head);
        VM_BUG_ON_PAGE(compound_mapcount(head), head);
 
        /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3447,7 +3309,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                BUG();
        } else {
                spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
-               unfreeze_page(anon_vma, head);
+               unfreeze_page(head);
                ret = -EBUSY;
        }