inet: create IPv6-equivalent inet_hash function
[cascardo/linux.git] / mm / rmap.c
index aa68a40..79f3bf0 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
  * inode->i_mutex      (while writing or truncating, not reading or faulting)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_rwsem
- *         anon_vma->rwsem
- *           mm->page_table_lock or pte_lock
- *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- *             swap_lock (in swap_duplicate, swap_info_get)
- *               mmlist_lock (in mmput, drain_mmlist and others)
- *               mapping->private_lock (in __set_page_dirty_buffers)
- *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- *                   mapping->tree_lock (widely used)
- *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- *                 sb_lock (within inode_lock in fs/fs-writeback.c)
- *                 mapping->tree_lock (widely used, in set_page_dirty,
- *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within bdi.wb->list_lock in __sync_single_inode)
+ *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ *         mapping->i_mmap_rwsem
+ *           anon_vma->rwsem
+ *             mm->page_table_lock or pte_lock
+ *               zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *               swap_lock (in swap_duplicate, swap_info_get)
+ *                 mmlist_lock (in mmput, drain_mmlist and others)
+ *                 mapping->private_lock (in __set_page_dirty_buffers)
+ *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                     mapping->tree_lock (widely used)
+ *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ *                   sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                   mapping->tree_lock (widely used, in set_page_dirty,
+ *                             in arch-dependent flush_dcache_mmap_lock,
+ *                             within bdi.wb->list_lock in __sync_single_inode)
  *
  * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
@@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
        anon_vma_unlock_read(anon_vma);
 }
 
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
-       pgoff_t pgoff = page_to_pgoff(page);
-       return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
-       unsigned long address = __vma_address(page, vma);
-
-       /* page should be within @vma mapping range */
-       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
-       return address;
-}
-
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 static void percpu_flush_tlb_batch_pages(void *data)
 {
@@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Check that @page is mapped at @address into @mm. In contrast to
+ * page_check_address(), this function can handle transparent huge pages.
+ *
+ * On success returns true with pte mapped and locked. For PMD-mapped
+ * transparent huge pages *@ptep is set to NULL.
+ */
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+                                 unsigned long address, pmd_t **pmdp,
+                                 pte_t **ptep, spinlock_t **ptlp)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       spinlock_t *ptl;
+
+       if (unlikely(PageHuge(page))) {
+               /* when pud is not present, pte will be NULL */
+               pte = huge_pte_offset(mm, address);
+               if (!pte)
+                       return false;
+
+               ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+               pmd = NULL;
+               goto check_pte;
+       }
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               return false;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return false;
+       pmd = pmd_offset(pud, address);
+
+       if (pmd_trans_huge(*pmd)) {
+               ptl = pmd_lock(mm, pmd);
+               if (!pmd_present(*pmd))
+                       goto unlock_pmd;
+               if (unlikely(!pmd_trans_huge(*pmd))) {
+                       spin_unlock(ptl);
+                       goto map_pte;
+               }
+
+               if (pmd_page(*pmd) != page)
+                       goto unlock_pmd;
+
+               pte = NULL;
+               goto found;
+unlock_pmd:
+               spin_unlock(ptl);
+               return false;
+       } else {
+               pmd_t pmde = *pmd;
+
+               barrier();
+               if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+                       return false;
+       }
+map_pte:
+       pte = pte_offset_map(pmd, address);
+       if (!pte_present(*pte)) {
+               pte_unmap(pte);
+               return false;
+       }
+
+       ptl = pte_lockptr(mm, pmd);
+check_pte:
+       spin_lock(ptl);
+
+       if (!pte_present(*pte)) {
+               pte_unmap_unlock(pte, ptl);
+               return false;
+       }
+
+       /* THP can be referenced by any subpage */
+       if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+               pte_unmap_unlock(pte, ptl);
+               return false;
+       }
+found:
+       *ptep = pte;
+       *pmdp = pmd;
+       *ptlp = ptl;
+       return true;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 struct page_referenced_arg {
        int mapcount;
        int referenced;
@@ -832,47 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
+       struct page_referenced_arg *pra = arg;
+       pmd_t *pmd;
+       pte_t *pte;
        spinlock_t *ptl;
        int referenced = 0;
-       struct page_referenced_arg *pra = arg;
 
-       if (unlikely(PageTransHuge(page))) {
-               pmd_t *pmd;
+       if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
+               return SWAP_AGAIN;
 
-               /*
-                * rmap might return false positives; we must filter
-                * these out using page_check_address_pmd().
-                */
-               pmd = page_check_address_pmd(page, mm, address, &ptl);
-               if (!pmd)
-                       return SWAP_AGAIN;
-
-               if (vma->vm_flags & VM_LOCKED) {
-                       spin_unlock(ptl);
-                       pra->vm_flags |= VM_LOCKED;
-                       return SWAP_FAIL; /* To break the loop */
-               }
-
-               if (pmdp_clear_flush_young_notify(vma, address, pmd))
-                       referenced++;
+       if (vma->vm_flags & VM_LOCKED) {
+               if (pte)
+                       pte_unmap(pte);
                spin_unlock(ptl);
-       } else {
-               pte_t *pte;
-
-               /*
-                * rmap might return false positives; we must filter
-                * these out using page_check_address().
-                */
-               pte = page_check_address(page, mm, address, &ptl, 0);
-               if (!pte)
-                       return SWAP_AGAIN;
-
-               if (vma->vm_flags & VM_LOCKED) {
-                       pte_unmap_unlock(pte, ptl);
-                       pra->vm_flags |= VM_LOCKED;
-                       return SWAP_FAIL; /* To break the loop */
-               }
+               pra->vm_flags |= VM_LOCKED;
+               return SWAP_FAIL; /* To break the loop */
+       }
 
+       if (pte) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        /*
                         * Don't treat a reference through a sequentially read
@@ -884,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                referenced++;
                }
-               pte_unmap_unlock(pte, ptl);
+               pte_unmap(pte);
+       } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+               if (pmdp_clear_flush_young_notify(vma, address, pmd))
+                       referenced++;
+       } else {
+               /* unexpected pmd-mapped page? */
+               WARN_ON_ONCE(1);
        }
+       spin_unlock(ptl);
 
        if (referenced)
                clear_page_idle(page);
@@ -933,7 +987,7 @@ int page_referenced(struct page *page,
        int ret;
        int we_locked = 0;
        struct page_referenced_arg pra = {
-               .mapcount = page_mapcount(page),
+               .mapcount = total_mapcount(page),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
@@ -1122,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
         * over the call to page_add_new_anon_rmap.
         */
        BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
-       BUG_ON(page->index != linear_page_index(vma, address));
+       BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
 #endif
 }
 
@@ -1152,9 +1206,20 @@ void page_add_anon_rmap(struct page *page,
 void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int flags)
 {
-       int first = atomic_inc_and_test(&page->_mapcount);
+       bool compound = flags & RMAP_COMPOUND;
+       bool first;
+
+       if (compound) {
+               atomic_t *mapcount;
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+               mapcount = compound_mapcount_ptr(page);
+               first = atomic_inc_and_test(mapcount);
+       } else {
+               first = atomic_inc_and_test(&page->_mapcount);
+       }
+
        if (first) {
-               bool compound = flags & RMAP_COMPOUND;
                int nr = compound ? hpage_nr_pages(page) : 1;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
@@ -1163,7 +1228,6 @@ void do_page_add_anon_rmap(struct page *page,
                 * disabled.
                 */
                if (compound) {
-                       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                        __inc_zone_page_state(page,
                                              NR_ANON_TRANSPARENT_HUGEPAGES);
                }
@@ -1173,6 +1237,7 @@ void do_page_add_anon_rmap(struct page *page,
                return;
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
+
        /* address might be in next vma when migration races vma_adjust */
        if (first)
                __page_set_anon_rmap(page, vma, address,
@@ -1199,10 +1264,16 @@ void page_add_new_anon_rmap(struct page *page,
 
        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        SetPageSwapBacked(page);
-       atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        if (compound) {
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+               /* increment count (starts at -1) */
+               atomic_set(compound_mapcount_ptr(page), 0);
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+       } else {
+               /* Anon THP always mapped first with PMD */
+               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               /* increment count (starts at -1) */
+               atomic_set(&page->_mapcount, 0);
        }
        __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        __page_set_anon_rmap(page, vma, address, 1);
@@ -1232,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page)
 
        memcg = mem_cgroup_begin_page_stat(page);
 
-       /* page still mapped by someone else? */
-       if (!atomic_add_negative(-1, &page->_mapcount))
+       /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+       if (unlikely(PageHuge(page))) {
+               /* hugetlb pages are always mapped with pmds */
+               atomic_dec(compound_mapcount_ptr(page));
                goto out;
+       }
 
-       /* Hugepages are not counted in NR_FILE_MAPPED for now. */
-       if (unlikely(PageHuge(page)))
+       /* page still mapped by someone else? */
+       if (!atomic_add_negative(-1, &page->_mapcount))
                goto out;
 
        /*
@@ -1254,6 +1328,44 @@ out:
        mem_cgroup_end_page_stat(memcg);
 }
 
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+       int i, nr;
+
+       if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+               return;
+
+       /* Hugepages are not counted in NR_ANON_PAGES for now. */
+       if (unlikely(PageHuge(page)))
+               return;
+
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+               return;
+
+       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+       if (TestClearPageDoubleMap(page)) {
+               /*
+                * Subpages can be mapped with PTEs too. Check how many of
+                * themi are still mapped.
+                */
+               for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+                       if (atomic_add_negative(-1, &page[i]._mapcount))
+                               nr++;
+               }
+       } else {
+               nr = HPAGE_PMD_NR;
+       }
+
+       if (unlikely(PageMlocked(page)))
+               clear_page_mlock(page);
+
+       if (nr) {
+               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+               deferred_split_huge_page(page);
+       }
+}
+
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page:      page to remove mapping from
@@ -1263,37 +1375,32 @@ out:
  */
 void page_remove_rmap(struct page *page, bool compound)
 {
-       int nr = compound ? hpage_nr_pages(page) : 1;
-
        if (!PageAnon(page)) {
                VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
                page_remove_file_rmap(page);
                return;
        }
 
+       if (compound)
+               return page_remove_anon_compound_rmap(page);
+
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
                return;
 
-       /* Hugepages are not counted in NR_ANON_PAGES for now. */
-       if (unlikely(PageHuge(page)))
-               return;
-
        /*
         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
-       if (compound) {
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-       }
-
-       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+       __dec_zone_page_state(page, NR_ANON_PAGES);
 
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
 
+       if (PageTransCompound(page))
+               deferred_split_huge_page(compound_head(page));
+
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1305,6 +1412,11 @@ void page_remove_rmap(struct page *page, bool compound)
         */
 }
 
+struct rmap_private {
+       enum ttu_flags flags;
+       int lazyfreed;
+};
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1316,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        pte_t pteval;
        spinlock_t *ptl;
        int ret = SWAP_AGAIN;
-       enum ttu_flags flags = (enum ttu_flags)arg;
+       struct rmap_private *rp = arg;
+       enum ttu_flags flags = rp->flags;
 
        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1408,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                 * See handle_pte_fault() ...
                 */
                VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+
+               if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+                       /* It's a freeable page by MADV_FREE */
+                       dec_mm_counter(mm, MM_ANONPAGES);
+                       rp->lazyfreed++;
+                       goto discard;
+               }
+
                if (swap_duplicate(entry) < 0) {
                        set_pte_at(mm, address, pte, pteval);
                        ret = SWAP_FAIL;
@@ -1428,6 +1549,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        } else
                dec_mm_counter(mm, mm_counter_file(page));
 
+discard:
        page_remove_rmap(page, PageHuge(page));
        page_cache_release(page);
 
@@ -1480,9 +1602,14 @@ static int page_not_mapped(struct page *page)
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
+       struct rmap_private rp = {
+               .flags = flags,
+               .lazyfreed = 0,
+       };
+
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-               .arg = (void *)flags,
+               .arg = &rp,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
        };
@@ -1502,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 
        ret = rmap_walk(page, &rwc);
 
-       if (ret != SWAP_MLOCK && !page_mapped(page))
+       if (ret != SWAP_MLOCK && !page_mapped(page)) {
                ret = SWAP_SUCCESS;
+               if (rp.lazyfreed && !PageDirty(page))
+                       ret = SWAP_LZFREE;
+       }
        return ret;
 }
 
@@ -1525,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 int try_to_munlock(struct page *page)
 {
        int ret;
+       struct rmap_private rp = {
+               .flags = TTU_MUNLOCK,
+               .lazyfreed = 0,
+       };
+
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-               .arg = (void *)TTU_MUNLOCK,
+               .arg = &rp,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
 
@@ -1710,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page,
        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
-       first = atomic_inc_and_test(&page->_mapcount);
+       first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
                __hugepage_set_anon_rmap(page, vma, address, 0);
 }
@@ -1719,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-       atomic_set(&page->_mapcount, 0);
+       atomic_set(compound_mapcount_ptr(page), 0);
        __hugepage_set_anon_rmap(page, vma, address, 1);
 }
 #endif /* CONFIG_HUGETLB_PAGE */