mm: account pmd page tables to the process
[cascardo/linux.git] / mm / memory.c
index d707c4d..bbe6a73 100644 (file)
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
+       mm_dec_nr_pmds(tlb->mm);
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -813,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
        /* pte contains position in swap or file, so copy. */
        if (unlikely(!pte_present(pte))) {
-               if (!pte_file(pte)) {
-                       swp_entry_t entry = pte_to_swp_entry(pte);
-
-                       if (likely(!non_swap_entry(entry))) {
-                               if (swap_duplicate(entry) < 0)
-                                       return entry.val;
-
-                               /* make sure dst_mm is on swapoff's mmlist. */
-                               if (unlikely(list_empty(&dst_mm->mmlist))) {
-                                       spin_lock(&mmlist_lock);
-                                       if (list_empty(&dst_mm->mmlist))
-                                               list_add(&dst_mm->mmlist,
-                                                        &src_mm->mmlist);
-                                       spin_unlock(&mmlist_lock);
-                               }
-                               rss[MM_SWAPENTS]++;
-                       } else if (is_migration_entry(entry)) {
-                               page = migration_entry_to_page(entry);
-
-                               if (PageAnon(page))
-                                       rss[MM_ANONPAGES]++;
-                               else
-                                       rss[MM_FILEPAGES]++;
-
-                               if (is_write_migration_entry(entry) &&
-                                   is_cow_mapping(vm_flags)) {
-                                       /*
-                                        * COW mappings require pages in both
-                                        * parent and child to be set to read.
-                                        */
-                                       make_migration_entry_read(&entry);
-                                       pte = swp_entry_to_pte(entry);
-                                       if (pte_swp_soft_dirty(*src_pte))
-                                               pte = pte_swp_mksoft_dirty(pte);
-                                       set_pte_at(src_mm, addr, src_pte, pte);
-                               }
+               swp_entry_t entry = pte_to_swp_entry(pte);
+
+               if (likely(!non_swap_entry(entry))) {
+                       if (swap_duplicate(entry) < 0)
+                               return entry.val;
+
+                       /* make sure dst_mm is on swapoff's mmlist. */
+                       if (unlikely(list_empty(&dst_mm->mmlist))) {
+                               spin_lock(&mmlist_lock);
+                               if (list_empty(&dst_mm->mmlist))
+                                       list_add(&dst_mm->mmlist,
+                                                       &src_mm->mmlist);
+                               spin_unlock(&mmlist_lock);
+                       }
+                       rss[MM_SWAPENTS]++;
+               } else if (is_migration_entry(entry)) {
+                       page = migration_entry_to_page(entry);
+
+                       if (PageAnon(page))
+                               rss[MM_ANONPAGES]++;
+                       else
+                               rss[MM_FILEPAGES]++;
+
+                       if (is_write_migration_entry(entry) &&
+                                       is_cow_mapping(vm_flags)) {
+                               /*
+                                * COW mappings require pages in both
+                                * parent and child to be set to read.
+                                */
+                               make_migration_entry_read(&entry);
+                               pte = swp_entry_to_pte(entry);
+                               if (pte_swp_soft_dirty(*src_pte))
+                                       pte = pte_swp_mksoft_dirty(pte);
+                               set_pte_at(src_mm, addr, src_pte, pte);
                        }
                }
                goto out_set_pte;
@@ -1022,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-       if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
-                              VM_PFNMAP | VM_MIXEDMAP))) {
-               if (!vma->anon_vma)
-                       return 0;
-       }
+       if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+                       !vma->anon_vma)
+               return 0;
 
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1084,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
+       swp_entry_t entry;
 
 again:
        init_rss_vec(rss);
@@ -1109,28 +1107,12 @@ again:
                                if (details->check_mapping &&
                                    details->check_mapping != page->mapping)
                                        continue;
-                               /*
-                                * Each page->index must be checked when
-                                * invalidating or truncating nonlinear.
-                                */
-                               if (details->nonlinear_vma &&
-                                   (page->index < details->first_index ||
-                                    page->index > details->last_index))
-                                       continue;
                        }
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        if (unlikely(!page))
                                continue;
-                       if (unlikely(details) && details->nonlinear_vma
-                           && linear_page_index(details->nonlinear_vma,
-                                               addr) != page->index) {
-                               pte_t ptfile = pgoff_to_pte(page->index);
-                               if (pte_soft_dirty(ptent))
-                                       ptfile = pte_file_mksoft_dirty(ptfile);
-                               set_pte_at(mm, addr, pte, ptfile);
-                       }
                        if (PageAnon(page))
                                rss[MM_ANONPAGES]--;
                        else {
@@ -1153,33 +1135,25 @@ again:
                        }
                        continue;
                }
-               /*
-                * If details->check_mapping, we leave swap entries;
-                * if details->nonlinear_vma, we leave file entries.
-                */
+               /* If details->check_mapping, we leave swap entries. */
                if (unlikely(details))
                        continue;
-               if (pte_file(ptent)) {
-                       if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-                               print_bad_pte(vma, addr, ptent, NULL);
-               } else {
-                       swp_entry_t entry = pte_to_swp_entry(ptent);
 
-                       if (!non_swap_entry(entry))
-                               rss[MM_SWAPENTS]--;
-                       else if (is_migration_entry(entry)) {
-                               struct page *page;
+               entry = pte_to_swp_entry(ptent);
+               if (!non_swap_entry(entry))
+                       rss[MM_SWAPENTS]--;
+               else if (is_migration_entry(entry)) {
+                       struct page *page;
 
-                               page = migration_entry_to_page(entry);
+                       page = migration_entry_to_page(entry);
 
-                               if (PageAnon(page))
-                                       rss[MM_ANONPAGES]--;
-                               else
-                                       rss[MM_FILEPAGES]--;
-                       }
-                       if (unlikely(!free_swap_and_cache(entry)))
-                               print_bad_pte(vma, addr, ptent, NULL);
+                       if (PageAnon(page))
+                               rss[MM_ANONPAGES]--;
+                       else
+                               rss[MM_FILEPAGES]--;
                }
+               if (unlikely(!free_swap_and_cache(entry)))
+                       print_bad_pte(vma, addr, ptent, NULL);
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -1279,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
        pgd_t *pgd;
        unsigned long next;
 
-       if (details && !details->check_mapping && !details->nonlinear_vma)
+       if (details && !details->check_mapping)
                details = NULL;
 
        BUG_ON(addr >= end);
@@ -1373,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
@@ -1399,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * The range must fit into one VMA.
  */
@@ -1924,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically.  Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2035,7 +2008,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t entry;
        int ret = 0;
        int page_mkwrite = 0;
-       struct page *dirty_page = NULL;
+       bool dirty_shared = false;
        unsigned long mmun_start = 0;   /* For mmu_notifiers */
        unsigned long mmun_end = 0;     /* For mmu_notifiers */
        struct mem_cgroup *memcg;
@@ -2086,6 +2059,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
+               page_cache_get(old_page);
                /*
                 * Only catch write-faults on shared writable pages,
                 * read-only shared pages can get COWed by
@@ -2093,7 +2067,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                        int tmp;
-                       page_cache_get(old_page);
+
                        pte_unmap_unlock(page_table, ptl);
                        tmp = do_page_mkwrite(vma, old_page, address);
                        if (unlikely(!tmp || (tmp &
@@ -2113,11 +2087,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unlock_page(old_page);
                                goto unlock;
                        }
-
                        page_mkwrite = 1;
                }
-               dirty_page = old_page;
-               get_page(dirty_page);
+
+               dirty_shared = true;
 
 reuse:
                /*
@@ -2136,20 +2109,20 @@ reuse:
                pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
 
-               if (!dirty_page)
-                       return ret;
-
-               if (!page_mkwrite) {
+               if (dirty_shared) {
                        struct address_space *mapping;
                        int dirtied;
 
-                       lock_page(dirty_page);
-                       dirtied = set_page_dirty(dirty_page);
-                       VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
-                       mapping = dirty_page->mapping;
-                       unlock_page(dirty_page);
+                       if (!page_mkwrite)
+                               lock_page(old_page);
+
+                       dirtied = set_page_dirty(old_page);
+                       VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+                       mapping = old_page->mapping;
+                       unlock_page(old_page);
+                       page_cache_release(old_page);
 
-                       if (dirtied && mapping) {
+                       if ((dirtied || page_mkwrite) && mapping) {
                                /*
                                 * Some device drivers do not set page.mapping
                                 * but still dirty their pages
@@ -2157,25 +2130,9 @@ reuse:
                                balance_dirty_pages_ratelimited(mapping);
                        }
 
-                       /* file_update_time outside page_lock */
-                       if (vma->vm_file)
+                       if (!page_mkwrite)
                                file_update_time(vma->vm_file);
                }
-               put_page(dirty_page);
-               if (page_mkwrite) {
-                       struct address_space *mapping = dirty_page->mapping;
-
-                       set_page_dirty(dirty_page);
-                       unlock_page(dirty_page);
-                       page_cache_release(dirty_page);
-                       if (mapping)    {
-                               /*
-                                * Some device drivers do not set page.mapping
-                                * but still dirty their pages
-                                */
-                               balance_dirty_pages_ratelimited(mapping);
-                       }
-               }
 
                return ret;
        }
@@ -2333,25 +2290,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
        }
 }
 
-static inline void unmap_mapping_range_list(struct list_head *head,
-                                           struct zap_details *details)
-{
-       struct vm_area_struct *vma;
-
-       /*
-        * In nonlinear VMAs there is no correspondence between virtual address
-        * offset and file offset.  So we must perform an exhaustive search
-        * across *all* the pages in each nonlinear VMA, not just the pages
-        * whose virtual address lies outside the file truncation point.
-        */
-       list_for_each_entry(vma, head, shared.nonlinear) {
-               details->nonlinear_vma = vma;
-               unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-       }
-}
-
 /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2380,7 +2323,6 @@ void unmap_mapping_range(struct address_space *mapping,
        }
 
        details.check_mapping = even_cows? NULL: mapping;
-       details.nonlinear_vma = NULL;
        details.first_index = hba;
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
@@ -2390,8 +2332,6 @@ void unmap_mapping_range(struct address_space *mapping,
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
-       if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-               unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -2752,8 +2692,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
        entry = mk_pte(page, vma->vm_page_prot);
        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-       else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-               entry = pte_mksoft_dirty(entry);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, address);
@@ -2888,8 +2826,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
-       if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
-           fault_around_bytes >> PAGE_SHIFT > 1) {
+       if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
@@ -3021,8 +2958,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                balance_dirty_pages_ratelimited(mapping);
        }
 
-       /* file_update_time outside page_lock */
-       if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+       if (!vma->vm_ops->page_mkwrite)
                file_update_time(vma->vm_file);
 
        return ret;
@@ -3034,7 +2970,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
@@ -3051,46 +2987,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags, pte_t orig_pte)
-{
-       pgoff_t pgoff;
-
-       flags |= FAULT_FLAG_NONLINEAR;
-
-       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-               return 0;
-
-       if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-               /*
-                * Page table corrupted: show pte and kill process.
-                */
-               print_bad_pte(vma, address, orig_pte, NULL);
-               return VM_FAULT_SIGBUS;
-       }
-
-       pgoff = pte_to_pgoff(orig_pte);
-       if (!(flags & FAULT_FLAG_WRITE))
-               return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       if (!(vma->vm_flags & VM_SHARED))
-               return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                unsigned long addr, int page_nid,
                                int *flags)
@@ -3218,15 +3114,12 @@ static int handle_pte_fault(struct mm_struct *mm,
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
                                if (likely(vma->vm_ops->fault))
-                                       return do_linear_fault(mm, vma, address,
-                                               pte, pmd, flags, entry);
+                                       return do_fault(mm, vma, address, pte,
+                                                       pmd, flags, entry);
                        }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, flags);
                }
-               if (pte_file(entry))
-                       return do_nonlinear_fault(mm, vma, address,
-                                       pte, pmd, flags, entry);
                return do_swap_page(mm, vma, address,
                                        pte, pmd, flags, entry);
        }
@@ -3430,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 
        spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-       if (pud_present(*pud))          /* Another has populated it */
-               pmd_free(mm, new);
-       else
+       if (!pud_present(*pud)) {
+               mm_inc_nr_pmds(mm);
                pud_populate(mm, pud, new);
-#else
-       if (pgd_present(*pud))          /* Another has populated it */
+       } else  /* Another has populated it */
                pmd_free(mm, new);
-       else
+#else
+       if (!pgd_present(*pud)) {
+               mm_inc_nr_pmds(mm);
                pgd_populate(mm, pud, new);
+       } else /* Another has populated it */
+               pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
        spin_unlock(&mm->page_table_lock);
        return 0;