mm: account pmd page tables to the process

[cascardo/linux.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index d707c4d..bbe6a73 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
         pmd = pmd_offset(pud, start);
         pud_clear(pud);
         pmd_free_tlb(tlb, pmd, start);
+       mm_dec_nr_pmds(tlb->mm);
  }
  
  static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -813,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  
         /* pte contains position in swap or file, so copy. */
         if (unlikely(!pte_present(pte))) {
-               if (!pte_file(pte)) {
-                       swp_entry_t entry = pte_to_swp_entry(pte);
-
-                       if (likely(!non_swap_entry(entry))) {
-                               if (swap_duplicate(entry) < 0)
-                                       return entry.val;
-
-                               /* make sure dst_mm is on swapoff's mmlist. */
-                               if (unlikely(list_empty(&dst_mm->mmlist))) {
-                                       spin_lock(&mmlist_lock);
-                                       if (list_empty(&dst_mm->mmlist))
-                                               list_add(&dst_mm->mmlist,
-                                                        &src_mm->mmlist);
-                                       spin_unlock(&mmlist_lock);
-                               }
-                               rss[MM_SWAPENTS]++;
-                       } else if (is_migration_entry(entry)) {
-                               page = migration_entry_to_page(entry);
-
-                               if (PageAnon(page))
-                                       rss[MM_ANONPAGES]++;
-                               else
-                                       rss[MM_FILEPAGES]++;
-
-                               if (is_write_migration_entry(entry) &&
-                                   is_cow_mapping(vm_flags)) {
-                                       /*
-                                        * COW mappings require pages in both
-                                        * parent and child to be set to read.
-                                        */
-                                       make_migration_entry_read(&entry);
-                                       pte = swp_entry_to_pte(entry);
-                                       if (pte_swp_soft_dirty(*src_pte))
-                                               pte = pte_swp_mksoft_dirty(pte);
-                                       set_pte_at(src_mm, addr, src_pte, pte);
-                               }
+               swp_entry_t entry = pte_to_swp_entry(pte);
+
+               if (likely(!non_swap_entry(entry))) {
+                       if (swap_duplicate(entry) < 0)
+                               return entry.val;
+
+                       /* make sure dst_mm is on swapoff's mmlist. */
+                       if (unlikely(list_empty(&dst_mm->mmlist))) {
+                               spin_lock(&mmlist_lock);
+                               if (list_empty(&dst_mm->mmlist))
+                                       list_add(&dst_mm->mmlist,
+                                                       &src_mm->mmlist);
+                               spin_unlock(&mmlist_lock);
+                       }
+                       rss[MM_SWAPENTS]++;
+               } else if (is_migration_entry(entry)) {
+                       page = migration_entry_to_page(entry);
+
+                       if (PageAnon(page))
+                               rss[MM_ANONPAGES]++;
+                       else
+                               rss[MM_FILEPAGES]++;
+
+                       if (is_write_migration_entry(entry) &&
+                                       is_cow_mapping(vm_flags)) {
+                               /*
+                                * COW mappings require pages in both
+                                * parent and child to be set to read.
+                                */
+                               make_migration_entry_read(&entry);
+                               pte = swp_entry_to_pte(entry);
+                               if (pte_swp_soft_dirty(*src_pte))
+                                       pte = pte_swp_mksoft_dirty(pte);
+                               set_pte_at(src_mm, addr, src_pte, pte);
                         }
                 }
                 goto out_set_pte;
@@ -1022,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
          * readonly mappings. The tradeoff is that copy_page_range is more
          * efficient than faulting.
          */
-       if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
-                              VM_PFNMAP | VM_MIXEDMAP))) {
-               if (!vma->anon_vma)
-                       return 0;
-       }
+       if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+                       !vma->anon_vma)
+               return 0;
  
         if (is_vm_hugetlb_page(vma))
                 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1084,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
         spinlock_t *ptl;
         pte_t *start_pte;
         pte_t *pte;
+       swp_entry_t entry;
  
  again:
         init_rss_vec(rss);
@@ -1109,28 +1107,12 @@ again:
                                 if (details->check_mapping &&
                                     details->check_mapping != page->mapping)
                                         continue;
-                               /*
-                                * Each page->index must be checked when
-                                * invalidating or truncating nonlinear.
-                                */
-                               if (details->nonlinear_vma &&
-                                   (page->index < details->first_index ||
-                                    page->index > details->last_index))
-                                       continue;
                         }
                         ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                         tlb->fullmm);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                         if (unlikely(!page))
                                 continue;
-                       if (unlikely(details) && details->nonlinear_vma
-                           && linear_page_index(details->nonlinear_vma,
-                                               addr) != page->index) {
-                               pte_t ptfile = pgoff_to_pte(page->index);
-                               if (pte_soft_dirty(ptent))
-                                       ptfile = pte_file_mksoft_dirty(ptfile);
-                               set_pte_at(mm, addr, pte, ptfile);
-                       }
                         if (PageAnon(page))
                                 rss[MM_ANONPAGES]--;
                         else {
@@ -1153,33 +1135,25 @@ again:
                         }
                         continue;
                 }
-               /*
-                * If details->check_mapping, we leave swap entries;
-                * if details->nonlinear_vma, we leave file entries.
-                */
+               /* If details->check_mapping, we leave swap entries. */
                 if (unlikely(details))
                         continue;
-               if (pte_file(ptent)) {
-                       if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-                               print_bad_pte(vma, addr, ptent, NULL);
-               } else {
-                       swp_entry_t entry = pte_to_swp_entry(ptent);
  
-                       if (!non_swap_entry(entry))
-                               rss[MM_SWAPENTS]--;
-                       else if (is_migration_entry(entry)) {
-                               struct page *page;
+               entry = pte_to_swp_entry(ptent);
+               if (!non_swap_entry(entry))
+                       rss[MM_SWAPENTS]--;
+               else if (is_migration_entry(entry)) {
+                       struct page *page;
  
-                               page = migration_entry_to_page(entry);
+                       page = migration_entry_to_page(entry);
  
-                               if (PageAnon(page))
-                                       rss[MM_ANONPAGES]--;
-                               else
-                                       rss[MM_FILEPAGES]--;
-                       }
-                       if (unlikely(!free_swap_and_cache(entry)))
-                               print_bad_pte(vma, addr, ptent, NULL);
+                       if (PageAnon(page))
+                               rss[MM_ANONPAGES]--;
+                       else
+                               rss[MM_FILEPAGES]--;
                 }
+               if (unlikely(!free_swap_and_cache(entry)))
+                       print_bad_pte(vma, addr, ptent, NULL);
                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
         } while (pte++, addr += PAGE_SIZE, addr != end);
  
@@ -1279,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
         pgd_t *pgd;
         unsigned long next;
  
-       if (details && !details->check_mapping && !details->nonlinear_vma)
+       if (details && !details->check_mapping)
                 details = NULL;
  
         BUG_ON(addr >= end);
@@ -1373,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb,
   * @vma: vm_area_struct holding the applicable pages
   * @start: starting address of pages to zap
   * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
   *
   * Caller must protect the VMA list
   */
@@ -1399,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
   * @vma: vm_area_struct holding the applicable pages
   * @address: starting address of pages to zap
   * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
   *
   * The range must fit into one VMA.
   */
@@ -1924,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
  EXPORT_SYMBOL_GPL(apply_to_page_range);
  
  /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically.  Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
   * and do_anonymous_page can safely check later on).
   */
  static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2035,7 +2008,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_t entry;
         int ret = 0;
         int page_mkwrite = 0;
-       struct page *dirty_page = NULL;
+       bool dirty_shared = false;
         unsigned long mmun_start = 0;   /* For mmu_notifiers */
         unsigned long mmun_end = 0;     /* For mmu_notifiers */
         struct mem_cgroup *memcg;
@@ -2086,6 +2059,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 unlock_page(old_page);
         } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                         (VM_WRITE|VM_SHARED))) {
+               page_cache_get(old_page);
                 /*
                  * Only catch write-faults on shared writable pages,
                  * read-only shared pages can get COWed by
@@ -2093,7 +2067,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                  */
                 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                         int tmp;
-                       page_cache_get(old_page);
+
                         pte_unmap_unlock(page_table, ptl);
                         tmp = do_page_mkwrite(vma, old_page, address);
                         if (unlikely(!tmp || (tmp &
@@ -2113,11 +2087,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                 unlock_page(old_page);
                                 goto unlock;
                         }
-
                         page_mkwrite = 1;
                 }
-               dirty_page = old_page;
-               get_page(dirty_page);
+
+               dirty_shared = true;
  
  reuse:
                 /*
@@ -2136,20 +2109,20 @@ reuse:
                 pte_unmap_unlock(page_table, ptl);
                 ret |= VM_FAULT_WRITE;
  
-               if (!dirty_page)
-                       return ret;
-
-               if (!page_mkwrite) {
+               if (dirty_shared) {
                         struct address_space *mapping;
                         int dirtied;
  
-                       lock_page(dirty_page);
-                       dirtied = set_page_dirty(dirty_page);
-                       VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
-                       mapping = dirty_page->mapping;
-                       unlock_page(dirty_page);
+                       if (!page_mkwrite)
+                               lock_page(old_page);
+
+                       dirtied = set_page_dirty(old_page);
+                       VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+                       mapping = old_page->mapping;
+                       unlock_page(old_page);
+                       page_cache_release(old_page);
  
-                       if (dirtied && mapping) {
+                       if ((dirtied || page_mkwrite) && mapping) {
                                 /*
                                  * Some device drivers do not set page.mapping
                                  * but still dirty their pages
@@ -2157,25 +2130,9 @@ reuse:
                                 balance_dirty_pages_ratelimited(mapping);
                         }
  
-                       /* file_update_time outside page_lock */
-                       if (vma->vm_file)
+                       if (!page_mkwrite)
                                 file_update_time(vma->vm_file);
                 }
-               put_page(dirty_page);
-               if (page_mkwrite) {
-                       struct address_space *mapping = dirty_page->mapping;
-
-                       set_page_dirty(dirty_page);
-                       unlock_page(dirty_page);
-                       page_cache_release(dirty_page);
-                       if (mapping)    {
-                               /*
-                                * Some device drivers do not set page.mapping
-                                * but still dirty their pages
-                                */
-                               balance_dirty_pages_ratelimited(mapping);
-                       }
-               }
  
                 return ret;
         }
@@ -2333,25 +2290,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
         }
  }
  
-static inline void unmap_mapping_range_list(struct list_head *head,
-                                           struct zap_details *details)
-{
-       struct vm_area_struct *vma;
-
-       /*
-        * In nonlinear VMAs there is no correspondence between virtual address
-        * offset and file offset.  So we must perform an exhaustive search
-        * across *all* the pages in each nonlinear VMA, not just the pages
-        * whose virtual address lies outside the file truncation point.
-        */
-       list_for_each_entry(vma, head, shared.nonlinear) {
-               details->nonlinear_vma = vma;
-               unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-       }
-}
-
  /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
   * @mapping: the address space containing mmaps to be unmapped.
   * @holebegin: byte in first page to unmap, relative to the start of
   * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2380,7 +2323,6 @@ void unmap_mapping_range(struct address_space *mapping,
         }
  
         details.check_mapping = even_cows? NULL: mapping;
-       details.nonlinear_vma = NULL;
         details.first_index = hba;
         details.last_index = hba + hlen - 1;
         if (details.last_index < details.first_index)
@@ -2390,8 +2332,6 @@ void unmap_mapping_range(struct address_space *mapping,
         i_mmap_lock_write(mapping);
         if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                 unmap_mapping_range_tree(&mapping->i_mmap, &details);
-       if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-               unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
         i_mmap_unlock_write(mapping);
  }
  EXPORT_SYMBOL(unmap_mapping_range);
@@ -2752,8 +2692,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
         entry = mk_pte(page, vma->vm_page_prot);
         if (write)
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-       else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-               entry = pte_mksoft_dirty(entry);
         if (anon) {
                 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                 page_add_new_anon_rmap(page, vma, address);
@@ -2888,8 +2826,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * if page by the offset is not ready to be mapped (cold cache or
          * something).
          */
-       if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
-           fault_around_bytes >> PAGE_SHIFT > 1) {
+       if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
                 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                 do_fault_around(vma, address, pte, pgoff, flags);
                 if (!pte_same(*pte, orig_pte))
@@ -3021,8 +2958,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 balance_dirty_pages_ratelimited(mapping);
         }
  
-       /* file_update_time outside page_lock */
-       if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+       if (!vma->vm_ops->page_mkwrite)
                 file_update_time(vma->vm_file);
  
         return ret;
@@ -3034,7 +2970,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
   */
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
                 unsigned int flags, pte_t orig_pte)
  {
@@ -3051,46 +2987,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
  }
  
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags, pte_t orig_pte)
-{
-       pgoff_t pgoff;
-
-       flags |= FAULT_FLAG_NONLINEAR;
-
-       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-               return 0;
-
-       if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-               /*
-                * Page table corrupted: show pte and kill process.
-                */
-               print_bad_pte(vma, address, orig_pte, NULL);
-               return VM_FAULT_SIGBUS;
-       }
-
-       pgoff = pte_to_pgoff(orig_pte);
-       if (!(flags & FAULT_FLAG_WRITE))
-               return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       if (!(vma->vm_flags & VM_SHARED))
-               return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
  static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                 unsigned long addr, int page_nid,
                                 int *flags)
@@ -3218,15 +3114,12 @@ static int handle_pte_fault(struct mm_struct *mm,
                 if (pte_none(entry)) {
                         if (vma->vm_ops) {
                                 if (likely(vma->vm_ops->fault))
-                                       return do_linear_fault(mm, vma, address,
-                                               pte, pmd, flags, entry);
+                                       return do_fault(mm, vma, address, pte,
+                                                       pmd, flags, entry);
                         }
                         return do_anonymous_page(mm, vma, address,
                                                  pte, pmd, flags);
                 }
-               if (pte_file(entry))
-                       return do_nonlinear_fault(mm, vma, address,
-                                       pte, pmd, flags, entry);
                 return do_swap_page(mm, vma, address,
                                         pte, pmd, flags, entry);
         }
@@ -3430,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  
         spin_lock(&mm->page_table_lock);
  #ifndef __ARCH_HAS_4LEVEL_HACK
-       if (pud_present(*pud))          /* Another has populated it */
-               pmd_free(mm, new);
-       else
+       if (!pud_present(*pud)) {
+               mm_inc_nr_pmds(mm);
                 pud_populate(mm, pud, new);
-#else
-       if (pgd_present(*pud))          /* Another has populated it */
+       } else  /* Another has populated it */
                 pmd_free(mm, new);
-       else
+#else
+       if (!pgd_present(*pud)) {
+               mm_inc_nr_pmds(mm);
                 pgd_populate(mm, pud, new);
+       } else /* Another has populated it */
+               pmd_free(mm, new);
  #endif /* __ARCH_HAS_4LEVEL_HACK */
         spin_unlock(&mm->page_table_lock);
         return 0;