mm: record MAP_NORESERVE status on vmas and fix small page mprotect reservations
[cascardo/linux.git] / mm / hugetlb.c
index bbf953e..3e873f0 100644 (file)
@@ -40,6 +40,125 @@ static int hugetlb_next_nid;
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in base page units.
+ */
+static pgoff_t vma_page_offset(struct vm_area_struct *vma,
+                               unsigned long address)
+{
+       return ((address - vma->vm_start) >> PAGE_SHIFT) +
+                                       (vma->vm_pgoff >> PAGE_SHIFT);
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
+                                       unsigned long address)
+{
+       return ((address - vma->vm_start) >> HPAGE_SHIFT) +
+                       (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+}
+
+#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
+#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+       return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+                                                       unsigned long value)
+{
+       vma->vm_private_data = (void *)value;
+}
+
+static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               return get_vma_private_data(vma) & ~HPAGE_RESV_MASK;
+       return 0;
+}
+
+static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
+                                                       unsigned long reserve)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+       set_vma_private_data(vma,
+               (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+       set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+
+       return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED) {
+               /* Shared mappings always use reserves */
+               resv_huge_pages--;
+       } else {
+               /*
+                * Only the process that called mmap() has reserves for
+                * private mappings.
+                */
+               if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                       unsigned long flags, reserve;
+                       resv_huge_pages--;
+                       flags = (unsigned long)vma->vm_private_data &
+                                                       HPAGE_RESV_MASK;
+                       reserve = (unsigned long)vma->vm_private_data - 1;
+                       vma->vm_private_data = (void *)(reserve | flags);
+               }
+       }
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_private_reserves(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED)
+               return 0;
+       if (!vma_resv_huge_pages(vma))
+               return 0;
+       return 1;
+}
+
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
        int i;
@@ -90,7 +209,7 @@ static struct page *dequeue_huge_page(void)
 }
 
 static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
-                               unsigned long address)
+                               unsigned long address, int avoid_reserve)
 {
        int nid;
        struct page *page = NULL;
@@ -101,6 +220,19 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        struct zone *zone;
        struct zoneref *z;
 
+       /*
+        * A child process with MAP_PRIVATE mappings created by their parent
+        * have no page reserves. This check ensures that reservations are
+        * not "stolen". The child may still get SIGKILLed
+        */
+       if (!vma_has_private_reserves(vma) &&
+                       free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
+       /* If reserves cannot be used, ensure enough pages are in the pool */
+       if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
@@ -111,8 +243,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
-                       if (vma && vma->vm_flags & VM_MAYSHARE)
-                               resv_huge_pages--;
+
+                       if (!avoid_reserve)
+                               decrement_hugepage_resv_vma(vma);
+
                        break;
                }
        }
@@ -461,55 +595,41 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
        }
 }
 
-
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
-                                               unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                   unsigned long addr, int avoid_reserve)
 {
        struct page *page;
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned int chg = 0;
 
-       spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_vma(vma, addr);
-       spin_unlock(&hugetlb_lock);
-       return page ? page : ERR_PTR(-VM_FAULT_OOM);
-}
-
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
-                                               unsigned long addr)
-{
-       struct page *page = NULL;
-
-       if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
-               return ERR_PTR(-VM_FAULT_SIGBUS);
+       /*
+        * Processes that did not create the mapping will have no reserves and
+        * will not have accounted against quota. Check that the quota can be
+        * made before satisfying the allocation
+        */
+       if (!(vma->vm_flags & VM_SHARED) &&
+                       !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+               chg = 1;
+               if (hugetlb_get_quota(inode->i_mapping, chg))
+                       return ERR_PTR(-ENOSPC);
+       }
 
        spin_lock(&hugetlb_lock);
-       if (free_huge_pages > resv_huge_pages)
-               page = dequeue_huge_page_vma(vma, addr);
+       page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
        spin_unlock(&hugetlb_lock);
+
        if (!page) {
                page = alloc_buddy_huge_page(vma, addr);
                if (!page) {
-                       hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                       hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
-       return page;
-}
-
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                   unsigned long addr)
-{
-       struct page *page;
-       struct address_space *mapping = vma->vm_file->f_mapping;
 
-       if (vma->vm_flags & VM_MAYSHARE)
-               page = alloc_huge_page_shared(vma, addr);
-       else
-               page = alloc_huge_page_private(vma, addr);
+       set_page_refcounted(page);
+       set_page_private(page, (unsigned long) mapping);
 
-       if (!IS_ERR(page)) {
-               set_page_refcounted(page);
-               set_page_private(page, (unsigned long) mapping);
-       }
        return page;
 }
 
@@ -603,7 +723,6 @@ static unsigned long set_max_huge_pages(unsigned long count)
        }
 
        while (count > persistent_huge_pages) {
-               int ret;
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_page will handle it by freeing the page
@@ -717,6 +836,54 @@ unsigned long hugetlb_total_pages(void)
        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 }
 
+static int hugetlb_acct_memory(long delta)
+{
+       int ret = -ENOMEM;
+
+       spin_lock(&hugetlb_lock);
+       /*
+        * When cpuset is configured, it breaks the strict hugetlb page
+        * reservation as the accounting is done on a global variable. Such
+        * reservation is completely rubbish in the presence of cpuset because
+        * the reservation is not checked against page availability for the
+        * current cpuset. Application can still potentially OOM'ed by kernel
+        * with lack of free htlb page in cpuset that the task is in.
+        * Attempt to enforce strict accounting with cpuset is almost
+        * impossible (or too ugly) because cpuset is too fluid that
+        * task or memory node can be dynamically moved between cpusets.
+        *
+        * The change of semantics for shared hugetlb mapping with cpuset is
+        * undesirable. However, in order to preserve some of the semantics,
+        * we fall back to check against current free page availability as
+        * a best attempt and hopefully to minimize the impact of changing
+        * semantics that cpuset has.
+        */
+       if (delta > 0) {
+               if (gather_surplus_pages(delta) < 0)
+                       goto out;
+
+               if (delta > cpuset_mems_nr(free_huge_pages_node)) {
+                       return_unused_surplus_pages(delta);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+       if (delta < 0)
+               return_unused_surplus_pages((unsigned long) -delta);
+
+out:
+       spin_unlock(&hugetlb_lock);
+       return ret;
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+       unsigned long reserve = vma_resv_huge_pages(vma);
+       if (reserve)
+               hugetlb_acct_memory(-reserve);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -731,6 +898,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
+       .close = hugetlb_vm_op_close,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -785,7 +953,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        continue;
 
                spin_lock(&dst->page_table_lock);
-               spin_lock(&src->page_table_lock);
+               spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
                if (!huge_pte_none(huge_ptep_get(src_pte))) {
                        if (cow)
                                huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -804,7 +972,7 @@ nomem:
 }
 
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                           unsigned long end)
+                           unsigned long end, struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -832,6 +1000,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
 
+               /*
+                * If a reference page is supplied, it is because a specific
+                * page is being unmapped, not a range. Ensure the page we
+                * are about to unmap is the actual page of interest.
+                */
+               if (ref_page) {
+                       pte = huge_ptep_get(ptep);
+                       if (huge_pte_none(pte))
+                               continue;
+                       page = pte_page(pte);
+                       if (page != ref_page)
+                               continue;
+
+                       /*
+                        * Mark the VMA as having unmapped its page so that
+                        * future faults in this VMA will fail rather than
+                        * looking like data was lost
+                        */
+                       set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+               }
+
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (huge_pte_none(pte))
                        continue;
@@ -850,7 +1039,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end)
+                         unsigned long end, struct page *ref_page)
 {
        /*
         * It is undesirable to test vma->vm_file as it should be non-null
@@ -862,19 +1051,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
         */
        if (vma->vm_file) {
                spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-               __unmap_hugepage_range(vma, start, end);
+               __unmap_hugepage_range(vma, start, end, ref_page);
                spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
        }
 }
 
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       struct page *page,
+                                       unsigned long address)
+{
+       struct vm_area_struct *iter_vma;
+       struct address_space *mapping;
+       struct prio_tree_iter iter;
+       pgoff_t pgoff;
+
+       /*
+        * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+        * from page cache lookup which is in HPAGE_SIZE units.
+        */
+       address = address & huge_page_mask(hstate_vma(vma));
+       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+               + (vma->vm_pgoff >> PAGE_SHIFT);
+       mapping = (struct address_space *)page_private(page);
+
+       vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+               /* Do not unmap the current VMA */
+               if (iter_vma == vma)
+                       continue;
+
+               /*
+                * Unmap the page from other VMAs without their own reserves.
+                * They get marked to be SIGKILLed if they fault in these
+                * areas. This is because a future no-page fault on this VMA
+                * could insert a zeroed page instead of the data existing
+                * from the time of fork. This would look like data corruption
+                */
+               if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                       unmap_hugepage_range(iter_vma,
+                               address, address + HPAGE_SIZE,
+                               page);
+       }
+
+       return 1;
+}
+
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep, pte_t pte)
+                       unsigned long address, pte_t *ptep, pte_t pte,
+                       struct page *pagecache_page)
 {
        struct page *old_page, *new_page;
        int avoidcopy;
+       int outside_reserve = 0;
 
        old_page = pte_page(pte);
 
+retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1121,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
 
+       /*
+        * If the process that created a MAP_PRIVATE mapping is about to
+        * perform a COW due to a shared page count, attempt to satisfy
+        * the allocation without using the existing reserves. The pagecache
+        * page is used to determine if the reserve at this address was
+        * consumed or not. If reserves were used, a partial faulted mapping
+        * at the time of fork() could consume its reserves on COW instead
+        * of the full address range.
+        */
+       if (!(vma->vm_flags & VM_SHARED) &&
+                       is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                       old_page != pagecache_page)
+               outside_reserve = 1;
+
        page_cache_get(old_page);
-       new_page = alloc_huge_page(vma, address);
+       new_page = alloc_huge_page(vma, address, outside_reserve);
 
        if (IS_ERR(new_page)) {
                page_cache_release(old_page);
+
+               /*
+                * If a process owning a MAP_PRIVATE mapping fails to COW,
+                * it is due to references held by a child and an insufficient
+                * huge page pool. To guarantee the original mappers
+                * reliability, unmap the page from child processes. The child
+                * may get SIGKILLed if it later faults.
+                */
+               if (outside_reserve) {
+                       BUG_ON(huge_pte_none(pte));
+                       if (unmap_ref_private(mm, vma, old_page, address)) {
+                               BUG_ON(page_count(old_page) != 1);
+                               BUG_ON(huge_pte_none(pte));
+                               goto retry_avoidcopy;
+                       }
+                       WARN_ON_ONCE(1);
+               }
+
                return -PTR_ERR(new_page);
        }
 
@@ -910,19 +1180,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        return 0;
 }
 
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
+                       unsigned long address)
+{
+       struct address_space *mapping;
+       pgoff_t idx;
+
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_pagecache_offset(vma, address);
+
+       return find_lock_page(mapping, idx);
+}
+
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
-       unsigned long idx;
+       pgoff_t idx;
        unsigned long size;
        struct page *page;
        struct address_space *mapping;
        pte_t new_pte;
 
+       /*
+        * Currently, we are forced to kill the process in the event the
+        * original mapper has unmapped pages from the child due to a failed
+        * COW. Warn that such a situation has occured as it may not be obvious
+        */
+       if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+               printk(KERN_WARNING
+                       "PID %d killed due to inadequate hugepage pool\n",
+                       current->pid);
+               return ret;
+       }
+
        mapping = vma->vm_file->f_mapping;
-       idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
-               + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+       idx = vma_pagecache_offset(vma, address);
 
        /*
         * Use page lock to guard against racing truncation
@@ -934,7 +1228,7 @@ retry:
                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
                if (idx >= size)
                        goto out;
-               page = alloc_huge_page(vma, address);
+               page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
                        ret = -PTR_ERR(page);
                        goto out;
@@ -976,7 +1270,7 @@ retry:
 
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
        }
 
        spin_unlock(&mm->page_table_lock);
@@ -1021,8 +1315,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
-               if (write_access && !pte_write(entry))
-                       ret = hugetlb_cow(mm, vma, address, ptep, entry);
+               if (write_access && !pte_write(entry)) {
+                       struct page *page;
+                       page = hugetlbfs_pagecache_page(vma, address);
+                       ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
+                       if (page) {
+                               unlock_page(page);
+                               put_page(page);
+                       }
+               }
        spin_unlock(&mm->page_table_lock);
        mutex_unlock(&hugetlb_instantiation_mutex);
 
@@ -1249,52 +1550,26 @@ static long region_truncate(struct list_head *head, long end)
        return chg;
 }
 
-static int hugetlb_acct_memory(long delta)
+int hugetlb_reserve_pages(struct inode *inode,
+                                       long from, long to,
+                                       struct vm_area_struct *vma)
 {
-       int ret = -ENOMEM;
+       long ret, chg;
 
-       spin_lock(&hugetlb_lock);
        /*
-        * When cpuset is configured, it breaks the strict hugetlb page
-        * reservation as the accounting is done on a global variable. Such
-        * reservation is completely rubbish in the presence of cpuset because
-        * the reservation is not checked against page availability for the
-        * current cpuset. Application can still potentially OOM'ed by kernel
-        * with lack of free htlb page in cpuset that the task is in.
-        * Attempt to enforce strict accounting with cpuset is almost
-        * impossible (or too ugly) because cpuset is too fluid that
-        * task or memory node can be dynamically moved between cpusets.
-        *
-        * The change of semantics for shared hugetlb mapping with cpuset is
-        * undesirable. However, in order to preserve some of the semantics,
-        * we fall back to check against current free page availability as
-        * a best attempt and hopefully to minimize the impact of changing
-        * semantics that cpuset has.
+        * Shared mappings base their reservation on the number of pages that
+        * are already allocated on behalf of the file. Private mappings need
+        * to reserve the full area even if read-only as mprotect() may be
+        * called to make the mapping read-write. Assume !vma is a shm mapping
         */
-       if (delta > 0) {
-               if (gather_surplus_pages(delta) < 0)
-                       goto out;
-
-               if (delta > cpuset_mems_nr(free_huge_pages_node)) {
-                       return_unused_surplus_pages(delta);
-                       goto out;
-               }
+       if (!vma || vma->vm_flags & VM_SHARED)
+               chg = region_chg(&inode->i_mapping->private_list, from, to);
+       else {
+               chg = to - from;
+               set_vma_resv_huge_pages(vma, chg);
+               set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
 
-       ret = 0;
-       if (delta < 0)
-               return_unused_surplus_pages((unsigned long) -delta);
-
-out:
-       spin_unlock(&hugetlb_lock);
-       return ret;
-}
-
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-       long ret, chg;
-
-       chg = region_chg(&inode->i_mapping->private_list, from, to);
        if (chg < 0)
                return chg;
 
@@ -1305,7 +1580,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
-       region_add(&inode->i_mapping->private_list, from, to);
+       if (!vma || vma->vm_flags & VM_SHARED)
+               region_add(&inode->i_mapping->private_list, from, to);
        return 0;
 }