mm/hugetlb: clear compound_mapcount when freeing gigantic pages
[cascardo/linux.git] / mm / hugetlb.c
index 19d0d08..c1f3c0b 100644 (file)
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
 static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                }
        }
 
-       if (spool->min_hpages != -1) {          /* minimum size accounting */
+       /* minimum size accounting */
+       if (spool->min_hpages != -1 && spool->rsv_hpages) {
                if (delta > spool->rsv_hpages) {
                        /*
                         * Asking for more reserves than those already taken on
@@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
        if (spool->max_hpages != -1)            /* maximum size accounting */
                spool->used_hpages -= delta;
 
-       if (spool->min_hpages != -1) {          /* minimum size accounting */
+        /* minimum size accounting */
+       if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
                if (spool->rsv_hpages + delta <= spool->min_hpages)
                        ret = 0;
                else
@@ -624,6 +627,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 {
        return vma_hugecache_offset(hstate_vma(vma), vma, address);
 }
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
 
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority
@@ -828,8 +832,27 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
         * Only the process that called mmap() has reserves for
         * private mappings.
         */
-       if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-               return true;
+       if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+               /*
+                * Like the shared case above, a hole punch or truncate
+                * could have been performed on the private mapping.
+                * Examine the value of chg to determine if reserves
+                * actually exist or were previously consumed.
+                * Very Subtle - The value of chg comes from a previous
+                * call to vma_needs_reserves().  The reserve map for
+                * private mappings has different (opposite) semantics
+                * than that of shared mappings.  vma_needs_reserves()
+                * has already taken this difference in semantics into
+                * account.  Therefore, the meaning of chg is the same
+                * as in the shared case above.  Code could easily be
+                * combined, but keeping it separate draws attention to
+                * subtle differences.
+                */
+               if (chg)
+                       return false;
+               else
+                       return true;
+       }
 
        return false;
 }
@@ -937,9 +960,7 @@ err:
  */
 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 {
-       nid = next_node(nid, *nodes_allowed);
-       if (nid == MAX_NUMNODES)
-               nid = first_node(*nodes_allowed);
+       nid = next_node_in(nid, *nodes_allowed);
        VM_BUG_ON(nid >= MAX_NUMNODES);
 
        return nid;
@@ -1009,6 +1030,7 @@ static void destroy_compound_gigantic_page(struct page *page,
        int nr_pages = 1 << order;
        struct page *p = page + 1;
 
+       atomic_set(compound_mapcount_ptr(page), 0);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                clear_compound_head(p);
                set_page_refcounted(p);
@@ -1030,8 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
        return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 }
 
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
-                               unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+                       unsigned long start_pfn, unsigned long nr_pages)
 {
        unsigned long i, end_pfn = start_pfn + nr_pages;
        struct page *page;
@@ -1042,6 +1064,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
 
                page = pfn_to_page(i);
 
+               if (page_zone(page) != z)
+                       return false;
+
                if (PageReserved(page))
                        return false;
 
@@ -1074,7 +1099,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
 
                pfn = ALIGN(z->zone_start_pfn, nr_pages);
                while (zone_spans_last_pfn(z, pfn, nr_pages)) {
-                       if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+                       if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
                                /*
                                 * We release the zone lock here because
                                 * alloc_contig_range() will also lock the zone
@@ -1811,6 +1836,25 @@ static long __vma_reservation_common(struct hstate *h,
 
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
+       else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
+               /*
+                * In most cases, reserves always exist for private mappings.
+                * However, a file associated with mapping could have been
+                * hole punched or truncated after reserves were consumed.
+                * As subsequent fault on such a range will not use reserves.
+                * Subtle - The reserve map for private mappings has the
+                * opposite meaning than that of shared mappings.  If NO
+                * entry is in the reserve map, it means a reservation exists.
+                * If an entry exists in the reserve map, it means the
+                * reservation has already been consumed.  As a result, the
+                * return value of this routine is the opposite of the
+                * value returned from reserve map manipulation routines above.
+                */
+               if (ret)
+                       return 0;
+               else
+                       return 1;
+       }
        else
                return ret < 0 ? ret : 0;
 }
@@ -2659,6 +2703,11 @@ static int __init hugetlb_init(void)
 subsys_initcall(hugetlb_init);
 
 /* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+       parsed_valid_hugepagesz = false;
+}
+
 void __init hugetlb_add_hstate(unsigned int order)
 {
        struct hstate *h;
@@ -2678,8 +2727,8 @@ void __init hugetlb_add_hstate(unsigned int order)
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
        INIT_LIST_HEAD(&h->hugepage_activelist);
-       h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
-       h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+       h->next_nid_to_alloc = first_memory_node;
+       h->next_nid_to_free = first_memory_node;
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
 
@@ -2691,11 +2740,17 @@ static int __init hugetlb_nrpages_setup(char *s)
        unsigned long *mhp;
        static unsigned long *last_mhp;
 
+       if (!parsed_valid_hugepagesz) {
+               pr_warn("hugepages = %s preceded by "
+                       "an unsupported hugepagesz, ignoring\n", s);
+               parsed_valid_hugepagesz = true;
+               return 1;
+       }
        /*
         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
         * so this hugepages= parameter goes to the "default hstate".
         */
-       if (!hugetlb_max_hstate)
+       else if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;
@@ -4174,7 +4229,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
                if (saddr) {
                        spte = huge_pte_offset(svma->vm_mm, saddr);
                        if (spte) {
-                               mm_inc_nr_pmds(mm);
                                get_page(virt_to_page(spte));
                                break;
                        }
@@ -4189,9 +4243,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
+               mm_inc_nr_pmds(mm);
        } else {
                put_page(virt_to_page(spte));
-               mm_inc_nr_pmds(mm);
        }
        spin_unlock(ptl);
 out: