#include <asm/page.h>
#include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
- for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
-
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
}
if (list_empty(&h->hugepage_freelists[nid]))
return NULL;
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
- list_del(&page->lru);
+ list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1 << PG_writeback);
}
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
- INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ /* remove the page from active list */
+ list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
+ INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
+ set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
spin_lock(&hugetlb_lock);
if (page) {
+ INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
*/
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
- list_del(&page->lru);
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
/* Free unnecessary surplus pages to the buddy allocator */
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- list_del(&page->lru);
put_page(page);
}
}
struct hstate *h = hstate_vma(vma);
struct page *page;
long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+ idx = hstate_index(h);
/*
* Processes that did not create the mapping will have no
* reserves and will not have accounted against subpool
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(-VM_FAULT_OOM);
+ return ERR_PTR(-ENOMEM);
if (chg)
if (hugepage_subpool_get_pages(spool, chg))
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ return ERR_PTR(-ENOSPC);
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret) {
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- spin_unlock(&hugetlb_lock);
-
- if (!page) {
+ if (page) {
+ /* update page cgroup details */
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ } else {
+ spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
hugepage_subpool_put_pages(spool, chg);
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ return ERR_PTR(-ENOSPC);
}
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ list_move(&page->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
}
set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
-
return page;
}
struct attribute_group *hstate_attr_group)
{
int retval;
- int hi = h - hstates;
+ int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
- for_each_hstate(h)
- if (nhs->hstate_kobjs[h - hstates]) {
- kobject_put(nhs->hstate_kobjs[h - hstates]);
- nhs->hstate_kobjs[h - hstates] = NULL;
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
}
+ }
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
hugetlb_unregister_all_nodes();
for_each_hstate(h) {
- kobject_put(hstate_kobjs[h - hstates]);
+ kobject_put(hstate_kobjs[hstate_index(h)]);
}
kobject_put(hugepages_kobj);
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
- default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
if (default_hstate_max_huge_pages)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
return;
}
- BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
- h = &hstates[max_hstate++];
+ h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
h->nr_huge_pages = 0;
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgoup details.
+ */
+ if (order >= HUGETLB_CGROUP_MIN_ORDER)
+ hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
static unsigned long *last_mhp;
/*
- * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!max_hstate)
+ if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
* But we need to allocate >= MAX_ORDER hstates here early to still
* use the bootmem allocator.
*/
- if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
return 0;
}
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
+ int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
- struct page *tmp;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- /*
- * A page gathering list, protected by per file i_mmap_mutex. The
- * lock is used to avoid list corruption from multiple unmapping
- * of the same page since we are using page->lru.
- */
- LIST_HEAD(page_list);
-
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, start, end);
+again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
+ tlb_remove_tlb_entry(tlb, ptep, address);
if (pte_dirty(pte))
set_page_dirty(page);
- list_add(&page->lru, &page_list);
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
/* Bail out after unmapping reference page if supplied */
if (ref_page)
break;
}
- flush_tlb_range(vma, start, end);
spin_unlock(&mm->page_table_lock);
- mmu_notifier_invalidate_range_end(mm, start, end);
- list_for_each_entry_safe(page, tmp, &page_list, lru) {
- page_remove_rmap(page);
- list_del(&page->lru);
- put_page(page);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
}
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_end_vma(tlb, vma);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_mutex is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
- __unmap_hugepage_range(vma, start, end, ref_page);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, 0);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
- __unmap_hugepage_range(iter_vma,
- address, address + huge_page_size(h),
- page);
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
}
mutex_unlock(&mapping->i_mmap_mutex);
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
page_cache_release(old_page);
/*
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
- return -PTR_ERR(new_page);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
}
/*
goto out;
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
- ret = -PTR_ERR(page);
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON |
- VM_FAULT_SET_HINDEX(h - hstates);
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
}
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(h - hstates);
+ VM_FAULT_SET_HINDEX(hstate_index(h));
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
}
}
spin_unlock(&mm->page_table_lock);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
int hugetlb_reserve_pages(struct inode *inode,