shmem: add huge pages support
[cascardo/linux.git] / mm / shmem.c
index 24463b6..302ae4a 100644 (file)
@@ -173,10 +173,13 @@ static inline int shmem_reacct_size(unsigned long flags,
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
-static inline int shmem_acct_block(unsigned long flags)
+static inline int shmem_acct_block(unsigned long flags, long pages)
 {
-       return (flags & VM_NORESERVE) ?
-               security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0;
+       if (!(flags & VM_NORESERVE))
+               return 0;
+
+       return security_vm_enough_memory_mm(current->mm,
+                       pages * VM_ACCT(PAGE_SIZE));
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -249,6 +252,51 @@ static void shmem_recalc_inode(struct inode *inode)
        }
 }
 
+bool shmem_charge(struct inode *inode, long pages)
+{
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+       if (shmem_acct_block(info->flags, pages))
+               return false;
+       spin_lock(&info->lock);
+       info->alloced += pages;
+       inode->i_blocks += pages * BLOCKS_PER_PAGE;
+       shmem_recalc_inode(inode);
+       spin_unlock(&info->lock);
+       inode->i_mapping->nrpages += pages;
+
+       if (!sbinfo->max_blocks)
+               return true;
+       if (percpu_counter_compare(&sbinfo->used_blocks,
+                               sbinfo->max_blocks - pages) > 0) {
+               inode->i_mapping->nrpages -= pages;
+               spin_lock(&info->lock);
+               info->alloced -= pages;
+               shmem_recalc_inode(inode);
+               spin_unlock(&info->lock);
+
+               return false;
+       }
+       percpu_counter_add(&sbinfo->used_blocks, pages);
+       return true;
+}
+
+void shmem_uncharge(struct inode *inode, long pages)
+{
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+       spin_lock(&info->lock);
+       info->alloced -= pages;
+       inode->i_blocks -= pages * BLOCKS_PER_PAGE;
+       shmem_recalc_inode(inode);
+       spin_unlock(&info->lock);
+
+       if (sbinfo->max_blocks)
+               percpu_counter_sub(&sbinfo->used_blocks, pages);
+}
+
 /*
  * Replace item expected in radix tree by a new item, while holding tree lock.
  */
@@ -288,6 +336,87 @@ static bool shmem_confirm_swap(struct address_space *mapping,
        return item == swp_to_radix_entry(swap);
 }
 
+/*
+ * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
+ *
+ * SHMEM_HUGE_NEVER:
+ *     disables huge pages for the mount;
+ * SHMEM_HUGE_ALWAYS:
+ *     enables huge pages for the mount;
+ * SHMEM_HUGE_WITHIN_SIZE:
+ *     only allocate huge pages if the page will be fully within i_size,
+ *     also respect fadvise()/madvise() hints;
+ * SHMEM_HUGE_ADVISE:
+ *     only allocate huge pages if requested with fadvise()/madvise();
+ */
+
+#define SHMEM_HUGE_NEVER       0
+#define SHMEM_HUGE_ALWAYS      1
+#define SHMEM_HUGE_WITHIN_SIZE 2
+#define SHMEM_HUGE_ADVISE      3
+
+/*
+ * Special values.
+ * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
+ *
+ * SHMEM_HUGE_DENY:
+ *     disables huge on shm_mnt and all mounts, for emergency use;
+ * SHMEM_HUGE_FORCE:
+ *     enables huge on shm_mnt and all mounts, w/o needing option, for testing;
+ *
+ */
+#define SHMEM_HUGE_DENY                (-1)
+#define SHMEM_HUGE_FORCE       (-2)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* ifdef here to avoid bloating shmem.o when not necessary */
+
+int shmem_huge __read_mostly;
+
+static int shmem_parse_huge(const char *str)
+{
+       if (!strcmp(str, "never"))
+               return SHMEM_HUGE_NEVER;
+       if (!strcmp(str, "always"))
+               return SHMEM_HUGE_ALWAYS;
+       if (!strcmp(str, "within_size"))
+               return SHMEM_HUGE_WITHIN_SIZE;
+       if (!strcmp(str, "advise"))
+               return SHMEM_HUGE_ADVISE;
+       if (!strcmp(str, "deny"))
+               return SHMEM_HUGE_DENY;
+       if (!strcmp(str, "force"))
+               return SHMEM_HUGE_FORCE;
+       return -EINVAL;
+}
+
+static const char *shmem_format_huge(int huge)
+{
+       switch (huge) {
+       case SHMEM_HUGE_NEVER:
+               return "never";
+       case SHMEM_HUGE_ALWAYS:
+               return "always";
+       case SHMEM_HUGE_WITHIN_SIZE:
+               return "within_size";
+       case SHMEM_HUGE_ADVISE:
+               return "advise";
+       case SHMEM_HUGE_DENY:
+               return "deny";
+       case SHMEM_HUGE_FORCE:
+               return "force";
+       default:
+               VM_BUG_ON(1);
+               return "bad_val";
+       }
+}
+
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+
+#define shmem_huge SHMEM_HUGE_DENY
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * Like add_to_page_cache_locked, but error if expected item has gone.
  */
@@ -295,30 +424,57 @@ static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, void *expected)
 {
-       int error;
+       int error, nr = hpage_nr_pages(page);
 
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       VM_BUG_ON_PAGE(index != round_down(index, nr), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+       VM_BUG_ON(expected && PageTransHuge(page));
 
-       get_page(page);
+       page_ref_add(page, nr);
        page->mapping = mapping;
        page->index = index;
 
        spin_lock_irq(&mapping->tree_lock);
-       if (!expected)
+       if (PageTransHuge(page)) {
+               void __rcu **results;
+               pgoff_t idx;
+               int i;
+
+               error = 0;
+               if (radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                       &results, &idx, index, 1) &&
+                               idx < index + HPAGE_PMD_NR) {
+                       error = -EEXIST;
+               }
+
+               if (!error) {
+                       for (i = 0; i < HPAGE_PMD_NR; i++) {
+                               error = radix_tree_insert(&mapping->page_tree,
+                                               index + i, page + i);
+                               VM_BUG_ON(error);
+                       }
+                       count_vm_event(THP_FILE_ALLOC);
+               }
+       } else if (!expected) {
                error = radix_tree_insert(&mapping->page_tree, index, page);
-       else
+       } else {
                error = shmem_radix_tree_replace(mapping, index, expected,
                                                                 page);
+       }
+
        if (!error) {
-               mapping->nrpages++;
-               __inc_zone_page_state(page, NR_FILE_PAGES);
-               __inc_zone_page_state(page, NR_SHMEM);
+               mapping->nrpages += nr;
+               if (PageTransHuge(page))
+                       __inc_zone_page_state(page, NR_SHMEM_THPS);
+               __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr);
+               __mod_zone_page_state(page_zone(page), NR_SHMEM, nr);
                spin_unlock_irq(&mapping->tree_lock);
        } else {
                page->mapping = NULL;
                spin_unlock_irq(&mapping->tree_lock);
-               put_page(page);
+               page_ref_sub(page, nr);
        }
        return error;
 }
@@ -331,6 +487,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
        struct address_space *mapping = page->mapping;
        int error;
 
+       VM_BUG_ON_PAGE(PageCompound(page), page);
+
        spin_lock_irq(&mapping->tree_lock);
        error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
        page->mapping = NULL;
@@ -510,10 +668,33 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                continue;
                        }
 
+                       VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
+
                        if (!trylock_page(page))
                                continue;
+
+                       if (PageTransTail(page)) {
+                               /* Middle of THP: zero out the page */
+                               clear_highpage(page);
+                               unlock_page(page);
+                               continue;
+                       } else if (PageTransHuge(page)) {
+                               if (index == round_down(end, HPAGE_PMD_NR)) {
+                                       /*
+                                        * Range ends in the middle of THP:
+                                        * zero out the page
+                                        */
+                                       clear_highpage(page);
+                                       unlock_page(page);
+                                       continue;
+                               }
+                               index += HPAGE_PMD_NR - 1;
+                               i += HPAGE_PMD_NR - 1;
+                       }
+
                        if (!unfalloc || !PageUptodate(page)) {
-                               if (page->mapping == mapping) {
+                               VM_BUG_ON_PAGE(PageTail(page), page);
+                               if (page_mapping(page) == mapping) {
                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
                                }
@@ -589,8 +770,36 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        }
 
                        lock_page(page);
+
+                       if (PageTransTail(page)) {
+                               /* Middle of THP: zero out the page */
+                               clear_highpage(page);
+                               unlock_page(page);
+                               /*
+                                * Partial thp truncate due 'start' in middle
+                                * of THP: don't need to look on these pages
+                                * again on !pvec.nr restart.
+                                */
+                               if (index != round_down(end, HPAGE_PMD_NR))
+                                       start++;
+                               continue;
+                       } else if (PageTransHuge(page)) {
+                               if (index == round_down(end, HPAGE_PMD_NR)) {
+                                       /*
+                                        * Range ends in the middle of THP:
+                                        * zero out the page
+                                        */
+                                       clear_highpage(page);
+                                       unlock_page(page);
+                                       continue;
+                               }
+                               index += HPAGE_PMD_NR - 1;
+                               i += HPAGE_PMD_NR - 1;
+                       }
+
                        if (!unfalloc || !PageUptodate(page)) {
-                               if (page->mapping == mapping) {
+                               VM_BUG_ON_PAGE(PageTail(page), page);
+                               if (page_mapping(page) == mapping) {
                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
                                } else {
@@ -848,6 +1057,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        swp_entry_t swap;
        pgoff_t index;
 
+       VM_BUG_ON_PAGE(PageCompound(page), page);
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
        index = page->index;
@@ -984,24 +1194,63 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #define vm_policy vm_private_data
 #endif
 
+static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
+               struct shmem_inode_info *info, pgoff_t index)
+{
+       /* Create a pseudo vma that just contains the policy */
+       vma->vm_start = 0;
+       /* Bias interleave by inode number to distribute better across nodes */
+       vma->vm_pgoff = index + info->vfs_inode.i_ino;
+       vma->vm_ops = NULL;
+       vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+}
+
+static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
+{
+       /* Drop reference taken by mpol_shared_policy_lookup() */
+       mpol_cond_put(vma->vm_policy);
+}
+
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
 {
        struct vm_area_struct pvma;
        struct page *page;
 
-       /* Create a pseudo vma that just contains the policy */
-       pvma.vm_start = 0;
-       /* Bias interleave by inode number to distribute better across nodes */
-       pvma.vm_pgoff = index + info->vfs_inode.i_ino;
-       pvma.vm_ops = NULL;
-       pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-
+       shmem_pseudo_vma_init(&pvma, info, index);
        page = swapin_readahead(swap, gfp, &pvma, 0);
+       shmem_pseudo_vma_destroy(&pvma);
 
-       /* Drop reference taken by mpol_shared_policy_lookup() */
-       mpol_cond_put(pvma.vm_policy);
+       return page;
+}
+
+static struct page *shmem_alloc_hugepage(gfp_t gfp,
+               struct shmem_inode_info *info, pgoff_t index)
+{
+       struct vm_area_struct pvma;
+       struct inode *inode = &info->vfs_inode;
+       struct address_space *mapping = inode->i_mapping;
+       pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR);
+       void __rcu **results;
+       struct page *page;
+
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+               return NULL;
+
+       rcu_read_lock();
+       if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
+                               hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       rcu_read_unlock();
 
+       shmem_pseudo_vma_init(&pvma, info, hindex);
+       page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
+                       HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+       shmem_pseudo_vma_destroy(&pvma);
+       if (page)
+               prep_transhuge_page(page);
        return page;
 }
 
@@ -1011,23 +1260,51 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        struct vm_area_struct pvma;
        struct page *page;
 
-       /* Create a pseudo vma that just contains the policy */
-       pvma.vm_start = 0;
-       /* Bias interleave by inode number to distribute better across nodes */
-       pvma.vm_pgoff = index + info->vfs_inode.i_ino;
-       pvma.vm_ops = NULL;
-       pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+       shmem_pseudo_vma_init(&pvma, info, index);
+       page = alloc_page_vma(gfp, &pvma, 0);
+       shmem_pseudo_vma_destroy(&pvma);
+
+       return page;
+}
+
+static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+               struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
+               pgoff_t index, bool huge)
+{
+       struct page *page;
+       int nr;
+       int err = -ENOSPC;
+
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+               huge = false;
+       nr = huge ? HPAGE_PMD_NR : 1;
 
-       page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+       if (shmem_acct_block(info->flags, nr))
+               goto failed;
+       if (sbinfo->max_blocks) {
+               if (percpu_counter_compare(&sbinfo->used_blocks,
+                                       sbinfo->max_blocks - nr) > 0)
+                       goto unacct;
+               percpu_counter_add(&sbinfo->used_blocks, nr);
+       }
+
+       if (huge)
+               page = shmem_alloc_hugepage(gfp, info, index);
+       else
+               page = shmem_alloc_page(gfp, info, index);
        if (page) {
                __SetPageLocked(page);
                __SetPageSwapBacked(page);
+               return page;
        }
 
-       /* Drop reference taken by mpol_shared_policy_lookup() */
-       mpol_cond_put(pvma.vm_policy);
-
-       return page;
+       err = -ENOMEM;
+       if (sbinfo->max_blocks)
+               percpu_counter_add(&sbinfo->used_blocks, -nr);
+unacct:
+       shmem_unacct_blocks(info->flags, nr);
+failed:
+       return ERR_PTR(err);
 }
 
 /*
@@ -1132,6 +1409,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct mem_cgroup *memcg;
        struct page *page;
        swp_entry_t swap;
+       pgoff_t hindex = index;
        int error;
        int once = 0;
        int alloced = 0;
@@ -1253,47 +1531,74 @@ repeat:
                swap_free(swap);
 
        } else {
-               if (shmem_acct_block(info->flags)) {
-                       error = -ENOSPC;
-                       goto failed;
-               }
-               if (sbinfo->max_blocks) {
-                       if (percpu_counter_compare(&sbinfo->used_blocks,
-                                               sbinfo->max_blocks) >= 0) {
-                               error = -ENOSPC;
-                               goto unacct;
-                       }
-                       percpu_counter_inc(&sbinfo->used_blocks);
+               /* shmem_symlink() */
+               if (mapping->a_ops != &shmem_aops)
+                       goto alloc_nohuge;
+               if (shmem_huge == SHMEM_HUGE_DENY)
+                       goto alloc_nohuge;
+               if (shmem_huge == SHMEM_HUGE_FORCE)
+                       goto alloc_huge;
+               switch (sbinfo->huge) {
+                       loff_t i_size;
+                       pgoff_t off;
+               case SHMEM_HUGE_NEVER:
+                       goto alloc_nohuge;
+               case SHMEM_HUGE_WITHIN_SIZE:
+                       off = round_up(index, HPAGE_PMD_NR);
+                       i_size = round_up(i_size_read(inode), PAGE_SIZE);
+                       if (i_size >= HPAGE_PMD_SIZE &&
+                                       i_size >> PAGE_SHIFT >= off)
+                               goto alloc_huge;
+                       /* fallthrough */
+               case SHMEM_HUGE_ADVISE:
+                       /* TODO: wire up fadvise()/madvise() */
+                       goto alloc_nohuge;
                }
 
-               page = shmem_alloc_page(gfp, info, index);
-               if (!page) {
-                       error = -ENOMEM;
-                       goto decused;
+alloc_huge:
+               page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+                               index, true);
+               if (IS_ERR(page)) {
+alloc_nohuge:          page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+                                       index, false);
+               }
+               if (IS_ERR(page)) {
+                       error = PTR_ERR(page);
+                       page = NULL;
+                       goto failed;
                }
+
+               if (PageTransHuge(page))
+                       hindex = round_down(index, HPAGE_PMD_NR);
+               else
+                       hindex = index;
+
                if (sgp == SGP_WRITE)
                        __SetPageReferenced(page);
 
                error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
-                               false);
+                               PageTransHuge(page));
                if (error)
-                       goto decused;
-               error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+                       goto unacct;
+               error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
+                               compound_order(page));
                if (!error) {
-                       error = shmem_add_to_page_cache(page, mapping, index,
+                       error = shmem_add_to_page_cache(page, mapping, hindex,
                                                        NULL);
                        radix_tree_preload_end();
                }
                if (error) {
-                       mem_cgroup_cancel_charge(page, memcg, false);
-                       goto decused;
+                       mem_cgroup_cancel_charge(page, memcg,
+                                       PageTransHuge(page));
+                       goto unacct;
                }
-               mem_cgroup_commit_charge(page, memcg, false, false);
+               mem_cgroup_commit_charge(page, memcg, false,
+                               PageTransHuge(page));
                lru_cache_add_anon(page);
 
                spin_lock(&info->lock);
-               info->alloced++;
-               inode->i_blocks += BLOCKS_PER_PAGE;
+               info->alloced += 1 << compound_order(page);
+               inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
                alloced = true;
@@ -1309,10 +1614,15 @@ clear:
                 * but SGP_FALLOC on a page fallocated earlier must initialize
                 * it now, lest undo on failure cancel our earlier guarantee.
                 */
-               if (sgp != SGP_WRITE) {
-                       clear_highpage(page);
-                       flush_dcache_page(page);
-                       SetPageUptodate(page);
+               if (sgp != SGP_WRITE && !PageUptodate(page)) {
+                       struct page *head = compound_head(page);
+                       int i;
+
+                       for (i = 0; i < (1 << compound_order(head)); i++) {
+                               clear_highpage(head + i);
+                               flush_dcache_page(head + i);
+                       }
+                       SetPageUptodate(head);
                }
        }
 
@@ -1329,17 +1639,23 @@ clear:
                error = -EINVAL;
                goto unlock;
        }
-       *pagep = page;
+       *pagep = page + index - hindex;
        return 0;
 
        /*
         * Error recovery.
         */
-decused:
-       if (sbinfo->max_blocks)
-               percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
-       shmem_unacct_blocks(info->flags, 1);
+       if (sbinfo->max_blocks)
+               percpu_counter_sub(&sbinfo->used_blocks,
+                               1 << compound_order(page));
+       shmem_unacct_blocks(info->flags, 1 << compound_order(page));
+
+       if (PageTransHuge(page)) {
+               unlock_page(page);
+               put_page(page);
+               goto alloc_nohuge;
+       }
 failed:
        if (swap.val && !shmem_confirm_swap(mapping, index, swap))
                error = -EEXIST;
@@ -1432,6 +1748,94 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return ret;
 }
 
+unsigned long shmem_get_unmapped_area(struct file *file,
+                                     unsigned long uaddr, unsigned long len,
+                                     unsigned long pgoff, unsigned long flags)
+{
+       unsigned long (*get_area)(struct file *,
+               unsigned long, unsigned long, unsigned long, unsigned long);
+       unsigned long addr;
+       unsigned long offset;
+       unsigned long inflated_len;
+       unsigned long inflated_addr;
+       unsigned long inflated_offset;
+
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
+       get_area = current->mm->get_unmapped_area;
+       addr = get_area(file, uaddr, len, pgoff, flags);
+
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+               return addr;
+       if (IS_ERR_VALUE(addr))
+               return addr;
+       if (addr & ~PAGE_MASK)
+               return addr;
+       if (addr > TASK_SIZE - len)
+               return addr;
+
+       if (shmem_huge == SHMEM_HUGE_DENY)
+               return addr;
+       if (len < HPAGE_PMD_SIZE)
+               return addr;
+       if (flags & MAP_FIXED)
+               return addr;
+       /*
+        * Our priority is to support MAP_SHARED mapped hugely;
+        * and support MAP_PRIVATE mapped hugely too, until it is COWed.
+        * But if caller specified an address hint, respect that as before.
+        */
+       if (uaddr)
+               return addr;
+
+       if (shmem_huge != SHMEM_HUGE_FORCE) {
+               struct super_block *sb;
+
+               if (file) {
+                       VM_BUG_ON(file->f_op != &shmem_file_operations);
+                       sb = file_inode(file)->i_sb;
+               } else {
+                       /*
+                        * Called directly from mm/mmap.c, or drivers/char/mem.c
+                        * for "/dev/zero", to create a shared anonymous object.
+                        */
+                       if (IS_ERR(shm_mnt))
+                               return addr;
+                       sb = shm_mnt->mnt_sb;
+               }
+               if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
+                       return addr;
+       }
+
+       offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
+       if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
+               return addr;
+       if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
+               return addr;
+
+       inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
+       if (inflated_len > TASK_SIZE)
+               return addr;
+       if (inflated_len < len)
+               return addr;
+
+       inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
+       if (IS_ERR_VALUE(inflated_addr))
+               return addr;
+       if (inflated_addr & ~PAGE_MASK)
+               return addr;
+
+       inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
+       inflated_addr += offset - inflated_offset;
+       if (inflated_offset > offset)
+               inflated_addr += HPAGE_PMD_SIZE;
+
+       if (inflated_addr > TASK_SIZE - len)
+               return addr;
+       return inflated_addr;
+}
+
 #ifdef CONFIG_NUMA
 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
@@ -1589,12 +1993,23 @@ shmem_write_end(struct file *file, struct address_space *mapping,
                i_size_write(inode, pos + copied);
 
        if (!PageUptodate(page)) {
+               struct page *head = compound_head(page);
+               if (PageTransCompound(page)) {
+                       int i;
+
+                       for (i = 0; i < HPAGE_PMD_NR; i++) {
+                               if (head + i == page)
+                                       continue;
+                               clear_highpage(head + i);
+                               flush_dcache_page(head + i);
+                       }
+               }
                if (copied < PAGE_SIZE) {
                        unsigned from = pos & (PAGE_SIZE - 1);
                        zero_user_segments(page, 0, from,
                                        from + copied, PAGE_SIZE);
                }
-               SetPageUptodate(page);
+               SetPageUptodate(head);
        }
        set_page_dirty(page);
        unlock_page(page);
@@ -2225,9 +2640,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                        error = shmem_getpage(inode, index, &page, SGP_FALLOC);
                if (error) {
                        /* Remove the !PageUptodate pages we added */
-                       shmem_undo_range(inode,
-                               (loff_t)start << PAGE_SHIFT,
-                               ((loff_t)index << PAGE_SHIFT) - 1, true);
+                       if (index > start) {
+                               shmem_undo_range(inode,
+                                   (loff_t)start << PAGE_SHIFT,
+                                   ((loff_t)index << PAGE_SHIFT) - 1, true);
+                       }
                        goto undone;
                }
 
@@ -2858,11 +3275,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        sbinfo->gid = make_kgid(current_user_ns(), gid);
                        if (!gid_valid(sbinfo->gid))
                                goto bad_val;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+               } else if (!strcmp(this_char, "huge")) {
+                       int huge;
+                       huge = shmem_parse_huge(value);
+                       if (huge < 0)
+                               goto bad_val;
+                       if (!has_transparent_hugepage() &&
+                                       huge != SHMEM_HUGE_NEVER)
+                               goto bad_val;
+                       sbinfo->huge = huge;
+#endif
+#ifdef CONFIG_NUMA
                } else if (!strcmp(this_char,"mpol")) {
                        mpol_put(mpol);
                        mpol = NULL;
                        if (mpol_parse_str(value, &mpol))
                                goto bad_val;
+#endif
                } else {
                        pr_err("tmpfs: Bad mount option %s\n", this_char);
                        goto error;
@@ -2908,6 +3338,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
                goto out;
 
        error = 0;
+       sbinfo->huge = config.huge;
        sbinfo->max_blocks  = config.max_blocks;
        sbinfo->max_inodes  = config.max_inodes;
        sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2941,6 +3372,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
+       if (sbinfo->huge)
+               seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
+#endif
        shmem_show_mpol(seq, sbinfo->mpol);
        return 0;
 }
@@ -3159,6 +3595,7 @@ static const struct address_space_operations shmem_aops = {
 
 static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
+       .get_unmapped_area = shmem_get_unmapped_area,
 #ifdef CONFIG_TMPFS
        .llseek         = shmem_file_llseek,
        .read_iter      = shmem_file_read_iter,
@@ -3280,6 +3717,13 @@ int __init shmem_init(void)
                pr_err("Could not kern_mount tmpfs\n");
                goto out1;
        }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
+               SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
+       else
+               shmem_huge = 0; /* just in case it was patched */
+#endif
        return 0;
 
 out1:
@@ -3291,6 +3735,60 @@ out3:
        return error;
 }
 
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
+static ssize_t shmem_enabled_show(struct kobject *kobj,
+               struct kobj_attribute *attr, char *buf)
+{
+       int values[] = {
+               SHMEM_HUGE_ALWAYS,
+               SHMEM_HUGE_WITHIN_SIZE,
+               SHMEM_HUGE_ADVISE,
+               SHMEM_HUGE_NEVER,
+               SHMEM_HUGE_DENY,
+               SHMEM_HUGE_FORCE,
+       };
+       int i, count;
+
+       for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
+               const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
+
+               count += sprintf(buf + count, fmt,
+                               shmem_format_huge(values[i]));
+       }
+       buf[count - 1] = '\n';
+       return count;
+}
+
+static ssize_t shmem_enabled_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t count)
+{
+       char tmp[16];
+       int huge;
+
+       if (count + 1 > sizeof(tmp))
+               return -EINVAL;
+       memcpy(tmp, buf, count);
+       tmp[count] = '\0';
+       if (count && tmp[count - 1] == '\n')
+               tmp[count - 1] = '\0';
+
+       huge = shmem_parse_huge(tmp);
+       if (huge == -EINVAL)
+               return -EINVAL;
+       if (!has_transparent_hugepage() &&
+                       huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
+               return -EINVAL;
+
+       shmem_huge = huge;
+       if (shmem_huge < SHMEM_HUGE_DENY)
+               SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
+       return count;
+}
+
+struct kobj_attribute shmem_enabled_attr =
+       __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
+
 #else /* !CONFIG_SHMEM */
 
 /*
@@ -3333,6 +3831,15 @@ void shmem_unlock_mapping(struct address_space *mapping)
 {
 }
 
+#ifdef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+                                     unsigned long addr, unsigned long len,
+                                     unsigned long pgoff, unsigned long flags)
+{
+       return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
+
 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
        truncate_inode_pages_range(inode->i_mapping, lstart, lend);