Merge branch 'linus' into x86/asm, to pick up recent fixes
authorIngo Molnar <mingo@kernel.org>
Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
committerIngo Molnar <mingo@kernel.org>
Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
arch/Kconfig
arch/x86/Kconfig
arch/x86/kernel/kvmclock.c
arch/x86/kernel/paravirt.c
arch/x86/kvm/vmx.c
arch/x86/mm/kaslr.c
fs/proc/base.c
kernel/fork.c
kernel/sched/core.c

diff --combined arch/Kconfig
@@@ -336,17 -336,6 +336,6 @@@ config HAVE_ARCH_SECCOMP_FILTE
            results in the system call being skipped immediately.
          - seccomp syscall wired up
  
-         For best performance, an arch should use seccomp_phase1 and
-         seccomp_phase2 directly.  It should call seccomp_phase1 for all
-         syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
-         need to be called from a ptrace-safe context.  It must then
-         call seccomp_phase2 if seccomp_phase1 returns anything other
-         than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.
-         As an additional optimization, an arch may provide seccomp_data
-         directly to seccomp_phase1; this avoids multiple calls
-         to the syscall_xyz helpers for every syscall.
  config SECCOMP_FILTER
        def_bool y
        depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
@@@ -707,38 -696,4 +696,38 @@@ config ARCH_NO_COHERENT_DMA_MMA
  config CPU_NO_EFFICIENT_FFS
        def_bool n
  
 +config HAVE_ARCH_VMAP_STACK
 +      def_bool n
 +      help
 +        An arch should select this symbol if it can support kernel stacks
 +        in vmalloc space.  This means:
 +
 +        - vmalloc space must be large enough to hold many kernel stacks.
 +          This may rule out many 32-bit architectures.
 +
 +        - Stacks in vmalloc space need to work reliably.  For example, if
 +          vmap page tables are created on demand, either this mechanism
 +          needs to work while the stack points to a virtual address with
 +          unpopulated page tables or arch code (switch_to() and switch_mm(),
 +          most likely) needs to ensure that the stack's page table entries
 +          are populated before running on a possibly unpopulated stack.
 +
 +        - If the stack overflows into a guard page, something reasonable
 +          should happen.  The definition of "reasonable" is flexible, but
 +          instantly rebooting without logging anything would be unfriendly.
 +
 +config VMAP_STACK
 +      default y
 +      bool "Use a virtually-mapped stack"
 +      depends on HAVE_ARCH_VMAP_STACK && !KASAN
 +      ---help---
 +        Enable this if you want the use virtually-mapped kernel stacks
 +        with guard pages.  This causes kernel stack overflows to be
 +        caught immediately rather than causing difficult-to-diagnose
 +        corruption.
 +
 +        This is presently incompatible with KASAN because KASAN expects
 +        the stack to map directly to the KASAN shadow map using a formula
 +        that is incorrect if the stack is in vmalloc space.
 +
  source "kernel/gcov/Kconfig"
diff --combined arch/x86/Kconfig
@@@ -24,7 -24,6 +24,6 @@@ config X8
        select ARCH_DISCARD_MEMBLOCK
        select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-       select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FAST_MULTIPLIER
@@@ -94,7 -93,6 +93,7 @@@
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select HAVE_ARCH_WITHIN_STACK_FRAMES
        select HAVE_EBPF_JIT                    if X86_64
 +      select HAVE_ARCH_VMAP_STACK             if X86_64
        select HAVE_CC_STACKPROTECTOR
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
        select HAVE_EXIT_THREAD
        select HAVE_FENTRY                      if X86_64
        select HAVE_FTRACE_MCOUNT_RECORD
 -      select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACER
        select HAVE_GCC_PLUGINS
@@@ -29,7 -29,7 +29,7 @@@
  #include <asm/x86_init.h>
  #include <asm/reboot.h>
  
 -static int kvmclock = 1;
 +static int kvmclock __ro_after_init = 1;
  static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
  static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
  static cycle_t kvm_sched_clock_offset;
@@@ -289,6 -289,7 +289,7 @@@ void __init kvmclock_init(void
        put_cpu();
  
        x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+       x86_platform.calibrate_cpu = kvm_get_tsc_khz;
        x86_platform.get_wallclock = kvm_get_wallclock;
        x86_platform.set_wallclock = kvm_set_wallclock;
  #ifdef CONFIG_X86_LOCAL_APIC
@@@ -56,12 -56,12 +56,12 @@@ asm (".pushsection .entry.text, \"ax\"\
       ".popsection");
  
  /* identity function, which can be inlined */
- u32 _paravirt_ident_32(u32 x)
+ u32 notrace _paravirt_ident_32(u32 x)
  {
        return x;
  }
  
- u64 _paravirt_ident_64(u64 x)
+ u64 notrace _paravirt_ident_64(u64 x)
  {
        return x;
  }
@@@ -389,7 -389,7 +389,7 @@@ NOKPROBE_SYMBOL(native_load_idt)
  #define PTE_IDENT     __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
  #endif
  
 -struct pv_mmu_ops pv_mmu_ops = {
 +struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
  
        .read_cr2 = native_read_cr2,
        .write_cr2 = native_write_cr2,
diff --combined arch/x86/kvm/vmx.c
@@@ -422,6 -422,7 +422,7 @@@ struct nested_vmx 
        struct list_head vmcs02_pool;
        int vmcs02_num;
        u64 vmcs01_tsc_offset;
+       bool change_vmcs01_virtual_x2apic_mode;
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool nested_run_pending;
        /*
        bool pi_pending;
        u16 posted_intr_nv;
  
+       unsigned long *msr_bitmap;
        struct hrtimer preemption_timer;
        bool preemption_timer_expired;
  
@@@ -924,7 -927,6 +927,6 @@@ static unsigned long *vmx_msr_bitmap_le
  static unsigned long *vmx_msr_bitmap_longmode;
  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
- static unsigned long *vmx_msr_bitmap_nested;
  static unsigned long *vmx_vmread_bitmap;
  static unsigned long *vmx_vmwrite_bitmap;
  
@@@ -2198,6 -2200,12 +2200,12 @@@ static void vmx_vcpu_pi_load(struct kvm
                        new.control) != old.control);
  }
  
+ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
+ {
+       vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
+       vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+ }
  /*
   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   * vcpu mutex is already taken.
@@@ -2256,10 -2264,8 +2264,8 @@@ static void vmx_vcpu_load(struct kvm_vc
  
        /* Setup TSC multiplier */
        if (kvm_has_tsc_control &&
-           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
-               vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
-               vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-       }
+           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
+               decache_tsc_multiplier(vmx);
  
        vmx_vcpu_pi_load(vcpu, cpu);
        vmx->host_pkru = read_pkru();
@@@ -2508,7 -2514,7 +2514,7 @@@ static void vmx_set_msr_bitmap(struct k
        unsigned long *msr_bitmap;
  
        if (is_guest_mode(vcpu))
-               msr_bitmap = vmx_msr_bitmap_nested;
+               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
        else if (cpu_has_secondary_exec_ctrls() &&
                 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
                  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
@@@ -6363,13 -6369,6 +6369,6 @@@ static __init int hardware_setup(void
        if (!vmx_msr_bitmap_longmode_x2apic)
                goto out4;
  
-       if (nested) {
-               vmx_msr_bitmap_nested =
-                       (unsigned long *)__get_free_page(GFP_KERNEL);
-               if (!vmx_msr_bitmap_nested)
-                       goto out5;
-       }
        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmread_bitmap)
                goto out6;
  
        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-       if (nested)
-               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
  
        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
@@@ -6529,9 -6526,6 +6526,6 @@@ out8
  out7:
        free_page((unsigned long)vmx_vmread_bitmap);
  out6:
-       if (nested)
-               free_page((unsigned long)vmx_msr_bitmap_nested);
- out5:
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
  out4:
        free_page((unsigned long)vmx_msr_bitmap_longmode);
@@@ -6557,8 -6551,6 +6551,6 @@@ static __exit void hardware_unsetup(voi
        free_page((unsigned long)vmx_io_bitmap_a);
        free_page((unsigned long)vmx_vmwrite_bitmap);
        free_page((unsigned long)vmx_vmread_bitmap);
-       if (nested)
-               free_page((unsigned long)vmx_msr_bitmap_nested);
  
        free_kvm_area();
  }
@@@ -6995,16 -6987,21 +6987,21 @@@ static int handle_vmon(struct kvm_vcpu 
                return 1;
        }
  
+       if (cpu_has_vmx_msr_bitmap()) {
+               vmx->nested.msr_bitmap =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+               if (!vmx->nested.msr_bitmap)
+                       goto out_msr_bitmap;
+       }
        vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
        if (!vmx->nested.cached_vmcs12)
-               return -ENOMEM;
+               goto out_cached_vmcs12;
  
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs) {
-                       kfree(vmx->nested.cached_vmcs12);
-                       return -ENOMEM;
-               }
+               if (!shadow_vmcs)
+                       goto out_shadow_vmcs;
                /* mark vmcs as shadow */
                shadow_vmcs->revision_id |= (1u << 31);
                /* init shadow vmcs */
        skip_emulated_instruction(vcpu);
        nested_vmx_succeed(vcpu);
        return 1;
+ out_shadow_vmcs:
+       kfree(vmx->nested.cached_vmcs12);
+ out_cached_vmcs12:
+       free_page((unsigned long)vmx->nested.msr_bitmap);
+ out_msr_bitmap:
+       return -ENOMEM;
  }
  
  /*
@@@ -7098,6 -7104,10 +7104,10 @@@ static void free_nested(struct vcpu_vm
        vmx->nested.vmxon = false;
        free_vpid(vmx->nested.vpid02);
        nested_release_vmcs12(vmx);
+       if (vmx->nested.msr_bitmap) {
+               free_page((unsigned long)vmx->nested.msr_bitmap);
+               vmx->nested.msr_bitmap = NULL;
+       }
        if (enable_shadow_vmcs)
                free_vmcs(vmx->nested.current_shadow_vmcs);
        kfree(vmx->nested.cached_vmcs12);
@@@ -8419,6 -8429,12 +8429,12 @@@ static void vmx_set_virtual_x2apic_mode
  {
        u32 sec_exec_control;
  
+       /* Postpone execution until vmcs01 is the current VMCS. */
+       if (is_guest_mode(vcpu)) {
+               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+               return;
+       }
        /*
         * There is not point to enable virtualize x2apic without enable
         * apicv
@@@ -9472,8 -9488,10 +9488,10 @@@ static inline bool nested_vmx_merge_msr
  {
        int msr;
        struct page *page;
-       unsigned long *msr_bitmap;
+       unsigned long *msr_bitmap_l1;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
  
+       /* This shortcut is ok because we support only x2APIC MSRs so far. */
        if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
                return false;
  
                WARN_ON(1);
                return false;
        }
-       msr_bitmap = (unsigned long *)kmap(page);
-       if (!msr_bitmap) {
+       msr_bitmap_l1 = (unsigned long *)kmap(page);
+       if (!msr_bitmap_l1) {
                nested_release_page_clean(page);
                WARN_ON(1);
                return false;
        }
  
+       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
        if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
                if (nested_cpu_has_apic_reg_virt(vmcs12))
                        for (msr = 0x800; msr <= 0x8ff; msr++)
                                nested_vmx_disable_intercept_for_msr(
-                                       msr_bitmap,
-                                       vmx_msr_bitmap_nested,
+                                       msr_bitmap_l1, msr_bitmap_l0,
                                        msr, MSR_TYPE_R);
-               /* TPR is allowed */
-               nested_vmx_disable_intercept_for_msr(msr_bitmap,
-                               vmx_msr_bitmap_nested,
+               nested_vmx_disable_intercept_for_msr(
+                               msr_bitmap_l1, msr_bitmap_l0,
                                APIC_BASE_MSR + (APIC_TASKPRI >> 4),
                                MSR_TYPE_R | MSR_TYPE_W);
                if (nested_cpu_has_vid(vmcs12)) {
-                       /* EOI and self-IPI are allowed */
                        nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap,
-                               vmx_msr_bitmap_nested,
+                               msr_bitmap_l1, msr_bitmap_l0,
                                APIC_BASE_MSR + (APIC_EOI >> 4),
                                MSR_TYPE_W);
                        nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap,
-                               vmx_msr_bitmap_nested,
+                               msr_bitmap_l1, msr_bitmap_l0,
                                APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
                                MSR_TYPE_W);
                }
-       } else {
-               /*
-                * Enable reading intercept of all the x2apic
-                * MSRs. We should not rely on vmcs12 to do any
-                * optimizations here, it may have been modified
-                * by L1.
-                */
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               msr,
-                               MSR_TYPE_R);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-                               MSR_TYPE_W);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_EOI >> 4),
-                               MSR_TYPE_W);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-                               MSR_TYPE_W);
        }
        kunmap(page);
        nested_release_page_clean(page);
@@@ -9957,10 -9949,10 +9949,10 @@@ static void prepare_vmcs02(struct kvm_v
        }
  
        if (cpu_has_vmx_msr_bitmap() &&
-           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
-               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
-               /* MSR_BITMAP will be set by following vmx_set_efer. */
-       else
+           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+               /* MSR_BITMAP will be set by following vmx_set_efer. */
+       else
                exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
  
        /*
                        vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
        else
                vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
  
        if (enable_vpid) {
                /*
@@@ -10767,6 -10761,14 +10761,14 @@@ static void nested_vmx_vmexit(struct kv
        else
                vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
                              PIN_BASED_VMX_PREEMPTION_TIMER);
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
+       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+               vmx_set_virtual_x2apic_mode(vcpu,
+                               vcpu->arch.apic_base & X2APIC_ENABLE);
+       }
  
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
        vmx->host_rsp = 0;
@@@ -11175,7 -11177,7 +11177,7 @@@ static void vmx_setup_mce(struct kvm_vc
                        ~FEATURE_CONTROL_LMCE;
  }
  
 -static struct kvm_x86_ops vmx_x86_ops = {
 +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .hardware_setup = hardware_setup,
diff --combined arch/x86/mm/kaslr.c
   * You need to add an if/def entry if you introduce a new memory region
   * compatible with KASLR. Your entry must be in logical order with memory
   * layout. For example, ESPFIX is before EFI because its virtual address is
 - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
 + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
   * ensure that this order is correct and won't be changed.
   */
  static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
 -static const unsigned long vaddr_end = VMEMMAP_START;
 +
 +#if defined(CONFIG_X86_ESPFIX64)
 +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
 +#elif defined(CONFIG_EFI)
 +static const unsigned long vaddr_end = EFI_VA_START;
 +#else
 +static const unsigned long vaddr_end = __START_KERNEL_map;
 +#endif
  
  /* Default values */
  unsigned long page_offset_base = __PAGE_OFFSET_BASE;
  EXPORT_SYMBOL(page_offset_base);
  unsigned long vmalloc_base = __VMALLOC_BASE;
  EXPORT_SYMBOL(vmalloc_base);
 +unsigned long vmemmap_base = __VMEMMAP_BASE;
 +EXPORT_SYMBOL(vmemmap_base);
  
  /*
   * Memory regions randomized by KASLR (except modules that use a separate logic
@@@ -72,7 -63,6 +72,7 @@@ static __initdata struct kaslr_memory_r
  } kaslr_regions[] = {
        { &page_offset_base, 64/* Maximum */ },
        { &vmalloc_base, VMALLOC_SIZE_TB },
 +      { &vmemmap_base, 1 },
  };
  
  /* Get size in bytes used by the memory region */
@@@ -87,7 -77,7 +87,7 @@@ static inline unsigned long get_padding
   */
  static inline bool kaslr_memory_enabled(void)
  {
-       return kaslr_enabled() && !config_enabled(CONFIG_KASAN);
+       return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
  }
  
  /* Initialize base and padding for each memory region randomized with KASLR */
@@@ -99,18 -89,6 +99,18 @@@ void __init kernel_randomize_memory(voi
        struct rnd_state rand_state;
        unsigned long remain_entropy;
  
 +      /*
 +       * All these BUILD_BUG_ON checks ensures the memory layout is
 +       * consistent with the vaddr_start/vaddr_end variables.
 +       */
 +      BUILD_BUG_ON(vaddr_start >= vaddr_end);
 +      BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) &&
 +                   vaddr_end >= EFI_VA_START);
 +      BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) ||
 +                    config_enabled(CONFIG_EFI)) &&
 +                   vaddr_end >= __START_KERNEL_map);
 +      BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
 +
        if (!kaslr_memory_enabled())
                return;
  
diff --combined fs/proc/base.c
@@@ -483,7 -483,7 +483,7 @@@ static int proc_pid_stack(struct seq_fi
                save_stack_trace_tsk(task, &trace);
  
                for (i = 0; i < trace.nr_entries; i++) {
 -                      seq_printf(m, "[<%pK>] %pS\n",
 +                      seq_printf(m, "[<%pK>] %pB\n",
                                   (void *)entries[i], (void *)entries[i]);
                }
                unlock_trace(task);
@@@ -1556,18 -1556,13 +1556,13 @@@ static const struct file_operations pro
  static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
  {
        struct task_struct *task;
-       struct mm_struct *mm;
        struct file *exe_file;
  
        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
-       mm = get_task_mm(task);
+       exe_file = get_task_exe_file(task);
        put_task_struct(task);
-       if (!mm)
-               return -ENOENT;
-       exe_file = get_mm_exe_file(mm);
-       mmput(mm);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
diff --combined kernel/fork.c
@@@ -158,39 -158,19 +158,39 @@@ void __weak arch_release_thread_stack(u
   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
   * kmemcache based allocator.
   */
 -# if THREAD_SIZE >= PAGE_SIZE
 -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 -                                                int node)
 +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
 +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
  {
 +#ifdef CONFIG_VMAP_STACK
 +      void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
 +                                         VMALLOC_START, VMALLOC_END,
 +                                         THREADINFO_GFP | __GFP_HIGHMEM,
 +                                         PAGE_KERNEL,
 +                                         0, node,
 +                                         __builtin_return_address(0));
 +
 +      /*
 +       * We can't call find_vm_area() in interrupt context, and
 +       * free_thread_stack() can be called in interrupt context,
 +       * so cache the vm_struct.
 +       */
 +      if (stack)
 +              tsk->stack_vm_area = find_vm_area(stack);
 +      return stack;
 +#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
  
        return page ? page_address(page) : NULL;
 +#endif
  }
  
 -static inline void free_thread_stack(unsigned long *stack)
 +static inline void free_thread_stack(struct task_struct *tsk)
  {
 -      __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
 +      if (task_stack_vm_area(tsk))
 +              vfree(tsk->stack);
 +      else
 +              __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
  }
  # else
  static struct kmem_cache *thread_stack_cache;
@@@ -201,9 -181,9 +201,9 @@@ static unsigned long *alloc_thread_stac
        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
  }
  
 -static void free_thread_stack(unsigned long *stack)
 +static void free_thread_stack(struct task_struct *tsk)
  {
 -      kmem_cache_free(thread_stack_cache, stack);
 +      kmem_cache_free(thread_stack_cache, tsk->stack);
  }
  
  void thread_stack_cache_init(void)
@@@ -233,47 -213,24 +233,47 @@@ struct kmem_cache *vm_area_cachep
  /* SLAB cache for mm_struct structures (tsk->mm) */
  static struct kmem_cache *mm_cachep;
  
 -static void account_kernel_stack(unsigned long *stack, int account)
 +static void account_kernel_stack(struct task_struct *tsk, int account)
  {
 -      /* All stack pages are in the same zone and belong to the same memcg. */
 -      struct page *first_page = virt_to_page(stack);
 +      void *stack = task_stack_page(tsk);
 +      struct vm_struct *vm = task_stack_vm_area(tsk);
 +
 +      BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
 +
 +      if (vm) {
 +              int i;
 +
 +              BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 +
 +              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
 +                      mod_zone_page_state(page_zone(vm->pages[i]),
 +                                          NR_KERNEL_STACK_KB,
 +                                          PAGE_SIZE / 1024 * account);
 +              }
  
 -      mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 -                          THREAD_SIZE / 1024 * account);
 +              /* All stack pages belong to the same memcg. */
 +              memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
 +                                          account * (THREAD_SIZE / 1024));
 +      } else {
 +              /*
 +               * All stack pages are in the same zone and belong to the
 +               * same memcg.
 +               */
 +              struct page *first_page = virt_to_page(stack);
  
 -      memcg_kmem_update_page_stat(
 -              first_page, MEMCG_KERNEL_STACK_KB,
 -              account * (THREAD_SIZE / 1024));
 +              mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 +                                  THREAD_SIZE / 1024 * account);
 +
 +              memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
 +                                          account * (THREAD_SIZE / 1024));
 +      }
  }
  
  void free_task(struct task_struct *tsk)
  {
 -      account_kernel_stack(tsk->stack, -1);
 +      account_kernel_stack(tsk, -1);
        arch_release_thread_stack(tsk->stack);
 -      free_thread_stack(tsk->stack);
 +      free_thread_stack(tsk);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@@ -385,7 -342,6 +385,7 @@@ static struct task_struct *dup_task_str
  {
        struct task_struct *tsk;
        unsigned long *stack;
 +      struct vm_struct *stack_vm_area;
        int err;
  
        if (node == NUMA_NO_NODE)
        if (!stack)
                goto free_tsk;
  
 +      stack_vm_area = task_stack_vm_area(tsk);
 +
        err = arch_dup_task_struct(tsk, orig);
 +
 +      /*
 +       * arch_dup_task_struct() clobbers the stack-related fields.  Make
 +       * sure they're properly initialized before using any stack-related
 +       * functions again.
 +       */
 +      tsk->stack = stack;
 +#ifdef CONFIG_VMAP_STACK
 +      tsk->stack_vm_area = stack_vm_area;
 +#endif
 +
        if (err)
                goto free_stack;
  
 -      tsk->stack = stack;
  #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
  
 -      account_kernel_stack(stack, 1);
 +      account_kernel_stack(tsk, 1);
  
        kcov_task_init(tsk);
  
        return tsk;
  
  free_stack:
 -      free_thread_stack(stack);
 +      free_thread_stack(tsk);
  free_tsk:
        free_task_struct(tsk);
        return NULL;
@@@ -854,6 -798,29 +854,29 @@@ struct file *get_mm_exe_file(struct mm_
  }
  EXPORT_SYMBOL(get_mm_exe_file);
  
+ /**
+  * get_task_exe_file - acquire a reference to the task's executable file
+  *
+  * Returns %NULL if task's mm (if any) has no associated executable file or
+  * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+  * User must release file via fput().
+  */
+ struct file *get_task_exe_file(struct task_struct *task)
+ {
+       struct file *exe_file = NULL;
+       struct mm_struct *mm;
+       task_lock(task);
+       mm = task->mm;
+       if (mm) {
+               if (!(task->flags & PF_KTHREAD))
+                       exe_file = get_mm_exe_file(mm);
+       }
+       task_unlock(task);
+       return exe_file;
+ }
+ EXPORT_SYMBOL(get_task_exe_file);
  /**
   * get_task_mm - acquire a reference to the task's mm
   *
@@@ -969,14 -936,12 +992,12 @@@ void mm_release(struct task_struct *tsk
        deactivate_mm(tsk, mm);
  
        /*
-        * If we're exiting normally, clear a user-space tid field if
-        * requested.  We leave this alone when dying by signal, to leave
-        * the value intact in a core dump, and to save the unnecessary
-        * trouble, say, a killed vfork parent shouldn't touch this mm.
-        * Userland only wants this done for a sys_exit.
+        * Signal userspace if we're not exiting with a core dump
+        * because we want to leave the value intact for debugging
+        * purposes.
         */
        if (tsk->clear_child_tid) {
-               if (!(tsk->flags & PF_SIGNALED) &&
+               if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                    atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
@@@ -1460,7 -1425,6 +1481,6 @@@ static struct task_struct *copy_process
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
-       threadgroup_change_begin(current);
        cgroup_fork(p);
  #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
  
+       threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
  bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p);
  bad_fork_free_pid:
+       threadgroup_change_end(current);
        if (pid != &init_struct_pid)
                free_pid(pid);
  bad_fork_cleanup_thread:
@@@ -1744,7 -1710,6 +1766,6 @@@ bad_fork_cleanup_policy
        mpol_put(p->mempolicy);
  bad_fork_cleanup_threadgroup_lock:
  #endif
-       threadgroup_change_end(current);
        delayacct_tsk_free(p);
  bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
diff --combined kernel/sched/core.c
@@@ -2016,6 -2016,28 +2016,28 @@@ try_to_wake_up(struct task_struct *p, u
        success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
  
+       /*
+        * Ensure we load p->on_rq _after_ p->state, otherwise it would
+        * be possible to, falsely, observe p->on_rq == 0 and get stuck
+        * in smp_cond_load_acquire() below.
+        *
+        * sched_ttwu_pending()                 try_to_wake_up()
+        *   [S] p->on_rq = 1;                  [L] P->state
+        *       UNLOCK rq->lock  -----.
+        *                              \
+        *                               +---   RMB
+        * schedule()                   /
+        *       LOCK rq->lock    -----'
+        *       UNLOCK rq->lock
+        *
+        * [task p]
+        *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+        *
+        * Pairs with the UNLOCK+LOCK on rq->lock from the
+        * last wakeup of our task and the schedule that got our task
+        * current.
+        */
+       smp_rmb();
        if (p->on_rq && ttwu_remote(p, wake_flags))
                goto stat;
  
@@@ -3381,6 -3403,7 +3403,6 @@@ static void __sched notrace __schedule(
  
        balance_callback(rq);
  }
 -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {