Merge branch 'linus' into x86/asm, to pick up recent fixes

author Ingo Molnar <mingo@kernel.org>

Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
author Ingo Molnar <mingo@kernel.org>
Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
diff --combined arch/Kconfig

index 9ecf9f6,fd6e971..180ea33
--- 1/arch/Kconfig
--- 2/arch/Kconfig
+++ b/arch/Kconfig
@@@ -336,17 -336,6 +336,6 @@@ config HAVE_ARCH_SECCOMP_FILTE
             results in the system call being skipped immediately.
           - seccomp syscall wired up
   
-         For best performance, an arch should use seccomp_phase1 and
-         seccomp_phase2 directly.  It should call seccomp_phase1 for all
-         syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
-         need to be called from a ptrace-safe context.  It must then
-         call seccomp_phase2 if seccomp_phase1 returns anything other
-         than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.
- 
-         As an additional optimization, an arch may provide seccomp_data
-         directly to seccomp_phase1; this avoids multiple calls
-         to the syscall_xyz helpers for every syscall.
- 
   config SECCOMP_FILTER
         def_bool y
         depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
@@@ -707,38 -696,4 +696,38 @@@ config ARCH_NO_COHERENT_DMA_MMA
   config CPU_NO_EFFICIENT_FFS
         def_bool n
   
+ +config HAVE_ARCH_VMAP_STACK
+ +      def_bool n
+ +      help
+ +        An arch should select this symbol if it can support kernel stacks
+ +        in vmalloc space.  This means:
+ +
+ +        - vmalloc space must be large enough to hold many kernel stacks.
+ +          This may rule out many 32-bit architectures.
+ +
+ +        - Stacks in vmalloc space need to work reliably.  For example, if
+ +          vmap page tables are created on demand, either this mechanism
+ +          needs to work while the stack points to a virtual address with
+ +          unpopulated page tables or arch code (switch_to() and switch_mm(),
+ +          most likely) needs to ensure that the stack's page table entries
+ +          are populated before running on a possibly unpopulated stack.
+ +
+ +        - If the stack overflows into a guard page, something reasonable
+ +          should happen.  The definition of "reasonable" is flexible, but
+ +          instantly rebooting without logging anything would be unfriendly.
+ +
+ +config VMAP_STACK
+ +      default y
+ +      bool "Use a virtually-mapped stack"
+ +      depends on HAVE_ARCH_VMAP_STACK && !KASAN
+ +      ---help---
+ +        Enable this if you want the use virtually-mapped kernel stacks
+ +        with guard pages.  This causes kernel stack overflows to be
+ +        caught immediately rather than causing difficult-to-diagnose
+ +        corruption.
+ +
+ +        This is presently incompatible with KASAN because KASAN expects
+ +        the stack to map directly to the KASAN shadow map using a formula
+ +        that is incorrect if the stack is in vmalloc space.
+ +
   source "kernel/gcov/Kconfig"
diff --combined arch/x86/Kconfig

index ce8860c,2a1f0ce..4c39728
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -24,7 -24,6 +24,6 @@@ config X8
         select ARCH_DISCARD_MEMBLOCK
         select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
         select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-       select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select ARCH_HAS_ELF_RANDOMIZE
         select ARCH_HAS_FAST_MULTIPLIER
@@@ -94,7 -93,6 +93,7 @@@
         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
         select HAVE_ARCH_WITHIN_STACK_FRAMES
         select HAVE_EBPF_JIT                    if X86_64
+ +      select HAVE_ARCH_VMAP_STACK             if X86_64
         select HAVE_CC_STACKPROTECTOR
         select HAVE_CMPXCHG_DOUBLE
         select HAVE_CMPXCHG_LOCAL
@@@ -111,6 -109,7 +110,6 @@@
         select HAVE_EXIT_THREAD
         select HAVE_FENTRY                      if X86_64
         select HAVE_FTRACE_MCOUNT_RECORD
- -      select HAVE_FUNCTION_GRAPH_FP_TEST
         select HAVE_FUNCTION_GRAPH_TRACER
         select HAVE_FUNCTION_TRACER
         select HAVE_GCC_PLUGINS
diff --combined arch/x86/kernel/kvmclock.c

index 0964399,3692249..60b9949
--- 1/arch/x86/kernel/kvmclock.c
--- 2/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@@ -29,7 -29,7 +29,7 @@@
   #include <asm/x86_init.h>
   #include <asm/reboot.h>
   
- -static int kvmclock = 1;
+ +static int kvmclock __ro_after_init = 1;
   static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
   static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
   static cycle_t kvm_sched_clock_offset;
@@@ -289,6 -289,7 +289,7 @@@ void __init kvmclock_init(void
         put_cpu();
   
         x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+       x86_platform.calibrate_cpu = kvm_get_tsc_khz;
         x86_platform.get_wallclock = kvm_get_wallclock;
         x86_platform.set_wallclock = kvm_set_wallclock;
   #ifdef CONFIG_X86_LOCAL_APIC
diff --combined arch/x86/kernel/paravirt.c

index b8e4680,1acfd76..bef3400
--- 1/arch/x86/kernel/paravirt.c
--- 2/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@@ -56,12 -56,12 +56,12 @@@ asm (".pushsection .entry.text, \"ax\"\
        ".popsection");
   
   /* identity function, which can be inlined */
- u32 _paravirt_ident_32(u32 x)
+ u32 notrace _paravirt_ident_32(u32 x)
   {
         return x;
   }
   
- u64 _paravirt_ident_64(u64 x)
+ u64 notrace _paravirt_ident_64(u64 x)
   {
         return x;
   }
@@@ -389,7 -389,7 +389,7 @@@ NOKPROBE_SYMBOL(native_load_idt)
   #define PTE_IDENT     __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
   #endif
   
- -struct pv_mmu_ops pv_mmu_ops = {
+ +struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
   
         .read_cr2 = native_read_cr2,
         .write_cr2 = native_write_cr2,
diff --combined arch/x86/kvm/vmx.c

index 87eaa6b,5cede40..121fdf6
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -422,6 -422,7 +422,7 @@@ struct nested_vmx 
         struct list_head vmcs02_pool;
         int vmcs02_num;
         u64 vmcs01_tsc_offset;
+       bool change_vmcs01_virtual_x2apic_mode;
         /* L2 must run next, and mustn't decide to exit to L1. */
         bool nested_run_pending;
         /*
@@@ -435,6 -436,8 +436,8 @@@
         bool pi_pending;
         u16 posted_intr_nv;
   
+       unsigned long *msr_bitmap;
+ 
         struct hrtimer preemption_timer;
         bool preemption_timer_expired;
   
@@@ -924,7 -927,6 +927,6 @@@ static unsigned long *vmx_msr_bitmap_le
   static unsigned long *vmx_msr_bitmap_longmode;
   static unsigned long *vmx_msr_bitmap_legacy_x2apic;
   static unsigned long *vmx_msr_bitmap_longmode_x2apic;
- static unsigned long *vmx_msr_bitmap_nested;
   static unsigned long *vmx_vmread_bitmap;
   static unsigned long *vmx_vmwrite_bitmap;
   
@@@ -2198,6 -2200,12 +2200,12 @@@ static void vmx_vcpu_pi_load(struct kvm
                         new.control) != old.control);
   }
   
+ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
+ {
+       vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
+       vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+ }
+ 
   /*
    * Switches to specified vcpu, until a matching vcpu_put(), but assumes
    * vcpu mutex is already taken.
@@@ -2256,10 -2264,8 +2264,8 @@@ static void vmx_vcpu_load(struct kvm_vc
   
         /* Setup TSC multiplier */
         if (kvm_has_tsc_control &&
-           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
-               vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
-               vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-       }
+           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
+               decache_tsc_multiplier(vmx);
   
         vmx_vcpu_pi_load(vcpu, cpu);
         vmx->host_pkru = read_pkru();
@@@ -2508,7 -2514,7 +2514,7 @@@ static void vmx_set_msr_bitmap(struct k
         unsigned long *msr_bitmap;
   
         if (is_guest_mode(vcpu))
-               msr_bitmap = vmx_msr_bitmap_nested;
+               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
         else if (cpu_has_secondary_exec_ctrls() &&
                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
@@@ -6363,13 -6369,6 +6369,6 @@@ static __init int hardware_setup(void
         if (!vmx_msr_bitmap_longmode_x2apic)
                 goto out4;
   
-       if (nested) {
-               vmx_msr_bitmap_nested =
-                       (unsigned long *)__get_free_page(GFP_KERNEL);
-               if (!vmx_msr_bitmap_nested)
-                       goto out5;
-       }
- 
         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_vmread_bitmap)
                 goto out6;
@@@ -6392,8 -6391,6 +6391,6 @@@
   
         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-       if (nested)
-               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
   
         if (setup_vmcs_config(&vmcs_config) < 0) {
                 r = -EIO;
@@@ -6529,9 -6526,6 +6526,6 @@@ out8
   out7:
         free_page((unsigned long)vmx_vmread_bitmap);
   out6:
-       if (nested)
-               free_page((unsigned long)vmx_msr_bitmap_nested);
- out5:
         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
   out4:
         free_page((unsigned long)vmx_msr_bitmap_longmode);
@@@ -6557,8 -6551,6 +6551,6 @@@ static __exit void hardware_unsetup(voi
         free_page((unsigned long)vmx_io_bitmap_a);
         free_page((unsigned long)vmx_vmwrite_bitmap);
         free_page((unsigned long)vmx_vmread_bitmap);
-       if (nested)
-               free_page((unsigned long)vmx_msr_bitmap_nested);
   
         free_kvm_area();
   }
@@@ -6995,16 -6987,21 +6987,21 @@@ static int handle_vmon(struct kvm_vcpu 
                 return 1;
         }
   
+       if (cpu_has_vmx_msr_bitmap()) {
+               vmx->nested.msr_bitmap =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+               if (!vmx->nested.msr_bitmap)
+                       goto out_msr_bitmap;
+       }
+ 
         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
         if (!vmx->nested.cached_vmcs12)
-               return -ENOMEM;
+               goto out_cached_vmcs12;
   
         if (enable_shadow_vmcs) {
                 shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs) {
-                       kfree(vmx->nested.cached_vmcs12);
-                       return -ENOMEM;
-               }
+               if (!shadow_vmcs)
+                       goto out_shadow_vmcs;
                 /* mark vmcs as shadow */
                 shadow_vmcs->revision_id |= (1u << 31);
                 /* init shadow vmcs */
@@@ -7024,6 -7021,15 +7021,15 @@@
         skip_emulated_instruction(vcpu);
         nested_vmx_succeed(vcpu);
         return 1;
+ 
+ out_shadow_vmcs:
+       kfree(vmx->nested.cached_vmcs12);
+ 
+ out_cached_vmcs12:
+       free_page((unsigned long)vmx->nested.msr_bitmap);
+ 
+ out_msr_bitmap:
+       return -ENOMEM;
   }
   
   /*
@@@ -7098,6 -7104,10 +7104,10 @@@ static void free_nested(struct vcpu_vm
         vmx->nested.vmxon = false;
         free_vpid(vmx->nested.vpid02);
         nested_release_vmcs12(vmx);
+       if (vmx->nested.msr_bitmap) {
+               free_page((unsigned long)vmx->nested.msr_bitmap);
+               vmx->nested.msr_bitmap = NULL;
+       }
         if (enable_shadow_vmcs)
                 free_vmcs(vmx->nested.current_shadow_vmcs);
         kfree(vmx->nested.cached_vmcs12);
@@@ -8419,6 -8429,12 +8429,12 @@@ static void vmx_set_virtual_x2apic_mode
   {
         u32 sec_exec_control;
   
+       /* Postpone execution until vmcs01 is the current VMCS. */
+       if (is_guest_mode(vcpu)) {
+               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+               return;
+       }
+ 
         /*
          * There is not point to enable virtualize x2apic without enable
          * apicv
@@@ -9472,8 -9488,10 +9488,10 @@@ static inline bool nested_vmx_merge_msr
   {
         int msr;
         struct page *page;
-       unsigned long *msr_bitmap;
+       unsigned long *msr_bitmap_l1;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
   
+       /* This shortcut is ok because we support only x2APIC MSRs so far. */
         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
                 return false;
   
@@@ -9482,63 -9500,37 +9500,37 @@@
                 WARN_ON(1);
                 return false;
         }
-       msr_bitmap = (unsigned long *)kmap(page);
-       if (!msr_bitmap) {
+       msr_bitmap_l1 = (unsigned long *)kmap(page);
+       if (!msr_bitmap_l1) {
                 nested_release_page_clean(page);
                 WARN_ON(1);
                 return false;
         }
   
+       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+ 
         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
                 if (nested_cpu_has_apic_reg_virt(vmcs12))
                         for (msr = 0x800; msr <= 0x8ff; msr++)
                                 nested_vmx_disable_intercept_for_msr(
-                                       msr_bitmap,
-                                       vmx_msr_bitmap_nested,
+                                       msr_bitmap_l1, msr_bitmap_l0,
                                         msr, MSR_TYPE_R);
-               /* TPR is allowed */
-               nested_vmx_disable_intercept_for_msr(msr_bitmap,
-                               vmx_msr_bitmap_nested,
+ 
+               nested_vmx_disable_intercept_for_msr(
+                               msr_bitmap_l1, msr_bitmap_l0,
                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
                                 MSR_TYPE_R | MSR_TYPE_W);
+ 
                 if (nested_cpu_has_vid(vmcs12)) {
-                       /* EOI and self-IPI are allowed */
                         nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap,
-                               vmx_msr_bitmap_nested,
+                               msr_bitmap_l1, msr_bitmap_l0,
                                 APIC_BASE_MSR + (APIC_EOI >> 4),
                                 MSR_TYPE_W);
                         nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap,
-                               vmx_msr_bitmap_nested,
+                               msr_bitmap_l1, msr_bitmap_l0,
                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
                                 MSR_TYPE_W);
                 }
-       } else {
-               /*
-                * Enable reading intercept of all the x2apic
-                * MSRs. We should not rely on vmcs12 to do any
-                * optimizations here, it may have been modified
-                * by L1.
-                */
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               msr,
-                               MSR_TYPE_R);
- 
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-                               MSR_TYPE_W);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_EOI >> 4),
-                               MSR_TYPE_W);
-               __vmx_enable_intercept_for_msr(
-                               vmx_msr_bitmap_nested,
-                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-                               MSR_TYPE_W);
         }
         kunmap(page);
         nested_release_page_clean(page);
@@@ -9957,10 -9949,10 +9949,10 @@@ static void prepare_vmcs02(struct kvm_v
         }
   
         if (cpu_has_vmx_msr_bitmap() &&
-           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
-               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
-               /* MSR_BITMAP will be set by following vmx_set_efer. */
-       } else
+           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
+       else
                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
   
         /*
@@@ -10011,6 -10003,8 +10003,8 @@@
                         vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
         else
                 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
   
         if (enable_vpid) {
                 /*
@@@ -10767,6 -10761,14 +10761,14 @@@ static void nested_vmx_vmexit(struct kv
         else
                 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
+ 
+       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+               vmx_set_virtual_x2apic_mode(vcpu,
+                               vcpu->arch.apic_base & X2APIC_ENABLE);
+       }
   
         /* This is needed for same reason as it was needed in prepare_vmcs02 */
         vmx->host_rsp = 0;
@@@ -11175,7 -11177,7 +11177,7 @@@ static void vmx_setup_mce(struct kvm_vc
                         ~FEATURE_CONTROL_LMCE;
   }
   
- -static struct kvm_x86_ops vmx_x86_ops = {
+ +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .cpu_has_kvm_support = cpu_has_kvm_support,
         .disabled_by_bios = vmx_disabled_by_bios,
         .hardware_setup = hardware_setup,
diff --combined arch/x86/mm/kaslr.c

index aec03aa,bda8d5e..ddd2661
--- 1/arch/x86/mm/kaslr.c
--- 2/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@@ -40,26 -40,17 +40,26 @@@
    * You need to add an if/def entry if you introduce a new memory region
    * compatible with KASLR. Your entry must be in logical order with memory
    * layout. For example, ESPFIX is before EFI because its virtual address is
- - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
    * ensure that this order is correct and won't be changed.
    */
   static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
- -static const unsigned long vaddr_end = VMEMMAP_START;
+ +
+ +#if defined(CONFIG_X86_ESPFIX64)
+ +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
+ +#elif defined(CONFIG_EFI)
+ +static const unsigned long vaddr_end = EFI_VA_START;
+ +#else
+ +static const unsigned long vaddr_end = __START_KERNEL_map;
+ +#endif
   
   /* Default values */
   unsigned long page_offset_base = __PAGE_OFFSET_BASE;
   EXPORT_SYMBOL(page_offset_base);
   unsigned long vmalloc_base = __VMALLOC_BASE;
   EXPORT_SYMBOL(vmalloc_base);
+ +unsigned long vmemmap_base = __VMEMMAP_BASE;
+ +EXPORT_SYMBOL(vmemmap_base);
   
   /*
    * Memory regions randomized by KASLR (except modules that use a separate logic
@@@ -72,7 -63,6 +72,7 @@@ static __initdata struct kaslr_memory_r
   } kaslr_regions[] = {
         { &page_offset_base, 64/* Maximum */ },
         { &vmalloc_base, VMALLOC_SIZE_TB },
+ +      { &vmemmap_base, 1 },
   };
   
   /* Get size in bytes used by the memory region */
@@@ -87,7 -77,7 +87,7 @@@ static inline unsigned long get_padding
    */
   static inline bool kaslr_memory_enabled(void)
   {
-       return kaslr_enabled() && !config_enabled(CONFIG_KASAN);
+       return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
   }
   
   /* Initialize base and padding for each memory region randomized with KASLR */
@@@ -99,18 -89,6 +99,18 @@@ void __init kernel_randomize_memory(voi
         struct rnd_state rand_state;
         unsigned long remain_entropy;
   
+ +      /*
+ +       * All these BUILD_BUG_ON checks ensures the memory layout is
+ +       * consistent with the vaddr_start/vaddr_end variables.
+ +       */
+ +      BUILD_BUG_ON(vaddr_start >= vaddr_end);
+ +      BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) &&
+ +                   vaddr_end >= EFI_VA_START);
+ +      BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) ||
+ +                    config_enabled(CONFIG_EFI)) &&
+ +                   vaddr_end >= __START_KERNEL_map);
+ +      BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ +
         if (!kaslr_memory_enabled())
                 return;
   
diff --combined fs/proc/base.c

index e9ff186,ac0df4d..3b792ab
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -483,7 -483,7 +483,7 @@@ static int proc_pid_stack(struct seq_fi
                 save_stack_trace_tsk(task, &trace);
   
                 for (i = 0; i < trace.nr_entries; i++) {
- -                      seq_printf(m, "[<%pK>] %pS\n",
+ +                      seq_printf(m, "[<%pK>] %pB\n",
                                    (void *)entries[i], (void *)entries[i]);
                 }
                 unlock_trace(task);
@@@ -1556,18 -1556,13 +1556,13 @@@ static const struct file_operations pro
   static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
   {
         struct task_struct *task;
-       struct mm_struct *mm;
         struct file *exe_file;
   
         task = get_proc_task(d_inode(dentry));
         if (!task)
                 return -ENOENT;
-       mm = get_task_mm(task);
+       exe_file = get_task_exe_file(task);
         put_task_struct(task);
-       if (!mm)
-               return -ENOENT;
-       exe_file = get_mm_exe_file(mm);
-       mmput(mm);
         if (exe_file) {
                 *exe_path = exe_file->f_path;
                 path_get(&exe_file->f_path);
diff --combined kernel/fork.c

index 9b85f6b,beb3172..0c240fd
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -158,39 -158,19 +158,39 @@@ void __weak arch_release_thread_stack(u
    * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
    * kmemcache based allocator.
    */
- -# if THREAD_SIZE >= PAGE_SIZE
- -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- -                                                int node)
+ +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+ +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
   {
+ +#ifdef CONFIG_VMAP_STACK
+ +      void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ +                                         VMALLOC_START, VMALLOC_END,
+ +                                         THREADINFO_GFP | __GFP_HIGHMEM,
+ +                                         PAGE_KERNEL,
+ +                                         0, node,
+ +                                         __builtin_return_address(0));
+ +
+ +      /*
+ +       * We can't call find_vm_area() in interrupt context, and
+ +       * free_thread_stack() can be called in interrupt context,
+ +       * so cache the vm_struct.
+ +       */
+ +      if (stack)
+ +              tsk->stack_vm_area = find_vm_area(stack);
+ +      return stack;
+ +#else
         struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                              THREAD_SIZE_ORDER);
   
         return page ? page_address(page) : NULL;
+ +#endif
   }
   
- -static inline void free_thread_stack(unsigned long *stack)
+ +static inline void free_thread_stack(struct task_struct *tsk)
   {
- -      __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+ +      if (task_stack_vm_area(tsk))
+ +              vfree(tsk->stack);
+ +      else
+ +              __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
   }
   # else
   static struct kmem_cache *thread_stack_cache;
@@@ -201,9 -181,9 +201,9 @@@ static unsigned long *alloc_thread_stac
         return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
   }
   
- -static void free_thread_stack(unsigned long *stack)
+ +static void free_thread_stack(struct task_struct *tsk)
   {
- -      kmem_cache_free(thread_stack_cache, stack);
+ +      kmem_cache_free(thread_stack_cache, tsk->stack);
   }
   
   void thread_stack_cache_init(void)
@@@ -233,47 -213,24 +233,47 @@@ struct kmem_cache *vm_area_cachep
   /* SLAB cache for mm_struct structures (tsk->mm) */
   static struct kmem_cache *mm_cachep;
   
- -static void account_kernel_stack(unsigned long *stack, int account)
+ +static void account_kernel_stack(struct task_struct *tsk, int account)
   {
- -      /* All stack pages are in the same zone and belong to the same memcg. */
- -      struct page *first_page = virt_to_page(stack);
+ +      void *stack = task_stack_page(tsk);
+ +      struct vm_struct *vm = task_stack_vm_area(tsk);
+ +
+ +      BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ +
+ +      if (vm) {
+ +              int i;
+ +
+ +              BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+ +
+ +              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ +                      mod_zone_page_state(page_zone(vm->pages[i]),
+ +                                          NR_KERNEL_STACK_KB,
+ +                                          PAGE_SIZE / 1024 * account);
+ +              }
   
- -      mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- -                          THREAD_SIZE / 1024 * account);
+ +              /* All stack pages belong to the same memcg. */
+ +              memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ +                                          account * (THREAD_SIZE / 1024));
+ +      } else {
+ +              /*
+ +               * All stack pages are in the same zone and belong to the
+ +               * same memcg.
+ +               */
+ +              struct page *first_page = virt_to_page(stack);
   
- -      memcg_kmem_update_page_stat(
- -              first_page, MEMCG_KERNEL_STACK_KB,
- -              account * (THREAD_SIZE / 1024));
+ +              mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ +                                  THREAD_SIZE / 1024 * account);
+ +
+ +              memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+ +                                          account * (THREAD_SIZE / 1024));
+ +      }
   }
   
   void free_task(struct task_struct *tsk)
   {
- -      account_kernel_stack(tsk->stack, -1);
+ +      account_kernel_stack(tsk, -1);
         arch_release_thread_stack(tsk->stack);
- -      free_thread_stack(tsk->stack);
+ +      free_thread_stack(tsk);
         rt_mutex_debug_task_free(tsk);
         ftrace_graph_exit_task(tsk);
         put_seccomp_filter(tsk);
@@@ -385,7 -342,6 +385,7 @@@ static struct task_struct *dup_task_str
   {
         struct task_struct *tsk;
         unsigned long *stack;
+ +      struct vm_struct *stack_vm_area;
         int err;
   
         if (node == NUMA_NO_NODE)
@@@ -398,23 -354,11 +398,23 @@@
         if (!stack)
                 goto free_tsk;
   
+ +      stack_vm_area = task_stack_vm_area(tsk);
+ +
         err = arch_dup_task_struct(tsk, orig);
+ +
+ +      /*
+ +       * arch_dup_task_struct() clobbers the stack-related fields.  Make
+ +       * sure they're properly initialized before using any stack-related
+ +       * functions again.
+ +       */
+ +      tsk->stack = stack;
+ +#ifdef CONFIG_VMAP_STACK
+ +      tsk->stack_vm_area = stack_vm_area;
+ +#endif
+ +
         if (err)
                 goto free_stack;
   
- -      tsk->stack = stack;
   #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
@@@ -446,14 -390,14 +446,14 @@@
         tsk->task_frag.page = NULL;
         tsk->wake_q.next = NULL;
   
- -      account_kernel_stack(stack, 1);
+ +      account_kernel_stack(tsk, 1);
   
         kcov_task_init(tsk);
   
         return tsk;
   
   free_stack:
- -      free_thread_stack(stack);
+ +      free_thread_stack(tsk);
   free_tsk:
         free_task_struct(tsk);
         return NULL;
@@@ -854,6 -798,29 +854,29 @@@ struct file *get_mm_exe_file(struct mm_
   }
   EXPORT_SYMBOL(get_mm_exe_file);
   
+ /**
+  * get_task_exe_file - acquire a reference to the task's executable file
+  *
+  * Returns %NULL if task's mm (if any) has no associated executable file or
+  * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+  * User must release file via fput().
+  */
+ struct file *get_task_exe_file(struct task_struct *task)
+ {
+       struct file *exe_file = NULL;
+       struct mm_struct *mm;
+ 
+       task_lock(task);
+       mm = task->mm;
+       if (mm) {
+               if (!(task->flags & PF_KTHREAD))
+                       exe_file = get_mm_exe_file(mm);
+       }
+       task_unlock(task);
+       return exe_file;
+ }
+ EXPORT_SYMBOL(get_task_exe_file);
+ 
   /**
    * get_task_mm - acquire a reference to the task's mm
    *
@@@ -969,14 -936,12 +992,12 @@@ void mm_release(struct task_struct *tsk
         deactivate_mm(tsk, mm);
   
         /*
-        * If we're exiting normally, clear a user-space tid field if
-        * requested.  We leave this alone when dying by signal, to leave
-        * the value intact in a core dump, and to save the unnecessary
-        * trouble, say, a killed vfork parent shouldn't touch this mm.
-        * Userland only wants this done for a sys_exit.
+        * Signal userspace if we're not exiting with a core dump
+        * because we want to leave the value intact for debugging
+        * purposes.
          */
         if (tsk->clear_child_tid) {
-               if (!(tsk->flags & PF_SIGNALED) &&
+               if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                     atomic_read(&mm->mm_users) > 1) {
                         /*
                          * We don't check the error code - if userspace has
@@@ -1460,7 -1425,6 +1481,6 @@@ static struct task_struct *copy_process
         p->real_start_time = ktime_get_boot_ns();
         p->io_context = NULL;
         p->audit_context = NULL;
-       threadgroup_change_begin(current);
         cgroup_fork(p);
   #ifdef CONFIG_NUMA
         p->mempolicy = mpol_dup(p->mempolicy);
@@@ -1612,6 -1576,7 +1632,7 @@@
         INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
   
+       threadgroup_change_begin(current);
         /*
          * Ensure that the cgroup subsystem policies allow the new process to be
          * forked. It should be noted the the new process's css_set can be changed
@@@ -1712,6 -1677,7 +1733,7 @@@
   bad_fork_cancel_cgroup:
         cgroup_cancel_fork(p);
   bad_fork_free_pid:
+       threadgroup_change_end(current);
         if (pid != &init_struct_pid)
                 free_pid(pid);
   bad_fork_cleanup_thread:
@@@ -1744,7 -1710,6 +1766,6 @@@ bad_fork_cleanup_policy
         mpol_put(p->mempolicy);
   bad_fork_cleanup_threadgroup_lock:
   #endif
-       threadgroup_change_end(current);
         delayacct_tsk_free(p);
   bad_fork_cleanup_count:
         atomic_dec(&p->cred->user->processes);
diff --combined kernel/sched/core.c

index 3d91b63,44817c6..0b6238f
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -2016,6 -2016,28 +2016,28 @@@ try_to_wake_up(struct task_struct *p, u
         success = 1; /* we're going to change ->state */
         cpu = task_cpu(p);
   
+       /*
+        * Ensure we load p->on_rq _after_ p->state, otherwise it would
+        * be possible to, falsely, observe p->on_rq == 0 and get stuck
+        * in smp_cond_load_acquire() below.
+        *
+        * sched_ttwu_pending()                 try_to_wake_up()
+        *   [S] p->on_rq = 1;                  [L] P->state
+        *       UNLOCK rq->lock  -----.
+        *                              \
+        *                               +---   RMB
+        * schedule()                   /
+        *       LOCK rq->lock    -----'
+        *       UNLOCK rq->lock
+        *
+        * [task p]
+        *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+        *
+        * Pairs with the UNLOCK+LOCK on rq->lock from the
+        * last wakeup of our task and the schedule that got our task
+        * current.
+        */
+       smp_rmb();
         if (p->on_rq && ttwu_remote(p, wake_flags))
                 goto stat;
   
@@@ -3381,6 -3403,7 +3403,6 @@@ static void __sched notrace __schedule(
   
         balance_callback(rq);
   }
- -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
author	Ingo Molnar <mingo@kernel.org>
	Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 15 Sep 2016 06:24:53 +0000 (08:24 +0200)
		1	2
arch/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/kvmclock.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/paravirt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/kaslr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history