i386: move kernel
authorThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:01 +0000 (11:17 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:01 +0000 (11:17 +0200)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
163 files changed:
arch/i386/Makefile
arch/i386/kernel/.gitignore [deleted file]
arch/i386/kernel/Makefile [deleted file]
arch/i386/kernel/Makefile_32 [deleted file]
arch/i386/kernel/alternative.c [deleted file]
arch/i386/kernel/apic_32.c [deleted file]
arch/i386/kernel/apm_32.c [deleted file]
arch/i386/kernel/asm-offsets.c [deleted file]
arch/i386/kernel/asm-offsets_32.c [deleted file]
arch/i386/kernel/bootflag.c [deleted file]
arch/i386/kernel/cpuid.c [deleted file]
arch/i386/kernel/crash_32.c [deleted file]
arch/i386/kernel/crash_dump_32.c [deleted file]
arch/i386/kernel/doublefault_32.c [deleted file]
arch/i386/kernel/e820_32.c [deleted file]
arch/i386/kernel/early_printk.c [deleted file]
arch/i386/kernel/efi_32.c [deleted file]
arch/i386/kernel/efi_stub_32.S [deleted file]
arch/i386/kernel/entry_32.S [deleted file]
arch/i386/kernel/geode_32.c [deleted file]
arch/i386/kernel/head_32.S [deleted file]
arch/i386/kernel/hpet_32.c [deleted file]
arch/i386/kernel/i386_ksyms_32.c [deleted file]
arch/i386/kernel/i387_32.c [deleted file]
arch/i386/kernel/i8237.c [deleted file]
arch/i386/kernel/i8253_32.c [deleted file]
arch/i386/kernel/i8259_32.c [deleted file]
arch/i386/kernel/init_task_32.c [deleted file]
arch/i386/kernel/io_apic_32.c [deleted file]
arch/i386/kernel/ioport_32.c [deleted file]
arch/i386/kernel/irq_32.c [deleted file]
arch/i386/kernel/kprobes_32.c [deleted file]
arch/i386/kernel/ldt_32.c [deleted file]
arch/i386/kernel/machine_kexec_32.c [deleted file]
arch/i386/kernel/mca_32.c [deleted file]
arch/i386/kernel/microcode.c [deleted file]
arch/i386/kernel/module_32.c [deleted file]
arch/i386/kernel/mpparse_32.c [deleted file]
arch/i386/kernel/msr.c [deleted file]
arch/i386/kernel/nmi_32.c [deleted file]
arch/i386/kernel/numaq_32.c [deleted file]
arch/i386/kernel/paravirt_32.c [deleted file]
arch/i386/kernel/pci-dma_32.c [deleted file]
arch/i386/kernel/pcspeaker.c [deleted file]
arch/i386/kernel/process_32.c [deleted file]
arch/i386/kernel/ptrace_32.c [deleted file]
arch/i386/kernel/quirks.c [deleted file]
arch/i386/kernel/reboot_32.c [deleted file]
arch/i386/kernel/reboot_fixups_32.c [deleted file]
arch/i386/kernel/relocate_kernel_32.S [deleted file]
arch/i386/kernel/scx200_32.c [deleted file]
arch/i386/kernel/setup_32.c [deleted file]
arch/i386/kernel/sigframe_32.h [deleted file]
arch/i386/kernel/signal_32.c [deleted file]
arch/i386/kernel/smp_32.c [deleted file]
arch/i386/kernel/smpboot_32.c [deleted file]
arch/i386/kernel/smpcommon_32.c [deleted file]
arch/i386/kernel/srat_32.c [deleted file]
arch/i386/kernel/summit_32.c [deleted file]
arch/i386/kernel/sys_i386_32.c [deleted file]
arch/i386/kernel/syscall_table_32.S [deleted file]
arch/i386/kernel/sysenter_32.c [deleted file]
arch/i386/kernel/time_32.c [deleted file]
arch/i386/kernel/topology.c [deleted file]
arch/i386/kernel/trampoline_32.S [deleted file]
arch/i386/kernel/traps_32.c [deleted file]
arch/i386/kernel/tsc_32.c [deleted file]
arch/i386/kernel/tsc_sync.c [deleted file]
arch/i386/kernel/vm86_32.c [deleted file]
arch/i386/kernel/vmi_32.c [deleted file]
arch/i386/kernel/vmiclock_32.c [deleted file]
arch/i386/kernel/vmlinux.lds.S [deleted file]
arch/i386/kernel/vmlinux_32.lds.S [deleted file]
arch/i386/kernel/vsyscall-int80_32.S [deleted file]
arch/i386/kernel/vsyscall-note_32.S [deleted file]
arch/i386/kernel/vsyscall-sigreturn_32.S [deleted file]
arch/i386/kernel/vsyscall-sysenter_32.S [deleted file]
arch/i386/kernel/vsyscall_32.S [deleted file]
arch/i386/kernel/vsyscall_32.lds.S [deleted file]
arch/um/sys-i386/sys_call_table.S
arch/x86/kernel/.gitignore [new file with mode: 0644]
arch/x86/kernel/Makefile [new file with mode: 0644]
arch/x86/kernel/Makefile_32 [new file with mode: 0644]
arch/x86/kernel/alternative.c [new file with mode: 0644]
arch/x86/kernel/apic_32.c [new file with mode: 0644]
arch/x86/kernel/apm_32.c [new file with mode: 0644]
arch/x86/kernel/asm-offsets.c [new file with mode: 0644]
arch/x86/kernel/asm-offsets_32.c [new file with mode: 0644]
arch/x86/kernel/bootflag.c [new file with mode: 0644]
arch/x86/kernel/cpuid.c [new file with mode: 0644]
arch/x86/kernel/crash_32.c [new file with mode: 0644]
arch/x86/kernel/crash_dump_32.c [new file with mode: 0644]
arch/x86/kernel/doublefault_32.c [new file with mode: 0644]
arch/x86/kernel/e820_32.c [new file with mode: 0644]
arch/x86/kernel/early_printk.c [new file with mode: 0644]
arch/x86/kernel/efi_32.c [new file with mode: 0644]
arch/x86/kernel/efi_stub_32.S [new file with mode: 0644]
arch/x86/kernel/entry_32.S [new file with mode: 0644]
arch/x86/kernel/geode_32.c [new file with mode: 0644]
arch/x86/kernel/head_32.S [new file with mode: 0644]
arch/x86/kernel/hpet_32.c [new file with mode: 0644]
arch/x86/kernel/i386_ksyms_32.c [new file with mode: 0644]
arch/x86/kernel/i387_32.c [new file with mode: 0644]
arch/x86/kernel/i8237.c [new file with mode: 0644]
arch/x86/kernel/i8253_32.c [new file with mode: 0644]
arch/x86/kernel/i8259_32.c [new file with mode: 0644]
arch/x86/kernel/init_task_32.c [new file with mode: 0644]
arch/x86/kernel/io_apic_32.c [new file with mode: 0644]
arch/x86/kernel/ioport_32.c [new file with mode: 0644]
arch/x86/kernel/irq_32.c [new file with mode: 0644]
arch/x86/kernel/kprobes_32.c [new file with mode: 0644]
arch/x86/kernel/ldt_32.c [new file with mode: 0644]
arch/x86/kernel/machine_kexec_32.c [new file with mode: 0644]
arch/x86/kernel/mca_32.c [new file with mode: 0644]
arch/x86/kernel/microcode.c [new file with mode: 0644]
arch/x86/kernel/module_32.c [new file with mode: 0644]
arch/x86/kernel/mpparse_32.c [new file with mode: 0644]
arch/x86/kernel/msr.c [new file with mode: 0644]
arch/x86/kernel/nmi_32.c [new file with mode: 0644]
arch/x86/kernel/numaq_32.c [new file with mode: 0644]
arch/x86/kernel/paravirt_32.c [new file with mode: 0644]
arch/x86/kernel/pci-dma_32.c [new file with mode: 0644]
arch/x86/kernel/pcspeaker.c [new file with mode: 0644]
arch/x86/kernel/process_32.c [new file with mode: 0644]
arch/x86/kernel/ptrace_32.c [new file with mode: 0644]
arch/x86/kernel/quirks.c [new file with mode: 0644]
arch/x86/kernel/reboot_32.c [new file with mode: 0644]
arch/x86/kernel/reboot_fixups_32.c [new file with mode: 0644]
arch/x86/kernel/relocate_kernel_32.S [new file with mode: 0644]
arch/x86/kernel/scx200_32.c [new file with mode: 0644]
arch/x86/kernel/setup_32.c [new file with mode: 0644]
arch/x86/kernel/sigframe_32.h [new file with mode: 0644]
arch/x86/kernel/signal_32.c [new file with mode: 0644]
arch/x86/kernel/smp_32.c [new file with mode: 0644]
arch/x86/kernel/smpboot_32.c [new file with mode: 0644]
arch/x86/kernel/smpcommon_32.c [new file with mode: 0644]
arch/x86/kernel/srat_32.c [new file with mode: 0644]
arch/x86/kernel/summit_32.c [new file with mode: 0644]
arch/x86/kernel/sys_i386_32.c [new file with mode: 0644]
arch/x86/kernel/syscall_table_32.S [new file with mode: 0644]
arch/x86/kernel/sysenter_32.c [new file with mode: 0644]
arch/x86/kernel/time_32.c [new file with mode: 0644]
arch/x86/kernel/topology.c [new file with mode: 0644]
arch/x86/kernel/trampoline_32.S [new file with mode: 0644]
arch/x86/kernel/traps_32.c [new file with mode: 0644]
arch/x86/kernel/tsc_32.c [new file with mode: 0644]
arch/x86/kernel/tsc_sync.c [new file with mode: 0644]
arch/x86/kernel/vm86_32.c [new file with mode: 0644]
arch/x86/kernel/vmi_32.c [new file with mode: 0644]
arch/x86/kernel/vmiclock_32.c [new file with mode: 0644]
arch/x86/kernel/vmlinux.lds.S [new file with mode: 0644]
arch/x86/kernel/vmlinux_32.lds.S [new file with mode: 0644]
arch/x86/kernel/vsyscall-int80_32.S [new file with mode: 0644]
arch/x86/kernel/vsyscall-note_32.S [new file with mode: 0644]
arch/x86/kernel/vsyscall-sigreturn_32.S [new file with mode: 0644]
arch/x86/kernel/vsyscall-sysenter_32.S [new file with mode: 0644]
arch/x86/kernel/vsyscall_32.S [new file with mode: 0644]
arch/x86/kernel/vsyscall_32.lds.S [new file with mode: 0644]
arch/x86/mach-generic/Makefile
arch/x86/mach-voyager/Makefile
arch/x86_64/ia32/vsyscall-sigreturn.S
arch/x86_64/kernel/Makefile
arch/x86_64/kernel/Makefile_64

index 397cfed..9c1da72 100644 (file)
 # 20050320  Kianusch Sayah Karadji <kianusch@sk-tech.net>
 #           Added support for GEODE CPU
 
+# Fill in SRCARCH
+SRCARCH        := x86
+
+archprepare:
+       @mkdir -p ${objtree}/arch/x86/kernel
+
+
 HAS_BIARCH      := $(call cc-option-yn, -m32)
 ifeq ($(HAS_BIARCH),y)
 AS              := $(AS) --32
@@ -99,10 +106,10 @@ core-$(CONFIG_XEN)         += arch/x86/xen/
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
-head-y := arch/i386/kernel/head_32.o arch/i386/kernel/init_task_32.o
+head-y := arch/x86/kernel/head_32.o arch/x86/kernel/init_task_32.o
 
 libs-y                                         += arch/x86/lib/
-core-y                                 += arch/i386/kernel/ \
+core-y                                 += arch/x86/kernel/ \
                                           arch/x86/mm/ \
                                           $(mcore-y)/ \
                                           arch/x86/crypto/
diff --git a/arch/i386/kernel/.gitignore b/arch/i386/kernel/.gitignore
deleted file mode 100644 (file)
index 40836ad..0000000
+++ /dev/null
@@ -1 +0,0 @@
-vsyscall.lds
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
deleted file mode 100644 (file)
index d3ebd16..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/i386/kernel/Makefile_32
-else
-include ${srctree}/arch/x86_64/kernel/Makefile_64
-endif
diff --git a/arch/i386/kernel/Makefile_32 b/arch/i386/kernel/Makefile_32
deleted file mode 100644 (file)
index 5096f48..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-extra-y := head_32.o init_task_32.o vmlinux.lds
-
-obj-y  := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
-               ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
-               pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
-               quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
-
-obj-$(CONFIG_STACKTRACE)       += stacktrace.o
-obj-y                          += ../../x86/kernel/cpu/
-obj-y                          += ../../x86/kernel/acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT)  += reboot_32.o
-obj-$(CONFIG_MCA)              += mca_32.o
-obj-$(CONFIG_X86_MSR)          += msr.o
-obj-$(CONFIG_X86_CPUID)                += cpuid.o
-obj-$(CONFIG_MICROCODE)                += microcode.o
-obj-$(CONFIG_APM)              += apm_32.o
-obj-$(CONFIG_X86_SMP)          += smp_32.o smpboot_32.o tsc_sync.o
-obj-$(CONFIG_SMP)              += smpcommon_32.o
-obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline_32.o
-obj-$(CONFIG_X86_MPPARSE)      += mpparse_32.o
-obj-$(CONFIG_X86_LOCAL_APIC)   += apic_32.o nmi_32.o
-obj-$(CONFIG_X86_IO_APIC)      += io_apic_32.o
-obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_32.o relocate_kernel_32.o crash_32.o
-obj-$(CONFIG_CRASH_DUMP)       += crash_dump_32.o
-obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
-obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit_32.o
-obj-$(CONFIG_KPROBES)          += kprobes_32.o
-obj-$(CONFIG_MODULES)          += module_32.o
-obj-y                          += sysenter_32.o vsyscall_32.o
-obj-$(CONFIG_ACPI_SRAT)        += srat_32.o
-obj-$(CONFIG_EFI)              += efi_32.o efi_stub_32.o
-obj-$(CONFIG_DOUBLEFAULT)      += doublefault_32.o
-obj-$(CONFIG_VM86)             += vm86_32.o
-obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
-obj-$(CONFIG_HPET_TIMER)       += hpet_32.o
-obj-$(CONFIG_K8_NB)            += k8.o
-obj-$(CONFIG_MGEODE_LX)                += geode_32.o
-
-obj-$(CONFIG_VMI)              += vmi_32.o vmiclock_32.o
-obj-$(CONFIG_PARAVIRT)         += paravirt_32.o
-obj-y                          += pcspeaker.o
-
-obj-$(CONFIG_SCx200)           += scx200_32.o
-
-# vsyscall_32.o contains the vsyscall DSO images as __initdata.
-# We must build both images before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
-targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
-targets += vsyscall-note_32.o vsyscall_32.lds
-
-# The DSO images are built using a special linker script.
-quiet_cmd_syscall = SYSCALL $@
-      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
-                         -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
-
-vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
-                $(call ld-option, -Wl$(comma)--hash-style=sysv)
-SYSCFLAGS_vsyscall-sysenter_32.so      = $(vsyscall-flags)
-SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
-
-$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
-$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
-                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
-       $(call if_changed,syscall)
-
-# We also create a special relocatable object that should mirror the symbol
-# table and layout of the linked DSO.  With ld -R we can then refer to
-# these symbols in the kernel code rather than hand-coded addresses.
-extra-y += vsyscall-syms.o
-$(obj)/built-in.o: $(obj)/vsyscall-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
-
-SYSCFLAGS_vsyscall-syms.o = -r
-$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
-                       $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
-       $(call if_changed,syscall)
-
-k8-y                      += ../../x86_64/kernel/k8.o
-stacktrace-y             += ../../x86_64/kernel/stacktrace.o
-
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
deleted file mode 100644 (file)
index bd72d94..0000000
+++ /dev/null
@@ -1,450 +0,0 @@
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/kprobes.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/pgtable.h>
-#include <asm/mce.h>
-#include <asm/nmi.h>
-
-#define MAX_PATCH_LEN (255-1)
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
-
-static int __init bootonly(char *str)
-{
-       smp_alt_once = 1;
-       return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
-
-static int debug_alternative;
-
-static int __init debug_alt(char *str)
-{
-       debug_alternative = 1;
-       return 1;
-}
-__setup("debug-alternative", debug_alt);
-
-static int noreplace_smp;
-
-static int __init setup_noreplace_smp(char *str)
-{
-       noreplace_smp = 1;
-       return 1;
-}
-__setup("noreplace-smp", setup_noreplace_smp);
-
-#ifdef CONFIG_PARAVIRT
-static int noreplace_paravirt = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
-{
-       noreplace_paravirt = 1;
-       return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
-#endif
-
-#define DPRINTK(fmt, args...) if (debug_alternative) \
-       printk(KERN_DEBUG fmt, args)
-
-#ifdef GENERIC_NOP1
-/* Use inline assembly to define this because the nops are defined
-   as inline assembly strings in the include files and we cannot
-   get them easily into strings. */
-asm("\t.data\nintelnops: "
-       GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
-       GENERIC_NOP7 GENERIC_NOP8);
-extern unsigned char intelnops[];
-static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
-       NULL,
-       intelnops,
-       intelnops + 1,
-       intelnops + 1 + 2,
-       intelnops + 1 + 2 + 3,
-       intelnops + 1 + 2 + 3 + 4,
-       intelnops + 1 + 2 + 3 + 4 + 5,
-       intelnops + 1 + 2 + 3 + 4 + 5 + 6,
-       intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
-#endif
-
-#ifdef K8_NOP1
-asm("\t.data\nk8nops: "
-       K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
-       K8_NOP7 K8_NOP8);
-extern unsigned char k8nops[];
-static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
-       NULL,
-       k8nops,
-       k8nops + 1,
-       k8nops + 1 + 2,
-       k8nops + 1 + 2 + 3,
-       k8nops + 1 + 2 + 3 + 4,
-       k8nops + 1 + 2 + 3 + 4 + 5,
-       k8nops + 1 + 2 + 3 + 4 + 5 + 6,
-       k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
-#endif
-
-#ifdef K7_NOP1
-asm("\t.data\nk7nops: "
-       K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
-       K7_NOP7 K7_NOP8);
-extern unsigned char k7nops[];
-static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
-       NULL,
-       k7nops,
-       k7nops + 1,
-       k7nops + 1 + 2,
-       k7nops + 1 + 2 + 3,
-       k7nops + 1 + 2 + 3 + 4,
-       k7nops + 1 + 2 + 3 + 4 + 5,
-       k7nops + 1 + 2 + 3 + 4 + 5 + 6,
-       k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
-#endif
-
-#ifdef CONFIG_X86_64
-
-extern char __vsyscall_0;
-static inline unsigned char** find_nop_table(void)
-{
-       return k8_nops;
-}
-
-#else /* CONFIG_X86_64 */
-
-static struct nop {
-       int cpuid;
-       unsigned char **noptable;
-} noptypes[] = {
-       { X86_FEATURE_K8, k8_nops },
-       { X86_FEATURE_K7, k7_nops },
-       { -1, NULL }
-};
-
-static unsigned char** find_nop_table(void)
-{
-       unsigned char **noptable = intel_nops;
-       int i;
-
-       for (i = 0; noptypes[i].cpuid >= 0; i++) {
-               if (boot_cpu_has(noptypes[i].cpuid)) {
-                       noptable = noptypes[i].noptable;
-                       break;
-               }
-       }
-       return noptable;
-}
-
-#endif /* CONFIG_X86_64 */
-
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void add_nops(void *insns, unsigned int len)
-{
-       unsigned char **noptable = find_nop_table();
-
-       while (len > 0) {
-               unsigned int noplen = len;
-               if (noplen > ASM_NOP_MAX)
-                       noplen = ASM_NOP_MAX;
-               memcpy(insns, noptable[noplen], noplen);
-               insns += noplen;
-               len -= noplen;
-       }
-}
-
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
-
-/* Replace instructions with better alternatives for this CPU type.
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that assymetric systems where
-   APs have less capabilities than the boot processor are not handled.
-   Tough. Make sure you disable such features by hand. */
-
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
-{
-       struct alt_instr *a;
-       char insnbuf[MAX_PATCH_LEN];
-
-       DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
-       for (a = start; a < end; a++) {
-               u8 *instr = a->instr;
-               BUG_ON(a->replacementlen > a->instrlen);
-               BUG_ON(a->instrlen > sizeof(insnbuf));
-               if (!boot_cpu_has(a->cpuid))
-                       continue;
-#ifdef CONFIG_X86_64
-               /* vsyscall code is not mapped yet. resolve it manually. */
-               if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
-                       instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
-                       DPRINTK("%s: vsyscall fixup: %p => %p\n",
-                               __FUNCTION__, a->instr, instr);
-               }
-#endif
-               memcpy(insnbuf, a->replacement, a->replacementlen);
-               add_nops(insnbuf + a->replacementlen,
-                        a->instrlen - a->replacementlen);
-               text_poke(instr, insnbuf, a->instrlen);
-       }
-}
-
-#ifdef CONFIG_SMP
-
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
-{
-       u8 **ptr;
-
-       for (ptr = start; ptr < end; ptr++) {
-               if (*ptr < text)
-                       continue;
-               if (*ptr > text_end)
-                       continue;
-               text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
-       };
-}
-
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
-{
-       u8 **ptr;
-       char insn[1];
-
-       if (noreplace_smp)
-               return;
-
-       add_nops(insn, 1);
-       for (ptr = start; ptr < end; ptr++) {
-               if (*ptr < text)
-                       continue;
-               if (*ptr > text_end)
-                       continue;
-               text_poke(*ptr, insn, 1);
-       };
-}
-
-struct smp_alt_module {
-       /* what is this ??? */
-       struct module   *mod;
-       char            *name;
-
-       /* ptrs to lock prefixes */
-       u8              **locks;
-       u8              **locks_end;
-
-       /* .text segment, needed to avoid patching init code ;) */
-       u8              *text;
-       u8              *text_end;
-
-       struct list_head next;
-};
-static LIST_HEAD(smp_alt_modules);
-static DEFINE_SPINLOCK(smp_alt);
-
-void alternatives_smp_module_add(struct module *mod, char *name,
-                                void *locks, void *locks_end,
-                                void *text,  void *text_end)
-{
-       struct smp_alt_module *smp;
-       unsigned long flags;
-
-       if (noreplace_smp)
-               return;
-
-       if (smp_alt_once) {
-               if (boot_cpu_has(X86_FEATURE_UP))
-                       alternatives_smp_unlock(locks, locks_end,
-                                               text, text_end);
-               return;
-       }
-
-       smp = kzalloc(sizeof(*smp), GFP_KERNEL);
-       if (NULL == smp)
-               return; /* we'll run the (safe but slow) SMP code then ... */
-
-       smp->mod        = mod;
-       smp->name       = name;
-       smp->locks      = locks;
-       smp->locks_end  = locks_end;
-       smp->text       = text;
-       smp->text_end   = text_end;
-       DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
-               __FUNCTION__, smp->locks, smp->locks_end,
-               smp->text, smp->text_end, smp->name);
-
-       spin_lock_irqsave(&smp_alt, flags);
-       list_add_tail(&smp->next, &smp_alt_modules);
-       if (boot_cpu_has(X86_FEATURE_UP))
-               alternatives_smp_unlock(smp->locks, smp->locks_end,
-                                       smp->text, smp->text_end);
-       spin_unlock_irqrestore(&smp_alt, flags);
-}
-
-void alternatives_smp_module_del(struct module *mod)
-{
-       struct smp_alt_module *item;
-       unsigned long flags;
-
-       if (smp_alt_once || noreplace_smp)
-               return;
-
-       spin_lock_irqsave(&smp_alt, flags);
-       list_for_each_entry(item, &smp_alt_modules, next) {
-               if (mod != item->mod)
-                       continue;
-               list_del(&item->next);
-               spin_unlock_irqrestore(&smp_alt, flags);
-               DPRINTK("%s: %s\n", __FUNCTION__, item->name);
-               kfree(item);
-               return;
-       }
-       spin_unlock_irqrestore(&smp_alt, flags);
-}
-
-void alternatives_smp_switch(int smp)
-{
-       struct smp_alt_module *mod;
-       unsigned long flags;
-
-#ifdef CONFIG_LOCKDEP
-       /*
-        * A not yet fixed binutils section handling bug prevents
-        * alternatives-replacement from working reliably, so turn
-        * it off:
-        */
-       printk("lockdep: not fixing up alternatives.\n");
-       return;
-#endif
-
-       if (noreplace_smp || smp_alt_once)
-               return;
-       BUG_ON(!smp && (num_online_cpus() > 1));
-
-       spin_lock_irqsave(&smp_alt, flags);
-       if (smp) {
-               printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
-               clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-               clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
-               list_for_each_entry(mod, &smp_alt_modules, next)
-                       alternatives_smp_lock(mod->locks, mod->locks_end,
-                                             mod->text, mod->text_end);
-       } else {
-               printk(KERN_INFO "SMP alternatives: switching to UP code\n");
-               set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-               set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
-               list_for_each_entry(mod, &smp_alt_modules, next)
-                       alternatives_smp_unlock(mod->locks, mod->locks_end,
-                                               mod->text, mod->text_end);
-       }
-       spin_unlock_irqrestore(&smp_alt, flags);
-}
-
-#endif
-
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
-                   struct paravirt_patch_site *end)
-{
-       struct paravirt_patch_site *p;
-       char insnbuf[MAX_PATCH_LEN];
-
-       if (noreplace_paravirt)
-               return;
-
-       for (p = start; p < end; p++) {
-               unsigned int used;
-
-               BUG_ON(p->len > MAX_PATCH_LEN);
-               /* prep the buffer with the original instructions */
-               memcpy(insnbuf, p->instr, p->len);
-               used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
-                                         (unsigned long)p->instr, p->len);
-
-               BUG_ON(used > p->len);
-
-               /* Pad the rest with nops */
-               add_nops(insnbuf + used, p->len - used);
-               text_poke(p->instr, insnbuf, p->len);
-       }
-}
-extern struct paravirt_patch_site __start_parainstructions[],
-       __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
-
-void __init alternative_instructions(void)
-{
-       unsigned long flags;
-
-       /* The patching is not fully atomic, so try to avoid local interruptions
-          that might execute the to be patched code.
-          Other CPUs are not running. */
-       stop_nmi();
-#ifdef CONFIG_X86_MCE
-       stop_mce();
-#endif
-
-       local_irq_save(flags);
-       apply_alternatives(__alt_instructions, __alt_instructions_end);
-
-       /* switch to patch-once-at-boottime-only mode and free the
-        * tables in case we know the number of CPUs will never ever
-        * change */
-#ifdef CONFIG_HOTPLUG_CPU
-       if (num_possible_cpus() < 2)
-               smp_alt_once = 1;
-#endif
-
-#ifdef CONFIG_SMP
-       if (smp_alt_once) {
-               if (1 == num_possible_cpus()) {
-                       printk(KERN_INFO "SMP alternatives: switching to UP code\n");
-                       set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-                       set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
-                       alternatives_smp_unlock(__smp_locks, __smp_locks_end,
-                                               _text, _etext);
-               }
-               free_init_pages("SMP alternatives",
-                               (unsigned long)__smp_locks,
-                               (unsigned long)__smp_locks_end);
-       } else {
-               alternatives_smp_module_add(NULL, "core kernel",
-                                           __smp_locks, __smp_locks_end,
-                                           _text, _etext);
-               alternatives_smp_switch(0);
-       }
-#endif
-       apply_paravirt(__parainstructions, __parainstructions_end);
-       local_irq_restore(flags);
-
-       restart_nmi();
-#ifdef CONFIG_X86_MCE
-       restart_mce();
-#endif
-}
-
-/*
- * Warning:
- * When you use this code to patch more than one byte of an instruction
- * you need to make sure that other CPUs cannot execute this code in parallel.
- * Also no thread must be currently preempted in the middle of these instructions.
- * And on the local CPU you need to be protected again NMI or MCE handlers
- * seeing an inconsistent instruction while you patch.
- */
-void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
-{
-       memcpy(addr, opcode, len);
-       sync_core();
-       /* Could also do a CLFLUSH here to speed up CPU recovery; but
-          that causes hangs on some VIA CPUs. */
-}
diff --git a/arch/i386/kernel/apic_32.c b/arch/i386/kernel/apic_32.c
deleted file mode 100644 (file)
index 3d67ae1..0000000
+++ /dev/null
@@ -1,1566 +0,0 @@
-/*
- *     Local APIC handling, local APIC timers
- *
- *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *     Fixes
- *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
- *                                     thanks to Eric Gilmore
- *                                     and Rolf G. Tews
- *                                     for testing these extensively.
- *     Maciej W. Rozycki       :       Various updates and fixes.
- *     Mikael Pettersson       :       Power Management for UP-APIC.
- *     Pavel Machek and
- *     Mikael Pettersson       :       PM converted to driver model.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/clockchips.h>
-#include <linux/acpi_pmtmr.h>
-#include <linux/module.h>
-#include <linux/dmi.h>
-
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/desc.h>
-#include <asm/arch_hooks.h>
-#include <asm/hpet.h>
-#include <asm/i8253.h>
-#include <asm/nmi.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-#include <mach_ipi.h>
-
-#include "io_ports.h"
-
-/*
- * Sanity check
- */
-#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
-# error SPURIOUS_APIC_VECTOR definition error
-#endif
-
-/*
- * Knob to control our willingness to enable the local APIC.
- *
- * -1=force-disable, +1=force-enable
- */
-static int enable_local_apic __initdata = 0;
-
-/* Local APIC timer verification ok */
-static int local_apic_timer_verify_ok;
-/* Disable local APIC timer from the kernel commandline or via dmi quirk
-   or using CPU MSR check */
-int local_apic_timer_disabled;
-/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-
-/*
- * Debug level, exported for io_apic.c
- */
-int apic_verbosity;
-
-static unsigned int calibration_result;
-
-static int lapic_next_event(unsigned long delta,
-                           struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
-                             struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
-static void apic_pm_activate(void);
-
-/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
-       .name           = "lapic",
-       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-                       | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-       .shift          = 32,
-       .set_mode       = lapic_timer_setup,
-       .set_next_event = lapic_next_event,
-       .broadcast      = lapic_timer_broadcast,
-       .rating         = 100,
-       .irq            = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
-
-/*
- * Get the LAPIC version
- */
-static inline int lapic_get_version(void)
-{
-       return GET_APIC_VERSION(apic_read(APIC_LVR));
-}
-
-/*
- * Check, if the APIC is integrated or a seperate chip
- */
-static inline int lapic_is_integrated(void)
-{
-       return APIC_INTEGRATED(lapic_get_version());
-}
-
-/*
- * Check, whether this is a modern or a first generation APIC
- */
-static int modern_apic(void)
-{
-       /* AMD systems use old APIC versions, so check the CPU */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-           boot_cpu_data.x86 >= 0xf)
-               return 1;
-       return lapic_get_version() >= 0x14;
-}
-
-void apic_wait_icr_idle(void)
-{
-       while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
-               cpu_relax();
-}
-
-unsigned long safe_apic_wait_icr_idle(void)
-{
-       unsigned long send_status;
-       int timeout;
-
-       timeout = 0;
-       do {
-               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-               if (!send_status)
-                       break;
-               udelay(100);
-       } while (timeout++ < 1000);
-
-       return send_status;
-}
-
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void enable_NMI_through_LVT0 (void * dummy)
-{
-       unsigned int v = APIC_DM_NMI;
-
-       /* Level triggered for 82489DX */
-       if (!lapic_is_integrated())
-               v |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT0, v);
-}
-
-/**
- * get_physical_broadcast - Get number of physical broadcast IDs
- */
-int get_physical_broadcast(void)
-{
-       return modern_apic() ? 0xff : 0xf;
-}
-
-/**
- * lapic_get_maxlvt - get the maximum number of local vector table entries
- */
-int lapic_get_maxlvt(void)
-{
-       unsigned int v = apic_read(APIC_LVR);
-
-       /* 82489DXs do not report # of LVT entries. */
-       return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
-}
-
-/*
- * Local APIC timer
- */
-
-/* Clock divisor is set to 16 */
-#define APIC_DIVISOR 16
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
-{
-       unsigned int lvtt_value, tmp_value;
-
-       lvtt_value = LOCAL_TIMER_VECTOR;
-       if (!oneshot)
-               lvtt_value |= APIC_LVT_TIMER_PERIODIC;
-       if (!lapic_is_integrated())
-               lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-
-       if (!irqen)
-               lvtt_value |= APIC_LVT_MASKED;
-
-       apic_write_around(APIC_LVTT, lvtt_value);
-
-       /*
-        * Divide PICLK by 16
-        */
-       tmp_value = apic_read(APIC_TDCR);
-       apic_write_around(APIC_TDCR, (tmp_value
-                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-                               | APIC_TDR_DIV_16);
-
-       if (!oneshot)
-               apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
-}
-
-/*
- * Program the next event, relative to now
- */
-static int lapic_next_event(unsigned long delta,
-                           struct clock_event_device *evt)
-{
-       apic_write_around(APIC_TMICT, delta);
-       return 0;
-}
-
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
-                             struct clock_event_device *evt)
-{
-       unsigned long flags;
-       unsigned int v;
-
-       /* Lapic used for broadcast ? */
-       if (!local_apic_timer_verify_ok)
-               return;
-
-       local_irq_save(flags);
-
-       switch (mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-       case CLOCK_EVT_MODE_ONESHOT:
-               __setup_APIC_LVTT(calibration_result,
-                                 mode != CLOCK_EVT_MODE_PERIODIC, 1);
-               break;
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               v = apic_read(APIC_LVTT);
-               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-               apic_write_around(APIC_LVTT, v);
-               break;
-       case CLOCK_EVT_MODE_RESUME:
-               /* Nothing to do here */
-               break;
-       }
-
-       local_irq_restore(flags);
-}
-
-/*
- * Local APIC timer broadcast function
- */
-static void lapic_timer_broadcast(cpumask_t mask)
-{
-#ifdef CONFIG_SMP
-       send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
-}
-
-/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
- * of the boot CPU and register the clock event in the framework.
- */
-static void __devinit setup_APIC_timer(void)
-{
-       struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-
-       memcpy(levt, &lapic_clockevent, sizeof(*levt));
-       levt->cpumask = cpumask_of_cpu(smp_processor_id());
-
-       clockevents_register_device(levt);
-}
-
-/*
- * In this functions we calibrate APIC bus clocks to the external timer.
- *
- * We want to do the calibration only once since we want to have local timer
- * irqs syncron. CPUs connected by the same APIC bus have the very same bus
- * frequency.
- *
- * This was previously done by reading the PIT/HPET and waiting for a wrap
- * around to find out, that a tick has elapsed. I have a box, where the PIT
- * readout is broken, so it never gets out of the wait loop again. This was
- * also reported by others.
- *
- * Monitoring the jiffies value is inaccurate and the clockevents
- * infrastructure allows us to do a simple substitution of the interrupt
- * handler.
- *
- * The calibration routine also uses the pm_timer when possible, as the PIT
- * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
- * back to normal later in the boot process).
- */
-
-#define LAPIC_CAL_LOOPS                (HZ/10)
-
-static __initdata int lapic_cal_loops = -1;
-static __initdata long lapic_cal_t1, lapic_cal_t2;
-static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
-static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
-
-/*
- * Temporary interrupt handler.
- */
-static void __init lapic_cal_handler(struct clock_event_device *dev)
-{
-       unsigned long long tsc = 0;
-       long tapic = apic_read(APIC_TMCCT);
-       unsigned long pm = acpi_pm_read_early();
-
-       if (cpu_has_tsc)
-               rdtscll(tsc);
-
-       switch (lapic_cal_loops++) {
-       case 0:
-               lapic_cal_t1 = tapic;
-               lapic_cal_tsc1 = tsc;
-               lapic_cal_pm1 = pm;
-               lapic_cal_j1 = jiffies;
-               break;
-
-       case LAPIC_CAL_LOOPS:
-               lapic_cal_t2 = tapic;
-               lapic_cal_tsc2 = tsc;
-               if (pm < lapic_cal_pm1)
-                       pm += ACPI_PM_OVRRUN;
-               lapic_cal_pm2 = pm;
-               lapic_cal_j2 = jiffies;
-               break;
-       }
-}
-
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
-{
-       struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-       const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
-       const long pm_thresh = pm_100ms/100;
-       void (*real_handler)(struct clock_event_device *dev);
-       unsigned long deltaj;
-       long delta, deltapm;
-       int pm_referenced = 0;
-
-       /*
-        * The local apic timer can be disabled via the kernel
-        * commandline or from the CPU detection code. Register the lapic
-        * timer as a dummy clock event source on SMP systems, so the
-        * broadcast mechanism is used. On UP systems simply ignore it.
-        */
-       if (local_apic_timer_disabled) {
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1)
-                       setup_APIC_timer();
-               return;
-       }
-
-       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
-                   "calibrating APIC timer ...\n");
-
-       local_irq_disable();
-
-       /* Replace the global interrupt handler */
-       real_handler = global_clock_event->event_handler;
-       global_clock_event->event_handler = lapic_cal_handler;
-
-       /*
-        * Setup the APIC counter to 1e9. There is no way the lapic
-        * can underflow in the 100ms detection time frame
-        */
-       __setup_APIC_LVTT(1000000000, 0, 0);
-
-       /* Let the interrupts run */
-       local_irq_enable();
-
-       while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
-               cpu_relax();
-
-       local_irq_disable();
-
-       /* Restore the real event handler */
-       global_clock_event->event_handler = real_handler;
-
-       /* Build delta t1-t2 as apic timer counts down */
-       delta = lapic_cal_t1 - lapic_cal_t2;
-       apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
-
-       /* Check, if the PM timer is available */
-       deltapm = lapic_cal_pm2 - lapic_cal_pm1;
-       apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
-
-       if (deltapm) {
-               unsigned long mult;
-               u64 res;
-
-               mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
-
-               if (deltapm > (pm_100ms - pm_thresh) &&
-                   deltapm < (pm_100ms + pm_thresh)) {
-                       apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
-               } else {
-                       res = (((u64) deltapm) *  mult) >> 22;
-                       do_div(res, 1000000);
-                       printk(KERN_WARNING "APIC calibration not consistent "
-                              "with PM Timer: %ldms instead of 100ms\n",
-                              (long)res);
-                       /* Correct the lapic counter value */
-                       res = (((u64) delta ) * pm_100ms);
-                       do_div(res, deltapm);
-                       printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
-                              "%lu (%ld)\n", (unsigned long) res, delta);
-                       delta = (long) res;
-               }
-               pm_referenced = 1;
-       }
-
-       /* Calculate the scaled math multiplication factor */
-       lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
-       lapic_clockevent.max_delta_ns =
-               clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
-       lapic_clockevent.min_delta_ns =
-               clockevent_delta2ns(0xF, &lapic_clockevent);
-
-       calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
-
-       apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
-       apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
-       apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
-                   calibration_result);
-
-       if (cpu_has_tsc) {
-               delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
-               apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
-                           "%ld.%04ld MHz.\n",
-                           (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
-                           (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
-       }
-
-       apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
-                   "%u.%04u MHz.\n",
-                   calibration_result / (1000000 / HZ),
-                   calibration_result % (1000000 / HZ));
-
-       local_apic_timer_verify_ok = 1;
-
-       /* We trust the pm timer based calibration */
-       if (!pm_referenced) {
-               apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
-
-               /*
-                * Setup the apic timer manually
-                */
-               levt->event_handler = lapic_cal_handler;
-               lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
-               lapic_cal_loops = -1;
-
-               /* Let the interrupts run */
-               local_irq_enable();
-
-               while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
-                       cpu_relax();
-
-               local_irq_disable();
-
-               /* Stop the lapic timer */
-               lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
-
-               local_irq_enable();
-
-               /* Jiffies delta */
-               deltaj = lapic_cal_j2 - lapic_cal_j1;
-               apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
-
-               /* Check, if the jiffies result is consistent */
-               if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
-                       apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
-               else
-                       local_apic_timer_verify_ok = 0;
-       } else
-               local_irq_enable();
-
-       if (!local_apic_timer_verify_ok) {
-               printk(KERN_WARNING
-                      "APIC timer disabled due to verification failure.\n");
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() == 1)
-                       return;
-       } else {
-               /*
-                * If nmi_watchdog is set to IO_APIC, we need the
-                * PIT/HPET going.  Otherwise register lapic as a dummy
-                * device.
-                */
-               if (nmi_watchdog != NMI_IO_APIC)
-                       lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-               else
-                       printk(KERN_WARNING "APIC timer registered as dummy,"
-                              " due to nmi_watchdog=1!\n");
-       }
-
-       /* Setup the lapic or request the broadcast */
-       setup_APIC_timer();
-}
-
-void __devinit setup_secondary_APIC_clock(void)
-{
-       setup_APIC_timer();
-}
-
-/*
- * The guts of the apic timer interrupt
- */
-static void local_apic_timer_interrupt(void)
-{
-       int cpu = smp_processor_id();
-       struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
-
-       /*
-        * Normally we should not be here till LAPIC has been initialized but
-        * in some cases like kdump, its possible that there is a pending LAPIC
-        * timer interrupt from previous kernel's context and is delivered in
-        * new kernel the moment interrupts are enabled.
-        *
-        * Interrupts are enabled early and LAPIC is setup much later, hence
-        * its possible that when we get here evt->event_handler is NULL.
-        * Check for event_handler being NULL and discard the interrupt as
-        * spurious.
-        */
-       if (!evt->event_handler) {
-               printk(KERN_WARNING
-                      "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
-               /* Switch it off */
-               lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
-               return;
-       }
-
-       per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
-       evt->event_handler(evt);
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
- */
-
-void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
-{
-       struct pt_regs *old_regs = set_irq_regs(regs);
-
-       /*
-        * NOTE! We'd better ACK the irq immediately,
-        * because timer handling can be slow.
-        */
-       ack_APIC_irq();
-       /*
-        * update_process_times() expects us to have done irq_enter().
-        * Besides, if we don't timer interrupts ignore the global
-        * interrupt lock, which is the WrongThing (tm) to do.
-        */
-       irq_enter();
-       local_apic_timer_interrupt();
-       irq_exit();
-
-       set_irq_regs(old_regs);
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
-       return -EINVAL;
-}
-
-/*
- * Local APIC start and shutdown
- */
-
-/**
- * clear_local_APIC - shutdown the local APIC
- *
- * This is called, when a CPU is disabled and before rebooting, so the state of
- * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
- * leftovers during boot.
- */
-void clear_local_APIC(void)
-{
-       int maxlvt = lapic_get_maxlvt();
-       unsigned long v;
-
-       /*
-        * Masking an LVT entry can trigger a local APIC error
-        * if the vector is zero. Mask LVTERR first to prevent this.
-        */
-       if (maxlvt >= 3) {
-               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
-       }
-       /*
-        * Careful: we have to set masks only first to deassert
-        * any level-triggered sources.
-        */
-       v = apic_read(APIC_LVTT);
-       apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT1);
-       apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
-       if (maxlvt >= 4) {
-               v = apic_read(APIC_LVTPC);
-               apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
-       }
-
-       /* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       if (maxlvt >= 5) {
-               v = apic_read(APIC_LVTTHMR);
-               apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
-       }
-#endif
-       /*
-        * Clean APIC state for other OSs:
-        */
-       apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
-       if (maxlvt >= 3)
-               apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
-       if (maxlvt >= 4)
-               apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       if (maxlvt >= 5)
-               apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
-       /* Integrated APIC (!82489DX) ? */
-       if (lapic_is_integrated()) {
-               if (maxlvt > 3)
-                       /* Clear ESR due to Pentium errata 3AP and 11AP */
-                       apic_write(APIC_ESR, 0);
-               apic_read(APIC_ESR);
-       }
-}
-
-/**
- * disable_local_APIC - clear and disable the local APIC
- */
-void disable_local_APIC(void)
-{
-       unsigned long value;
-
-       clear_local_APIC();
-
-       /*
-        * Disable APIC (implies clearing of registers
-        * for 82489DX!).
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write_around(APIC_SPIV, value);
-
-       /*
-        * When LAPIC was disabled by the BIOS and enabled by the kernel,
-        * restore the disabled state.
-        */
-       if (enabled_via_apicbase) {
-               unsigned int l, h;
-
-               rdmsr(MSR_IA32_APICBASE, l, h);
-               l &= ~MSR_IA32_APICBASE_ENABLE;
-               wrmsr(MSR_IA32_APICBASE, l, h);
-       }
-}
-
-/*
- * If Linux enabled the LAPIC against the BIOS default disable it down before
- * re-entering the BIOS on shutdown.  Otherwise the BIOS may get confused and
- * not power-off.  Additionally clear all LVT entries before disable_local_APIC
- * for the case where Linux didn't enable the LAPIC.
- */
-void lapic_shutdown(void)
-{
-       unsigned long flags;
-
-       if (!cpu_has_apic)
-               return;
-
-       local_irq_save(flags);
-       clear_local_APIC();
-
-       if (enabled_via_apicbase)
-               disable_local_APIC();
-
-       local_irq_restore(flags);
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
-       unsigned int reg0, reg1;
-
-       /*
-        * The version register is read-only in a real APIC.
-        */
-       reg0 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-       reg1 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
-       /*
-        * The two version reads above should print the same
-        * numbers.  If the second one is different, then we
-        * poke at a non-APIC.
-        */
-       if (reg1 != reg0)
-               return 0;
-
-       /*
-        * Check if the version looks reasonably.
-        */
-       reg1 = GET_APIC_VERSION(reg0);
-       if (reg1 == 0x00 || reg1 == 0xff)
-               return 0;
-       reg1 = lapic_get_maxlvt();
-       if (reg1 < 0x02 || reg1 == 0xff)
-               return 0;
-
-       /*
-        * The ID register is read/write in a real APIC.
-        */
-       reg0 = apic_read(APIC_ID);
-       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-
-       /*
-        * The next two are just to see if we have sane values.
-        * They're only really relevant if we're in Virtual Wire
-        * compatibility mode, but most boxes are anymore.
-        */
-       reg0 = apic_read(APIC_LVT0);
-       apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
-       reg1 = apic_read(APIC_LVT1);
-       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
-       return 1;
-}
-
-/**
- * sync_Arb_IDs - synchronize APIC bus arbitration IDs
- */
-void __init sync_Arb_IDs(void)
-{
-       /*
-        * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
-        * needed on AMD.
-        */
-       if (modern_apic())
-               return;
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
-
-       apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-       apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-                               | APIC_DM_INIT);
-}
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
-       unsigned long value;
-
-       /*
-        * Don't do the setup now if we have a SMP BIOS as the
-        * through-I/O-APIC virtual wire mode might be active.
-        */
-       if (smp_found_config || !cpu_has_apic)
-               return;
-
-       /*
-        * Do not trust the local APIC being empty at bootup.
-        */
-       clear_local_APIC();
-
-       /*
-        * Enable APIC.
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       value |= APIC_SPIV_APIC_ENABLED;
-
-       /* This bit is reserved on P4/Xeon and should be cleared */
-       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-           (boot_cpu_data.x86 == 15))
-               value &= ~APIC_SPIV_FOCUS_DISABLED;
-       else
-               value |= APIC_SPIV_FOCUS_DISABLED;
-       value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
-
-       /*
-        * Set up the virtual wire mode.
-        */
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
-       value = APIC_DM_NMI;
-       if (!lapic_is_integrated())             /* 82489DX */
-               value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
-}
-
-/**
- * setup_local_APIC - setup the local APIC
- */
-void __devinit setup_local_APIC(void)
-{
-       unsigned long oldvalue, value, maxlvt, integrated;
-       int i, j;
-
-       /* Pound the ESR really hard over the head with a big hammer - mbligh */
-       if (esr_disable) {
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-       }
-
-       integrated = lapic_is_integrated();
-
-       /*
-        * Double-check whether this APIC is really registered.
-        */
-       if (!apic_id_registered())
-               BUG();
-
-       /*
-        * Intel recommends to set DFR, LDR and TPR before enabling
-        * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
-        * document number 292116).  So here it goes...
-        */
-       init_apic_ldr();
-
-       /*
-        * Set Task Priority to 'accept all'. We never change this
-        * later on.
-        */
-       value = apic_read(APIC_TASKPRI);
-       value &= ~APIC_TPRI_MASK;
-       apic_write_around(APIC_TASKPRI, value);
-
-       /*
-        * After a crash, we no longer service the interrupts and a pending
-        * interrupt from previous kernel might still have ISR bit set.
-        *
-        * Most probably by now CPU has serviced that pending interrupt and
-        * it might not have done the ack_APIC_irq() because it thought,
-        * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
-        * does not clear the ISR bit and cpu thinks it has already serivced
-        * the interrupt. Hence a vector might get locked. It was noticed
-        * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
-        */
-       for (i = APIC_ISR_NR - 1; i >= 0; i--) {
-               value = apic_read(APIC_ISR + i*0x10);
-               for (j = 31; j >= 0; j--) {
-                       if (value & (1<<j))
-                               ack_APIC_irq();
-               }
-       }
-
-       /*
-        * Now that we are all set up, enable the APIC
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       /*
-        * Enable APIC
-        */
-       value |= APIC_SPIV_APIC_ENABLED;
-
-       /*
-        * Some unknown Intel IO/APIC (or APIC) errata is biting us with
-        * certain networking cards. If high frequency interrupts are
-        * happening on a particular IOAPIC pin, plus the IOAPIC routing
-        * entry is masked/unmasked at a high rate as well then sooner or
-        * later IOAPIC line gets 'stuck', no more interrupts are received
-        * from the device. If focus CPU is disabled then the hang goes
-        * away, oh well :-(
-        *
-        * [ This bug can be reproduced easily with a level-triggered
-        *   PCI Ne2000 networking cards and PII/PIII processors, dual
-        *   BX chipset. ]
-        */
-       /*
-        * Actually disabling the focus CPU check just makes the hang less
-        * frequent as it makes the interrupt distributon model be more
-        * like LRU than MRU (the short-term load is more even across CPUs).
-        * See also the comment in end_level_ioapic_irq().  --macro
-        */
-
-       /* Enable focus processor (bit==0) */
-       value &= ~APIC_SPIV_FOCUS_DISABLED;
-
-       /*
-        * Set spurious IRQ vector
-        */
-       value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
-
-       /*
-        * Set up LVT0, LVT1:
-        *
-        * set up through-local-APIC on the BP's LINT0. This is not
-        * strictly necessery in pure symmetric-IO mode, but sometimes
-        * we delegate interrupts to the 8259A.
-        */
-       /*
-        * TODO: set up through-local-APIC from through-I/O-APIC? --macro
-        */
-       value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
-       if (!smp_processor_id() && (pic_mode || !value)) {
-               value = APIC_DM_EXTINT;
-               apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
-                               smp_processor_id());
-       } else {
-               value = APIC_DM_EXTINT | APIC_LVT_MASKED;
-               apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
-                               smp_processor_id());
-       }
-       apic_write_around(APIC_LVT0, value);
-
-       /*
-        * only the BP should see the LINT1 NMI signal, obviously.
-        */
-       if (!smp_processor_id())
-               value = APIC_DM_NMI;
-       else
-               value = APIC_DM_NMI | APIC_LVT_MASKED;
-       if (!integrated)                /* 82489DX */
-               value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
-
-       if (integrated && !esr_disable) {               /* !82489DX */
-               maxlvt = lapic_get_maxlvt();
-               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
-                       apic_write(APIC_ESR, 0);
-               oldvalue = apic_read(APIC_ESR);
-
-               /* enables sending errors */
-               value = ERROR_APIC_VECTOR;
-               apic_write_around(APIC_LVTERR, value);
-               /*
-                * spec says clear errors after enabling vector.
-                */
-               if (maxlvt > 3)
-                       apic_write(APIC_ESR, 0);
-               value = apic_read(APIC_ESR);
-               if (value != oldvalue)
-                       apic_printk(APIC_VERBOSE, "ESR value before enabling "
-                               "vector: 0x%08lx  after: 0x%08lx\n",
-                               oldvalue, value);
-       } else {
-               if (esr_disable)
-                       /*
-                        * Something untraceble is creating bad interrupts on
-                        * secondary quads ... for the moment, just leave the
-                        * ESR disabled - we can't do anything useful with the
-                        * errors anyway - mbligh
-                        */
-                       printk(KERN_INFO "Leaving ESR disabled.\n");
-               else
-                       printk(KERN_INFO "No ESR for 82489DX.\n");
-       }
-
-       /* Disable the local apic timer */
-       value = apic_read(APIC_LVTT);
-       value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-       apic_write_around(APIC_LVTT, value);
-
-       setup_apic_nmi_watchdog(NULL);
-       apic_pm_activate();
-}
-
-/*
- * Detect and initialize APIC
- */
-static int __init detect_init_APIC (void)
-{
-       u32 h, l, features;
-
-       /* Disabled by kernel option? */
-       if (enable_local_apic < 0)
-               return -1;
-
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_AMD:
-               if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
-                   (boot_cpu_data.x86 == 15))
-                       break;
-               goto no_apic;
-       case X86_VENDOR_INTEL:
-               if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
-                   (boot_cpu_data.x86 == 5 && cpu_has_apic))
-                       break;
-               goto no_apic;
-       default:
-               goto no_apic;
-       }
-
-       if (!cpu_has_apic) {
-               /*
-                * Over-ride BIOS and try to enable the local APIC only if
-                * "lapic" specified.
-                */
-               if (enable_local_apic <= 0) {
-                       printk(KERN_INFO "Local APIC disabled by BIOS -- "
-                              "you can enable it with \"lapic\"\n");
-                       return -1;
-               }
-               /*
-                * Some BIOSes disable the local APIC in the APIC_BASE
-                * MSR. This can only be done in software for Intel P6 or later
-                * and AMD K7 (Model > 1) or later.
-                */
-               rdmsr(MSR_IA32_APICBASE, l, h);
-               if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                       printk(KERN_INFO
-                              "Local APIC disabled by BIOS -- reenabling.\n");
-                       l &= ~MSR_IA32_APICBASE_BASE;
-                       l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-                       wrmsr(MSR_IA32_APICBASE, l, h);
-                       enabled_via_apicbase = 1;
-               }
-       }
-       /*
-        * The APIC feature bit should now be enabled
-        * in `cpuid'
-        */
-       features = cpuid_edx(1);
-       if (!(features & (1 << X86_FEATURE_APIC))) {
-               printk(KERN_WARNING "Could not enable APIC!\n");
-               return -1;
-       }
-       set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-       /* The BIOS may have set up the APIC at some other address */
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       if (l & MSR_IA32_APICBASE_ENABLE)
-               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
-       if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
-               nmi_watchdog = NMI_LOCAL_APIC;
-
-       printk(KERN_INFO "Found and enabled local APIC!\n");
-
-       apic_pm_activate();
-
-       return 0;
-
-no_apic:
-       printk(KERN_INFO "No local APIC present or hardware disabled\n");
-       return -1;
-}
-
-/**
- * init_apic_mappings - initialize APIC mappings
- */
-void __init init_apic_mappings(void)
-{
-       unsigned long apic_phys;
-
-       /*
-        * If no local APIC can be found then set up a fake all
-        * zeroes page to simulate the local APIC and another
-        * one for the IO-APIC.
-        */
-       if (!smp_found_config && detect_init_APIC()) {
-               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-               apic_phys = __pa(apic_phys);
-       } else
-               apic_phys = mp_lapic_addr;
-
-       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-       printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
-              apic_phys);
-
-       /*
-        * Fetch the APIC ID of the BSP in case we have a
-        * default configuration (or the MP table is broken).
-        */
-       if (boot_cpu_physical_apicid == -1U)
-               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-
-#ifdef CONFIG_X86_IO_APIC
-       {
-               unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-               int i;
-
-               for (i = 0; i < nr_ioapics; i++) {
-                       if (smp_found_config) {
-                               ioapic_phys = mp_ioapics[i].mpc_apicaddr;
-                               if (!ioapic_phys) {
-                                       printk(KERN_ERR
-                                              "WARNING: bogus zero IO-APIC "
-                                              "address found in MPTABLE, "
-                                              "disabling IO/APIC support!\n");
-                                       smp_found_config = 0;
-                                       skip_ioapic_setup = 1;
-                                       goto fake_ioapic_page;
-                               }
-                       } else {
-fake_ioapic_page:
-                               ioapic_phys = (unsigned long)
-                                             alloc_bootmem_pages(PAGE_SIZE);
-                               ioapic_phys = __pa(ioapic_phys);
-                       }
-                       set_fixmap_nocache(idx, ioapic_phys);
-                       printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
-                              __fix_to_virt(idx), ioapic_phys);
-                       idx++;
-               }
-       }
-#endif
-}
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int __init APIC_init_uniprocessor (void)
-{
-       if (enable_local_apic < 0)
-               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-
-       if (!smp_found_config && !cpu_has_apic)
-               return -1;
-
-       /*
-        * Complain if the BIOS pretends there is one.
-        */
-       if (!cpu_has_apic &&
-           APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
-               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-                      boot_cpu_physical_apicid);
-               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-               return -1;
-       }
-
-       verify_local_APIC();
-
-       connect_bsp_APIC();
-
-       /*
-        * Hack: In case of kdump, after a crash, kernel might be booting
-        * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
-        * might be zero if read from MP tables. Get it from LAPIC.
-        */
-#ifdef CONFIG_CRASH_DUMP
-       boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-#endif
-       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
-
-       setup_local_APIC();
-
-#ifdef CONFIG_X86_IO_APIC
-       if (smp_found_config)
-               if (!skip_ioapic_setup && nr_ioapics)
-                       setup_IO_APIC();
-#endif
-       setup_boot_clock();
-
-       return 0;
-}
-
-/*
- * APIC command line parameters
- */
-static int __init parse_lapic(char *arg)
-{
-       enable_local_apic = 1;
-       return 0;
-}
-early_param("lapic", parse_lapic);
-
-static int __init parse_nolapic(char *arg)
-{
-       enable_local_apic = -1;
-       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-       return 0;
-}
-early_param("nolapic", parse_nolapic);
-
-static int __init parse_disable_lapic_timer(char *arg)
-{
-       local_apic_timer_disabled = 1;
-       return 0;
-}
-early_param("nolapic_timer", parse_disable_lapic_timer);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
-       local_apic_timer_c2_ok = 1;
-       return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static int __init apic_set_verbosity(char *str)
-{
-       if (strcmp("debug", str) == 0)
-               apic_verbosity = APIC_DEBUG;
-       else if (strcmp("verbose", str) == 0)
-               apic_verbosity = APIC_VERBOSE;
-       return 1;
-}
-
-__setup("apic=", apic_set_verbosity);
-
-
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-void smp_spurious_interrupt(struct pt_regs *regs)
-{
-       unsigned long v;
-
-       irq_enter();
-       /*
-        * Check if this really is a spurious interrupt and ACK it
-        * if it is a vectored one.  Just in case...
-        * Spurious interrupts should not be ACKed.
-        */
-       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-               ack_APIC_irq();
-
-       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
-       printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
-              "should never happen.\n", smp_processor_id());
-       irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-void smp_error_interrupt(struct pt_regs *regs)
-{
-       unsigned long v, v1;
-
-       irq_enter();
-       /* First tickle the hardware, only then report what went on. -- REW */
-       v = apic_read(APIC_ESR);
-       apic_write(APIC_ESR, 0);
-       v1 = apic_read(APIC_ESR);
-       ack_APIC_irq();
-       atomic_inc(&irq_err_count);
-
-       /* Here is what the APIC error bits mean:
-          0: Send CS error
-          1: Receive CS error
-          2: Send accept error
-          3: Receive accept error
-          4: Reserved
-          5: Send illegal vector
-          6: Received illegal vector
-          7: Illegal register address
-       */
-       printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
-               smp_processor_id(), v , v1);
-       irq_exit();
-}
-
-/*
- * Initialize APIC interrupts
- */
-void __init apic_intr_init(void)
-{
-#ifdef CONFIG_SMP
-       smp_intr_init();
-#endif
-       /* self generated IPI for local APIC timer */
-       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-       /* IPI vectors for APIC spurious and error interrupts */
-       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-       set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-       /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-}
-
-/**
- * connect_bsp_APIC - attach the APIC to the interrupt system
- */
-void __init connect_bsp_APIC(void)
-{
-       if (pic_mode) {
-               /*
-                * Do not trust the local APIC being empty at bootup.
-                */
-               clear_local_APIC();
-               /*
-                * PIC mode, enable APIC mode in the IMCR, i.e.  connect BSP's
-                * local APIC to INT and NMI lines.
-                */
-               apic_printk(APIC_VERBOSE, "leaving PIC mode, "
-                               "enabling APIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x01, 0x23);
-       }
-       enable_apic_mode();
-}
-
-/**
- * disconnect_bsp_APIC - detach the APIC from the interrupt system
- * @virt_wire_setup:   indicates, whether virtual wire mode is selected
- *
- * Virtual wire mode is necessary to deliver legacy interrupts even when the
- * APIC is disabled.
- */
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
-       if (pic_mode) {
-               /*
-                * Put the board back into PIC mode (has an effect only on
-                * certain older boards).  Note that APIC interrupts, including
-                * IPIs, won't work beyond this point!  The only exception are
-                * INIT IPIs.
-                */
-               apic_printk(APIC_VERBOSE, "disabling APIC mode, "
-                               "entering PIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x00, 0x23);
-       } else {
-               /* Go back to Virtual Wire compatibility mode */
-               unsigned long value;
-
-               /* For the spurious interrupt use vector F, and enable it */
-               value = apic_read(APIC_SPIV);
-               value &= ~APIC_VECTOR_MASK;
-               value |= APIC_SPIV_APIC_ENABLED;
-               value |= 0xf;
-               apic_write_around(APIC_SPIV, value);
-
-               if (!virt_wire_setup) {
-                       /*
-                        * For LVT0 make it edge triggered, active high,
-                        * external and enabled
-                        */
-                       value = apic_read(APIC_LVT0);
-                       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-                               APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                               APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
-                       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-                       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-                       apic_write_around(APIC_LVT0, value);
-               } else {
-                       /* Disable LVT0 */
-                       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-               }
-
-               /*
-                * For LVT1 make it edge triggered, active high, nmi and
-                * enabled
-                */
-               value = apic_read(APIC_LVT1);
-               value &= ~(
-                       APIC_MODE_MASK | APIC_SEND_PENDING |
-                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-               apic_write_around(APIC_LVT1, value);
-       }
-}
-
-/*
- * Power management
- */
-#ifdef CONFIG_PM
-
-static struct {
-       int active;
-       /* r/w apic fields */
-       unsigned int apic_id;
-       unsigned int apic_taskpri;
-       unsigned int apic_ldr;
-       unsigned int apic_dfr;
-       unsigned int apic_spiv;
-       unsigned int apic_lvtt;
-       unsigned int apic_lvtpc;
-       unsigned int apic_lvt0;
-       unsigned int apic_lvt1;
-       unsigned int apic_lvterr;
-       unsigned int apic_tmict;
-       unsigned int apic_tdcr;
-       unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-       unsigned long flags;
-       int maxlvt;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       maxlvt = lapic_get_maxlvt();
-
-       apic_pm_state.apic_id = apic_read(APIC_ID);
-       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-       if (maxlvt >= 4)
-               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       if (maxlvt >= 5)
-               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-
-       local_irq_save(flags);
-       disable_local_APIC();
-       local_irq_restore(flags);
-       return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
-       unsigned int l, h;
-       unsigned long flags;
-       int maxlvt;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       maxlvt = lapic_get_maxlvt();
-
-       local_irq_save(flags);
-
-       /*
-        * Make sure the APICBASE points to the right address
-        *
-        * FIXME! This will be wrong if we ever support suspend on
-        * SMP! We'll need to do this as part of the CPU restore!
-        */
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       l &= ~MSR_IA32_APICBASE_BASE;
-       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-       wrmsr(MSR_IA32_APICBASE, l, h);
-
-       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-       apic_write(APIC_ID, apic_pm_state.apic_id);
-       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       if (maxlvt >= 5)
-               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
-       if (maxlvt >= 4)
-               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       local_irq_restore(flags);
-       return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
-       set_kset_name("lapic"),
-       .resume         = lapic_resume,
-       .suspend        = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-       .id     = 0,
-       .cls    = &lapic_sysclass,
-};
-
-static void __devinit apic_pm_activate(void)
-{
-       apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
-       int error;
-
-       if (!cpu_has_apic)
-               return 0;
-       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
-       error = sysdev_class_register(&lapic_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic);
-       return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else  /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
diff --git a/arch/i386/kernel/apm_32.c b/arch/i386/kernel/apm_32.c
deleted file mode 100644 (file)
index f02a8ac..0000000
+++ /dev/null
@@ -1,2403 +0,0 @@
-/* -*- linux-c -*-
- * APM BIOS driver for Linux
- * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
- *
- * Initial development of this driver was funded by NEC Australia P/L
- *     and NEC Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * October 1995, Rik Faith (faith@cs.unc.edu):
- *    Minor enhancements and updates (to the patch set) for 1.3.x
- *    Documentation
- * January 1996, Rik Faith (faith@cs.unc.edu):
- *    Make /proc/apm easy to format (bump driver version)
- * March 1996, Rik Faith (faith@cs.unc.edu):
- *    Prohibit APM BIOS calls unless apm_enabled.
- *    (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
- * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
- *    Version 1.0 and 1.1
- * May 1996, Version 1.2
- * Feb 1998, Version 1.3
- * Feb 1998, Version 1.4
- * Aug 1998, Version 1.5
- * Sep 1998, Version 1.6
- * Nov 1998, Version 1.7
- * Jan 1999, Version 1.8
- * Jan 1999, Version 1.9
- * Oct 1999, Version 1.10
- * Nov 1999, Version 1.11
- * Jan 2000, Version 1.12
- * Feb 2000, Version 1.13
- * Nov 2000, Version 1.14
- * Oct 2001, Version 1.15
- * Jan 2002, Version 1.16
- * Oct 2002, Version 1.16ac
- *
- * History:
- *    0.6b: first version in official kernel, Linux 1.3.46
- *    0.7: changed /proc/apm format, Linux 1.3.58
- *    0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
- *    0.9: only call bios if bios is present, Linux 1.3.72
- *    1.0: use fixed device number, consolidate /proc/apm into this file,
- *         Linux 1.3.85
- *    1.1: support user-space standby and suspend, power off after system
- *         halted, Linux 1.3.98
- *    1.2: When resetting RTC after resume, take care so that the time
- *         is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
- *         <jtoth@princeton.edu>); improve interaction between
- *         screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
- *    1.2a:Simple change to stop mysterious bug reports with SMP also added
- *        levels to the printk calls. APM is not defined for SMP machines.
- *         The new replacment for it is, but Linux doesn't yet support this.
- *         Alan Cox Linux 2.1.55
- *    1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
- *    1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
- *         Dean Gaudet <dgaudet@arctic.org>.
- *         C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
- *    1.5: Fix segment register reloading (in case of bad segments saved
- *         across BIOS call).
- *         Stephen Rothwell
- *    1.6: Cope with complier/assembler differences.
- *         Only try to turn off the first display device.
- *         Fix OOPS at power off with no APM BIOS by Jan Echternach
- *                   <echter@informatik.uni-rostock.de>
- *         Stephen Rothwell
- *    1.7: Modify driver's cached copy of the disabled/disengaged flags
- *         to reflect current state of APM BIOS.
- *         Chris Rankin <rankinc@bellsouth.net>
- *         Reset interrupt 0 timer to 100Hz after suspend
- *         Chad Miller <cmiller@surfsouth.com>
- *         Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
- *         Richard Gooch <rgooch@atnf.csiro.au>
- *         Allow boot time disabling of APM
- *         Make boot messages far less verbose by default
- *         Make asm safer
- *         Stephen Rothwell
- *    1.8: Add CONFIG_APM_RTC_IS_GMT
- *         Richard Gooch <rgooch@atnf.csiro.au>
- *         change APM_NOINTS to CONFIG_APM_ALLOW_INTS
- *         remove dependency on CONFIG_PROC_FS
- *         Stephen Rothwell
- *    1.9: Fix small typo.  <laslo@wodip.opole.pl>
- *         Try to cope with BIOS's that need to have all display
- *         devices blanked and not just the first one.
- *         Ross Paterson <ross@soi.city.ac.uk>
- *         Fix segment limit setting it has always been wrong as
- *         the segments needed to have byte granularity.
- *         Mark a few things __init.
- *         Add hack to allow power off of SMP systems by popular request.
- *         Use CONFIG_SMP instead of __SMP__
- *         Ignore BOUNCES for three seconds.
- *         Stephen Rothwell
- *   1.10: Fix for Thinkpad return code.
- *         Merge 2.2 and 2.3 drivers.
- *         Remove APM dependencies in arch/i386/kernel/process.c
- *         Remove APM dependencies in drivers/char/sysrq.c
- *         Reset time across standby.
- *         Allow more inititialisation on SMP.
- *         Remove CONFIG_APM_POWER_OFF and make it boot time
- *         configurable (default on).
- *         Make debug only a boot time parameter (remove APM_DEBUG).
- *         Try to blank all devices on any error.
- *   1.11: Remove APM dependencies in drivers/char/console.c
- *         Check nr_running to detect if we are idle (from
- *         Borislav Deianov <borislav@lix.polytechnique.fr>)
- *         Fix for bioses that don't zero the top part of the
- *         entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
- *         (reported by Panos Katsaloulis <teras@writeme.com>).
- *         Real mode power off patch (Walter Hofmann
- *         <Walter.Hofmann@physik.stud.uni-erlangen.de>).
- *   1.12: Remove CONFIG_SMP as the compiler will optimize
- *         the code away anyway (smp_num_cpus == 1 in UP)
- *         noted by Artur Skawina <skawina@geocities.com>.
- *         Make power off under SMP work again.
- *         Fix thinko with initial engaging of BIOS.
- *         Make sure power off only happens on CPU 0
- *         (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
- *         Do error notification to user mode if BIOS calls fail.
- *         Move entrypoint offset fix to ...boot/setup.S
- *         where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
- *         Remove smp-power-off. SMP users must now specify
- *         "apm=power-off" on the kernel command line. Suggested
- *         by Jim Avera <jima@hal.com>, modified by Alan Cox
- *         <alan@lxorguk.ukuu.org.uk>.
- *         Register the /proc/apm entry even on SMP so that
- *         scripts that check for it before doing power off
- *         work (Jim Avera <jima@hal.com>).
- *   1.13: Changes for new pm_ interfaces (Andy Henroid
- *         <andy_henroid@yahoo.com>).
- *         Modularize the code.
- *         Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
- *         is now the way life works).
- *         Fix thinko in suspend() (wrong return).
- *         Notify drivers on critical suspend.
- *         Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
- *         modified by sfr).
- *         Disable interrupts while we are suspended (Andy Henroid
- *         <andy_henroid@yahoo.com> fixed by sfr).
- *         Make power off work on SMP again (Tony Hoyle
- *         <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
- *         Remove CONFIG_APM_SUSPEND_BOUNCE.  The bounce ignore
- *         interval is now configurable.
- *   1.14: Make connection version persist across module unload/load.
- *         Enable and engage power management earlier.
- *         Disengage power management on module unload.
- *         Changed to use the sysrq-register hack for registering the
- *         power off function called by magic sysrq based upon discussions
- *         in irc://irc.openprojects.net/#kernelnewbies
- *         (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
- *         Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
- *         (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
- *         Work around byte swap bug in one of the Vaio's BIOS's
- *         (Marc Boucher <marc@mbsi.ca>).
- *         Exposed the disable flag to dmi so that we can handle known
- *         broken APM (Alan Cox <alan@redhat.com>).
- *   1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
- *         calling it - instead idle. (Alan Cox <alan@redhat.com>)
- *         If an APM idle fails log it and idle sensibly
- *   1.15: Don't queue events to clients who open the device O_WRONLY.
- *         Don't expect replies from clients who open the device O_RDONLY.
- *         (Idea from Thomas Hood)
- *         Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
- *   1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
- *         Notify listeners of standby or suspend events before notifying
- *         drivers. Return EBUSY to ioctl() if suspend is rejected.
- *         (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
- *         Ignore first resume after we generate our own resume event
- *         after a suspend (Thomas Hood)
- *         Daemonize now gets rid of our controlling terminal (sfr).
- *         CONFIG_APM_CPU_IDLE now just affects the default value of
- *         idle_threshold (sfr).
- *         Change name of kernel apm daemon (as it no longer idles) (sfr).
- *   1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
- *        make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
- *        TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
- *
- * APM 1.1 Reference:
- *
- *   Intel Corporation, Microsoft Corporation. Advanced Power Management
- *   (APM) BIOS Interface Specification, Revision 1.1, September 1993.
- *   Intel Order Number 241704-001.  Microsoft Part Number 781-110-X01.
- *
- * [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc.  It is also
- * available from Microsoft by calling 206.882.8080.]
- *
- * APM 1.2 Reference:
- *   Intel Corporation, Microsoft Corporation. Advanced Power Management
- *   (APM) BIOS Interface Specification, Revision 1.2, February 1996.
- *
- * [This document is available from Microsoft at:
- *    http://www.microsoft.com/whdc/archive/amp_12.mspx]
- */
-
-#include <linux/module.h>
-
-#include <linux/poll.h>
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/timer.h>
-#include <linux/fcntl.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/miscdevice.h>
-#include <linux/apm_bios.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/sched.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
-#include <linux/capability.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/freezer.h>
-#include <linux/smp.h>
-#include <linux/dmi.h>
-#include <linux/suspend.h>
-#include <linux/kthread.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-#include <asm/i8253.h>
-#include <asm/paravirt.h>
-#include <asm/reboot.h>
-
-#include "io_ports.h"
-
-#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
-extern int (*console_blank_hook)(int);
-#endif
-
-/*
- * The apm_bios device is one of the misc char devices.
- * This is its minor number.
- */
-#define        APM_MINOR_DEV   134
-
-/*
- * See Documentation/Config.help for the configuration options.
- *
- * Various options can be changed at boot time as follows:
- * (We allow underscores for compatibility with the modules code)
- *     apm=on/off                      enable/disable APM
- *         [no-]allow[-_]ints          allow interrupts during BIOS calls
- *         [no-]broken[-_]psr          BIOS has a broken GetPowerStatus call
- *         [no-]realmode[-_]power[-_]off       switch to real mode before
- *                                             powering off
- *         [no-]debug                  log some debugging messages
- *         [no-]power[-_]off           power off on shutdown
- *         [no-]smp                    Use apm even on an SMP box
- *         bounce[-_]interval=<n>      number of ticks to ignore suspend
- *                                     bounces
- *          idle[-_]threshold=<n>       System idle percentage above which to
- *                                      make APM BIOS idle calls. Set it to
- *                                      100 to disable.
- *          idle[-_]period=<n>          Period (in 1/100s of a second) over
- *                                      which the idle percentage is
- *                                      calculated.
- */
-
-/* KNOWN PROBLEM MACHINES:
- *
- * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
- *                         [Confirmed by TI representative]
- * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
- *                    [Confirmed by BIOS disassembly]
- *                    [This may work now ...]
- * P: Toshiba 1950S: battery life information only gets updated after resume
- * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
- *     broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
- * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
- *     Neale Banks <neale@lowendale.com.au> December 2000
- *
- * Legend: U = unusable with APM patches
- *         P = partially usable with APM patches
- */
-
-/*
- * Define as 1 to make the driver always call the APM BIOS busy
- * routine even if the clock was not reported as slowed by the
- * idle routine.  Otherwise, define as 0.
- */
-#define ALWAYS_CALL_BUSY   1
-
-/*
- * Define to make the APM BIOS calls zero all data segment registers (so
- * that an incorrect BIOS implementation will cause a kernel panic if it
- * tries to write to arbitrary memory).
- */
-#define APM_ZERO_SEGS
-
-#include "apm.h"
-
-/*
- * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
- * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
- * David Chen <chen@ctpa04.mit.edu>
- */
-#undef INIT_TIMER_AFTER_SUSPEND
-
-#ifdef INIT_TIMER_AFTER_SUSPEND
-#include <linux/timex.h>
-#include <asm/io.h>
-#include <linux/delay.h>
-#endif
-
-/*
- * Need to poll the APM BIOS every second
- */
-#define APM_CHECK_TIMEOUT      (HZ)
-
-/*
- * Ignore suspend events for this amount of time after a resume
- */
-#define DEFAULT_BOUNCE_INTERVAL                (3 * HZ)
-
-/*
- * Maximum number of events stored
- */
-#define APM_MAX_EVENTS         20
-
-/*
- * The per-file APM data
- */
-struct apm_user {
-       int             magic;
-       struct apm_user *       next;
-       unsigned int    suser: 1;
-       unsigned int    writer: 1;
-       unsigned int    reader: 1;
-       unsigned int    suspend_wait: 1;
-       int             suspend_result;
-       int             suspends_pending;
-       int             standbys_pending;
-       int             suspends_read;
-       int             standbys_read;
-       int             event_head;
-       int             event_tail;
-       apm_event_t     events[APM_MAX_EVENTS];
-};
-
-/*
- * The magic number in apm_user
- */
-#define APM_BIOS_MAGIC         0x4101
-
-/*
- * idle percentage above which bios idle calls are done
- */
-#ifdef CONFIG_APM_CPU_IDLE
-#define DEFAULT_IDLE_THRESHOLD 95
-#else
-#define DEFAULT_IDLE_THRESHOLD 100
-#endif
-#define DEFAULT_IDLE_PERIOD    (100 / 3)
-
-/*
- * Local variables
- */
-static struct {
-       unsigned long   offset;
-       unsigned short  segment;
-}                              apm_bios_entry;
-static int                     clock_slowed;
-static int                     idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
-static int                     idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int                     set_pm_idle;
-static int                     suspends_pending;
-static int                     standbys_pending;
-static int                     ignore_sys_suspend;
-static int                     ignore_normal_resume;
-static int                     bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-
-static int                     debug __read_mostly;
-static int                     smp __read_mostly;
-static int                     apm_disabled = -1;
-#ifdef CONFIG_SMP
-static int                     power_off;
-#else
-static int                     power_off = 1;
-#endif
-#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int                     realmode_power_off = 1;
-#else
-static int                     realmode_power_off;
-#endif
-#ifdef CONFIG_APM_ALLOW_INTS
-static int                     allow_ints = 1;
-#else
-static int                     allow_ints;
-#endif
-static int                     broken_psr;
-
-static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
-static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
-static struct apm_user *       user_list;
-static DEFINE_SPINLOCK(user_list_lock);
-static const struct desc_struct        bad_bios_desc = { 0, 0x00409200 };
-
-static const char              driver_version[] = "1.16ac";    /* no spaces */
-
-static struct task_struct *kapmd_task;
-
-/*
- *     APM event names taken from the APM 1.2 specification. These are
- *     the message codes that the BIOS uses to tell us about events
- */
-static const char *    const apm_event_name[] = {
-       "system standby",
-       "system suspend",
-       "normal resume",
-       "critical resume",
-       "low battery",
-       "power status change",
-       "update time",
-       "critical suspend",
-       "user standby",
-       "user suspend",
-       "system standby resume",
-       "capabilities change"
-};
-#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
-
-typedef struct lookup_t {
-       int     key;
-       char *  msg;
-} lookup_t;
-
-/*
- *     The BIOS returns a set of standard error codes in AX when the
- *     carry flag is set.
- */
-static const lookup_t error_table[] = {
-/* N/A { APM_SUCCESS,          "Operation succeeded" }, */
-       { APM_DISABLED,         "Power management disabled" },
-       { APM_CONNECTED,        "Real mode interface already connected" },
-       { APM_NOT_CONNECTED,    "Interface not connected" },
-       { APM_16_CONNECTED,     "16 bit interface already connected" },
-/* N/A { APM_16_UNSUPPORTED,   "16 bit interface not supported" }, */
-       { APM_32_CONNECTED,     "32 bit interface already connected" },
-       { APM_32_UNSUPPORTED,   "32 bit interface not supported" },
-       { APM_BAD_DEVICE,       "Unrecognized device ID" },
-       { APM_BAD_PARAM,        "Parameter out of range" },
-       { APM_NOT_ENGAGED,      "Interface not engaged" },
-       { APM_BAD_FUNCTION,     "Function not supported" },
-       { APM_RESUME_DISABLED,  "Resume timer disabled" },
-       { APM_BAD_STATE,        "Unable to enter requested state" },
-/* N/A { APM_NO_EVENTS,        "No events pending" }, */
-       { APM_NO_ERROR,         "BIOS did not set a return code" },
-       { APM_NOT_PRESENT,      "No APM present" }
-};
-#define ERROR_COUNT    ARRAY_SIZE(error_table)
-
-/**
- *     apm_error       -       display an APM error
- *     @str: information string
- *     @err: APM BIOS return code
- *
- *     Write a meaningful log entry to the kernel log in the event of
- *     an APM error.
- */
-static void apm_error(char *str, int err)
-{
-       int     i;
-
-       for (i = 0; i < ERROR_COUNT; i++)
-               if (error_table[i].key == err) break;
-       if (i < ERROR_COUNT)
-               printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
-       else
-               printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
-                       str, err);
-}
-
-/*
- * Lock APM functionality to physical CPU 0
- */
-#ifdef CONFIG_SMP
-
-static cpumask_t apm_save_cpus(void)
-{
-       cpumask_t x = current->cpus_allowed;
-       /* Some bioses don't like being called from CPU != 0 */
-       set_cpus_allowed(current, cpumask_of_cpu(0));
-       BUG_ON(smp_processor_id() != 0);
-       return x;
-}
-
-static inline void apm_restore_cpus(cpumask_t mask)
-{
-       set_cpus_allowed(current, mask);
-}
-
-#else
-
-/*
- *     No CPU lockdown needed on a uniprocessor
- */
-#define apm_save_cpus()                (current->cpus_allowed)
-#define apm_restore_cpus(x)    (void)(x)
-
-#endif
-
-/*
- * These are the actual BIOS calls.  Depending on APM_ZERO_SEGS and
- * apm_info.allow_ints, we are being really paranoid here!  Not only
- * are interrupts disabled, but all the segment registers (except SS)
- * are saved and zeroed this means that if the BIOS tries to reference
- * any data without explicitly loading the segment registers, the kernel
- * will fault immediately rather than have some unforeseen circumstances
- * for the rest of the kernel.  And it will be very obvious!  :-) Doing
- * this depends on CS referring to the same physical memory as DS so that
- * DS can be zeroed before the call. Unfortunately, we can't do anything
- * about the stack segment/pointer.  Also, we tell the compiler that
- * everything could change.
- *
- * Also, we KNOW that for the non error case of apm_bios_call, there
- * is no useful data returned in the low order 8 bits of eax.
- */
-
-static inline unsigned long __apm_irq_save(void)
-{
-       unsigned long flags;
-       local_save_flags(flags);
-       if (apm_info.allow_ints) {
-               if (irqs_disabled_flags(flags))
-                       local_irq_enable();
-       } else
-               local_irq_disable();
-
-       return flags;
-}
-
-#define apm_irq_save(flags) \
-       do { flags = __apm_irq_save(); } while (0)
-
-static inline void apm_irq_restore(unsigned long flags)
-{
-       if (irqs_disabled_flags(flags))
-               local_irq_disable();
-       else if (irqs_disabled())
-               local_irq_enable();
-}
-
-#ifdef APM_ZERO_SEGS
-#      define APM_DECL_SEGS \
-               unsigned int saved_fs; unsigned int saved_gs;
-#      define APM_DO_SAVE_SEGS \
-               savesegment(fs, saved_fs); savesegment(gs, saved_gs)
-#      define APM_DO_RESTORE_SEGS \
-               loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
-#else
-#      define APM_DECL_SEGS
-#      define APM_DO_SAVE_SEGS
-#      define APM_DO_RESTORE_SEGS
-#endif
-
-/**
- *     apm_bios_call   -       Make an APM BIOS 32bit call
- *     @func: APM function to execute
- *     @ebx_in: EBX register for call entry
- *     @ecx_in: ECX register for call entry
- *     @eax: EAX register return
- *     @ebx: EBX register return
- *     @ecx: ECX register return
- *     @edx: EDX register return
- *     @esi: ESI register return
- *
- *     Make an APM call using the 32bit protected mode interface. The
- *     caller is responsible for knowing if APM BIOS is configured and
- *     enabled. This call can disable interrupts for a long period of
- *     time on some laptops.  The return value is in AH and the carry
- *     flag is loaded into AL.  If there is an error, then the error
- *     code is returned in AH (bits 8-15 of eax) and this function
- *     returns non-zero.
- */
-static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
-       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
-{
-       APM_DECL_SEGS
-       unsigned long           flags;
-       cpumask_t               cpus;
-       int                     cpu;
-       struct desc_struct      save_desc_40;
-       struct desc_struct      *gdt;
-
-       cpus = apm_save_cpus();
-       
-       cpu = get_cpu();
-       gdt = get_cpu_gdt_table(cpu);
-       save_desc_40 = gdt[0x40 / 8];
-       gdt[0x40 / 8] = bad_bios_desc;
-
-       apm_irq_save(flags);
-       APM_DO_SAVE_SEGS;
-       apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
-       APM_DO_RESTORE_SEGS;
-       apm_irq_restore(flags);
-       gdt[0x40 / 8] = save_desc_40;
-       put_cpu();
-       apm_restore_cpus(cpus);
-       
-       return *eax & 0xff;
-}
-
-/**
- *     apm_bios_call_simple    -       make a simple APM BIOS 32bit call
- *     @func: APM function to invoke
- *     @ebx_in: EBX register value for BIOS call
- *     @ecx_in: ECX register value for BIOS call
- *     @eax: EAX register on return from the BIOS call
- *
- *     Make a BIOS call that returns one value only, or just status.
- *     If there is an error, then the error code is returned in AH
- *     (bits 8-15 of eax) and this function returns non-zero. This is
- *     used for simpler BIOS operations. This call may hold interrupts
- *     off for a long time on some laptops.
- */
-
-static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
-{
-       u8                      error;
-       APM_DECL_SEGS
-       unsigned long           flags;
-       cpumask_t               cpus;
-       int                     cpu;
-       struct desc_struct      save_desc_40;
-       struct desc_struct      *gdt;
-
-       cpus = apm_save_cpus();
-       
-       cpu = get_cpu();
-       gdt = get_cpu_gdt_table(cpu);
-       save_desc_40 = gdt[0x40 / 8];
-       gdt[0x40 / 8] = bad_bios_desc;
-
-       apm_irq_save(flags);
-       APM_DO_SAVE_SEGS;
-       error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
-       APM_DO_RESTORE_SEGS;
-       apm_irq_restore(flags);
-       gdt[0x40 / 8] = save_desc_40;
-       put_cpu();
-       apm_restore_cpus(cpus);
-       return error;
-}
-
-/**
- *     apm_driver_version      -       APM driver version
- *     @val:   loaded with the APM version on return
- *
- *     Retrieve the APM version supported by the BIOS. This is only
- *     supported for APM 1.1 or higher. An error indicates APM 1.0 is
- *     probably present.
- *
- *     On entry val should point to a value indicating the APM driver
- *     version with the high byte being the major and the low byte the
- *     minor number both in BCD
- *
- *     On return it will hold the BIOS revision supported in the
- *     same format.
- */
-
-static int apm_driver_version(u_short *val)
-{
-       u32     eax;
-
-       if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
-               return (eax >> 8) & 0xff;
-       *val = eax;
-       return APM_SUCCESS;
-}
-
-/**
- *     apm_get_event   -       get an APM event from the BIOS
- *     @event: pointer to the event
- *     @info: point to the event information
- *
- *     The APM BIOS provides a polled information for event
- *     reporting. The BIOS expects to be polled at least every second
- *     when events are pending. When a message is found the caller should
- *     poll until no more messages are present.  However, this causes
- *     problems on some laptops where a suspend event notification is
- *     not cleared until it is acknowledged.
- *
- *     Additional information is returned in the info pointer, providing
- *     that APM 1.2 is in use. If no messges are pending the value 0x80
- *     is returned (No power management events pending).
- */
-static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
-{
-       u32     eax;
-       u32     ebx;
-       u32     ecx;
-       u32     dummy;
-
-       if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
-                       &dummy, &dummy))
-               return (eax >> 8) & 0xff;
-       *event = ebx;
-       if (apm_info.connection_version < 0x0102)
-               *info = ~0; /* indicate info not valid */
-       else
-               *info = ecx;
-       return APM_SUCCESS;
-}
-
-/**
- *     set_power_state -       set the power management state
- *     @what: which items to transition
- *     @state: state to transition to
- *
- *     Request an APM change of state for one or more system devices. The
- *     processor state must be transitioned last of all. what holds the
- *     class of device in the upper byte and the device number (0xFF for
- *     all) for the object to be transitioned.
- *
- *     The state holds the state to transition to, which may in fact
- *     be an acceptance of a BIOS requested state change.
- */
-static int set_power_state(u_short what, u_short state)
-{
-       u32     eax;
-
-       if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
-               return (eax >> 8) & 0xff;
-       return APM_SUCCESS;
-}
-
-/**
- *     set_system_power_state - set system wide power state
- *     @state: which state to enter
- *
- *     Transition the entire system into a new APM power state.
- */
-static int set_system_power_state(u_short state)
-{
-       return set_power_state(APM_DEVICE_ALL, state);
-}
-
-/**
- *     apm_do_idle     -       perform power saving
- *
- *     This function notifies the BIOS that the processor is (in the view
- *     of the OS) idle. It returns -1 in the event that the BIOS refuses
- *     to handle the idle request. On a success the function returns 1
- *     if the BIOS did clock slowing or 0 otherwise.
- */
-static int apm_do_idle(void)
-{
-       u32     eax;
-       u8      ret = 0;
-       int     idled = 0;
-       int     polling;
-
-       polling = !!(current_thread_info()->status & TS_POLLING);
-       if (polling) {
-               current_thread_info()->status &= ~TS_POLLING;
-               /*
-                * TS_POLLING-cleared state must be visible before we
-                * test NEED_RESCHED:
-                */
-               smp_mb();
-       }
-       if (!need_resched()) {
-               idled = 1;
-               ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
-       }
-       if (polling)
-               current_thread_info()->status |= TS_POLLING;
-
-       if (!idled)
-               return 0;
-
-       if (ret) {
-               static unsigned long t;
-
-               /* This always fails on some SMP boards running UP kernels.
-                * Only report the failure the first 5 times.
-                */
-               if (++t < 5)
-               {
-                       printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
-                                       (eax >> 8) & 0xff);
-                       t = jiffies;
-               }
-               return -1;
-       }
-       clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
-       return clock_slowed;
-}
-
-/**
- *     apm_do_busy     -       inform the BIOS the CPU is busy
- *
- *     Request that the BIOS brings the CPU back to full performance. 
- */
-static void apm_do_busy(void)
-{
-       u32     dummy;
-
-       if (clock_slowed || ALWAYS_CALL_BUSY) {
-               (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
-               clock_slowed = 0;
-       }
-}
-
-/*
- * If no process has really been interested in
- * the CPU for some time, we want to call BIOS
- * power management - we probably want
- * to conserve power.
- */
-#define IDLE_CALC_LIMIT   (HZ * 100)
-#define IDLE_LEAKY_MAX    16
-
-static void (*original_pm_idle)(void) __read_mostly;
-
-/**
- * apm_cpu_idle                -       cpu idling for APM capable Linux
- *
- * This is the idling function the kernel executes when APM is available. It 
- * tries to do BIOS powermanagement based on the average system idle time.
- * Furthermore it calls the system default idle routine.
- */
-
-static void apm_cpu_idle(void)
-{
-       static int use_apm_idle; /* = 0 */
-       static unsigned int last_jiffies; /* = 0 */
-       static unsigned int last_stime; /* = 0 */
-
-       int apm_idle_done = 0;
-       unsigned int jiffies_since_last_check = jiffies - last_jiffies;
-       unsigned int bucket;
-
-recalc:
-       if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
-               use_apm_idle = 0;
-               last_jiffies = jiffies;
-               last_stime = current->stime;
-       } else if (jiffies_since_last_check > idle_period) {
-               unsigned int idle_percentage;
-
-               idle_percentage = current->stime - last_stime;
-               idle_percentage *= 100;
-               idle_percentage /= jiffies_since_last_check;
-               use_apm_idle = (idle_percentage > idle_threshold);
-               if (apm_info.forbid_idle)
-                       use_apm_idle = 0;
-               last_jiffies = jiffies;
-               last_stime = current->stime;
-       }
-
-       bucket = IDLE_LEAKY_MAX;
-
-       while (!need_resched()) {
-               if (use_apm_idle) {
-                       unsigned int t;
-
-                       t = jiffies;
-                       switch (apm_do_idle()) {
-                       case 0: apm_idle_done = 1;
-                               if (t != jiffies) {
-                                       if (bucket) {
-                                               bucket = IDLE_LEAKY_MAX;
-                                               continue;
-                                       }
-                               } else if (bucket) {
-                                       bucket--;
-                                       continue;
-                               }
-                               break;
-                       case 1: apm_idle_done = 1;
-                               break;
-                       default: /* BIOS refused */
-                               break;
-                       }
-               }
-               if (original_pm_idle)
-                       original_pm_idle();
-               else
-                       default_idle();
-               jiffies_since_last_check = jiffies - last_jiffies;
-               if (jiffies_since_last_check > idle_period)
-                       goto recalc;
-       }
-
-       if (apm_idle_done)
-               apm_do_busy();
-}
-
-/**
- *     apm_power_off   -       ask the BIOS to power off
- *
- *     Handle the power off sequence. This is the one piece of code we
- *     will execute even on SMP machines. In order to deal with BIOS
- *     bugs we support real mode APM BIOS power off calls. We also make
- *     the SMP call on CPU0 as some systems will only honour this call
- *     on their first cpu.
- */
-static void apm_power_off(void)
-{
-       unsigned char   po_bios_call[] = {
-               0xb8, 0x00, 0x10,       /* movw  $0x1000,ax  */
-               0x8e, 0xd0,             /* movw  ax,ss       */
-               0xbc, 0x00, 0xf0,       /* movw  $0xf000,sp  */
-               0xb8, 0x07, 0x53,       /* movw  $0x5307,ax  */
-               0xbb, 0x01, 0x00,       /* movw  $0x0001,bx  */
-               0xb9, 0x03, 0x00,       /* movw  $0x0003,cx  */
-               0xcd, 0x15              /* int   $0x15       */
-       };
-
-       /* Some bioses don't like being called from CPU != 0 */
-       if (apm_info.realmode_power_off)
-       {
-               (void)apm_save_cpus();
-               machine_real_restart(po_bios_call, sizeof(po_bios_call));
-       }
-       else
-               (void) set_system_power_state(APM_STATE_OFF);
-}
-
-#ifdef CONFIG_APM_DO_ENABLE
-
-/**
- *     apm_enable_power_management - enable BIOS APM power management
- *     @enable: enable yes/no
- *
- *     Enable or disable the APM BIOS power services. 
- */
-static int apm_enable_power_management(int enable)
-{
-       u32     eax;
-
-       if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
-               return APM_NOT_ENGAGED;
-       if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
-                       enable, &eax))
-               return (eax >> 8) & 0xff;
-       if (enable)
-               apm_info.bios.flags &= ~APM_BIOS_DISABLED;
-       else
-               apm_info.bios.flags |= APM_BIOS_DISABLED;
-       return APM_SUCCESS;
-}
-#endif
-
-/**
- *     apm_get_power_status    -       get current power state
- *     @status: returned status
- *     @bat: battery info
- *     @life: estimated life
- *
- *     Obtain the current power status from the APM BIOS. We return a
- *     status which gives the rough battery status, and current power
- *     source. The bat value returned give an estimate as a percentage
- *     of life and a status value for the battery. The estimated life
- *     if reported is a lifetime in secodnds/minutes at current powwer
- *     consumption.
- */
-static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
-{
-       u32     eax;
-       u32     ebx;
-       u32     ecx;
-       u32     edx;
-       u32     dummy;
-
-       if (apm_info.get_power_status_broken)
-               return APM_32_UNSUPPORTED;
-       if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
-                       &eax, &ebx, &ecx, &edx, &dummy))
-               return (eax >> 8) & 0xff;
-       *status = ebx;
-       *bat = ecx;
-       if (apm_info.get_power_status_swabinminutes) {
-               *life = swab16((u16)edx);
-               *life |= 0x8000;
-       } else
-               *life = edx;
-       return APM_SUCCESS;
-}
-
-#if 0
-static int apm_get_battery_status(u_short which, u_short *status,
-                                 u_short *bat, u_short *life, u_short *nbat)
-{
-       u32     eax;
-       u32     ebx;
-       u32     ecx;
-       u32     edx;
-       u32     esi;
-
-       if (apm_info.connection_version < 0x0102) {
-               /* pretend we only have one battery. */
-               if (which != 1)
-                       return APM_BAD_DEVICE;
-               *nbat = 1;
-               return apm_get_power_status(status, bat, life);
-       }
-
-       if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
-                       &ebx, &ecx, &edx, &esi))
-               return (eax >> 8) & 0xff;
-       *status = ebx;
-       *bat = ecx;
-       *life = edx;
-       *nbat = esi;
-       return APM_SUCCESS;
-}
-#endif
-
-/**
- *     apm_engage_power_management     -       enable PM on a device
- *     @device: identity of device
- *     @enable: on/off
- *
- *     Activate or deactive power management on either a specific device
- *     or the entire system (%APM_DEVICE_ALL).
- */
-static int apm_engage_power_management(u_short device, int enable)
-{
-       u32     eax;
-
-       if ((enable == 0) && (device == APM_DEVICE_ALL)
-           && (apm_info.bios.flags & APM_BIOS_DISABLED))
-               return APM_DISABLED;
-       if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
-               return (eax >> 8) & 0xff;
-       if (device == APM_DEVICE_ALL) {
-               if (enable)
-                       apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
-               else
-                       apm_info.bios.flags |= APM_BIOS_DISENGAGED;
-       }
-       return APM_SUCCESS;
-}
-
-#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
-
-/**
- *     apm_console_blank       -       blank the display
- *     @blank: on/off
- *
- *     Attempt to blank the console, firstly by blanking just video device
- *     zero, and if that fails (some BIOSes don't support it) then it blanks
- *     all video devices. Typically the BIOS will do laptop backlight and
- *     monitor powerdown for us.
- */
-static int apm_console_blank(int blank)
-{
-       int error = APM_NOT_ENGAGED; /* silence gcc */
-       int i;
-       u_short state;
-       static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
-
-       state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
-
-       for (i = 0; i < ARRAY_SIZE(dev); i++) {
-               error = set_power_state(dev[i], state);
-
-               if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
-                       return 1;
-
-               if (error == APM_NOT_ENGAGED)
-                       break;
-       }
-
-       if (error == APM_NOT_ENGAGED) {
-               static int tried;
-               int eng_error;
-               if (tried++ == 0) {
-                       eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
-                       if (eng_error) {
-                               apm_error("set display", error);
-                               apm_error("engage interface", eng_error);
-                               return 0;
-                       } else
-                               return apm_console_blank(blank);
-               }
-       }
-       apm_error("set display", error);
-       return 0;
-}
-#endif
-
-static int queue_empty(struct apm_user *as)
-{
-       return as->event_head == as->event_tail;
-}
-
-static apm_event_t get_queued_event(struct apm_user *as)
-{
-       if (++as->event_tail >= APM_MAX_EVENTS)
-               as->event_tail = 0;
-       return as->events[as->event_tail];
-}
-
-static void queue_event(apm_event_t event, struct apm_user *sender)
-{
-       struct apm_user *       as;
-
-       spin_lock(&user_list_lock);
-       if (user_list == NULL)
-               goto out;
-       for (as = user_list; as != NULL; as = as->next) {
-               if ((as == sender) || (!as->reader))
-                       continue;
-               if (++as->event_head >= APM_MAX_EVENTS)
-                       as->event_head = 0;
-
-               if (as->event_head == as->event_tail) {
-                       static int notified;
-
-                       if (notified++ == 0)
-                           printk(KERN_ERR "apm: an event queue overflowed\n");
-                       if (++as->event_tail >= APM_MAX_EVENTS)
-                               as->event_tail = 0;
-               }
-               as->events[as->event_head] = event;
-               if ((!as->suser) || (!as->writer))
-                       continue;
-               switch (event) {
-               case APM_SYS_SUSPEND:
-               case APM_USER_SUSPEND:
-                       as->suspends_pending++;
-                       suspends_pending++;
-                       break;
-
-               case APM_SYS_STANDBY:
-               case APM_USER_STANDBY:
-                       as->standbys_pending++;
-                       standbys_pending++;
-                       break;
-               }
-       }
-       wake_up_interruptible(&apm_waitqueue);
-out:
-       spin_unlock(&user_list_lock);
-}
-
-static void reinit_timer(void)
-{
-#ifdef INIT_TIMER_AFTER_SUSPEND
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-       /* set the clock to HZ */
-       outb_p(0x34, PIT_MODE);         /* binary, mode 2, LSB/MSB, ch 0 */
-       udelay(10);
-       outb_p(LATCH & 0xff, PIT_CH0);  /* LSB */
-       udelay(10);
-       outb(LATCH >> 8, PIT_CH0);      /* MSB */
-       udelay(10);
-       spin_unlock_irqrestore(&i8253_lock, flags);
-#endif
-}
-
-static int suspend(int vetoable)
-{
-       int             err;
-       struct apm_user *as;
-
-       if (pm_send_all(PM_SUSPEND, (void *)3)) {
-               /* Vetoed */
-               if (vetoable) {
-                       if (apm_info.connection_version > 0x100)
-                               set_system_power_state(APM_STATE_REJECT);
-                       err = -EBUSY;
-                       ignore_sys_suspend = 0;
-                       printk(KERN_WARNING "apm: suspend was vetoed.\n");
-                       goto out;
-               }
-               printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
-       }
-
-       device_suspend(PMSG_SUSPEND);
-       local_irq_disable();
-       device_power_down(PMSG_SUSPEND);
-
-       local_irq_enable();
-
-       save_processor_state();
-       err = set_system_power_state(APM_STATE_SUSPEND);
-       ignore_normal_resume = 1;
-       restore_processor_state();
-
-       local_irq_disable();
-       reinit_timer();
-
-       if (err == APM_NO_ERROR)
-               err = APM_SUCCESS;
-       if (err != APM_SUCCESS)
-               apm_error("suspend", err);
-       err = (err == APM_SUCCESS) ? 0 : -EIO;
-       device_power_up();
-       local_irq_enable();
-       device_resume();
-       pm_send_all(PM_RESUME, (void *)0);
-       queue_event(APM_NORMAL_RESUME, NULL);
- out:
-       spin_lock(&user_list_lock);
-       for (as = user_list; as != NULL; as = as->next) {
-               as->suspend_wait = 0;
-               as->suspend_result = err;
-       }
-       spin_unlock(&user_list_lock);
-       wake_up_interruptible(&apm_suspend_waitqueue);
-       return err;
-}
-
-static void standby(void)
-{
-       int     err;
-
-       local_irq_disable();
-       device_power_down(PMSG_SUSPEND);
-       local_irq_enable();
-
-       err = set_system_power_state(APM_STATE_STANDBY);
-       if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
-               apm_error("standby", err);
-
-       local_irq_disable();
-       device_power_up();
-       local_irq_enable();
-}
-
-static apm_event_t get_event(void)
-{
-       int             error;
-       apm_event_t     event = APM_NO_EVENTS; /* silence gcc */
-       apm_eventinfo_t info;
-
-       static int notified;
-
-       /* we don't use the eventinfo */
-       error = apm_get_event(&event, &info);
-       if (error == APM_SUCCESS)
-               return event;
-
-       if ((error != APM_NO_EVENTS) && (notified++ == 0))
-               apm_error("get_event", error);
-
-       return 0;
-}
-
-static void check_events(void)
-{
-       apm_event_t             event;
-       static unsigned long    last_resume;
-       static int              ignore_bounce;
-
-       while ((event = get_event()) != 0) {
-               if (debug) {
-                       if (event <= NR_APM_EVENT_NAME)
-                               printk(KERN_DEBUG "apm: received %s notify\n",
-                                      apm_event_name[event - 1]);
-                       else
-                               printk(KERN_DEBUG "apm: received unknown "
-                                      "event 0x%02x\n", event);
-               }
-               if (ignore_bounce
-                   && ((jiffies - last_resume) > bounce_interval))
-                       ignore_bounce = 0;
-
-               switch (event) {
-               case APM_SYS_STANDBY:
-               case APM_USER_STANDBY:
-                       queue_event(event, NULL);
-                       if (standbys_pending <= 0)
-                               standby();
-                       break;
-
-               case APM_USER_SUSPEND:
-#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
-                       if (apm_info.connection_version > 0x100)
-                               set_system_power_state(APM_STATE_REJECT);
-                       break;
-#endif
-               case APM_SYS_SUSPEND:
-                       if (ignore_bounce) {
-                               if (apm_info.connection_version > 0x100)
-                                       set_system_power_state(APM_STATE_REJECT);
-                               break;
-                       }
-                       /*
-                        * If we are already processing a SUSPEND,
-                        * then further SUSPEND events from the BIOS
-                        * will be ignored.  We also return here to
-                        * cope with the fact that the Thinkpads keep
-                        * sending a SUSPEND event until something else
-                        * happens!
-                        */
-                       if (ignore_sys_suspend)
-                               return;
-                       ignore_sys_suspend = 1;
-                       queue_event(event, NULL);
-                       if (suspends_pending <= 0)
-                               (void) suspend(1);
-                       break;
-
-               case APM_NORMAL_RESUME:
-               case APM_CRITICAL_RESUME:
-               case APM_STANDBY_RESUME:
-                       ignore_sys_suspend = 0;
-                       last_resume = jiffies;
-                       ignore_bounce = 1;
-                       if ((event != APM_NORMAL_RESUME)
-                           || (ignore_normal_resume == 0)) {
-                               device_resume();
-                               pm_send_all(PM_RESUME, (void *)0);
-                               queue_event(event, NULL);
-                       }
-                       ignore_normal_resume = 0;
-                       break;
-
-               case APM_CAPABILITY_CHANGE:
-               case APM_LOW_BATTERY:
-               case APM_POWER_STATUS_CHANGE:
-                       queue_event(event, NULL);
-                       /* If needed, notify drivers here */
-                       break;
-
-               case APM_UPDATE_TIME:
-                       break;
-
-               case APM_CRITICAL_SUSPEND:
-                       /*
-                        * We are not allowed to reject a critical suspend.
-                        */
-                       (void) suspend(0);
-                       break;
-               }
-       }
-}
-
-static void apm_event_handler(void)
-{
-       static int      pending_count = 4;
-       int             err;
-
-       if ((standbys_pending > 0) || (suspends_pending > 0)) {
-               if ((apm_info.connection_version > 0x100) &&
-                               (pending_count-- <= 0)) {
-                       pending_count = 4;
-                       if (debug)
-                               printk(KERN_DEBUG "apm: setting state busy\n");
-                       err = set_system_power_state(APM_STATE_BUSY);
-                       if (err)
-                               apm_error("busy", err);
-               }
-       } else
-               pending_count = 4;
-       check_events();
-}
-
-/*
- * This is the APM thread main loop.
- */
-
-static void apm_mainloop(void)
-{
-       DECLARE_WAITQUEUE(wait, current);
-
-       add_wait_queue(&apm_waitqueue, &wait);
-       set_current_state(TASK_INTERRUPTIBLE);
-       for (;;) {
-               schedule_timeout(APM_CHECK_TIMEOUT);
-               if (kthread_should_stop())
-                       break;
-               /*
-                * Ok, check all events, check for idle (and mark us sleeping
-                * so as not to count towards the load average)..
-                */
-               set_current_state(TASK_INTERRUPTIBLE);
-               apm_event_handler();
-       }
-       remove_wait_queue(&apm_waitqueue, &wait);
-}
-
-static int check_apm_user(struct apm_user *as, const char *func)
-{
-       if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
-               printk(KERN_ERR "apm: %s passed bad filp\n", func);
-               return 1;
-       }
-       return 0;
-}
-
-static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
-{
-       struct apm_user *       as;
-       int                     i;
-       apm_event_t             event;
-
-       as = fp->private_data;
-       if (check_apm_user(as, "read"))
-               return -EIO;
-       if ((int)count < sizeof(apm_event_t))
-               return -EINVAL;
-       if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
-               return -EAGAIN;
-       wait_event_interruptible(apm_waitqueue, !queue_empty(as));
-       i = count;
-       while ((i >= sizeof(event)) && !queue_empty(as)) {
-               event = get_queued_event(as);
-               if (copy_to_user(buf, &event, sizeof(event))) {
-                       if (i < count)
-                               break;
-                       return -EFAULT;
-               }
-               switch (event) {
-               case APM_SYS_SUSPEND:
-               case APM_USER_SUSPEND:
-                       as->suspends_read++;
-                       break;
-
-               case APM_SYS_STANDBY:
-               case APM_USER_STANDBY:
-                       as->standbys_read++;
-                       break;
-               }
-               buf += sizeof(event);
-               i -= sizeof(event);
-       }
-       if (i < count)
-               return count - i;
-       if (signal_pending(current))
-               return -ERESTARTSYS;
-       return 0;
-}
-
-static unsigned int do_poll(struct file *fp, poll_table * wait)
-{
-       struct apm_user * as;
-
-       as = fp->private_data;
-       if (check_apm_user(as, "poll"))
-               return 0;
-       poll_wait(fp, &apm_waitqueue, wait);
-       if (!queue_empty(as))
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-static int do_ioctl(struct inode * inode, struct file *filp,
-                   u_int cmd, u_long arg)
-{
-       struct apm_user *       as;
-
-       as = filp->private_data;
-       if (check_apm_user(as, "ioctl"))
-               return -EIO;
-       if ((!as->suser) || (!as->writer))
-               return -EPERM;
-       switch (cmd) {
-       case APM_IOC_STANDBY:
-               if (as->standbys_read > 0) {
-                       as->standbys_read--;
-                       as->standbys_pending--;
-                       standbys_pending--;
-               } else
-                       queue_event(APM_USER_STANDBY, as);
-               if (standbys_pending <= 0)
-                       standby();
-               break;
-       case APM_IOC_SUSPEND:
-               if (as->suspends_read > 0) {
-                       as->suspends_read--;
-                       as->suspends_pending--;
-                       suspends_pending--;
-               } else
-                       queue_event(APM_USER_SUSPEND, as);
-               if (suspends_pending <= 0) {
-                       return suspend(1);
-               } else {
-                       as->suspend_wait = 1;
-                       wait_event_interruptible(apm_suspend_waitqueue,
-                                       as->suspend_wait == 0);
-                       return as->suspend_result;
-               }
-               break;
-       default:
-               return -EINVAL;
-       }
-       return 0;
-}
-
-static int do_release(struct inode * inode, struct file * filp)
-{
-       struct apm_user *       as;
-
-       as = filp->private_data;
-       if (check_apm_user(as, "release"))
-               return 0;
-       filp->private_data = NULL;
-       if (as->standbys_pending > 0) {
-               standbys_pending -= as->standbys_pending;
-               if (standbys_pending <= 0)
-                       standby();
-       }
-       if (as->suspends_pending > 0) {
-               suspends_pending -= as->suspends_pending;
-               if (suspends_pending <= 0)
-                       (void) suspend(1);
-       }
-       spin_lock(&user_list_lock);
-       if (user_list == as)
-               user_list = as->next;
-       else {
-               struct apm_user *       as1;
-
-               for (as1 = user_list;
-                    (as1 != NULL) && (as1->next != as);
-                    as1 = as1->next)
-                       ;
-               if (as1 == NULL)
-                       printk(KERN_ERR "apm: filp not in user list\n");
-               else
-                       as1->next = as->next;
-       }
-       spin_unlock(&user_list_lock);
-       kfree(as);
-       return 0;
-}
-
-static int do_open(struct inode * inode, struct file * filp)
-{
-       struct apm_user *       as;
-
-       as = kmalloc(sizeof(*as), GFP_KERNEL);
-       if (as == NULL) {
-               printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
-                      sizeof(*as));
-               return -ENOMEM;
-       }
-       as->magic = APM_BIOS_MAGIC;
-       as->event_tail = as->event_head = 0;
-       as->suspends_pending = as->standbys_pending = 0;
-       as->suspends_read = as->standbys_read = 0;
-       /*
-        * XXX - this is a tiny bit broken, when we consider BSD
-         * process accounting. If the device is opened by root, we
-        * instantly flag that we used superuser privs. Who knows,
-        * we might close the device immediately without doing a
-        * privileged operation -- cevans
-        */
-       as->suser = capable(CAP_SYS_ADMIN);
-       as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
-       as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
-       spin_lock(&user_list_lock);
-       as->next = user_list;
-       user_list = as;
-       spin_unlock(&user_list_lock);
-       filp->private_data = as;
-       return 0;
-}
-
-static int proc_apm_show(struct seq_file *m, void *v)
-{
-       unsigned short  bx;
-       unsigned short  cx;
-       unsigned short  dx;
-       int             error;
-       unsigned short  ac_line_status = 0xff;
-       unsigned short  battery_status = 0xff;
-       unsigned short  battery_flag   = 0xff;
-       int             percentage     = -1;
-       int             time_units     = -1;
-       char            *units         = "?";
-
-       if ((num_online_cpus() == 1) &&
-           !(error = apm_get_power_status(&bx, &cx, &dx))) {
-               ac_line_status = (bx >> 8) & 0xff;
-               battery_status = bx & 0xff;
-               if ((cx & 0xff) != 0xff)
-                       percentage = cx & 0xff;
-
-               if (apm_info.connection_version > 0x100) {
-                       battery_flag = (cx >> 8) & 0xff;
-                       if (dx != 0xffff) {
-                               units = (dx & 0x8000) ? "min" : "sec";
-                               time_units = dx & 0x7fff;
-                       }
-               }
-       }
-       /* Arguments, with symbols from linux/apm_bios.h.  Information is
-          from the Get Power Status (0x0a) call unless otherwise noted.
-
-          0) Linux driver version (this will change if format changes)
-          1) APM BIOS Version.  Usually 1.0, 1.1 or 1.2.
-          2) APM flags from APM Installation Check (0x00):
-             bit 0: APM_16_BIT_SUPPORT
-             bit 1: APM_32_BIT_SUPPORT
-             bit 2: APM_IDLE_SLOWS_CLOCK
-             bit 3: APM_BIOS_DISABLED
-             bit 4: APM_BIOS_DISENGAGED
-          3) AC line status
-             0x00: Off-line
-             0x01: On-line
-             0x02: On backup power (BIOS >= 1.1 only)
-             0xff: Unknown
-          4) Battery status
-             0x00: High
-             0x01: Low
-             0x02: Critical
-             0x03: Charging
-             0x04: Selected battery not present (BIOS >= 1.2 only)
-             0xff: Unknown
-          5) Battery flag
-             bit 0: High
-             bit 1: Low
-             bit 2: Critical
-             bit 3: Charging
-             bit 7: No system battery
-             0xff: Unknown
-          6) Remaining battery life (percentage of charge):
-             0-100: valid
-             -1: Unknown
-          7) Remaining battery life (time units):
-             Number of remaining minutes or seconds
-             -1: Unknown
-          8) min = minutes; sec = seconds */
-
-       seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
-                    driver_version,
-                    (apm_info.bios.version >> 8) & 0xff,
-                    apm_info.bios.version & 0xff,
-                    apm_info.bios.flags,
-                    ac_line_status,
-                    battery_status,
-                    battery_flag,
-                    percentage,
-                    time_units,
-                    units);
-       return 0;
-}
-
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
-       .owner          = THIS_MODULE,
-       .open           = proc_apm_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int apm(void *unused)
-{
-       unsigned short  bx;
-       unsigned short  cx;
-       unsigned short  dx;
-       int             error;
-       char *          power_stat;
-       char *          bat_stat;
-
-#ifdef CONFIG_SMP
-       /* 2002/08/01 - WT
-        * This is to avoid random crashes at boot time during initialization
-        * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
-        * Some bioses don't like being called from CPU != 0.
-        * Method suggested by Ingo Molnar.
-        */
-       set_cpus_allowed(current, cpumask_of_cpu(0));
-       BUG_ON(smp_processor_id() != 0);
-#endif
-
-       if (apm_info.connection_version == 0) {
-               apm_info.connection_version = apm_info.bios.version;
-               if (apm_info.connection_version > 0x100) {
-                       /*
-                        * We only support BIOSs up to version 1.2
-                        */
-                       if (apm_info.connection_version > 0x0102)
-                               apm_info.connection_version = 0x0102;
-                       error = apm_driver_version(&apm_info.connection_version);
-                       if (error != APM_SUCCESS) {
-                               apm_error("driver version", error);
-                               /* Fall back to an APM 1.0 connection. */
-                               apm_info.connection_version = 0x100;
-                       }
-               }
-       }
-
-       if (debug)
-               printk(KERN_INFO "apm: Connection version %d.%d\n",
-                       (apm_info.connection_version >> 8) & 0xff,
-                       apm_info.connection_version & 0xff);
-
-#ifdef CONFIG_APM_DO_ENABLE
-       if (apm_info.bios.flags & APM_BIOS_DISABLED) {
-               /*
-                * This call causes my NEC UltraLite Versa 33/C to hang if it
-                * is booted with PM disabled but not in the docking station.
-                * Unfortunate ...
-                */
-               error = apm_enable_power_management(1);
-               if (error) {
-                       apm_error("enable power management", error);
-                       return -1;
-               }
-       }
-#endif
-
-       if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
-           && (apm_info.connection_version > 0x0100)) {
-               error = apm_engage_power_management(APM_DEVICE_ALL, 1);
-               if (error) {
-                       apm_error("engage power management", error);
-                       return -1;
-               }
-       }
-
-       if (debug && (num_online_cpus() == 1 || smp )) {
-               error = apm_get_power_status(&bx, &cx, &dx);
-               if (error)
-                       printk(KERN_INFO "apm: power status not available\n");
-               else {
-                       switch ((bx >> 8) & 0xff) {
-                       case 0: power_stat = "off line"; break;
-                       case 1: power_stat = "on line"; break;
-                       case 2: power_stat = "on backup power"; break;
-                       default: power_stat = "unknown"; break;
-                       }
-                       switch (bx & 0xff) {
-                       case 0: bat_stat = "high"; break;
-                       case 1: bat_stat = "low"; break;
-                       case 2: bat_stat = "critical"; break;
-                       case 3: bat_stat = "charging"; break;
-                       default: bat_stat = "unknown"; break;
-                       }
-                       printk(KERN_INFO
-                              "apm: AC %s, battery status %s, battery life ",
-                              power_stat, bat_stat);
-                       if ((cx & 0xff) == 0xff)
-                               printk("unknown\n");
-                       else
-                               printk("%d%%\n", cx & 0xff);
-                       if (apm_info.connection_version > 0x100) {
-                               printk(KERN_INFO
-                                      "apm: battery flag 0x%02x, battery life ",
-                                      (cx >> 8) & 0xff);
-                               if (dx == 0xffff)
-                                       printk("unknown\n");
-                               else
-                                       printk("%d %s\n", dx & 0x7fff,
-                                               (dx & 0x8000) ?
-                                               "minutes" : "seconds");
-                       }
-               }
-       }
-
-       /* Install our power off handler.. */
-       if (power_off)
-               pm_power_off = apm_power_off;
-
-       if (num_online_cpus() == 1 || smp) {
-#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
-               console_blank_hook = apm_console_blank;
-#endif
-               apm_mainloop();
-#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
-               console_blank_hook = NULL;
-#endif
-       }
-
-       return 0;
-}
-
-#ifndef MODULE
-static int __init apm_setup(char *str)
-{
-       int     invert;
-
-       while ((str != NULL) && (*str != '\0')) {
-               if (strncmp(str, "off", 3) == 0)
-                       apm_disabled = 1;
-               if (strncmp(str, "on", 2) == 0)
-                       apm_disabled = 0;
-               if ((strncmp(str, "bounce-interval=", 16) == 0) ||
-                   (strncmp(str, "bounce_interval=", 16) == 0))
-                       bounce_interval = simple_strtol(str + 16, NULL, 0);
-               if ((strncmp(str, "idle-threshold=", 15) == 0) ||
-                   (strncmp(str, "idle_threshold=", 15) == 0))
-                       idle_threshold = simple_strtol(str + 15, NULL, 0);
-               if ((strncmp(str, "idle-period=", 12) == 0) ||
-                   (strncmp(str, "idle_period=", 12) == 0))
-                       idle_period = simple_strtol(str + 12, NULL, 0);
-               invert = (strncmp(str, "no-", 3) == 0) ||
-                       (strncmp(str, "no_", 3) == 0);
-               if (invert)
-                       str += 3;
-               if (strncmp(str, "debug", 5) == 0)
-                       debug = !invert;
-               if ((strncmp(str, "power-off", 9) == 0) ||
-                   (strncmp(str, "power_off", 9) == 0))
-                       power_off = !invert;
-               if (strncmp(str, "smp", 3) == 0)
-               {
-                       smp = !invert;
-                       idle_threshold = 100;
-               }
-               if ((strncmp(str, "allow-ints", 10) == 0) ||
-                   (strncmp(str, "allow_ints", 10) == 0))
-                       apm_info.allow_ints = !invert;
-               if ((strncmp(str, "broken-psr", 10) == 0) ||
-                   (strncmp(str, "broken_psr", 10) == 0))
-                       apm_info.get_power_status_broken = !invert;
-               if ((strncmp(str, "realmode-power-off", 18) == 0) ||
-                   (strncmp(str, "realmode_power_off", 18) == 0))
-                       apm_info.realmode_power_off = !invert;
-               str = strchr(str, ',');
-               if (str != NULL)
-                       str += strspn(str, ", \t");
-       }
-       return 1;
-}
-
-__setup("apm=", apm_setup);
-#endif
-
-static const struct file_operations apm_bios_fops = {
-       .owner          = THIS_MODULE,
-       .read           = do_read,
-       .poll           = do_poll,
-       .ioctl          = do_ioctl,
-       .open           = do_open,
-       .release        = do_release,
-};
-
-static struct miscdevice apm_device = {
-       APM_MINOR_DEV,
-       "apm_bios",
-       &apm_bios_fops
-};
-
-
-/* Simple "print if true" callback */
-static int __init print_if_true(struct dmi_system_id *d)
-{
-       printk("%s\n", d->ident);
-       return 0;
-}
-
-/*
- * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
- * disabled before the suspend. Linux used to get terribly confused by that.
- */
-static int __init broken_ps2_resume(struct dmi_system_id *d)
-{
-       printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
-       return 0;
-}
-
-/* Some bioses have a broken protected mode poweroff and need to use realmode */
-static int __init set_realmode_power_off(struct dmi_system_id *d)
-{
-       if (apm_info.realmode_power_off == 0) {
-               apm_info.realmode_power_off = 1;
-               printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
-       }
-       return 0;
-}
-
-/* Some laptops require interrupts to be enabled during APM calls */
-static int __init set_apm_ints(struct dmi_system_id *d)
-{
-       if (apm_info.allow_ints == 0) {
-               apm_info.allow_ints = 1;
-               printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
-       }
-       return 0;
-}
-
-/* Some APM bioses corrupt memory or just plain do not work */
-static int __init apm_is_horked(struct dmi_system_id *d)
-{
-       if (apm_info.disabled == 0) {
-               apm_info.disabled = 1;
-               printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
-       }
-       return 0;
-}
-
-static int __init apm_is_horked_d850md(struct dmi_system_id *d)
-{
-       if (apm_info.disabled == 0) {
-               apm_info.disabled = 1;
-               printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
-               printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
-               printk(KERN_INFO "download from support.intel.com \n");
-       }
-       return 0;
-}
-
-/* Some APM bioses hang on APM idle calls */
-static int __init apm_likes_to_melt(struct dmi_system_id *d)
-{
-       if (apm_info.forbid_idle == 0) {
-               apm_info.forbid_idle = 1;
-               printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
-       }
-       return 0;
-}
-
-/*
- *  Check for clue free BIOS implementations who use
- *  the following QA technique
- *
- *      [ Write BIOS Code ]<------
- *               |                ^
- *      < Does it Compile >----N--
- *               |Y               ^
- *     < Does it Boot Win98 >-N--
- *               |Y
- *           [Ship It]
- *
- *     Phoenix A04  08/24/2000 is known bad (Dell Inspiron 5000e)
- *     Phoenix A07  09/29/2000 is known good (Dell Inspiron 5000)
- */
-static int __init broken_apm_power(struct dmi_system_id *d)
-{
-       apm_info.get_power_status_broken = 1;
-       printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
-       return 0;
-}
-
-/*
- * This bios swaps the APM minute reporting bytes over (Many sony laptops
- * have this problem).
- */
-static int __init swab_apm_power_in_minutes(struct dmi_system_id *d)
-{
-       apm_info.get_power_status_swabinminutes = 1;
-       printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
-       return 0;
-}
-
-static struct dmi_system_id __initdata apm_dmi_table[] = {
-       {
-               print_if_true,
-               KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
-       },
-       {       /* Handle problems with APM on the C600 */
-               broken_ps2_resume, "Dell Latitude C600",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
-       },
-       {       /* Allow interrupts during suspend on Dell Latitude laptops*/
-               set_apm_ints, "Dell Latitude",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Dell Inspiron 2500",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
-       },
-       {       /* Allow interrupts during suspend on Dell Inspiron laptops*/
-               set_apm_ints, "Dell Inspiron", {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
-       },
-       {       /* Handle problems with APM on Inspiron 5000e */
-               broken_apm_power, "Dell Inspiron 5000e",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "A04"),
-                       DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
-       },
-       {       /* Handle problems with APM on Inspiron 2500 */
-               broken_apm_power, "Dell Inspiron 2500",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "A12"),
-                       DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Dell Dimension 4100",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
-                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
-       },
-       {       /* Allow interrupts during suspend on Compaq Laptops*/
-               set_apm_ints, "Compaq 12XL125",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
-                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
-       },
-       {       /* Allow interrupts during APM or the clock goes slow */
-               set_apm_ints, "ASUSTeK",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
-       },
-       {       /* APM blows on shutdown */
-               apm_is_horked, "ABIT KX7-333[R]",
-               {       DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
-                       DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Trigem Delhi3",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Fujitsu-Siemens",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked_d850md, "Intel D850MD",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
-                       DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Intel D810EMO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
-                       DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Dell XPS-Z",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
-                       DMI_MATCH(DMI_BIOS_VERSION, "A11"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Sharp PC-PJ/AX",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
-       },
-       {       /* APM crashes */
-               apm_is_horked, "Dell Inspiron 2500",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
-       },
-       {       /* APM idle hangs */
-               apm_likes_to_melt, "Jabil AMD",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
-                       DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
-       },
-       {       /* APM idle hangs */
-               apm_likes_to_melt, "AMI Bios",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
-                       DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
-                       DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-N505VX */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
-                       DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-XG29 */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
-                       DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-Z600NE */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
-                       DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-Z600NE */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
-                       DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
-                       DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-Z505LS */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
-                       DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-Z505LS */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
-                       DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
-                       DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-F104K */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
-                       DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
-       },
-
-       {       /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
-                       DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-C1VE */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
-                       DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
-       },
-       {       /* Handle problems with APM on Sony Vaio PCG-C1VE */
-               swab_apm_power_in_minutes, "Sony VAIO",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
-                       DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
-       },
-       {       /* broken PM poweroff bios */
-               set_realmode_power_off, "Award Software v4.60 PGMA",
-               {       DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
-                       DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
-                       DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
-       },
-
-       /* Generic per vendor APM settings  */
-
-       {       /* Allow interrupts during suspend on IBM laptops */
-               set_apm_ints, "IBM",
-               {       DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
-       },
-
-       { }
-};
-
-/*
- * Just start the APM thread. We do NOT want to do APM BIOS
- * calls from anything but the APM thread, if for no other reason
- * than the fact that we don't trust the APM BIOS. This way,
- * most common APM BIOS problems that lead to protection errors
- * etc will have at least some level of being contained...
- *
- * In short, if something bad happens, at least we have a choice
- * of just killing the apm thread..
- */
-static int __init apm_init(void)
-{
-       struct proc_dir_entry *apm_proc;
-       struct desc_struct *gdt;
-       int err;
-
-       dmi_check_system(apm_dmi_table);
-
-       if (apm_info.bios.version == 0 || paravirt_enabled()) {
-               printk(KERN_INFO "apm: BIOS not found.\n");
-               return -ENODEV;
-       }
-       printk(KERN_INFO
-               "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
-               ((apm_info.bios.version >> 8) & 0xff),
-               (apm_info.bios.version & 0xff),
-               apm_info.bios.flags,
-               driver_version);
-       if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
-               printk(KERN_INFO "apm: no 32 bit BIOS support\n");
-               return -ENODEV;
-       }
-
-       if (allow_ints)
-               apm_info.allow_ints = 1;
-       if (broken_psr)
-               apm_info.get_power_status_broken = 1;
-       if (realmode_power_off)
-               apm_info.realmode_power_off = 1;
-       /* User can override, but default is to trust DMI */
-       if (apm_disabled != -1)
-               apm_info.disabled = apm_disabled;
-
-       /*
-        * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
-        * but is reportedly a 1.0 BIOS.
-        */
-       if (apm_info.bios.version == 0x001)
-               apm_info.bios.version = 0x100;
-
-       /* BIOS < 1.2 doesn't set cseg_16_len */
-       if (apm_info.bios.version < 0x102)
-               apm_info.bios.cseg_16_len = 0; /* 64k */
-
-       if (debug) {
-               printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
-                       apm_info.bios.cseg, apm_info.bios.offset,
-                       apm_info.bios.cseg_16, apm_info.bios.dseg);
-               if (apm_info.bios.version > 0x100)
-                       printk(" cseg len %x, dseg len %x",
-                               apm_info.bios.cseg_len,
-                               apm_info.bios.dseg_len);
-               if (apm_info.bios.version > 0x101)
-                       printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
-               printk("\n");
-       }
-
-       if (apm_info.disabled) {
-               printk(KERN_NOTICE "apm: disabled on user request.\n");
-               return -ENODEV;
-       }
-       if ((num_online_cpus() > 1) && !power_off && !smp) {
-               printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
-               apm_info.disabled = 1;
-               return -ENODEV;
-       }
-       if (PM_IS_ACTIVE()) {
-               printk(KERN_NOTICE "apm: overridden by ACPI.\n");
-               apm_info.disabled = 1;
-               return -ENODEV;
-       }
-#ifdef CONFIG_PM_LEGACY
-       pm_active = 1;
-#endif
-
-       /*
-        * Set up a segment that references the real mode segment 0x40
-        * that extends up to the end of page zero (that we have reserved).
-        * This is for buggy BIOS's that refer to (real mode) segment 0x40
-        * even though they are called in protected mode.
-        */
-       set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
-       _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
-
-       /*
-        * Set up the long jump entry point to the APM BIOS, which is called
-        * from inline assembly.
-        */
-       apm_bios_entry.offset = apm_info.bios.offset;
-       apm_bios_entry.segment = APM_CS;
-
-       /*
-        * The APM 1.1 BIOS is supposed to provide limit information that it
-        * recognizes.  Many machines do this correctly, but many others do
-        * not restrict themselves to their claimed limit.  When this happens,
-        * they will cause a segmentation violation in the kernel at boot time.
-        * Most BIOS's, however, will respect a 64k limit, so we use that.
-        *
-        * Note we only set APM segments on CPU zero, since we pin the APM
-        * code to that CPU.
-        */
-       gdt = get_cpu_gdt_table(0);
-       set_base(gdt[APM_CS >> 3],
-                __va((unsigned long)apm_info.bios.cseg << 4));
-       set_base(gdt[APM_CS_16 >> 3],
-                __va((unsigned long)apm_info.bios.cseg_16 << 4));
-       set_base(gdt[APM_DS >> 3],
-                __va((unsigned long)apm_info.bios.dseg << 4));
-
-       apm_proc = create_proc_entry("apm", 0, NULL);
-       if (apm_proc)
-               apm_proc->proc_fops = &apm_file_ops;
-
-       kapmd_task = kthread_create(apm, NULL, "kapmd");
-       if (IS_ERR(kapmd_task)) {
-               printk(KERN_ERR "apm: disabled - Unable to start kernel "
-                               "thread.\n");
-               err = PTR_ERR(kapmd_task);
-               kapmd_task = NULL;
-               remove_proc_entry("apm", NULL);
-               return err;
-       }
-       wake_up_process(kapmd_task);
-
-       if (num_online_cpus() > 1 && !smp ) {
-               printk(KERN_NOTICE
-                  "apm: disabled - APM is not SMP safe (power off active).\n");
-               return 0;
-       }
-
-       /*
-        * Note we don't actually care if the misc_device cannot be registered.
-        * this driver can do its job without it, even if userspace can't
-        * control it.  just log the error
-        */
-       if (misc_register(&apm_device))
-               printk(KERN_WARNING "apm: Could not register misc device.\n");
-
-       if (HZ != 100)
-               idle_period = (idle_period * HZ) / 100;
-       if (idle_threshold < 100) {
-               original_pm_idle = pm_idle;
-               pm_idle  = apm_cpu_idle;
-               set_pm_idle = 1;
-       }
-
-       return 0;
-}
-
-static void __exit apm_exit(void)
-{
-       int     error;
-
-       if (set_pm_idle) {
-               pm_idle = original_pm_idle;
-               /*
-                * We are about to unload the current idle thread pm callback
-                * (pm_idle), Wait for all processors to update cached/local
-                * copies of pm_idle before proceeding.
-                */
-               cpu_idle_wait();
-       }
-       if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
-           && (apm_info.connection_version > 0x0100)) {
-               error = apm_engage_power_management(APM_DEVICE_ALL, 0);
-               if (error)
-                       apm_error("disengage power management", error);
-       }
-       misc_deregister(&apm_device);
-       remove_proc_entry("apm", NULL);
-       if (power_off)
-               pm_power_off = NULL;
-       if (kapmd_task) {
-               kthread_stop(kapmd_task);
-               kapmd_task = NULL;
-       }
-#ifdef CONFIG_PM_LEGACY
-       pm_active = 0;
-#endif
-}
-
-module_init(apm_init);
-module_exit(apm_exit);
-
-MODULE_AUTHOR("Stephen Rothwell");
-MODULE_DESCRIPTION("Advanced Power Management");
-MODULE_LICENSE("GPL");
-module_param(debug, bool, 0644);
-MODULE_PARM_DESC(debug, "Enable debug mode");
-module_param(power_off, bool, 0444);
-MODULE_PARM_DESC(power_off, "Enable power off");
-module_param(bounce_interval, int, 0444);
-MODULE_PARM_DESC(bounce_interval,
-               "Set the number of ticks to ignore suspend bounces");
-module_param(allow_ints, bool, 0444);
-MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
-module_param(broken_psr, bool, 0444);
-MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
-module_param(realmode_power_off, bool, 0444);
-MODULE_PARM_DESC(realmode_power_off,
-               "Switch to real mode before powering off");
-module_param(idle_threshold, int, 0444);
-MODULE_PARM_DESC(idle_threshold,
-       "System idle percentage above which to make APM BIOS idle calls");
-module_param(idle_period, int, 0444);
-MODULE_PARM_DESC(idle_period,
-       "Period (in sec/100) over which to caculate the idle percentage");
-module_param(smp, bool, 0444);
-MODULE_PARM_DESC(smp,
-       "Set this to enable APM use on an SMP platform. Use with caution on older systems");
-MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
deleted file mode 100644 (file)
index cfa82c8..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include "asm-offsets_32.c"
-#else
-# include "asm-offsets_64.c"
-#endif
diff --git a/arch/i386/kernel/asm-offsets_32.c b/arch/i386/kernel/asm-offsets_32.c
deleted file mode 100644 (file)
index 8029742..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <asm/ucontext.h>
-#include "sigframe_32.h"
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/elf.h>
-
-#include <xen/interface/xen.h>
-
-#ifdef CONFIG_LGUEST_GUEST
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-#endif
-
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : : )
-
-#define OFFSET(sym, str, mem) \
-       DEFINE(sym, offsetof(struct str, mem));
-
-/* workaround for a warning with -Wmissing-prototypes */
-void foo(void);
-
-void foo(void)
-{
-       OFFSET(SIGCONTEXT_eax, sigcontext, eax);
-       OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
-       OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
-       OFFSET(SIGCONTEXT_edx, sigcontext, edx);
-       OFFSET(SIGCONTEXT_esi, sigcontext, esi);
-       OFFSET(SIGCONTEXT_edi, sigcontext, edi);
-       OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
-       OFFSET(SIGCONTEXT_esp, sigcontext, esp);
-       OFFSET(SIGCONTEXT_eip, sigcontext, eip);
-       BLANK();
-
-       OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
-       OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
-       OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
-       OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
-       OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
-       OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
-       OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
-       OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
-       BLANK();
-
-       OFFSET(TI_task, thread_info, task);
-       OFFSET(TI_exec_domain, thread_info, exec_domain);
-       OFFSET(TI_flags, thread_info, flags);
-       OFFSET(TI_status, thread_info, status);
-       OFFSET(TI_preempt_count, thread_info, preempt_count);
-       OFFSET(TI_addr_limit, thread_info, addr_limit);
-       OFFSET(TI_restart_block, thread_info, restart_block);
-       OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-       OFFSET(TI_cpu, thread_info, cpu);
-       BLANK();
-
-       OFFSET(GDS_size, Xgt_desc_struct, size);
-       OFFSET(GDS_address, Xgt_desc_struct, address);
-       OFFSET(GDS_pad, Xgt_desc_struct, pad);
-       BLANK();
-
-       OFFSET(PT_EBX, pt_regs, ebx);
-       OFFSET(PT_ECX, pt_regs, ecx);
-       OFFSET(PT_EDX, pt_regs, edx);
-       OFFSET(PT_ESI, pt_regs, esi);
-       OFFSET(PT_EDI, pt_regs, edi);
-       OFFSET(PT_EBP, pt_regs, ebp);
-       OFFSET(PT_EAX, pt_regs, eax);
-       OFFSET(PT_DS,  pt_regs, xds);
-       OFFSET(PT_ES,  pt_regs, xes);
-       OFFSET(PT_FS,  pt_regs, xfs);
-       OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
-       OFFSET(PT_EIP, pt_regs, eip);
-       OFFSET(PT_CS,  pt_regs, xcs);
-       OFFSET(PT_EFLAGS, pt_regs, eflags);
-       OFFSET(PT_OLDESP, pt_regs, esp);
-       OFFSET(PT_OLDSS,  pt_regs, xss);
-       BLANK();
-
-       OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
-       OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
-       BLANK();
-
-       OFFSET(pbe_address, pbe, address);
-       OFFSET(pbe_orig_address, pbe, orig_address);
-       OFFSET(pbe_next, pbe, next);
-
-       /* Offset from the sysenter stack to tss.esp0 */
-       DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
-                sizeof(struct tss_struct));
-
-       DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-       DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
-       DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
-       DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
-       DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
-
-       DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
-
-       OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
-#ifdef CONFIG_PARAVIRT
-       BLANK();
-       OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
-       OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
-       OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
-       OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
-       OFFSET(PARAVIRT_iret, paravirt_ops, iret);
-       OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
-#endif
-
-#ifdef CONFIG_XEN
-       BLANK();
-       OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
-       OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#endif
-
-#ifdef CONFIG_LGUEST_GUEST
-       BLANK();
-       OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-       OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-       OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-       OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-       OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-       OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-       OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-       OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-       OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-       OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-       OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
-}
diff --git a/arch/i386/kernel/bootflag.c b/arch/i386/kernel/bootflag.c
deleted file mode 100644 (file)
index 0b98605..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- *     Implement 'Simple Boot Flag Specification 2.0'
- */
-
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/acpi.h>
-#include <asm/io.h>
-
-#include <linux/mc146818rtc.h>
-
-
-#define SBF_RESERVED (0x78)
-#define SBF_PNPOS    (1<<0)
-#define SBF_BOOTING  (1<<1)
-#define SBF_DIAG     (1<<2)
-#define SBF_PARITY   (1<<7)
-
-
-int sbf_port __initdata = -1;  /* set via acpi_boot_init() */
-
-
-static int __init parity(u8 v)
-{
-       int x = 0;
-       int i;
-       
-       for(i=0;i<8;i++)
-       {
-               x^=(v&1);
-               v>>=1;
-       }
-       return x;
-}
-
-static void __init sbf_write(u8 v)
-{
-       unsigned long flags;
-       if(sbf_port != -1)
-       {
-               v &= ~SBF_PARITY;
-               if(!parity(v))
-                       v|=SBF_PARITY;
-
-               printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
-
-               spin_lock_irqsave(&rtc_lock, flags);
-               CMOS_WRITE(v, sbf_port);
-               spin_unlock_irqrestore(&rtc_lock, flags);
-       }
-}
-
-static u8 __init sbf_read(void)
-{
-       u8 v;
-       unsigned long flags;
-       if(sbf_port == -1)
-               return 0;
-       spin_lock_irqsave(&rtc_lock, flags);
-       v = CMOS_READ(sbf_port);
-       spin_unlock_irqrestore(&rtc_lock, flags);
-       return v;
-}
-
-static int __init sbf_value_valid(u8 v)
-{
-       if(v&SBF_RESERVED)              /* Reserved bits */
-               return 0;
-       if(!parity(v))
-               return 0;
-       return 1;
-}
-
-static int __init sbf_init(void)
-{
-       u8 v;
-       if(sbf_port == -1)
-               return 0;
-       v = sbf_read();
-       if(!sbf_value_valid(v))
-               printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
-
-       v &= ~SBF_RESERVED;
-       v &= ~SBF_BOOTING;
-       v &= ~SBF_DIAG;
-#if defined(CONFIG_ISAPNP)
-       v |= SBF_PNPOS;
-#endif
-       sbf_write(v);
-       return 0;
-}
-
-module_init(sbf_init);
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
deleted file mode 100644 (file)
index 5c2faa1..0000000
+++ /dev/null
@@ -1,242 +0,0 @@
-/* ----------------------------------------------------------------------- *
- *   
- *   Copyright 2000 H. Peter Anvin - All Rights Reserved
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- *   USA; either version 2 of the License, or (at your option) any later
- *   version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * cpuid.c
- *
- * x86 CPUID access device
- *
- * This device is accessed by lseek() to the appropriate CPUID level
- * and then read in chunks of 16 bytes.  A larger size means multiple
- * reads of consecutive levels.
- *
- * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
- * an SMP box will direct the access to CPU %d.
- */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/smp.h>
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/smp_lock.h>
-#include <linux/device.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-static struct class *cpuid_class;
-
-#ifdef CONFIG_SMP
-
-struct cpuid_command {
-       u32 reg;
-       u32 *data;
-};
-
-static void cpuid_smp_cpuid(void *cmd_block)
-{
-       struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
-
-       cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
-                     &cmd->data[3]);
-}
-
-static inline void do_cpuid(int cpu, u32 reg, u32 * data)
-{
-       struct cpuid_command cmd;
-
-       preempt_disable();
-       if (cpu == smp_processor_id()) {
-               cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
-       } else {
-               cmd.reg = reg;
-               cmd.data = data;
-
-               smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
-       }
-       preempt_enable();
-}
-#else                          /* ! CONFIG_SMP */
-
-static inline void do_cpuid(int cpu, u32 reg, u32 * data)
-{
-       cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
-}
-
-#endif                         /* ! CONFIG_SMP */
-
-static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
-{
-       loff_t ret;
-
-       lock_kernel();
-
-       switch (orig) {
-       case 0:
-               file->f_pos = offset;
-               ret = file->f_pos;
-               break;
-       case 1:
-               file->f_pos += offset;
-               ret = file->f_pos;
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       unlock_kernel();
-       return ret;
-}
-
-static ssize_t cpuid_read(struct file *file, char __user *buf,
-                         size_t count, loff_t * ppos)
-{
-       char __user *tmp = buf;
-       u32 data[4];
-       u32 reg = *ppos;
-       int cpu = iminor(file->f_path.dentry->d_inode);
-
-       if (count % 16)
-               return -EINVAL; /* Invalid chunk size */
-
-       for (; count; count -= 16) {
-               do_cpuid(cpu, reg, data);
-               if (copy_to_user(tmp, &data, 16))
-                       return -EFAULT;
-               tmp += 16;
-               *ppos = reg++;
-       }
-
-       return tmp - buf;
-}
-
-static int cpuid_open(struct inode *inode, struct file *file)
-{
-       unsigned int cpu = iminor(file->f_path.dentry->d_inode);
-       struct cpuinfo_x86 *c = &(cpu_data)[cpu];
-
-       if (cpu >= NR_CPUS || !cpu_online(cpu))
-               return -ENXIO;  /* No such CPU */
-       if (c->cpuid_level < 0)
-               return -EIO;    /* CPUID not supported */
-
-       return 0;
-}
-
-/*
- * File operations we support
- */
-static const struct file_operations cpuid_fops = {
-       .owner = THIS_MODULE,
-       .llseek = cpuid_seek,
-       .read = cpuid_read,
-       .open = cpuid_open,
-};
-
-static int cpuid_device_create(int i)
-{
-       int err = 0;
-       struct device *dev;
-
-       dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
-       if (IS_ERR(dev))
-               err = PTR_ERR(dev);
-       return err;
-}
-
-static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               cpuid_device_create(cpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
-{
-       .notifier_call = cpuid_class_cpu_callback,
-};
-
-static int __init cpuid_init(void)
-{
-       int i, err = 0;
-       i = 0;
-
-       if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
-               printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
-                      CPUID_MAJOR);
-               err = -EBUSY;
-               goto out;
-       }
-       cpuid_class = class_create(THIS_MODULE, "cpuid");
-       if (IS_ERR(cpuid_class)) {
-               err = PTR_ERR(cpuid_class);
-               goto out_chrdev;
-       }
-       for_each_online_cpu(i) {
-               err = cpuid_device_create(i);
-               if (err != 0) 
-                       goto out_class;
-       }
-       register_hotcpu_notifier(&cpuid_class_cpu_notifier);
-
-       err = 0;
-       goto out;
-
-out_class:
-       i = 0;
-       for_each_online_cpu(i) {
-               device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
-       }
-       class_destroy(cpuid_class);
-out_chrdev:
-       unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");    
-out:
-       return err;
-}
-
-static void __exit cpuid_exit(void)
-{
-       int cpu = 0;
-
-       for_each_online_cpu(cpu)
-               device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
-       class_destroy(cpuid_class);
-       unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
-       unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
-}
-
-module_init(cpuid_init);
-module_exit(cpuid_exit);
-
-MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
-MODULE_DESCRIPTION("x86 generic CPUID driver");
-MODULE_LICENSE("GPL");
diff --git a/arch/i386/kernel/crash_32.c b/arch/i386/kernel/crash_32.c
deleted file mode 100644 (file)
index 53589d1..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Architecture specific (i386) functions for kexec based crash dumps.
- *
- * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
- *
- * Copyright (C) IBM Corporation, 2004. All rights reserved.
- *
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/reboot.h>
-#include <linux/kexec.h>
-#include <linux/delay.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-
-#include <asm/processor.h>
-#include <asm/hardirq.h>
-#include <asm/nmi.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <linux/kdebug.h>
-#include <asm/smp.h>
-
-#include <mach_ipi.h>
-
-
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
-
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
-static atomic_t waiting_for_crash_ipi;
-
-static int crash_nmi_callback(struct notifier_block *self,
-                       unsigned long val, void *data)
-{
-       struct pt_regs *regs;
-       struct pt_regs fixed_regs;
-       int cpu;
-
-       if (val != DIE_NMI_IPI)
-               return NOTIFY_OK;
-
-       regs = ((struct die_args *)data)->regs;
-       cpu = raw_smp_processor_id();
-
-       /* Don't do anything if this handler is invoked on crashing cpu.
-        * Otherwise, system will completely hang. Crashing cpu can get
-        * an NMI if system was initially booted with nmi_watchdog parameter.
-        */
-       if (cpu == crashing_cpu)
-               return NOTIFY_STOP;
-       local_irq_disable();
-
-       if (!user_mode_vm(regs)) {
-               crash_fixup_ss_esp(&fixed_regs, regs);
-               regs = &fixed_regs;
-       }
-       crash_save_cpu(regs, cpu);
-       disable_local_APIC();
-       atomic_dec(&waiting_for_crash_ipi);
-       /* Assume hlt works */
-       halt();
-       for (;;)
-               cpu_relax();
-
-       return 1;
-}
-
-static void smp_send_nmi_allbutself(void)
-{
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(safe_smp_processor_id(), mask);
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, NMI_VECTOR);
-}
-
-static struct notifier_block crash_nmi_nb = {
-       .notifier_call = crash_nmi_callback,
-};
-
-static void nmi_shootdown_cpus(void)
-{
-       unsigned long msecs;
-
-       atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-       /* Would it be better to replace the trap vector here? */
-       if (register_die_notifier(&crash_nmi_nb))
-               return;         /* return what? */
-       /* Ensure the new callback function is set before sending
-        * out the NMI
-        */
-       wmb();
-
-       smp_send_nmi_allbutself();
-
-       msecs = 1000; /* Wait at most a second for the other cpus to stop */
-       while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
-               mdelay(1);
-               msecs--;
-       }
-
-       /* Leave the nmi callback set */
-       disable_local_APIC();
-}
-#else
-static void nmi_shootdown_cpus(void)
-{
-       /* There are no cpus to shootdown */
-}
-#endif
-
-void machine_crash_shutdown(struct pt_regs *regs)
-{
-       /* This function is only called after the system
-        * has panicked or is otherwise in a critical state.
-        * The minimum amount of code to allow a kexec'd kernel
-        * to run successfully needs to happen here.
-        *
-        * In practice this means shooting down the other cpus in
-        * an SMP system.
-        */
-       /* The kernel is broken so disable interrupts */
-       local_irq_disable();
-
-       /* Make a note of crashing cpu. Will be used in NMI callback.*/
-       crashing_cpu = safe_smp_processor_id();
-       nmi_shootdown_cpus();
-       lapic_shutdown();
-#if defined(CONFIG_X86_IO_APIC)
-       disable_IO_APIC();
-#endif
-       crash_save_cpu(regs, safe_smp_processor_id());
-}
diff --git a/arch/i386/kernel/crash_dump_32.c b/arch/i386/kernel/crash_dump_32.c
deleted file mode 100644 (file)
index 3f532df..0000000
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *     kernel/crash_dump.c - Memory preserving reboot related code.
- *
- *     Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
- *     Copyright (C) IBM Corporation, 2004. All rights reserved
- */
-
-#include <linux/errno.h>
-#include <linux/highmem.h>
-#include <linux/crash_dump.h>
-
-#include <asm/uaccess.h>
-
-static void *kdump_buf_page;
-
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- *     space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- *     otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-                               size_t csize, unsigned long offset, int userbuf)
-{
-       void  *vaddr;
-
-       if (!csize)
-               return 0;
-
-       vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
-
-       if (!userbuf) {
-               memcpy(buf, (vaddr + offset), csize);
-               kunmap_atomic(vaddr, KM_PTE0);
-       } else {
-               if (!kdump_buf_page) {
-                       printk(KERN_WARNING "Kdump: Kdump buffer page not"
-                               " allocated\n");
-                       return -EFAULT;
-               }
-               copy_page(kdump_buf_page, vaddr);
-               kunmap_atomic(vaddr, KM_PTE0);
-               if (copy_to_user(buf, (kdump_buf_page + offset), csize))
-                       return -EFAULT;
-       }
-
-       return csize;
-}
-
-static int __init kdump_buf_page_init(void)
-{
-       int ret = 0;
-
-       kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!kdump_buf_page) {
-               printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
-                        " page\n");
-               ret = -ENOMEM;
-       }
-
-       return ret;
-}
-arch_initcall(kdump_buf_page_init);
diff --git a/arch/i386/kernel/doublefault_32.c b/arch/i386/kernel/doublefault_32.c
deleted file mode 100644 (file)
index 40978af..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
-
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
-
-static void doublefault_fn(void)
-{
-       struct Xgt_desc_struct gdt_desc = {0, 0};
-       unsigned long gdt, tss;
-
-       store_gdt(&gdt_desc);
-       gdt = gdt_desc.address;
-
-       printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
-
-       if (ptr_ok(gdt)) {
-               gdt += GDT_ENTRY_TSS << 3;
-               tss = *(u16 *)(gdt+2);
-               tss += *(u8 *)(gdt+4) << 16;
-               tss += *(u8 *)(gdt+7) << 24;
-               printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
-
-               if (ptr_ok(tss)) {
-                       struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
-
-                       printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
-
-                       printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
-                               t->eax, t->ebx, t->ecx, t->edx);
-                       printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-                               t->esi, t->edi);
-               }
-       }
-
-       for (;;)
-               cpu_relax();
-}
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
-       .x86_tss = {
-               .esp0           = STACK_START,
-               .ss0            = __KERNEL_DS,
-               .ldt            = 0,
-               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
-               .eip            = (unsigned long) doublefault_fn,
-               /* 0x2 bit is always set */
-               .eflags         = X86_EFLAGS_SF | 0x2,
-               .esp            = STACK_START,
-               .es             = __USER_DS,
-               .cs             = __KERNEL_CS,
-               .ss             = __KERNEL_DS,
-               .ds             = __USER_DS,
-               .fs             = __KERNEL_PERCPU,
-
-               .__cr3          = __pa(swapper_pg_dir)
-       }
-};
diff --git a/arch/i386/kernel/e820_32.c b/arch/i386/kernel/e820_32.c
deleted file mode 100644 (file)
index 3c86b97..0000000
+++ /dev/null
@@ -1,944 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/efi.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-#ifdef CONFIG_EFI
-int efi_enabled = 0;
-EXPORT_SYMBOL(efi_enabled);
-#endif
-
-struct e820map e820;
-struct change_member {
-       struct e820entry *pbios; /* pointer to original bios entry */
-       unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-struct resource data_resource = {
-       .name   = "Kernel data",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource code_resource = {
-       .name   = "Kernel code",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource system_rom_resource = {
-       .name   = "System ROM",
-       .start  = 0xf0000,
-       .end    = 0xfffff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-       .name   = "Extension ROM",
-       .start  = 0xe0000,
-       .end    = 0xeffff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-       .name   = "Adapter ROM",
-       .start  = 0xc8000,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-       .name   = "Adapter ROM",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-       .name   = "Adapter ROM",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-       .name   = "Adapter ROM",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-       .name   = "Adapter ROM",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-       .name   = "Adapter ROM",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-       .name   = "Video ROM",
-       .start  = 0xc0000,
-       .end    = 0xc7fff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource video_ram_resource = {
-       .name   = "Video RAM area",
-       .start  = 0xa0000,
-       .end    = 0xbffff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
-       .name   = "dma1",
-       .start  = 0x0000,
-       .end    = 0x001f,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "pic1",
-       .start  = 0x0020,
-       .end    = 0x0021,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "timer0",
-       .start  = 0x0040,
-       .end    = 0x0043,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "timer1",
-       .start  = 0x0050,
-       .end    = 0x0053,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "keyboard",
-       .start  = 0x0060,
-       .end    = 0x006f,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "dma page reg",
-       .start  = 0x0080,
-       .end    = 0x008f,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "pic2",
-       .start  = 0x00a0,
-       .end    = 0x00a1,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "dma2",
-       .start  = 0x00c0,
-       .end    = 0x00df,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "fpu",
-       .start  = 0x00f0,
-       .end    = 0x00ff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-       const unsigned short * const ptr = (const unsigned short *)rom;
-       unsigned short sig;
-
-       return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-       unsigned char sum, c;
-
-       for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-               sum += c;
-       return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
-       const unsigned char *rom;
-       unsigned long start, length, upper;
-       unsigned char c;
-       int i;
-
-       /* video rom */
-       upper = adapter_rom_resources[0].start;
-       for (start = video_rom_resource.start; start < upper; start += 2048) {
-               rom = isa_bus_to_virt(start);
-               if (!romsignature(rom))
-                       continue;
-
-               video_rom_resource.start = start;
-
-               if (probe_kernel_address(rom + 2, c) != 0)
-                       continue;
-
-               /* 0 < length <= 0x7f * 512, historically */
-               length = c * 512;
-
-               /* if checksum okay, trust length byte */
-               if (length && romchecksum(rom, length))
-                       video_rom_resource.end = start + length - 1;
-
-               request_resource(&iomem_resource, &video_rom_resource);
-               break;
-       }
-
-       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-       if (start < upper)
-               start = upper;
-
-       /* system rom */
-       request_resource(&iomem_resource, &system_rom_resource);
-       upper = system_rom_resource.start;
-
-       /* check for extension rom (ignore length byte!) */
-       rom = isa_bus_to_virt(extension_rom_resource.start);
-       if (romsignature(rom)) {
-               length = extension_rom_resource.end - extension_rom_resource.start + 1;
-               if (romchecksum(rom, length)) {
-                       request_resource(&iomem_resource, &extension_rom_resource);
-                       upper = extension_rom_resource.start;
-               }
-       }
-
-       /* check for adapter roms on 2k boundaries */
-       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-               rom = isa_bus_to_virt(start);
-               if (!romsignature(rom))
-                       continue;
-
-               if (probe_kernel_address(rom + 2, c) != 0)
-                       continue;
-
-               /* 0 < length <= 0x7f * 512, historically */
-               length = c * 512;
-
-               /* but accept any length that fits if checksum okay */
-               if (!length || start + length > upper || !romchecksum(rom, length))
-                       continue;
-
-               adapter_rom_resources[i].start = start;
-               adapter_rom_resources[i].end = start + length - 1;
-               request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-               start = adapter_rom_resources[i++].end & ~2047UL;
-       }
-}
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-static void __init
-legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
-{
-       int i;
-
-       probe_roms();
-       for (i = 0; i < e820.nr_map; i++) {
-               struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
-               if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-                       continue;
-#endif
-               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-               switch (e820.map[i].type) {
-               case E820_RAM:  res->name = "System RAM"; break;
-               case E820_ACPI: res->name = "ACPI Tables"; break;
-               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
-               default:        res->name = "reserved";
-               }
-               res->start = e820.map[i].addr;
-               res->end = res->start + e820.map[i].size - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-               if (request_resource(&iomem_resource, res)) {
-                       kfree(res);
-                       continue;
-               }
-               if (e820.map[i].type == E820_RAM) {
-                       /*
-                        *  We don't know which RAM region contains kernel data,
-                        *  so we try it repeatedly and let the resource manager
-                        *  test it.
-                        */
-                       request_resource(res, code_resource);
-                       request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-                       request_resource(res, &crashk_res);
-#endif
-               }
-       }
-}
-
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
-       int i;
-
-       printk("Setting up standard PCI resources\n");
-       if (efi_enabled)
-               efi_initialize_iomem_resources(&code_resource, &data_resource);
-       else
-               legacy_init_iomem_resources(&code_resource, &data_resource);
-
-       /* EFI systems may still have VGA */
-       request_resource(&iomem_resource, &video_ram_resource);
-
-       /* request I/O space for devices used on all i[345]86 PCs */
-       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-               request_resource(&ioport_resource, &standard_io_resources[i]);
-       return 0;
-}
-
-subsys_initcall(request_standard_resources);
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-       int i;
-       unsigned long pfn;
-
-       pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
-       for (i = 1; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i];
-
-               if (pfn < PFN_UP(ei->addr))
-                       register_nosave_region(pfn, PFN_UP(ei->addr));
-
-               pfn = PFN_DOWN(ei->addr + ei->size);
-               if (ei->type != E820_RAM)
-                       register_nosave_region(PFN_UP(ei->addr), pfn);
-
-               if (pfn >= max_low_pfn)
-                       break;
-       }
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
-                             unsigned long long size, int type)
-{
-       int x;
-
-       if (!efi_enabled) {
-                       x = e820.nr_map;
-
-               if (x == E820MAX) {
-                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-                   return;
-               }
-
-               e820.map[x].addr = start;
-               e820.map[x].size = size;
-               e820.map[x].type = type;
-               e820.nr_map++;
-       }
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-       struct change_member *change_tmp;
-       unsigned long current_type, last_type;
-       unsigned long long last_addr;
-       int chgidx, still_changing;
-       int overlap_entries;
-       int new_bios_entry;
-       int old_nr, new_nr, chg_nr;
-       int i;
-
-       /*
-               Visually we're performing the following (1,2,3,4 = memory types)...
-
-               Sample memory map (w/overlaps):
-                  ____22__________________
-                  ______________________4_
-                  ____1111________________
-                  _44_____________________
-                  11111111________________
-                  ____________________33__
-                  ___________44___________
-                  __________33333_________
-                  ______________22________
-                  ___________________2222_
-                  _________111111111______
-                  _____________________11_
-                  _________________4______
-
-               Sanitized equivalent (no overlap):
-                  1_______________________
-                  _44_____________________
-                  ___1____________________
-                  ____22__________________
-                  ______11________________
-                  _________1______________
-                  __________3_____________
-                  ___________44___________
-                  _____________33_________
-                  _______________2________
-                  ________________1_______
-                  _________________4______
-                  ___________________2____
-                  ____________________33__
-                  ______________________4_
-       */
-       /* if there's only one memory region, don't bother */
-       if (*pnr_map < 2) {
-               return -1;
-       }
-
-       old_nr = *pnr_map;
-
-       /* bail out if we find any unreasonable addresses in bios map */
-       for (i=0; i<old_nr; i++)
-               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
-                       return -1;
-               }
-
-       /* create pointers for initial change-point information (for sorting) */
-       for (i=0; i < 2*old_nr; i++)
-               change_point[i] = &change_point_list[i];
-
-       /* record all known change-points (starting and ending addresses),
-          omitting those that are for empty memory regions */
-       chgidx = 0;
-       for (i=0; i < old_nr; i++)      {
-               if (biosmap[i].size != 0) {
-                       change_point[chgidx]->addr = biosmap[i].addr;
-                       change_point[chgidx++]->pbios = &biosmap[i];
-                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-                       change_point[chgidx++]->pbios = &biosmap[i];
-               }
-       }
-       chg_nr = chgidx;        /* true number of change-points */
-
-       /* sort change-point list by memory addresses (low -> high) */
-       still_changing = 1;
-       while (still_changing)  {
-               still_changing = 0;
-               for (i=1; i < chg_nr; i++)  {
-                       /* if <current_addr> > <last_addr>, swap */
-                       /* or, if current=<start_addr> & last=<end_addr>, swap */
-                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
-                               ((change_point[i]->addr == change_point[i-1]->addr) &&
-                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
-                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-                          )
-                       {
-                               change_tmp = change_point[i];
-                               change_point[i] = change_point[i-1];
-                               change_point[i-1] = change_tmp;
-                               still_changing=1;
-                       }
-               }
-       }
-
-       /* create a new bios memory map, removing overlaps */
-       overlap_entries=0;       /* number of entries in the overlap table */
-       new_bios_entry=0;        /* index for creating new bios map entries */
-       last_type = 0;           /* start with undefined memory type */
-       last_addr = 0;           /* start with 0 as last starting address */
-       /* loop through change-points, determining affect on the new bios map */
-       for (chgidx=0; chgidx < chg_nr; chgidx++)
-       {
-               /* keep track of all overlapping bios entries */
-               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-               {
-                       /* add map entry to overlap list (> 1 entry implies an overlap) */
-                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-               }
-               else
-               {
-                       /* remove entry from list (order independent, so swap with last) */
-                       for (i=0; i<overlap_entries; i++)
-                       {
-                               if (overlap_list[i] == change_point[chgidx]->pbios)
-                                       overlap_list[i] = overlap_list[overlap_entries-1];
-                       }
-                       overlap_entries--;
-               }
-               /* if there are overlapping entries, decide which "type" to use */
-               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-               current_type = 0;
-               for (i=0; i<overlap_entries; i++)
-                       if (overlap_list[i]->type > current_type)
-                               current_type = overlap_list[i]->type;
-               /* continue building up new bios map based on this information */
-               if (current_type != last_type)  {
-                       if (last_type != 0)      {
-                               new_bios[new_bios_entry].size =
-                                       change_point[chgidx]->addr - last_addr;
-                               /* move forward only if the new size was non-zero */
-                               if (new_bios[new_bios_entry].size != 0)
-                                       if (++new_bios_entry >= E820MAX)
-                                               break;  /* no more space left for new bios entries */
-                       }
-                       if (current_type != 0)  {
-                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-                               new_bios[new_bios_entry].type = current_type;
-                               last_addr=change_point[chgidx]->addr;
-                       }
-                       last_type = current_type;
-               }
-       }
-       new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-       /* copy new bios mapping into original location */
-       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-       *pnr_map = new_nr;
-
-       return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
-{
-       /* Only one memory region (or negative)? Ignore it */
-       if (nr_map < 2)
-               return -1;
-
-       do {
-               unsigned long long start = biosmap->addr;
-               unsigned long long size = biosmap->size;
-               unsigned long long end = start + size;
-               unsigned long type = biosmap->type;
-
-               /* Overflow in 64 bits? Ignore the memory map. */
-               if (start > end)
-                       return -1;
-
-               /*
-                * Some BIOSes claim RAM in the 640k - 1M region.
-                * Not right. Fix it up.
-                */
-               if (type == E820_RAM) {
-                       if (start < 0x100000ULL && end > 0xA0000ULL) {
-                               if (start < 0xA0000ULL)
-                                       add_memory_region(start, 0xA0000ULL-start, type);
-                               if (end <= 0x100000ULL)
-                                       continue;
-                               start = 0x100000ULL;
-                               size = end - start;
-                       }
-               }
-               add_memory_region(start, size, type);
-       } while (biosmap++,--nr_map);
-       return 0;
-}
-
-/*
- * Callback for efi_memory_walk.
- */
-static int __init
-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
-{
-       unsigned long *max_pfn = arg, pfn;
-
-       if (start < end) {
-               pfn = PFN_UP(end -1);
-               if (pfn > *max_pfn)
-                       *max_pfn = pfn;
-       }
-       return 0;
-}
-
-static int __init
-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
-{
-       memory_present(0, PFN_UP(start), PFN_DOWN(end));
-       return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init find_max_pfn(void)
-{
-       int i;
-
-       max_pfn = 0;
-       if (efi_enabled) {
-               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
-               efi_memmap_walk(efi_memory_present_wrapper, NULL);
-               return;
-       }
-
-       for (i = 0; i < e820.nr_map; i++) {
-               unsigned long start, end;
-               /* RAM? */
-               if (e820.map[i].type != E820_RAM)
-                       continue;
-               start = PFN_UP(e820.map[i].addr);
-               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-               if (start >= end)
-                       continue;
-               if (end > max_pfn)
-                       max_pfn = end;
-               memory_present(0, start, end);
-       }
-}
-
-/*
- * Free all available memory for boot time allocation.  Used
- * as a callback function by efi_memory_walk()
- */
-
-static int __init
-free_available_memory(unsigned long start, unsigned long end, void *arg)
-{
-       /* check max_low_pfn */
-       if (start >= (max_low_pfn << PAGE_SHIFT))
-               return 0;
-       if (end >= (max_low_pfn << PAGE_SHIFT))
-               end = max_low_pfn << PAGE_SHIFT;
-       if (start < end)
-               free_bootmem(start, end - start);
-
-       return 0;
-}
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
-       int i;
-
-       if (efi_enabled) {
-               efi_memmap_walk(free_available_memory, NULL);
-               return;
-       }
-       for (i = 0; i < e820.nr_map; i++) {
-               unsigned long curr_pfn, last_pfn, size;
-               /*
-                * Reserve usable low memory
-                */
-               if (e820.map[i].type != E820_RAM)
-                       continue;
-               /*
-                * We are rounding up the start address of usable memory:
-                */
-               curr_pfn = PFN_UP(e820.map[i].addr);
-               if (curr_pfn >= max_low_pfn)
-                       continue;
-               /*
-                * ... and at the end of the usable range downwards:
-                */
-               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-               if (last_pfn > max_low_pfn)
-                       last_pfn = max_low_pfn;
-
-               /*
-                * .. finally, did all the rounding and playing
-                * around just make the area go away?
-                */
-               if (last_pfn <= curr_pfn)
-                       continue;
-
-               size = last_pfn - curr_pfn;
-               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
-       }
-}
-
-void __init e820_register_memory(void)
-{
-       unsigned long gapstart, gapsize, round;
-       unsigned long long last;
-       int i;
-
-       /*
-        * Search for the bigest gap in the low 32 bits of the e820
-        * memory space.
-        */
-       last = 0x100000000ull;
-       gapstart = 0x10000000;
-       gapsize = 0x400000;
-       i = e820.nr_map;
-       while (--i >= 0) {
-               unsigned long long start = e820.map[i].addr;
-               unsigned long long end = start + e820.map[i].size;
-
-               /*
-                * Since "last" is at most 4GB, we know we'll
-                * fit in 32 bits if this condition is true
-                */
-               if (last > end) {
-                       unsigned long gap = last - end;
-
-                       if (gap > gapsize) {
-                               gapsize = gap;
-                               gapstart = end;
-                       }
-               }
-               if (start < last)
-                       last = start;
-       }
-
-       /*
-        * See how much we want to round up: start off with
-        * rounding to the next 1MB area.
-        */
-       round = 0x100000;
-       while ((gapsize >> 4) > round)
-               round += round;
-       /* Fun with two's complement */
-       pci_mem_start = (gapstart + round) & -round;
-
-       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
-               pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
-       int i;
-
-       for (i = 0; i < e820.nr_map; i++) {
-               printk(" %s: %016Lx - %016Lx ", who,
-                       e820.map[i].addr,
-                       e820.map[i].addr + e820.map[i].size);
-               switch (e820.map[i].type) {
-               case E820_RAM:  printk("(usable)\n");
-                               break;
-               case E820_RESERVED:
-                               printk("(reserved)\n");
-                               break;
-               case E820_ACPI:
-                               printk("(ACPI data)\n");
-                               break;
-               case E820_NVS:
-                               printk("(ACPI NVS)\n");
-                               break;
-               default:        printk("type %u\n", e820.map[i].type);
-                               break;
-               }
-       }
-}
-
-static __init __always_inline void efi_limit_regions(unsigned long long size)
-{
-       unsigned long long current_addr = 0;
-       efi_memory_desc_t *md, *next_md;
-       void *p, *p1;
-       int i, j;
-
-       j = 0;
-       p1 = memmap.map;
-       for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-               md = p;
-               next_md = p1;
-               current_addr = md->phys_addr +
-                       PFN_PHYS(md->num_pages);
-               if (is_available_memory(md)) {
-                       if (md->phys_addr >= size) continue;
-                       memcpy(next_md, md, memmap.desc_size);
-                       if (current_addr >= size) {
-                               next_md->num_pages -=
-                                       PFN_UP(current_addr-size);
-                       }
-                       p1 += memmap.desc_size;
-                       next_md = p1;
-                       j++;
-               } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
-                          EFI_MEMORY_RUNTIME) {
-                       /* In order to make runtime services
-                        * available we have to include runtime
-                        * memory regions in memory map */
-                       memcpy(next_md, md, memmap.desc_size);
-                       p1 += memmap.desc_size;
-                       next_md = p1;
-                       j++;
-               }
-       }
-       memmap.nr_map = j;
-       memmap.map_end = memmap.map +
-               (memmap.nr_map * memmap.desc_size);
-}
-
-void __init limit_regions(unsigned long long size)
-{
-       unsigned long long current_addr;
-       int i;
-
-       print_memory_map("limit_regions start");
-       if (efi_enabled) {
-               efi_limit_regions(size);
-               return;
-       }
-       for (i = 0; i < e820.nr_map; i++) {
-               current_addr = e820.map[i].addr + e820.map[i].size;
-               if (current_addr < size)
-                       continue;
-
-               if (e820.map[i].type != E820_RAM)
-                       continue;
-
-               if (e820.map[i].addr >= size) {
-                       /*
-                        * This region starts past the end of the
-                        * requested size, skip it completely.
-                        */
-                       e820.nr_map = i;
-               } else {
-                       e820.nr_map = i + 1;
-                       e820.map[i].size -= current_addr - size;
-               }
-               print_memory_map("limit_regions endfor");
-               return;
-       }
-       print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
-       int i;
-       for (i = 0; i < e820.nr_map; i++) {
-               const struct e820entry *ei = &e820.map[i];
-               if (type && ei->type != type)
-                       continue;
-               if (ei->addr >= end || ei->addr + ei->size <= start)
-                       continue;
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
-  * This function checks if the entire range <start,end> is mapped with type.
-  *
-  * Note: this function only works correct if the e820 table is sorted and
-  * not-overlapping, which is the case
-  */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
-       u64 start = s;
-       u64 end = e;
-       int i;
-       for (i = 0; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i];
-               if (type && ei->type != type)
-                       continue;
-               /* is the region (part) in overlap with the current region ?*/
-               if (ei->addr >= end || ei->addr + ei->size <= start)
-                       continue;
-               /* if the region is at the beginning of <start,end> we move
-                * start to the end of the region since it's ok until there
-                */
-               if (ei->addr <= start)
-                       start = ei->addr + ei->size;
-               /* if start is now at or beyond end, we're done, full
-                * coverage */
-               if (start >= end)
-                       return 1; /* we're done */
-       }
-       return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
-               /* If we are doing a crash dump, we
-                * still need to know the real mem
-                * size before original memory map is
-                * reset.
-                */
-               find_max_pfn();
-               saved_max_pfn = max_pfn;
-#endif
-               e820.nr_map = 0;
-               user_defined_memmap = 1;
-       } else {
-               /* If the user specifies memory size, we
-                * limit the BIOS-provided memory map to
-                * that size. exactmap can be used to specify
-                * the exact map. mem=number can be used to
-                * trim the existing memory map.
-                */
-               unsigned long long start_at, mem_size;
-
-               mem_size = memparse(arg, &arg);
-               if (*arg == '@') {
-                       start_at = memparse(arg+1, &arg);
-                       add_memory_region(start_at, mem_size, E820_RAM);
-               } else if (*arg == '#') {
-                       start_at = memparse(arg+1, &arg);
-                       add_memory_region(start_at, mem_size, E820_ACPI);
-               } else if (*arg == '$') {
-                       start_at = memparse(arg+1, &arg);
-                       add_memory_region(start_at, mem_size, E820_RESERVED);
-               } else {
-                       limit_regions(mem_size);
-                       user_defined_memmap = 1;
-               }
-       }
-       return 0;
-}
-early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/early_printk.c b/arch/i386/kernel/early_printk.c
deleted file mode 100644 (file)
index 92f812b..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-
-#include "../../x86_64/kernel/early_printk.c"
diff --git a/arch/i386/kernel/efi_32.c b/arch/i386/kernel/efi_32.c
deleted file mode 100644 (file)
index 2452c6f..0000000
+++ /dev/null
@@ -1,712 +0,0 @@
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- *     David Mosberger-Tang <davidm@hpl.hp.com>
- *     Stephane Eranian <eranian@hpl.hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *     Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/kexec.h>
-
-#include <asm/setup.h>
-#include <asm/io.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/tlbflush.h>
-
-#define EFI_DEBUG      0
-#define PFX            "EFI: "
-
-extern efi_status_t asmlinkage efi_call_phys(void *, ...);
-
-struct efi efi;
-EXPORT_SYMBOL(efi);
-static struct efi efi_phys;
-struct efi_memory_map memmap;
-
-/*
- * We require an early boot_ioremap mapping mechanism initially
- */
-extern void * boot_ioremap(unsigned long, unsigned long);
-
-/*
- * To make EFI call EFI runtime service in physical addressing mode we need
- * prelog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
- */
-
-static unsigned long efi_rt_eflags;
-static DEFINE_SPINLOCK(efi_rt_lock);
-static pgd_t efi_bak_pg_dir_pointer[2];
-
-static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
-{
-       unsigned long cr4;
-       unsigned long temp;
-       struct Xgt_desc_struct gdt_descr;
-
-       spin_lock(&efi_rt_lock);
-       local_irq_save(efi_rt_eflags);
-
-       /*
-        * If I don't have PSE, I should just duplicate two entries in page
-        * directory. If I have PSE, I just need to duplicate one entry in
-        * page directory.
-        */
-       cr4 = read_cr4();
-
-       if (cr4 & X86_CR4_PSE) {
-               efi_bak_pg_dir_pointer[0].pgd =
-                   swapper_pg_dir[pgd_index(0)].pgd;
-               swapper_pg_dir[0].pgd =
-                   swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
-       } else {
-               efi_bak_pg_dir_pointer[0].pgd =
-                   swapper_pg_dir[pgd_index(0)].pgd;
-               efi_bak_pg_dir_pointer[1].pgd =
-                   swapper_pg_dir[pgd_index(0x400000)].pgd;
-               swapper_pg_dir[pgd_index(0)].pgd =
-                   swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
-               temp = PAGE_OFFSET + 0x400000;
-               swapper_pg_dir[pgd_index(0x400000)].pgd =
-                   swapper_pg_dir[pgd_index(temp)].pgd;
-       }
-
-       /*
-        * After the lock is released, the original page table is restored.
-        */
-       local_flush_tlb();
-
-       gdt_descr.address = __pa(get_cpu_gdt_table(0));
-       gdt_descr.size = GDT_SIZE - 1;
-       load_gdt(&gdt_descr);
-}
-
-static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
-{
-       unsigned long cr4;
-       struct Xgt_desc_struct gdt_descr;
-
-       gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
-       gdt_descr.size = GDT_SIZE - 1;
-       load_gdt(&gdt_descr);
-
-       cr4 = read_cr4();
-
-       if (cr4 & X86_CR4_PSE) {
-               swapper_pg_dir[pgd_index(0)].pgd =
-                   efi_bak_pg_dir_pointer[0].pgd;
-       } else {
-               swapper_pg_dir[pgd_index(0)].pgd =
-                   efi_bak_pg_dir_pointer[0].pgd;
-               swapper_pg_dir[pgd_index(0x400000)].pgd =
-                   efi_bak_pg_dir_pointer[1].pgd;
-       }
-
-       /*
-        * After the lock is released, the original page table is restored.
-        */
-       local_flush_tlb();
-
-       local_irq_restore(efi_rt_eflags);
-       spin_unlock(&efi_rt_lock);
-}
-
-static efi_status_t
-phys_efi_set_virtual_address_map(unsigned long memory_map_size,
-                                unsigned long descriptor_size,
-                                u32 descriptor_version,
-                                efi_memory_desc_t *virtual_map)
-{
-       efi_status_t status;
-
-       efi_call_phys_prelog();
-       status = efi_call_phys(efi_phys.set_virtual_address_map,
-                                    memory_map_size, descriptor_size,
-                                    descriptor_version, virtual_map);
-       efi_call_phys_epilog();
-       return status;
-}
-
-static efi_status_t
-phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-       efi_status_t status;
-
-       efi_call_phys_prelog();
-       status = efi_call_phys(efi_phys.get_time, tm, tc);
-       efi_call_phys_epilog();
-       return status;
-}
-
-inline int efi_set_rtc_mmss(unsigned long nowtime)
-{
-       int real_seconds, real_minutes;
-       efi_status_t    status;
-       efi_time_t      eft;
-       efi_time_cap_t  cap;
-
-       spin_lock(&efi_rt_lock);
-       status = efi.get_time(&eft, &cap);
-       spin_unlock(&efi_rt_lock);
-       if (status != EFI_SUCCESS)
-               panic("Ooops, efitime: can't read time!\n");
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-
-       if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
-               real_minutes += 30;
-       real_minutes %= 60;
-
-       eft.minute = real_minutes;
-       eft.second = real_seconds;
-
-       if (status != EFI_SUCCESS) {
-               printk("Ooops: efitime: can't read time!\n");
-               return -1;
-       }
-       return 0;
-}
-/*
- * This is used during kernel init before runtime
- * services have been remapped and also during suspend, therefore,
- * we'll need to call both in physical and virtual modes.
- */
-inline unsigned long efi_get_time(void)
-{
-       efi_status_t status;
-       efi_time_t eft;
-       efi_time_cap_t cap;
-
-       if (efi.get_time) {
-               /* if we are in virtual mode use remapped function */
-               status = efi.get_time(&eft, &cap);
-       } else {
-               /* we are in physical mode */
-               status = phys_efi_get_time(&eft, &cap);
-       }
-
-       if (status != EFI_SUCCESS)
-               printk("Oops: efitime: can't read time status: 0x%lx\n",status);
-
-       return mktime(eft.year, eft.month, eft.day, eft.hour,
-                       eft.minute, eft.second);
-}
-
-int is_available_memory(efi_memory_desc_t * md)
-{
-       if (!(md->attribute & EFI_MEMORY_WB))
-               return 0;
-
-       switch (md->type) {
-               case EFI_LOADER_CODE:
-               case EFI_LOADER_DATA:
-               case EFI_BOOT_SERVICES_CODE:
-               case EFI_BOOT_SERVICES_DATA:
-               case EFI_CONVENTIONAL_MEMORY:
-                       return 1;
-       }
-       return 0;
-}
-
-/*
- * We need to map the EFI memory map again after paging_init().
- */
-void __init efi_map_memmap(void)
-{
-       memmap.map = NULL;
-
-       memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
-                       (memmap.nr_map * memmap.desc_size));
-       if (memmap.map == NULL)
-               printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
-
-       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-}
-
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
-       efi_memory_desc_t *md;
-       void *p;
-       int i;
-
-       for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-               md = p;
-               printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
-                       "range=[0x%016llx-0x%016llx) (%lluMB)\n",
-                       i, md->type, md->attribute, md->phys_addr,
-                       md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-                       (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
-       }
-}
-#endif  /*  EFI_DEBUG  */
-
-/*
- * Walks the EFI memory map and calls CALLBACK once for each EFI
- * memory descriptor that has memory that is available for kernel use.
- */
-void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
-{
-       int prev_valid = 0;
-       struct range {
-               unsigned long start;
-               unsigned long end;
-       } uninitialized_var(prev), curr;
-       efi_memory_desc_t *md;
-       unsigned long start, end;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-
-               if ((md->num_pages == 0) || (!is_available_memory(md)))
-                       continue;
-
-               curr.start = md->phys_addr;
-               curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
-
-               if (!prev_valid) {
-                       prev = curr;
-                       prev_valid = 1;
-               } else {
-                       if (curr.start < prev.start)
-                               printk(KERN_INFO PFX "Unordered memory map\n");
-                       if (prev.end == curr.start)
-                               prev.end = curr.end;
-                       else {
-                               start =
-                                   (unsigned long) (PAGE_ALIGN(prev.start));
-                               end = (unsigned long) (prev.end & PAGE_MASK);
-                               if ((end > start)
-                                   && (*callback) (start, end, arg) < 0)
-                                       return;
-                               prev = curr;
-                       }
-               }
-       }
-       if (prev_valid) {
-               start = (unsigned long) PAGE_ALIGN(prev.start);
-               end = (unsigned long) (prev.end & PAGE_MASK);
-               if (end > start)
-                       (*callback) (start, end, arg);
-       }
-}
-
-void __init efi_init(void)
-{
-       efi_config_table_t *config_tables;
-       efi_runtime_services_t *runtime;
-       efi_char16_t *c16;
-       char vendor[100] = "unknown";
-       unsigned long num_config_tables;
-       int i = 0;
-
-       memset(&efi, 0, sizeof(efi) );
-       memset(&efi_phys, 0, sizeof(efi_phys));
-
-       efi_phys.systab = EFI_SYSTAB;
-       memmap.phys_map = EFI_MEMMAP;
-       memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
-       memmap.desc_version = EFI_MEMDESC_VERSION;
-       memmap.desc_size = EFI_MEMDESC_SIZE;
-
-       efi.systab = (efi_system_table_t *)
-               boot_ioremap((unsigned long) efi_phys.systab,
-                       sizeof(efi_system_table_t));
-       /*
-        * Verify the EFI Table
-        */
-       if (efi.systab == NULL)
-               printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
-       if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-               printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
-       if ((efi.systab->hdr.revision >> 16) == 0)
-               printk(KERN_ERR PFX "Warning: EFI system table version "
-                      "%d.%02d, expected 1.00 or greater\n",
-                      efi.systab->hdr.revision >> 16,
-                      efi.systab->hdr.revision & 0xffff);
-
-       /*
-        * Grab some details from the system table
-        */
-       num_config_tables = efi.systab->nr_tables;
-       config_tables = (efi_config_table_t *)efi.systab->tables;
-       runtime = efi.systab->runtime;
-
-       /*
-        * Show what we know for posterity
-        */
-       c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
-       if (c16) {
-               for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
-                       vendor[i] = *c16++;
-               vendor[i] = '\0';
-       } else
-               printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-
-       printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
-              efi.systab->hdr.revision >> 16,
-              efi.systab->hdr.revision & 0xffff, vendor);
-
-       /*
-        * Let's see what config tables the firmware passed to us.
-        */
-       config_tables = (efi_config_table_t *)
-                               boot_ioremap((unsigned long) config_tables,
-                               num_config_tables * sizeof(efi_config_table_t));
-
-       if (config_tables == NULL)
-               printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
-
-       efi.mps        = EFI_INVALID_TABLE_ADDR;
-       efi.acpi       = EFI_INVALID_TABLE_ADDR;
-       efi.acpi20     = EFI_INVALID_TABLE_ADDR;
-       efi.smbios     = EFI_INVALID_TABLE_ADDR;
-       efi.sal_systab = EFI_INVALID_TABLE_ADDR;
-       efi.boot_info  = EFI_INVALID_TABLE_ADDR;
-       efi.hcdp       = EFI_INVALID_TABLE_ADDR;
-       efi.uga        = EFI_INVALID_TABLE_ADDR;
-
-       for (i = 0; i < num_config_tables; i++) {
-               if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-                       efi.mps = config_tables[i].table;
-                       printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-                       efi.acpi20 = config_tables[i].table;
-                       printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-                       efi.acpi = config_tables[i].table;
-                       printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-                       efi.smbios = config_tables[i].table;
-                       printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-                       efi.hcdp = config_tables[i].table;
-                       printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
-                       efi.uga = config_tables[i].table;
-                       printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
-               }
-       }
-       printk("\n");
-
-       /*
-        * Check out the runtime services table. We need to map
-        * the runtime services table so that we can grab the physical
-        * address of several of the EFI runtime functions, needed to
-        * set the firmware into virtual mode.
-        */
-
-       runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
-                                               runtime,
-                                               sizeof(efi_runtime_services_t));
-       if (runtime != NULL) {
-               /*
-                * We will only need *early* access to the following
-                * two EFI runtime services before set_virtual_address_map
-                * is invoked.
-                */
-               efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
-               efi_phys.set_virtual_address_map =
-                       (efi_set_virtual_address_map_t *)
-                               runtime->set_virtual_address_map;
-       } else
-               printk(KERN_ERR PFX "Could not map the runtime service table!\n");
-
-       /* Map the EFI memory map for use until paging_init() */
-       memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
-       if (memmap.map == NULL)
-               printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
-
-       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
-#if EFI_DEBUG
-       print_efi_memmap();
-#endif
-}
-
-static inline void __init check_range_for_systab(efi_memory_desc_t *md)
-{
-       if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
-               ((unsigned long)efi_phys.systab < md->phys_addr +
-               ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
-               unsigned long addr;
-
-               addr = md->virt_addr - md->phys_addr +
-                       (unsigned long)efi_phys.systab;
-               efi.systab = (efi_system_table_t *)addr;
-       }
-}
-
-/*
- * Wrap all the virtual calls in a way that forces the parameters on the stack.
- */
-
-#define efi_call_virt(f, args...) \
-     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-       return efi_call_virt(get_time, tm, tc);
-}
-
-static efi_status_t virt_efi_set_time (efi_time_t *tm)
-{
-       return efi_call_virt(set_time, tm);
-}
-
-static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
-                                             efi_bool_t *pending,
-                                             efi_time_t *tm)
-{
-       return efi_call_virt(get_wakeup_time, enabled, pending, tm);
-}
-
-static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
-                                             efi_time_t *tm)
-{
-       return efi_call_virt(set_wakeup_time, enabled, tm);
-}
-
-static efi_status_t virt_efi_get_variable (efi_char16_t *name,
-                                          efi_guid_t *vendor, u32 *attr,
-                                          unsigned long *data_size, void *data)
-{
-       return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
-                                               efi_char16_t *name,
-                                               efi_guid_t *vendor)
-{
-       return efi_call_virt(get_next_variable, name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable (efi_char16_t *name,
-                                          efi_guid_t *vendor,
-                                          unsigned long attr,
-                                          unsigned long data_size, void *data)
-{
-       return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
-{
-       return efi_call_virt(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system (int reset_type, efi_status_t status,
-                                  unsigned long data_size,
-                                  efi_char16_t *data)
-{
-       efi_call_virt(reset_system, reset_type, status, data_size, data);
-}
-
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-
-void __init efi_enter_virtual_mode(void)
-{
-       efi_memory_desc_t *md;
-       efi_status_t status;
-       void *p;
-
-       efi.systab = NULL;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-
-               if (!(md->attribute & EFI_MEMORY_RUNTIME))
-                       continue;
-
-               md->virt_addr = (unsigned long)ioremap(md->phys_addr,
-                       md->num_pages << EFI_PAGE_SHIFT);
-               if (!(unsigned long)md->virt_addr) {
-                       printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
-                               (unsigned long)md->phys_addr);
-               }
-               /* update the virtual address of the EFI system table */
-               check_range_for_systab(md);
-       }
-
-       BUG_ON(!efi.systab);
-
-       status = phys_efi_set_virtual_address_map(
-                       memmap.desc_size * memmap.nr_map,
-                       memmap.desc_size,
-                       memmap.desc_version,
-                       memmap.phys_map);
-
-       if (status != EFI_SUCCESS) {
-               printk (KERN_ALERT "You are screwed! "
-                       "Unable to switch EFI into virtual mode "
-                       "(status=%lx)\n", status);
-               panic("EFI call to SetVirtualAddressMap() failed!");
-       }
-
-       /*
-        * Now that EFI is in virtual mode, update the function
-        * pointers in the runtime service table to the new virtual addresses.
-        */
-
-       efi.get_time = virt_efi_get_time;
-       efi.set_time = virt_efi_set_time;
-       efi.get_wakeup_time = virt_efi_get_wakeup_time;
-       efi.set_wakeup_time = virt_efi_set_wakeup_time;
-       efi.get_variable = virt_efi_get_variable;
-       efi.get_next_variable = virt_efi_get_next_variable;
-       efi.set_variable = virt_efi_set_variable;
-       efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
-       efi.reset_system = virt_efi_reset_system;
-}
-
-void __init
-efi_initialize_iomem_resources(struct resource *code_resource,
-                              struct resource *data_resource)
-{
-       struct resource *res;
-       efi_memory_desc_t *md;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-
-               if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
-                   0x100000000ULL)
-                       continue;
-               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-               switch (md->type) {
-               case EFI_RESERVED_TYPE:
-                       res->name = "Reserved Memory";
-                       break;
-               case EFI_LOADER_CODE:
-                       res->name = "Loader Code";
-                       break;
-               case EFI_LOADER_DATA:
-                       res->name = "Loader Data";
-                       break;
-               case EFI_BOOT_SERVICES_DATA:
-                       res->name = "BootServices Data";
-                       break;
-               case EFI_BOOT_SERVICES_CODE:
-                       res->name = "BootServices Code";
-                       break;
-               case EFI_RUNTIME_SERVICES_CODE:
-                       res->name = "Runtime Service Code";
-                       break;
-               case EFI_RUNTIME_SERVICES_DATA:
-                       res->name = "Runtime Service Data";
-                       break;
-               case EFI_CONVENTIONAL_MEMORY:
-                       res->name = "Conventional Memory";
-                       break;
-               case EFI_UNUSABLE_MEMORY:
-                       res->name = "Unusable Memory";
-                       break;
-               case EFI_ACPI_RECLAIM_MEMORY:
-                       res->name = "ACPI Reclaim";
-                       break;
-               case EFI_ACPI_MEMORY_NVS:
-                       res->name = "ACPI NVS";
-                       break;
-               case EFI_MEMORY_MAPPED_IO:
-                       res->name = "Memory Mapped IO";
-                       break;
-               case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
-                       res->name = "Memory Mapped IO Port Space";
-                       break;
-               default:
-                       res->name = "Reserved";
-                       break;
-               }
-               res->start = md->phys_addr;
-               res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-               if (request_resource(&iomem_resource, res) < 0)
-                       printk(KERN_ERR PFX "Failed to allocate res %s : "
-                               "0x%llx-0x%llx\n", res->name,
-                               (unsigned long long)res->start,
-                               (unsigned long long)res->end);
-               /*
-                * We don't know which region contains kernel data so we try
-                * it repeatedly and let the resource manager test it.
-                */
-               if (md->type == EFI_CONVENTIONAL_MEMORY) {
-                       request_resource(res, code_resource);
-                       request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-                       request_resource(res, &crashk_res);
-#endif
-               }
-       }
-}
-
-/*
- * Convenience functions to obtain memory types and attributes
- */
-
-u32 efi_mem_type(unsigned long phys_addr)
-{
-       efi_memory_desc_t *md;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-               if ((md->phys_addr <= phys_addr) && (phys_addr <
-                       (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
-                       return md->type;
-       }
-       return 0;
-}
-
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
-       efi_memory_desc_t *md;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-               if ((md->phys_addr <= phys_addr) && (phys_addr <
-                       (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
-                       return md->attribute;
-       }
-       return 0;
-}
diff --git a/arch/i386/kernel/efi_stub_32.S b/arch/i386/kernel/efi_stub_32.S
deleted file mode 100644 (file)
index ef00bb7..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-
-.text
-ENTRY(efi_call_phys)
-       /*
-        * 0. The function can only be called in Linux kernel. So CS has been
-        * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
-        * the values of these registers are the same. And, the corresponding
-        * GDT entries are identical. So I will do nothing about segment reg
-        * and GDT, but change GDT base register in prelog and epilog.
-        */
-
-       /*
-        * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
-        * But to make it smoothly switch from virtual mode to flat mode.
-        * The mapping of lower virtual memory has been created in prelog and
-        * epilog.
-        */
-       movl    $1f, %edx
-       subl    $__PAGE_OFFSET, %edx
-       jmp     *%edx
-1:
-
-       /*
-        * 2. Now on the top of stack is the return
-        * address in the caller of efi_call_phys(), then parameter 1,
-        * parameter 2, ..., param n. To make things easy, we save the return
-        * address of efi_call_phys in a global variable.
-        */
-       popl    %edx
-       movl    %edx, saved_return_addr
-       /* get the function pointer into ECX*/
-       popl    %ecx
-       movl    %ecx, efi_rt_function_ptr
-       movl    $2f, %edx
-       subl    $__PAGE_OFFSET, %edx
-       pushl   %edx
-
-       /*
-        * 3. Clear PG bit in %CR0.
-        */
-       movl    %cr0, %edx
-       andl    $0x7fffffff, %edx
-       movl    %edx, %cr0
-       jmp     1f
-1:
-
-       /*
-        * 4. Adjust stack pointer.
-        */
-       subl    $__PAGE_OFFSET, %esp
-
-       /*
-        * 5. Call the physical function.
-        */
-       jmp     *%ecx
-
-2:
-       /*
-        * 6. After EFI runtime service returns, control will return to
-        * following instruction. We'd better readjust stack pointer first.
-        */
-       addl    $__PAGE_OFFSET, %esp
-
-       /*
-        * 7. Restore PG bit
-        */
-       movl    %cr0, %edx
-       orl     $0x80000000, %edx
-       movl    %edx, %cr0
-       jmp     1f
-1:
-       /*
-        * 8. Now restore the virtual mode from flat mode by
-        * adding EIP with PAGE_OFFSET.
-        */
-       movl    $1f, %edx
-       jmp     *%edx
-1:
-
-       /*
-        * 9. Balance the stack. And because EAX contain the return value,
-        * we'd better not clobber it.
-        */
-       leal    efi_rt_function_ptr, %edx
-       movl    (%edx), %ecx
-       pushl   %ecx
-
-       /*
-        * 10. Push the saved return address onto the stack and return.
-        */
-       leal    saved_return_addr, %edx
-       movl    (%edx), %ecx
-       pushl   %ecx
-       ret
-.previous
-
-.data
-saved_return_addr:
-       .long 0
-efi_rt_function_ptr:
-       .long 0
diff --git a/arch/i386/kernel/entry_32.S b/arch/i386/kernel/entry_32.S
deleted file mode 100644 (file)
index 290b7bc..0000000
+++ /dev/null
@@ -1,1112 +0,0 @@
-/*
- *  linux/arch/i386/entry.S
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- *     ptrace needs to have all regs on the stack.
- *     if the order here is changed, it needs to be
- *     updated in fork.c:copy_process, signal.c:do_signal,
- *     ptrace.c and ptrace.h
- *
- *      0(%esp) - %ebx
- *      4(%esp) - %ecx
- *      8(%esp) - %edx
- *       C(%esp) - %esi
- *     10(%esp) - %edi
- *     14(%esp) - %ebp
- *     18(%esp) - %eax
- *     1C(%esp) - %ds
- *     20(%esp) - %es
- *     24(%esp) - %fs
- *     28(%esp) - orig_eax
- *     2C(%esp) - %eip
- *     30(%esp) - %cs
- *     34(%esp) - %eflags
- *     38(%esp) - %oldesp
- *     3C(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page.h>
-#include <asm/desc.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include "irq_vectors.h"
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization.  The following will never clobber any registers:
- *   INTERRUPT_RETURN (aka. "iret")
- *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#define nr_syscalls ((syscall_table_size)/4)
-
-CF_MASK                = 0x00000001
-TF_MASK                = 0x00000100
-IF_MASK                = 0x00000200
-DF_MASK                = 0x00000400 
-NT_MASK                = 0x00004000
-VM_MASK                = 0x00020000
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel          restore_nocheck
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
-       testl $IF_MASK,PT_EFLAGS(%esp)     # interrupts off?
-       jz 1f
-       TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-#ifdef CONFIG_VM86
-#define resume_userspace_sig   check_userspace
-#else
-#define resume_userspace_sig   resume_userspace
-#endif
-
-#define SAVE_ALL \
-       cld; \
-       pushl %fs; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       /*CFI_REL_OFFSET fs, 0;*/\
-       pushl %es; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       /*CFI_REL_OFFSET es, 0;*/\
-       pushl %ds; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       /*CFI_REL_OFFSET ds, 0;*/\
-       pushl %eax; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET eax, 0;\
-       pushl %ebp; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET ebp, 0;\
-       pushl %edi; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET edi, 0;\
-       pushl %esi; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET esi, 0;\
-       pushl %edx; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET edx, 0;\
-       pushl %ecx; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET ecx, 0;\
-       pushl %ebx; \
-       CFI_ADJUST_CFA_OFFSET 4;\
-       CFI_REL_OFFSET ebx, 0;\
-       movl $(__USER_DS), %edx; \
-       movl %edx, %ds; \
-       movl %edx, %es; \
-       movl $(__KERNEL_PERCPU), %edx; \
-       movl %edx, %fs
-
-#define RESTORE_INT_REGS \
-       popl %ebx;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE ebx;\
-       popl %ecx;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE ecx;\
-       popl %edx;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE edx;\
-       popl %esi;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE esi;\
-       popl %edi;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE edi;\
-       popl %ebp;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE ebp;\
-       popl %eax;      \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       CFI_RESTORE eax
-
-#define RESTORE_REGS   \
-       RESTORE_INT_REGS; \
-1:     popl %ds;       \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       /*CFI_RESTORE ds;*/\
-2:     popl %es;       \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       /*CFI_RESTORE es;*/\
-3:     popl %fs;       \
-       CFI_ADJUST_CFA_OFFSET -4;\
-       /*CFI_RESTORE fs;*/\
-.pushsection .fixup,"ax";      \
-4:     movl $0,(%esp); \
-       jmp 1b;         \
-5:     movl $0,(%esp); \
-       jmp 2b;         \
-6:     movl $0,(%esp); \
-       jmp 3b;         \
-.section __ex_table,"a";\
-       .align 4;       \
-       .long 1b,4b;    \
-       .long 2b,5b;    \
-       .long 3b,6b;    \
-.popsection
-
-#define RING0_INT_FRAME \
-       CFI_STARTPROC simple;\
-       CFI_SIGNAL_FRAME;\
-       CFI_DEF_CFA esp, 3*4;\
-       /*CFI_OFFSET cs, -2*4;*/\
-       CFI_OFFSET eip, -3*4
-
-#define RING0_EC_FRAME \
-       CFI_STARTPROC simple;\
-       CFI_SIGNAL_FRAME;\
-       CFI_DEF_CFA esp, 4*4;\
-       /*CFI_OFFSET cs, -2*4;*/\
-       CFI_OFFSET eip, -3*4
-
-#define RING0_PTREGS_FRAME \
-       CFI_STARTPROC simple;\
-       CFI_SIGNAL_FRAME;\
-       CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
-       /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
-       CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
-       /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
-       /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
-       CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
-       CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
-       CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
-       CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
-       CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
-       CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
-       CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-
-ENTRY(ret_from_fork)
-       CFI_STARTPROC
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       call schedule_tail
-       GET_THREAD_INFO(%ebp)
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
-       pushl $0x0202                   # Reset kernel eflags
-       CFI_ADJUST_CFA_OFFSET 4
-       popfl
-       CFI_ADJUST_CFA_OFFSET -4
-       jmp syscall_exit
-       CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
-       # userspace resumption stub bypassing syscall exit tracing
-       ALIGN
-       RING0_PTREGS_FRAME
-ret_from_exception:
-       preempt_stop(CLBR_ANY)
-ret_from_intr:
-       GET_THREAD_INFO(%ebp)
-check_userspace:
-       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS
-       movb PT_CS(%esp), %al
-       andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
-       cmpl $USER_RPL, %eax
-       jb resume_kernel                # not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
-       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
-                                       # setting need_resched or sigpending
-                                       # between sampling and the iret
-       movl TI_flags(%ebp), %ecx
-       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
-                                       # int/exception return?
-       jne work_pending
-       jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
-       DISABLE_INTERRUPTS(CLBR_ANY)
-       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
-       jnz restore_nocheck
-need_resched:
-       movl TI_flags(%ebp), %ecx       # need_resched set ?
-       testb $_TIF_NEED_RESCHED, %cl
-       jz restore_all
-       testl $IF_MASK,PT_EFLAGS(%esp)  # interrupts off (exception path) ?
-       jz restore_all
-       call preempt_schedule_irq
-       jmp need_resched
-END(resume_kernel)
-#endif
-       CFI_ENDPROC
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
-   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
-
-       # sysenter call handler stub
-ENTRY(sysenter_entry)
-       CFI_STARTPROC simple
-       CFI_SIGNAL_FRAME
-       CFI_DEF_CFA esp, 0
-       CFI_REGISTER esp, ebp
-       movl TSS_sysenter_esp0(%esp),%esp
-sysenter_past_esp:
-       /*
-        * No need to follow this irqs on/off section: the syscall
-        * disabled irqs and here we enable it straight after entry:
-        */
-       ENABLE_INTERRUPTS(CLBR_NONE)
-       pushl $(__USER_DS)
-       CFI_ADJUST_CFA_OFFSET 4
-       /*CFI_REL_OFFSET ss, 0*/
-       pushl %ebp
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET esp, 0
-       pushfl
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $(__USER_CS)
-       CFI_ADJUST_CFA_OFFSET 4
-       /*CFI_REL_OFFSET cs, 0*/
-       /*
-        * Push current_thread_info()->sysenter_return to the stack.
-        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
-        * pushed above; +8 corresponds to copy_thread's esp0 setting.
-        */
-       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET eip, 0
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
-       cmpl $__PAGE_OFFSET-3,%ebp
-       jae syscall_fault
-1:     movl (%ebp),%ebp
-.section __ex_table,"a"
-       .align 4
-       .long 1b,syscall_fault
-.previous
-
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       GET_THREAD_INFO(%ebp)
-
-       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
-       jnz syscall_trace_entry
-       cmpl $(nr_syscalls), %eax
-       jae syscall_badsys
-       call *sys_call_table(,%eax,4)
-       movl %eax,PT_EAX(%esp)
-       DISABLE_INTERRUPTS(CLBR_ANY)
-       TRACE_IRQS_OFF
-       movl TI_flags(%ebp), %ecx
-       testw $_TIF_ALLWORK_MASK, %cx
-       jne syscall_exit_work
-/* if something modifies registers it must also disable sysexit */
-       movl PT_EIP(%esp), %edx
-       movl PT_OLDESP(%esp), %ecx
-       xorl %ebp,%ebp
-       TRACE_IRQS_ON
-1:     mov  PT_FS(%esp), %fs
-       ENABLE_INTERRUPTS_SYSEXIT
-       CFI_ENDPROC
-.pushsection .fixup,"ax"
-2:     movl $0,PT_FS(%esp)
-       jmp 1b
-.section __ex_table,"a"
-       .align 4
-       .long 1b,2b
-.popsection
-ENDPROC(sysenter_entry)
-
-       # system call handler stub
-ENTRY(system_call)
-       RING0_INT_FRAME                 # can't unwind into user space anyway
-       pushl %eax                      # save orig_eax
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       GET_THREAD_INFO(%ebp)
-                                       # system call tracing in operation / emulation
-       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
-       jnz syscall_trace_entry
-       cmpl $(nr_syscalls), %eax
-       jae syscall_badsys
-syscall_call:
-       call *sys_call_table(,%eax,4)
-       movl %eax,PT_EAX(%esp)          # store the return value
-syscall_exit:
-       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
-                                       # setting need_resched or sigpending
-                                       # between sampling and the iret
-       TRACE_IRQS_OFF
-       testl $TF_MASK,PT_EFLAGS(%esp)  # If tracing set singlestep flag on exit
-       jz no_singlestep
-       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
-       movl TI_flags(%ebp), %ecx
-       testw $_TIF_ALLWORK_MASK, %cx   # current->work
-       jne syscall_exit_work
-
-restore_all:
-       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS, SS and CS
-       # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
-       # are returning to the kernel.
-       # See comments in process.c:copy_thread() for details.
-       movb PT_OLDSS(%esp), %ah
-       movb PT_CS(%esp), %al
-       andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
-       cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
-       CFI_REMEMBER_STATE
-       je ldt_ss                       # returning to user-space with LDT SS
-restore_nocheck:
-       TRACE_IRQS_IRET
-restore_nocheck_notrace:
-       RESTORE_REGS
-       addl $4, %esp                   # skip orig_eax/error_code
-       CFI_ADJUST_CFA_OFFSET -4
-1:     INTERRUPT_RETURN
-.section .fixup,"ax"
-iret_exc:
-       pushl $0                        # no error code
-       pushl $do_iret_error
-       jmp error_code
-.previous
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
-
-       CFI_RESTORE_STATE
-ldt_ss:
-       larl PT_OLDSS(%esp), %eax
-       jnz restore_nocheck
-       testl $0x00400000, %eax         # returning to 32bit stack?
-       jnz restore_nocheck             # allright, normal return
-
-#ifdef CONFIG_PARAVIRT
-       /*
-        * The kernel can't run on a non-flat stack if paravirt mode
-        * is active.  Rather than try to fixup the high bits of
-        * ESP, bypass this code entirely.  This may break DOSemu
-        * and/or Wine support in a paravirt VM, although the option
-        * is still available to implement the setting of the high
-        * 16-bits in the INTERRUPT_RETURN paravirt-op.
-        */
-       cmpl $0, paravirt_ops+PARAVIRT_enabled
-       jne restore_nocheck
-#endif
-
-       /* If returning to userspace with 16bit stack,
-        * try to fix the higher word of ESP, as the CPU
-        * won't restore it.
-        * This is an "official" bug of all the x86-compatible
-        * CPUs, which we can try to work around to make
-        * dosemu and wine happy. */
-       movl PT_OLDESP(%esp), %eax
-       movl %esp, %edx
-       call patch_espfix_desc
-       pushl $__ESPFIX_SS
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       DISABLE_INTERRUPTS(CLBR_EAX)
-       TRACE_IRQS_OFF
-       lss (%esp), %esp
-       CFI_ADJUST_CFA_OFFSET -8
-       jmp restore_nocheck
-       CFI_ENDPROC
-ENDPROC(system_call)
-
-       # perform work that needs to be done immediately before resumption
-       ALIGN
-       RING0_PTREGS_FRAME              # can't unwind into user space anyway
-work_pending:
-       testb $_TIF_NEED_RESCHED, %cl
-       jz work_notifysig
-work_resched:
-       call schedule
-       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
-                                       # setting need_resched or sigpending
-                                       # between sampling and the iret
-       TRACE_IRQS_OFF
-       movl TI_flags(%ebp), %ecx
-       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
-                                       # than syscall tracing?
-       jz restore_all
-       testb $_TIF_NEED_RESCHED, %cl
-       jnz work_resched
-
-work_notifysig:                                # deal with pending signals and
-                                       # notify-resume requests
-#ifdef CONFIG_VM86
-       testl $VM_MASK, PT_EFLAGS(%esp)
-       movl %esp, %eax
-       jne work_notifysig_v86          # returning to kernel-space or
-                                       # vm86-space
-       xorl %edx, %edx
-       call do_notify_resume
-       jmp resume_userspace_sig
-
-       ALIGN
-work_notifysig_v86:
-       pushl %ecx                      # save ti_flags for do_notify_resume
-       CFI_ADJUST_CFA_OFFSET 4
-       call save_v86_state             # %eax contains pt_regs pointer
-       popl %ecx
-       CFI_ADJUST_CFA_OFFSET -4
-       movl %eax, %esp
-#else
-       movl %esp, %eax
-#endif
-       xorl %edx, %edx
-       call do_notify_resume
-       jmp resume_userspace_sig
-END(work_pending)
-
-       # perform syscall exit tracing
-       ALIGN
-syscall_trace_entry:
-       movl $-ENOSYS,PT_EAX(%esp)
-       movl %esp, %eax
-       xorl %edx,%edx
-       call do_syscall_trace
-       cmpl $0, %eax
-       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
-                                       # so must skip actual syscall
-       movl PT_ORIG_EAX(%esp), %eax
-       cmpl $(nr_syscalls), %eax
-       jnae syscall_call
-       jmp syscall_exit
-END(syscall_trace_entry)
-
-       # perform syscall exit tracing
-       ALIGN
-syscall_exit_work:
-       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
-       jz work_pending
-       TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_ANY)     # could let do_syscall_trace() call
-                                       # schedule() instead
-       movl %esp, %eax
-       movl $1, %edx
-       call do_syscall_trace
-       jmp resume_userspace
-END(syscall_exit_work)
-       CFI_ENDPROC
-
-       RING0_INT_FRAME                 # can't unwind into user space anyway
-syscall_fault:
-       pushl %eax                      # save orig_eax
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       GET_THREAD_INFO(%ebp)
-       movl $-EFAULT,PT_EAX(%esp)
-       jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
-       movl $-ENOSYS,PT_EAX(%esp)
-       jmp resume_userspace
-END(syscall_badsys)
-       CFI_ENDPROC
-
-#define FIXUP_ESPFIX_STACK \
-       /* since we are on a wrong stack, we cant make it a C code :( */ \
-       PER_CPU(gdt_page, %ebx); \
-       GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
-       addl %esp, %eax; \
-       pushl $__KERNEL_DS; \
-       CFI_ADJUST_CFA_OFFSET 4; \
-       pushl %eax; \
-       CFI_ADJUST_CFA_OFFSET 4; \
-       lss (%esp), %esp; \
-       CFI_ADJUST_CFA_OFFSET -8;
-#define UNWIND_ESPFIX_STACK \
-       movl %ss, %eax; \
-       /* see if on espfix stack */ \
-       cmpw $__ESPFIX_SS, %ax; \
-       jne 27f; \
-       movl $__KERNEL_DS, %eax; \
-       movl %eax, %ds; \
-       movl %eax, %es; \
-       /* switch to normal stack */ \
-       FIXUP_ESPFIX_STACK; \
-27:;
-
-/*
- * Build the entry stubs and pointer table with
- * some assembler magic.
- */
-.data
-ENTRY(interrupt)
-.text
-
-ENTRY(irq_entries_start)
-       RING0_INT_FRAME
-vector=0
-.rept NR_IRQS
-       ALIGN
- .if vector
-       CFI_ADJUST_CFA_OFFSET -4
- .endif
-1:     pushl $~(vector)
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp common_interrupt
- .previous
-       .long 1b
- .text
-vector=vector+1
-.endr
-END(irq_entries_start)
-
-.previous
-END(interrupt)
-.previous
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
-       ALIGN
-common_interrupt:
-       SAVE_ALL
-       TRACE_IRQS_OFF
-       movl %esp,%eax
-       call do_IRQ
-       jmp ret_from_intr
-ENDPROC(common_interrupt)
-       CFI_ENDPROC
-
-#define BUILD_INTERRUPT(name, nr)      \
-ENTRY(name)                            \
-       RING0_INT_FRAME;                \
-       pushl $~(nr);                   \
-       CFI_ADJUST_CFA_OFFSET 4;        \
-       SAVE_ALL;                       \
-       TRACE_IRQS_OFF                  \
-       movl %esp,%eax;                 \
-       call smp_##name;                \
-       jmp ret_from_intr;              \
-       CFI_ENDPROC;                    \
-ENDPROC(name)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include "entry_arch.h"
-
-KPROBE_ENTRY(page_fault)
-       RING0_EC_FRAME
-       pushl $do_page_fault
-       CFI_ADJUST_CFA_OFFSET 4
-       ALIGN
-error_code:
-       /* the function address is in %fs's slot on the stack */
-       pushl %es
-       CFI_ADJUST_CFA_OFFSET 4
-       /*CFI_REL_OFFSET es, 0*/
-       pushl %ds
-       CFI_ADJUST_CFA_OFFSET 4
-       /*CFI_REL_OFFSET ds, 0*/
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET eax, 0
-       pushl %ebp
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET ebp, 0
-       pushl %edi
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET edi, 0
-       pushl %esi
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET esi, 0
-       pushl %edx
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET edx, 0
-       pushl %ecx
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET ecx, 0
-       pushl %ebx
-       CFI_ADJUST_CFA_OFFSET 4
-       CFI_REL_OFFSET ebx, 0
-       cld
-       pushl %fs
-       CFI_ADJUST_CFA_OFFSET 4
-       /*CFI_REL_OFFSET fs, 0*/
-       movl $(__KERNEL_PERCPU), %ecx
-       movl %ecx, %fs
-       UNWIND_ESPFIX_STACK
-       popl %ecx
-       CFI_ADJUST_CFA_OFFSET -4
-       /*CFI_REGISTER es, ecx*/
-       movl PT_FS(%esp), %edi          # get the function address
-       movl PT_ORIG_EAX(%esp), %edx    # get the error code
-       movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
-       mov  %ecx, PT_FS(%esp)
-       /*CFI_REL_OFFSET fs, ES*/
-       movl $(__USER_DS), %ecx
-       movl %ecx, %ds
-       movl %ecx, %es
-       movl %esp,%eax                  # pt_regs pointer
-       call *%edi
-       jmp ret_from_exception
-       CFI_ENDPROC
-KPROBE_END(page_fault)
-
-ENTRY(coprocessor_error)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_coprocessor_error
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_simd_coprocessor_error
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
-       RING0_INT_FRAME
-       pushl $-1                       # mark this as an int
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       GET_CR0_INTO_EAX
-       testl $0x4, %eax                # EM (math emulation bit)
-       jne device_not_available_emulate
-       preempt_stop(CLBR_ANY)
-       call math_state_restore
-       jmp ret_from_exception
-device_not_available_emulate:
-       pushl $0                        # temporary storage for ORIG_EIP
-       CFI_ADJUST_CFA_OFFSET 4
-       call math_emulate
-       addl $4, %esp
-       CFI_ADJUST_CFA_OFFSET -4
-       jmp ret_from_exception
-       CFI_ENDPROC
-END(device_not_available)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label)           \
-       cmpw $__KERNEL_CS,4(%esp);              \
-       jne ok;                                 \
-label:                                         \
-       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
-       CFI_DEF_CFA esp, 0;                     \
-       CFI_UNDEFINED eip;                      \
-       pushfl;                                 \
-       CFI_ADJUST_CFA_OFFSET 4;                \
-       pushl $__KERNEL_CS;                     \
-       CFI_ADJUST_CFA_OFFSET 4;                \
-       pushl $sysenter_past_esp;               \
-       CFI_ADJUST_CFA_OFFSET 4;                \
-       CFI_REL_OFFSET eip, 0
-
-KPROBE_ENTRY(debug)
-       RING0_INT_FRAME
-       cmpl $sysenter_entry,(%esp)
-       jne debug_stack_correct
-       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
-       pushl $-1                       # mark this as an int
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       xorl %edx,%edx                  # error code 0
-       movl %esp,%eax                  # pt_regs pointer
-       call do_debug
-       jmp ret_from_exception
-       CFI_ENDPROC
-KPROBE_END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
-       RING0_INT_FRAME
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       movl %ss, %eax
-       cmpw $__ESPFIX_SS, %ax
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
-       je nmi_espfix_stack
-       cmpl $sysenter_entry,(%esp)
-       je nmi_stack_fixup
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       movl %esp,%eax
-       /* Do not access memory above the end of our stack page,
-        * it might not exist.
-        */
-       andl $(THREAD_SIZE-1),%eax
-       cmpl $(THREAD_SIZE-20),%eax
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
-       jae nmi_stack_correct
-       cmpl $sysenter_entry,12(%esp)
-       je nmi_debug_stack_check
-nmi_stack_correct:
-       /* We have a RING0_INT_FRAME here */
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       xorl %edx,%edx          # zero error code
-       movl %esp,%eax          # pt_regs pointer
-       call do_nmi
-       jmp restore_nocheck_notrace
-       CFI_ENDPROC
-
-nmi_stack_fixup:
-       RING0_INT_FRAME
-       FIX_STACK(12,nmi_stack_correct, 1)
-       jmp nmi_stack_correct
-
-nmi_debug_stack_check:
-       /* We have a RING0_INT_FRAME here */
-       cmpw $__KERNEL_CS,16(%esp)
-       jne nmi_stack_correct
-       cmpl $debug,(%esp)
-       jb nmi_stack_correct
-       cmpl $debug_esp_fix_insn,(%esp)
-       ja nmi_stack_correct
-       FIX_STACK(24,nmi_stack_correct, 1)
-       jmp nmi_stack_correct
-
-nmi_espfix_stack:
-       /* We have a RING0_INT_FRAME here.
-        *
-        * create the pointer to lss back
-        */
-       pushl %ss
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl %esp
-       CFI_ADJUST_CFA_OFFSET 4
-       addw $4, (%esp)
-       /* copy the iret frame of 12 bytes */
-       .rept 3
-       pushl 16(%esp)
-       CFI_ADJUST_CFA_OFFSET 4
-       .endr
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       FIXUP_ESPFIX_STACK              # %eax == %esp
-       xorl %edx,%edx                  # zero error code
-       call do_nmi
-       RESTORE_REGS
-       lss 12+4(%esp), %esp            # back to espfix stack
-       CFI_ADJUST_CFA_OFFSET -24
-1:     INTERRUPT_RETURN
-       CFI_ENDPROC
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
-KPROBE_END(nmi)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
-1:     iret
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
-       sti
-       sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-KPROBE_ENTRY(int3)
-       RING0_INT_FRAME
-       pushl $-1                       # mark this as an int
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       xorl %edx,%edx          # zero error code
-       movl %esp,%eax          # pt_regs pointer
-       call do_int3
-       jmp ret_from_exception
-       CFI_ENDPROC
-KPROBE_END(int3)
-
-ENTRY(overflow)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_overflow
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_bounds
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_invalid_op
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_coprocessor_segment_overrun
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
-       RING0_EC_FRAME
-       pushl $do_invalid_TSS
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
-       RING0_EC_FRAME
-       pushl $do_segment_not_present
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
-       RING0_EC_FRAME
-       pushl $do_stack_segment
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
-       RING0_EC_FRAME
-       pushl $do_general_protection
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
-       RING0_EC_FRAME
-       pushl $do_alignment_check
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
-       RING0_INT_FRAME
-       pushl $0                        # no error code
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_divide_error
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl machine_check_vector
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
-       RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_spurious_interrupt_bug
-       CFI_ADJUST_CFA_OFFSET 4
-       jmp error_code
-       CFI_ENDPROC
-END(spurious_interrupt_bug)
-
-ENTRY(kernel_thread_helper)
-       pushl $0                # fake return address for unwinder
-       CFI_STARTPROC
-       movl %edx,%eax
-       push %edx
-       CFI_ADJUST_CFA_OFFSET 4
-       call *%ebx
-       push %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       call do_exit
-       CFI_ENDPROC
-ENDPROC(kernel_thread_helper)
-
-#ifdef CONFIG_XEN
-ENTRY(xen_hypervisor_callback)
-       CFI_STARTPROC
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       TRACE_IRQS_OFF
-
-       /* Check to see if we got the event in the critical
-          region in xen_iret_direct, after we've reenabled
-          events and checked for pending events.  This simulates
-          iret instruction's behaviour where it delivers a
-          pending interrupt when enabling interrupts. */
-       movl PT_EIP(%esp),%eax
-       cmpl $xen_iret_start_crit,%eax
-       jb   1f
-       cmpl $xen_iret_end_crit,%eax
-       jae  1f
-
-       call xen_iret_crit_fixup
-
-1:     mov %esp, %eax
-       call xen_evtchn_do_upcall
-       jmp  ret_from_intr
-       CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-#  1. Fault while reloading DS, ES, FS or GS
-#  2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
-       CFI_STARTPROC
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
-       movl $1,%eax
-1:     mov 4(%esp),%ds
-2:     mov 8(%esp),%es
-3:     mov 12(%esp),%fs
-4:     mov 16(%esp),%gs
-       testl %eax,%eax
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
-       lea 16(%esp),%esp
-       CFI_ADJUST_CFA_OFFSET -16
-       jz 5f
-       addl $16,%esp
-       jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
-5:     pushl $0                # EAX == 0 => Category 1 (Bad segment)
-       CFI_ADJUST_CFA_OFFSET 4
-       SAVE_ALL
-       jmp ret_from_exception
-       CFI_ENDPROC
-
-.section .fixup,"ax"
-6:     xorl %eax,%eax
-       movl %eax,4(%esp)
-       jmp 1b
-7:     xorl %eax,%eax
-       movl %eax,8(%esp)
-       jmp 2b
-8:     xorl %eax,%eax
-       movl %eax,12(%esp)
-       jmp 3b
-9:     xorl %eax,%eax
-       movl %eax,16(%esp)
-       jmp 4b
-.previous
-.section __ex_table,"a"
-       .align 4
-       .long 1b,6b
-       .long 2b,7b
-       .long 3b,8b
-       .long 4b,9b
-.previous
-ENDPROC(xen_failsafe_callback)
-
-#endif /* CONFIG_XEN */
-
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
diff --git a/arch/i386/kernel/geode_32.c b/arch/i386/kernel/geode_32.c
deleted file mode 100644 (file)
index 41e8aec..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * AMD Geode southbridge support code
- * Copyright (C) 2006, Advanced Micro Devices, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-#include <linux/io.h>
-#include <asm/msr.h>
-#include <asm/geode.h>
-
-static struct {
-       char *name;
-       u32 msr;
-       int size;
-       u32 base;
-} lbars[] = {
-       { "geode-pms",   MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
-       { "geode-acpi",  MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
-       { "geode-gpio",  MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
-       { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
-};
-
-static void __init init_lbars(void)
-{
-       u32 lo, hi;
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(lbars); i++) {
-               rdmsr(lbars[i].msr, lo, hi);
-               if (hi & 0x01)
-                       lbars[i].base = lo & 0x0000ffff;
-
-               if (lbars[i].base == 0)
-                       printk(KERN_ERR "geode:  Couldn't initialize '%s'\n",
-                                       lbars[i].name);
-       }
-}
-
-int geode_get_dev_base(unsigned int dev)
-{
-       BUG_ON(dev >= ARRAY_SIZE(lbars));
-       return lbars[dev].base;
-}
-EXPORT_SYMBOL_GPL(geode_get_dev_base);
-
-/* === GPIO API === */
-
-void geode_gpio_set(unsigned int gpio, unsigned int reg)
-{
-       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
-       if (!base)
-               return;
-
-       if (gpio < 16)
-               outl(1 << gpio, base + reg);
-       else
-               outl(1 << (gpio - 16), base + 0x80 + reg);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_set);
-
-void geode_gpio_clear(unsigned int gpio, unsigned int reg)
-{
-       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
-       if (!base)
-               return;
-
-       if (gpio < 16)
-               outl(1 << (gpio + 16), base + reg);
-       else
-               outl(1 << gpio, base + 0x80 + reg);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_clear);
-
-int geode_gpio_isset(unsigned int gpio, unsigned int reg)
-{
-       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
-       if (!base)
-               return 0;
-
-       if (gpio < 16)
-               return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
-       else
-               return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
-}
-EXPORT_SYMBOL_GPL(geode_gpio_isset);
-
-void geode_gpio_set_irq(unsigned int group, unsigned int irq)
-{
-       u32 lo, hi;
-
-       if (group > 7 || irq > 15)
-               return;
-
-       rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
-
-       lo &= ~(0xF << (group * 4));
-       lo |= (irq & 0xF) << (group * 4);
-
-       wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
-
-void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
-{
-       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-       u32 offset, shift, val;
-
-       if (gpio >= 24)
-               offset = GPIO_MAP_W;
-       else if (gpio >= 16)
-               offset = GPIO_MAP_Z;
-       else if (gpio >= 8)
-               offset = GPIO_MAP_Y;
-       else
-               offset = GPIO_MAP_X;
-
-       shift = (gpio % 8) * 4;
-
-       val = inl(base + offset);
-
-       /* Clear whatever was there before */
-       val &= ~(0xF << shift);
-
-       /* And set the new value */
-
-       val |= ((pair & 7) << shift);
-
-       /* Set the PME bit if this is a PME event */
-
-       if (pme)
-               val |= (1 << (shift + 3));
-
-       outl(val, base + offset);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
-
-static int __init geode_southbridge_init(void)
-{
-       if (!is_geode())
-               return -ENODEV;
-
-       init_lbars();
-       return 0;
-}
-
-postcore_initcall(geode_southbridge_init);
diff --git a/arch/i386/kernel/head_32.S b/arch/i386/kernel/head_32.S
deleted file mode 100644 (file)
index 9150ca9..0000000
+++ /dev/null
@@ -1,578 +0,0 @@
-/*
- *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  Enhanced CPU detection and feature setting code by Mike Jagdis
- *  and Martin Mares, November 1997.
- */
-
-.text
-#include <linux/threads.h>
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/cache.h>
-#include <asm/thread_info.h>
-#include <asm/asm-offsets.h>
-#include <asm/setup.h>
-
-/*
- * References to members of the new_cpu_data structure.
- */
-
-#define X86            new_cpu_data+CPUINFO_x86
-#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
-#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
-#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
-#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
-#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
-#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
-#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
-
-/*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
- * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
- *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-#if PTRS_PER_PMD > 1
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
-#else
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
-#endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
-ALLOCATOR_SLOP = 4
-
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
-
-/*
- * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
- * %esi points to the real-mode code as a 32-bit pointer.
- * CS and DS must be 4 GB flat segments, but we don't depend on
- * any particular GDT layout, because we load our own as soon as we
- * can.
- */
-.section .text.head,"ax",@progbits
-ENTRY(startup_32)
-
-/*
- * Set segments to known values.
- */
-       cld
-       lgdt boot_gdt_descr - __PAGE_OFFSET
-       movl $(__BOOT_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       movl %eax,%fs
-       movl %eax,%gs
-
-/*
- * Clear BSS first so that there are no surprises...
- * No need to cld as DF is already clear from cld above...
- */
-       xorl %eax,%eax
-       movl $__bss_start - __PAGE_OFFSET,%edi
-       movl $__bss_stop - __PAGE_OFFSET,%ecx
-       subl %edi,%ecx
-       shrl $2,%ecx
-       rep ; stosl
-/*
- * Copy bootup parameters out of the way.
- * Note: %esi still has the pointer to the real-mode data.
- * With the kexec as boot loader, parameter segment might be loaded beyond
- * kernel image and might not even be addressable by early boot page tables.
- * (kexec on panic case). Hence copy out the parameters before initializing
- * page tables.
- */
-       movl $(boot_params - __PAGE_OFFSET),%edi
-       movl $(PARAM_SIZE/4),%ecx
-       cld
-       rep
-       movsl
-       movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
-       andl %esi,%esi
-       jnz 2f                  # New command line protocol
-       cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
-       jne 1f
-       movzwl OLD_CL_OFFSET,%esi
-       addl $(OLD_CL_BASE_ADDR),%esi
-2:
-       movl $(boot_command_line - __PAGE_OFFSET),%edi
-       movl $(COMMAND_LINE_SIZE/4),%ecx
-       rep
-       movsl
-1:
-
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
- *
- * Warning: don't use %esi or the stack in this code.  However, %esp
- * can be used as a GPR if you really need it...
- */
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-       movl $(pg0 - __PAGE_OFFSET), %edi
-       movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
-       movl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
-10:
-       leal 0x007(%edi),%ecx                   /* Create PDE entry */
-       movl %ecx,(%edx)                        /* Store identity PDE entry */
-       movl %ecx,page_pde_offset(%edx)         /* Store kernel PDE entry */
-       addl $4,%edx
-       movl $1024, %ecx
-11:
-       stosl
-       addl $0x1000,%eax
-       loop 11b
-       /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
-       /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
-       leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
-       cmpl %ebp,%eax
-       jb 10b
-       movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
-
-       xorl %ebx,%ebx                          /* This is the boot CPU (BSP) */
-       jmp 3f
-/*
- * Non-boot CPU entry point; entered from trampoline.S
- * We can't lgdt here, because lgdt itself uses a data segment, but
- * we know the trampoline has already loaded the boot_gdt for us.
- *
- * If cpu hotplug is not supported then this code can go in init section
- * which will be freed later
- */
-
-#ifndef CONFIG_HOTPLUG_CPU
-.section .init.text,"ax",@progbits
-#endif
-
-       /* Do an early initialization of the fixmap area */
-       movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
-       movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
-       addl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
-       movl %eax, 4092(%edx)
-
-#ifdef CONFIG_SMP
-ENTRY(startup_32_smp)
-       cld
-       movl $(__BOOT_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       movl %eax,%fs
-       movl %eax,%gs
-
-/*
- *     New page tables may be in 4Mbyte page mode and may
- *     be using the global pages. 
- *
- *     NOTE! If we are on a 486 we may have no cr4 at all!
- *     So we do not try to touch it unless we really have
- *     some bits in it to set.  This won't work if the BSP
- *     implements cr4 but this AP does not -- very unlikely
- *     but be warned!  The same applies to the pse feature
- *     if not equally supported. --macro
- *
- *     NOTE! We have to correct for the fact that we're
- *     not yet offset PAGE_OFFSET..
- */
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
-       movl cr4_bits,%edx
-       andl %edx,%edx
-       jz 6f
-       movl %cr4,%eax          # Turn on paging options (PSE,PAE,..)
-       orl %edx,%eax
-       movl %eax,%cr4
-
-       btl $5, %eax            # check if PAE is enabled
-       jnc 6f
-
-       /* Check if extended functions are implemented */
-       movl $0x80000000, %eax
-       cpuid
-       cmpl $0x80000000, %eax
-       jbe 6f
-       mov $0x80000001, %eax
-       cpuid
-       /* Execute Disable bit supported? */
-       btl $20, %edx
-       jnc 6f
-
-       /* Setup EFER (Extended Feature Enable Register) */
-       movl $0xc0000080, %ecx
-       rdmsr
-
-       btsl $11, %eax
-       /* Make changes effective */
-       wrmsr
-
-6:
-       /* This is a secondary processor (AP) */
-       xorl %ebx,%ebx
-       incl %ebx
-
-#endif /* CONFIG_SMP */
-3:
-
-/*
- * Enable paging
- */
-       movl $swapper_pg_dir-__PAGE_OFFSET,%eax
-       movl %eax,%cr3          /* set the page table pointer.. */
-       movl %cr0,%eax
-       orl $0x80000000,%eax
-       movl %eax,%cr0          /* ..and set paging (PG) bit */
-       ljmp $__BOOT_CS,$1f     /* Clear prefetch and normalize %eip */
-1:
-       /* Set up the stack pointer */
-       lss stack_start,%esp
-
-/*
- * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
- */
-       pushl $0
-       popfl
-
-#ifdef CONFIG_SMP
-       andl %ebx,%ebx
-       jz  1f                          /* Initial CPU cleans BSS */
-       jmp checkCPUtype
-1:
-#endif /* CONFIG_SMP */
-
-/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
-       call setup_idt
-
-checkCPUtype:
-
-       movl $-1,X86_CPUID              #  -1 for no CPUID initially
-
-/* check if it is 486 or 386. */
-/*
- * XXX - this does a lot of unnecessary setup.  Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
- */
-
-       movb $3,X86             # at least 386
-       pushfl                  # push EFLAGS
-       popl %eax               # get EFLAGS
-       movl %eax,%ecx          # save original EFLAGS
-       xorl $0x240000,%eax     # flip AC and ID bits in EFLAGS
-       pushl %eax              # copy to EFLAGS
-       popfl                   # set EFLAGS
-       pushfl                  # get new EFLAGS
-       popl %eax               # put it in eax
-       xorl %ecx,%eax          # change in flags
-       pushl %ecx              # restore original EFLAGS
-       popfl
-       testl $0x40000,%eax     # check if AC bit changed
-       je is386
-
-       movb $4,X86             # at least 486
-       testl $0x200000,%eax    # check if ID bit changed
-       je is486
-
-       /* get vendor info */
-       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
-       cpuid
-       movl %eax,X86_CPUID             # save CPUID level
-       movl %ebx,X86_VENDOR_ID         # lo 4 chars
-       movl %edx,X86_VENDOR_ID+4       # next 4 chars
-       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
-
-       orl %eax,%eax                   # do we have processor info as well?
-       je is486
-
-       movl $1,%eax            # Use the CPUID instruction to get CPU type
-       cpuid
-       movb %al,%cl            # save reg for future use
-       andb $0x0f,%ah          # mask processor family
-       movb %ah,X86
-       andb $0xf0,%al          # mask model
-       shrb $4,%al
-       movb %al,X86_MODEL
-       andb $0x0f,%cl          # mask mask revision
-       movb %cl,X86_MASK
-       movl %edx,X86_CAPABILITY
-
-is486: movl $0x50022,%ecx      # set AM, WP, NE and MP
-       jmp 2f
-
-is386: movl $2,%ecx            # set MP
-2:     movl %cr0,%eax
-       andl $0x80000011,%eax   # Save PG,PE,ET
-       orl %ecx,%eax
-       movl %eax,%cr0
-
-       call check_x87
-       lgdt early_gdt_descr
-       lidt idt_descr
-       ljmp $(__KERNEL_CS),$1f
-1:     movl $(__KERNEL_DS),%eax        # reload all the segment registers
-       movl %eax,%ss                   # after changing gdt.
-       movl %eax,%fs                   # gets reset once there's real percpu
-
-       movl $(__USER_DS),%eax          # DS/ES contains default USER segment
-       movl %eax,%ds
-       movl %eax,%es
-
-       xorl %eax,%eax                  # Clear GS and LDT
-       movl %eax,%gs
-       lldt %ax
-
-       cld                     # gcc2 wants the direction flag cleared at all times
-       pushl $0                # fake return address for unwinder
-#ifdef CONFIG_SMP
-       movb ready, %cl
-       movb $1, ready
-       cmpb $0,%cl             # the first CPU calls start_kernel
-       je   1f
-       movl $(__KERNEL_PERCPU), %eax
-       movl %eax,%fs           # set this cpu's percpu
-       jmp initialize_secondary # all other CPUs call initialize_secondary
-1:
-#endif /* CONFIG_SMP */
-       jmp start_kernel
-
-/*
- * We depend on ET to be correct. This checks for 287/387.
- */
-check_x87:
-       movb $0,X86_HARD_MATH
-       clts
-       fninit
-       fstsw %ax
-       cmpb $0,%al
-       je 1f
-       movl %cr0,%eax          /* no coprocessor: have to set bits */
-       xorl $4,%eax            /* set EM */
-       movl %eax,%cr0
-       ret
-       ALIGN
-1:     movb $1,X86_HARD_MATH
-       .byte 0xDB,0xE4         /* fsetpm for 287, ignored by 387 */
-       ret
-
-/*
- *  setup_idt
- *
- *  sets up a idt with 256 entries pointing to
- *  ignore_int, interrupt gates. It doesn't actually load
- *  idt - that can be done only after paging has been enabled
- *  and the kernel moved to PAGE_OFFSET. Interrupts
- *  are enabled elsewhere, when we can be relatively
- *  sure everything is ok.
- *
- *  Warning: %esi is live across this function.
- */
-setup_idt:
-       lea ignore_int,%edx
-       movl $(__KERNEL_CS << 16),%eax
-       movw %dx,%ax            /* selector = 0x0010 = cs */
-       movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
-
-       lea idt_table,%edi
-       mov $256,%ecx
-rp_sidt:
-       movl %eax,(%edi)
-       movl %edx,4(%edi)
-       addl $8,%edi
-       dec %ecx
-       jne rp_sidt
-
-.macro set_early_handler handler,trapno
-       lea \handler,%edx
-       movl $(__KERNEL_CS << 16),%eax
-       movw %dx,%ax
-       movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
-       lea idt_table,%edi
-       movl %eax,8*\trapno(%edi)
-       movl %edx,8*\trapno+4(%edi)
-.endm
-
-       set_early_handler handler=early_divide_err,trapno=0
-       set_early_handler handler=early_illegal_opcode,trapno=6
-       set_early_handler handler=early_protection_fault,trapno=13
-       set_early_handler handler=early_page_fault,trapno=14
-
-       ret
-
-early_divide_err:
-       xor %edx,%edx
-       pushl $0        /* fake errcode */
-       jmp early_fault
-
-early_illegal_opcode:
-       movl $6,%edx
-       pushl $0        /* fake errcode */
-       jmp early_fault
-
-early_protection_fault:
-       movl $13,%edx
-       jmp early_fault
-
-early_page_fault:
-       movl $14,%edx
-       jmp early_fault
-
-early_fault:
-       cld
-#ifdef CONFIG_PRINTK
-       movl $(__KERNEL_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       cmpl $2,early_recursion_flag
-       je hlt_loop
-       incl early_recursion_flag
-       movl %cr2,%eax
-       pushl %eax
-       pushl %edx              /* trapno */
-       pushl $fault_msg
-#ifdef CONFIG_EARLY_PRINTK
-       call early_printk
-#else
-       call printk
-#endif
-#endif
-hlt_loop:
-       hlt
-       jmp hlt_loop
-
-/* This is the default interrupt "handler" :-) */
-       ALIGN
-ignore_int:
-       cld
-#ifdef CONFIG_PRINTK
-       pushl %eax
-       pushl %ecx
-       pushl %edx
-       pushl %es
-       pushl %ds
-       movl $(__KERNEL_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       cmpl $2,early_recursion_flag
-       je hlt_loop
-       incl early_recursion_flag
-       pushl 16(%esp)
-       pushl 24(%esp)
-       pushl 32(%esp)
-       pushl 40(%esp)
-       pushl $int_msg
-#ifdef CONFIG_EARLY_PRINTK
-       call early_printk
-#else
-       call printk
-#endif
-       addl $(5*4),%esp
-       popl %ds
-       popl %es
-       popl %edx
-       popl %ecx
-       popl %eax
-#endif
-       iret
-
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
-/*
- * BSS section
- */
-.section ".bss.page_aligned","wa"
-       .align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
-       .fill 1024,4,0
-ENTRY(swapper_pg_pmd)
-       .fill 1024,4,0
-ENTRY(empty_zero_page)
-       .fill 4096,1,0
-
-/*
- * This starts the data section.
- */
-.data
-ENTRY(stack_start)
-       .long init_thread_union+THREAD_SIZE
-       .long __BOOT_DS
-
-ready: .byte 0
-
-early_recursion_flag:
-       .long 0
-
-int_msg:
-       .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
-
-fault_msg:
-       .ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
-       .asciz "Stack: %p %p %p %p %p %p %p %p\n"
-
-#include "../../x86/xen/xen-head.S"
-
-/*
- * The IDT and GDT 'descriptors' are a strange 48-bit object
- * only used by the lidt and lgdt instructions. They are not
- * like usual segment descriptors - they consist of a 16-bit
- * segment size, and 32-bit linear address value:
- */
-
-.globl boot_gdt_descr
-.globl idt_descr
-
-       ALIGN
-# early boot GDT descriptor (must use 1:1 address mapping)
-       .word 0                         # 32 bit align gdt_desc.address
-boot_gdt_descr:
-       .word __BOOT_DS+7
-       .long boot_gdt - __PAGE_OFFSET
-
-       .word 0                         # 32-bit align idt_desc.address
-idt_descr:
-       .word IDT_ENTRIES*8-1           # idt contains 256 entries
-       .long idt_table
-
-# boot GDT descriptor (later on used by CPU#0):
-       .word 0                         # 32 bit align gdt_desc.address
-ENTRY(early_gdt_descr)
-       .word GDT_ENTRIES*8-1
-       .long per_cpu__gdt_page         /* Overwritten for secondary CPUs */
-
-/*
- * The boot_gdt must mirror the equivalent in setup.S and is
- * used only for booting.
- */
-       .align L1_CACHE_BYTES
-ENTRY(boot_gdt)
-       .fill GDT_ENTRY_BOOT_CS,8,0
-       .quad 0x00cf9a000000ffff        /* kernel 4GB code at 0x00000000 */
-       .quad 0x00cf92000000ffff        /* kernel 4GB data at 0x00000000 */
diff --git a/arch/i386/kernel/hpet_32.c b/arch/i386/kernel/hpet_32.c
deleted file mode 100644 (file)
index 533d493..0000000
+++ /dev/null
@@ -1,553 +0,0 @@
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/errno.h>
-#include <linux/hpet.h>
-#include <linux/init.h>
-#include <linux/sysdev.h>
-#include <linux/pm.h>
-#include <linux/delay.h>
-
-#include <asm/hpet.h>
-#include <asm/io.h>
-
-extern struct clock_event_device *global_clock_event;
-
-#define HPET_MASK      CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT     22
-
-/* FSEC = 10^-15 NSEC = 10^-9 */
-#define FSEC_PER_NSEC  1000000
-
-/*
- * HPET address is set in acpi/boot.c, when an ACPI entry exists
- */
-unsigned long hpet_address;
-static void __iomem * hpet_virt_address;
-
-static inline unsigned long hpet_readl(unsigned long a)
-{
-       return readl(hpet_virt_address + a);
-}
-
-static inline void hpet_writel(unsigned long d, unsigned long a)
-{
-       writel(d, hpet_virt_address + a);
-}
-
-/*
- * HPET command line enable / disable
- */
-static int boot_hpet_disable;
-
-static int __init hpet_setup(char* str)
-{
-       if (str) {
-               if (!strncmp("disable", str, 7))
-                       boot_hpet_disable = 1;
-       }
-       return 1;
-}
-__setup("hpet=", hpet_setup);
-
-static inline int is_hpet_capable(void)
-{
-       return (!boot_hpet_disable && hpet_address);
-}
-
-/*
- * HPET timer interrupt enable / disable
- */
-static int hpet_legacy_int_enabled;
-
-/**
- * is_hpet_enabled - check whether the hpet timer interrupt is enabled
- */
-int is_hpet_enabled(void)
-{
-       return is_hpet_capable() && hpet_legacy_int_enabled;
-}
-
-/*
- * When the hpet driver (/dev/hpet) is enabled, we need to reserve
- * timer 0 and timer 1 in case of RTC emulation.
- */
-#ifdef CONFIG_HPET
-static void hpet_reserve_platform_timers(unsigned long id)
-{
-       struct hpet __iomem *hpet = hpet_virt_address;
-       struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
-       unsigned int nrtimers, i;
-       struct hpet_data hd;
-
-       nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
-
-       memset(&hd, 0, sizeof (hd));
-       hd.hd_phys_address = hpet_address;
-       hd.hd_address = hpet_virt_address;
-       hd.hd_nirqs = nrtimers;
-       hd.hd_flags = HPET_DATA_PLATFORM;
-       hpet_reserve_timer(&hd, 0);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-       hpet_reserve_timer(&hd, 1);
-#endif
-
-       hd.hd_irq[0] = HPET_LEGACY_8254;
-       hd.hd_irq[1] = HPET_LEGACY_RTC;
-
-       for (i = 2; i < nrtimers; timer++, i++)
-               hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
-                       Tn_INT_ROUTE_CNF_SHIFT;
-
-       hpet_alloc(&hd);
-
-}
-#else
-static void hpet_reserve_platform_timers(unsigned long id) { }
-#endif
-
-/*
- * Common hpet info
- */
-static unsigned long hpet_period;
-
-static void hpet_set_mode(enum clock_event_mode mode,
-                         struct clock_event_device *evt);
-static int hpet_next_event(unsigned long delta,
-                          struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
-       .name           = "hpet",
-       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .set_mode       = hpet_set_mode,
-       .set_next_event = hpet_next_event,
-       .shift          = 32,
-       .irq            = 0,
-};
-
-static void hpet_start_counter(void)
-{
-       unsigned long cfg = hpet_readl(HPET_CFG);
-
-       cfg &= ~HPET_CFG_ENABLE;
-       hpet_writel(cfg, HPET_CFG);
-       hpet_writel(0, HPET_COUNTER);
-       hpet_writel(0, HPET_COUNTER + 4);
-       cfg |= HPET_CFG_ENABLE;
-       hpet_writel(cfg, HPET_CFG);
-}
-
-static void hpet_enable_int(void)
-{
-       unsigned long cfg = hpet_readl(HPET_CFG);
-
-       cfg |= HPET_CFG_LEGACY;
-       hpet_writel(cfg, HPET_CFG);
-       hpet_legacy_int_enabled = 1;
-}
-
-static void hpet_set_mode(enum clock_event_mode mode,
-                         struct clock_event_device *evt)
-{
-       unsigned long cfg, cmp, now;
-       uint64_t delta;
-
-       switch(mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-               delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
-               delta >>= hpet_clockevent.shift;
-               now = hpet_readl(HPET_COUNTER);
-               cmp = now + (unsigned long) delta;
-               cfg = hpet_readl(HPET_T0_CFG);
-               cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-                      HPET_TN_SETVAL | HPET_TN_32BIT;
-               hpet_writel(cfg, HPET_T0_CFG);
-               /*
-                * The first write after writing TN_SETVAL to the
-                * config register sets the counter value, the second
-                * write sets the period.
-                */
-               hpet_writel(cmp, HPET_T0_CMP);
-               udelay(1);
-               hpet_writel((unsigned long) delta, HPET_T0_CMP);
-               break;
-
-       case CLOCK_EVT_MODE_ONESHOT:
-               cfg = hpet_readl(HPET_T0_CFG);
-               cfg &= ~HPET_TN_PERIODIC;
-               cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-               hpet_writel(cfg, HPET_T0_CFG);
-               break;
-
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               cfg = hpet_readl(HPET_T0_CFG);
-               cfg &= ~HPET_TN_ENABLE;
-               hpet_writel(cfg, HPET_T0_CFG);
-               break;
-
-       case CLOCK_EVT_MODE_RESUME:
-               hpet_enable_int();
-               break;
-       }
-}
-
-static int hpet_next_event(unsigned long delta,
-                          struct clock_event_device *evt)
-{
-       unsigned long cnt;
-
-       cnt = hpet_readl(HPET_COUNTER);
-       cnt += delta;
-       hpet_writel(cnt, HPET_T0_CMP);
-
-       return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
-}
-
-/*
- * Clock source related code
- */
-static cycle_t read_hpet(void)
-{
-       return (cycle_t)hpet_readl(HPET_COUNTER);
-}
-
-static struct clocksource clocksource_hpet = {
-       .name           = "hpet",
-       .rating         = 250,
-       .read           = read_hpet,
-       .mask           = HPET_MASK,
-       .shift          = HPET_SHIFT,
-       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-       .resume         = hpet_start_counter,
-};
-
-/*
- * Try to setup the HPET timer
- */
-int __init hpet_enable(void)
-{
-       unsigned long id;
-       uint64_t hpet_freq;
-       u64 tmp, start, now;
-       cycle_t t1;
-
-       if (!is_hpet_capable())
-               return 0;
-
-       hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-
-       /*
-        * Read the period and check for a sane value:
-        */
-       hpet_period = hpet_readl(HPET_PERIOD);
-       if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
-               goto out_nohpet;
-
-       /*
-        * The period is a femto seconds value. We need to calculate the
-        * scaled math multiplication factor for nanosecond to hpet tick
-        * conversion.
-        */
-       hpet_freq = 1000000000000000ULL;
-       do_div(hpet_freq, hpet_period);
-       hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
-                                     NSEC_PER_SEC, 32);
-       /* Calculate the min / max delta */
-       hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-                                                          &hpet_clockevent);
-       hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
-                                                          &hpet_clockevent);
-
-       /*
-        * Read the HPET ID register to retrieve the IRQ routing
-        * information and the number of channels
-        */
-       id = hpet_readl(HPET_ID);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-       /*
-        * The legacy routing mode needs at least two channels, tick timer
-        * and the rtc emulation channel.
-        */
-       if (!(id & HPET_ID_NUMBER))
-               goto out_nohpet;
-#endif
-
-       /* Start the counter */
-       hpet_start_counter();
-
-       /* Verify whether hpet counter works */
-       t1 = read_hpet();
-       rdtscll(start);
-
-       /*
-        * We don't know the TSC frequency yet, but waiting for
-        * 200000 TSC cycles is safe:
-        * 4 GHz == 50us
-        * 1 GHz == 200us
-        */
-       do {
-               rep_nop();
-               rdtscll(now);
-       } while ((now - start) < 200000UL);
-
-       if (t1 == read_hpet()) {
-               printk(KERN_WARNING
-                      "HPET counter not counting. HPET disabled\n");
-               goto out_nohpet;
-       }
-
-       /* Initialize and register HPET clocksource
-        *
-        * hpet period is in femto seconds per cycle
-        * so we need to convert this to ns/cyc units
-        * aproximated by mult/2^shift
-        *
-        *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
-        *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
-        *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
-        *  (fsec/cyc << shift)/1000000 = mult
-        *  (hpet_period << shift)/FSEC_PER_NSEC = mult
-        */
-       tmp = (u64)hpet_period << HPET_SHIFT;
-       do_div(tmp, FSEC_PER_NSEC);
-       clocksource_hpet.mult = (u32)tmp;
-
-       clocksource_register(&clocksource_hpet);
-
-       if (id & HPET_ID_LEGSUP) {
-               hpet_enable_int();
-               hpet_reserve_platform_timers(id);
-               /*
-                * Start hpet with the boot cpu mask and make it
-                * global after the IO_APIC has been initialized.
-                */
-               hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
-               clockevents_register_device(&hpet_clockevent);
-               global_clock_event = &hpet_clockevent;
-               return 1;
-       }
-       return 0;
-
-out_nohpet:
-       iounmap(hpet_virt_address);
-       hpet_virt_address = NULL;
-       boot_hpet_disable = 1;
-       return 0;
-}
-
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
- * is enabled, we support RTC interrupt functionality in software.
- * RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- *    is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
- */
-#include <linux/mc146818rtc.h>
-#include <linux/rtc.h>
-
-#define DEFAULT_RTC_INT_FREQ   64
-#define DEFAULT_RTC_SHIFT      6
-#define RTC_NUM_INTS           1
-
-static unsigned long hpet_rtc_flags;
-static unsigned long hpet_prev_update_sec;
-static struct rtc_time hpet_alarm_time;
-static unsigned long hpet_pie_count;
-static unsigned long hpet_t1_cmp;
-static unsigned long hpet_default_delta;
-static unsigned long hpet_pie_delta;
-static unsigned long hpet_pie_limit;
-
-/*
- * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
- * is not supported by all HPET implementations for timer 1.
- *
- * hpet_rtc_timer_init() is called when the rtc is initialized.
- */
-int hpet_rtc_timer_init(void)
-{
-       unsigned long cfg, cnt, delta, flags;
-
-       if (!is_hpet_enabled())
-               return 0;
-
-       if (!hpet_default_delta) {
-               uint64_t clc;
-
-               clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
-               clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
-               hpet_default_delta = (unsigned long) clc;
-       }
-
-       if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
-               delta = hpet_default_delta;
-       else
-               delta = hpet_pie_delta;
-
-       local_irq_save(flags);
-
-       cnt = delta + hpet_readl(HPET_COUNTER);
-       hpet_writel(cnt, HPET_T1_CMP);
-       hpet_t1_cmp = cnt;
-
-       cfg = hpet_readl(HPET_T1_CFG);
-       cfg &= ~HPET_TN_PERIODIC;
-       cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-       hpet_writel(cfg, HPET_T1_CFG);
-
-       local_irq_restore(flags);
-
-       return 1;
-}
-
-/*
- * The functions below are called from rtc driver.
- * Return 0 if HPET is not being used.
- * Otherwise do the necessary changes and return 1.
- */
-int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
-{
-       if (!is_hpet_enabled())
-               return 0;
-
-       hpet_rtc_flags &= ~bit_mask;
-       return 1;
-}
-
-int hpet_set_rtc_irq_bit(unsigned long bit_mask)
-{
-       unsigned long oldbits = hpet_rtc_flags;
-
-       if (!is_hpet_enabled())
-               return 0;
-
-       hpet_rtc_flags |= bit_mask;
-
-       if (!oldbits)
-               hpet_rtc_timer_init();
-
-       return 1;
-}
-
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
-                       unsigned char sec)
-{
-       if (!is_hpet_enabled())
-               return 0;
-
-       hpet_alarm_time.tm_hour = hrs;
-       hpet_alarm_time.tm_min = min;
-       hpet_alarm_time.tm_sec = sec;
-
-       return 1;
-}
-
-int hpet_set_periodic_freq(unsigned long freq)
-{
-       uint64_t clc;
-
-       if (!is_hpet_enabled())
-               return 0;
-
-       if (freq <= DEFAULT_RTC_INT_FREQ)
-               hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
-       else {
-               clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
-               do_div(clc, freq);
-               clc >>= hpet_clockevent.shift;
-               hpet_pie_delta = (unsigned long) clc;
-       }
-       return 1;
-}
-
-int hpet_rtc_dropped_irq(void)
-{
-       return is_hpet_enabled();
-}
-
-static void hpet_rtc_timer_reinit(void)
-{
-       unsigned long cfg, delta;
-       int lost_ints = -1;
-
-       if (unlikely(!hpet_rtc_flags)) {
-               cfg = hpet_readl(HPET_T1_CFG);
-               cfg &= ~HPET_TN_ENABLE;
-               hpet_writel(cfg, HPET_T1_CFG);
-               return;
-       }
-
-       if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
-               delta = hpet_default_delta;
-       else
-               delta = hpet_pie_delta;
-
-       /*
-        * Increment the comparator value until we are ahead of the
-        * current count.
-        */
-       do {
-               hpet_t1_cmp += delta;
-               hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-               lost_ints++;
-       } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
-
-       if (lost_ints) {
-               if (hpet_rtc_flags & RTC_PIE)
-                       hpet_pie_count += lost_ints;
-               if (printk_ratelimit())
-                       printk(KERN_WARNING "rtc: lost %d interrupts\n",
-                               lost_ints);
-       }
-}
-
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
-{
-       struct rtc_time curr_time;
-       unsigned long rtc_int_flag = 0;
-
-       hpet_rtc_timer_reinit();
-
-       if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
-               rtc_get_rtc_time(&curr_time);
-
-       if (hpet_rtc_flags & RTC_UIE &&
-           curr_time.tm_sec != hpet_prev_update_sec) {
-               rtc_int_flag = RTC_UF;
-               hpet_prev_update_sec = curr_time.tm_sec;
-       }
-
-       if (hpet_rtc_flags & RTC_PIE &&
-           ++hpet_pie_count >= hpet_pie_limit) {
-               rtc_int_flag |= RTC_PF;
-               hpet_pie_count = 0;
-       }
-
-       if (hpet_rtc_flags & RTC_PIE &&
-           (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
-           (curr_time.tm_min == hpet_alarm_time.tm_min) &&
-           (curr_time.tm_hour == hpet_alarm_time.tm_hour))
-                       rtc_int_flag |= RTC_AF;
-
-       if (rtc_int_flag) {
-               rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
-               rtc_interrupt(rtc_int_flag, dev_id);
-       }
-       return IRQ_HANDLED;
-}
-#endif
diff --git a/arch/i386/kernel/i386_ksyms_32.c b/arch/i386/kernel/i386_ksyms_32.c
deleted file mode 100644 (file)
index e3d4b73..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <linux/module.h>
-#include <asm/checksum.h>
-#include <asm/desc.h>
-
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
-/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy_generic);
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(strstr);
-
-#ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
-EXPORT_SYMBOL(__write_lock_failed);
-EXPORT_SYMBOL(__read_lock_failed);
-#endif
-
-EXPORT_SYMBOL(csum_partial);
diff --git a/arch/i386/kernel/i387_32.c b/arch/i386/kernel/i387_32.c
deleted file mode 100644 (file)
index 6658472..0000000
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- *  linux/arch/i386/kernel/i387.c
- *
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/math_emu.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-
-#ifdef CONFIG_MATH_EMULATION
-#define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-#define HAVE_HWFP 1
-#endif
-
-static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
-
-void mxcsr_feature_mask_init(void)
-{
-       unsigned long mask = 0;
-       clts();
-       if (cpu_has_fxsr) {
-               memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-               asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); 
-               mask = current->thread.i387.fxsave.mxcsr_mask;
-               if (mask == 0) mask = 0x0000ffbf;
-       } 
-       mxcsr_feature_mask &= mask;
-       stts();
-}
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-void init_fpu(struct task_struct *tsk)
-{
-       if (cpu_has_fxsr) {
-               memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-               tsk->thread.i387.fxsave.cwd = 0x37f;
-               if (cpu_has_xmm)
-                       tsk->thread.i387.fxsave.mxcsr = 0x1f80;
-       } else {
-               memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
-               tsk->thread.i387.fsave.cwd = 0xffff037fu;
-               tsk->thread.i387.fsave.swd = 0xffff0000u;
-               tsk->thread.i387.fsave.twd = 0xffffffffu;
-               tsk->thread.i387.fsave.fos = 0xffff0000u;
-       }
-       /* only the device not available exception or ptrace can call init_fpu */
-       set_stopped_child_used_math(tsk);
-}
-
-/*
- * FPU lazy state save handling.
- */
-
-void kernel_fpu_begin(void)
-{
-       struct thread_info *thread = current_thread_info();
-
-       preempt_disable();
-       if (thread->status & TS_USEDFPU) {
-               __save_init_fpu(thread->task);
-               return;
-       }
-       clts();
-}
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
-{
-       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
-        tmp = ~twd;
-        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-        /* and move the valid bits to the lower byte. */
-        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-        return tmp;
-}
-
-static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
-{
-       struct _fpxreg *st = NULL;
-       unsigned long tos = (fxsave->swd >> 11) & 7;
-       unsigned long twd = (unsigned long) fxsave->twd;
-       unsigned long tag;
-       unsigned long ret = 0xffff0000u;
-       int i;
-
-#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16);
-
-       for ( i = 0 ; i < 8 ; i++ ) {
-               if ( twd & 0x1 ) {
-                       st = FPREG_ADDR( fxsave, (i - tos) & 7 );
-
-                       switch ( st->exponent & 0x7fff ) {
-                       case 0x7fff:
-                               tag = 2;                /* Special */
-                               break;
-                       case 0x0000:
-                               if ( !st->significand[0] &&
-                                    !st->significand[1] &&
-                                    !st->significand[2] &&
-                                    !st->significand[3] ) {
-                                       tag = 1;        /* Zero */
-                               } else {
-                                       tag = 2;        /* Special */
-                               }
-                               break;
-                       default:
-                               if ( st->significand[3] & 0x8000 ) {
-                                       tag = 0;        /* Valid */
-                               } else {
-                                       tag = 2;        /* Special */
-                               }
-                               break;
-                       }
-               } else {
-                       tag = 3;                        /* Empty */
-               }
-               ret |= (tag << (2 * i));
-               twd = twd >> 1;
-       }
-       return ret;
-}
-
-/*
- * FPU state interaction.
- */
-
-unsigned short get_fpu_cwd( struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               return tsk->thread.i387.fxsave.cwd;
-       } else {
-               return (unsigned short)tsk->thread.i387.fsave.cwd;
-       }
-}
-
-unsigned short get_fpu_swd( struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               return tsk->thread.i387.fxsave.swd;
-       } else {
-               return (unsigned short)tsk->thread.i387.fsave.swd;
-       }
-}
-
-#if 0
-unsigned short get_fpu_twd( struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               return tsk->thread.i387.fxsave.twd;
-       } else {
-               return (unsigned short)tsk->thread.i387.fsave.twd;
-       }
-}
-#endif  /*  0  */
-
-unsigned short get_fpu_mxcsr( struct task_struct *tsk )
-{
-       if ( cpu_has_xmm ) {
-               return tsk->thread.i387.fxsave.mxcsr;
-       } else {
-               return 0x1f80;
-       }
-}
-
-#if 0
-
-void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
-{
-       if ( cpu_has_fxsr ) {
-               tsk->thread.i387.fxsave.cwd = cwd;
-       } else {
-               tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
-       }
-}
-
-void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
-{
-       if ( cpu_has_fxsr ) {
-               tsk->thread.i387.fxsave.swd = swd;
-       } else {
-               tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
-       }
-}
-
-void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
-{
-       if ( cpu_has_fxsr ) {
-               tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
-       } else {
-               tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
-       }
-}
-
-#endif  /*  0  */
-
-/*
- * FXSR floating point environment conversions.
- */
-
-static int convert_fxsr_to_user( struct _fpstate __user *buf,
-                                       struct i387_fxsave_struct *fxsave )
-{
-       unsigned long env[7];
-       struct _fpreg __user *to;
-       struct _fpxreg *from;
-       int i;
-
-       env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
-       env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
-       env[2] = twd_fxsr_to_i387(fxsave);
-       env[3] = fxsave->fip;
-       env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
-       env[5] = fxsave->foo;
-       env[6] = fxsave->fos;
-
-       if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
-               return 1;
-
-       to = &buf->_st[0];
-       from = (struct _fpxreg *) &fxsave->st_space[0];
-       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
-               unsigned long __user *t = (unsigned long __user *)to;
-               unsigned long *f = (unsigned long *)from;
-
-               if (__put_user(*f, t) ||
-                               __put_user(*(f + 1), t + 1) ||
-                               __put_user(from->exponent, &to->exponent))
-                       return 1;
-       }
-       return 0;
-}
-
-static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
-                                         struct _fpstate __user *buf )
-{
-       unsigned long env[7];
-       struct _fpxreg *to;
-       struct _fpreg __user *from;
-       int i;
-
-       if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
-               return 1;
-
-       fxsave->cwd = (unsigned short)(env[0] & 0xffff);
-       fxsave->swd = (unsigned short)(env[1] & 0xffff);
-       fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
-       fxsave->fip = env[3];
-       fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
-       fxsave->fcs = (env[4] & 0xffff);
-       fxsave->foo = env[5];
-       fxsave->fos = env[6];
-
-       to = (struct _fpxreg *) &fxsave->st_space[0];
-       from = &buf->_st[0];
-       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
-               unsigned long *t = (unsigned long *)to;
-               unsigned long __user *f = (unsigned long __user *)from;
-
-               if (__get_user(*t, f) ||
-                               __get_user(*(t + 1), f + 1) ||
-                               __get_user(to->exponent, &from->exponent))
-                       return 1;
-       }
-       return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave( struct _fpstate __user *buf )
-{
-       struct task_struct *tsk = current;
-
-       unlazy_fpu( tsk );
-       tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
-       if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
-                            sizeof(struct i387_fsave_struct) ) )
-               return -1;
-       return 1;
-}
-
-static int save_i387_fxsave( struct _fpstate __user *buf )
-{
-       struct task_struct *tsk = current;
-       int err = 0;
-
-       unlazy_fpu( tsk );
-
-       if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
-               return -1;
-
-       err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
-       err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
-       if ( err )
-               return -1;
-
-       if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
-                            sizeof(struct i387_fxsave_struct) ) )
-               return -1;
-       return 1;
-}
-
-int save_i387( struct _fpstate __user *buf )
-{
-       if ( !used_math() )
-               return 0;
-
-       /* This will cause a "finit" to be triggered by the next
-        * attempted FPU operation by the 'current' process.
-        */
-       clear_used_math();
-
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       return save_i387_fxsave( buf );
-               } else {
-                       return save_i387_fsave( buf );
-               }
-       } else {
-               return save_i387_soft( &current->thread.i387.soft, buf );
-       }
-}
-
-static inline int restore_i387_fsave( struct _fpstate __user *buf )
-{
-       struct task_struct *tsk = current;
-       clear_fpu( tsk );
-       return __copy_from_user( &tsk->thread.i387.fsave, buf,
-                                sizeof(struct i387_fsave_struct) );
-}
-
-static int restore_i387_fxsave( struct _fpstate __user *buf )
-{
-       int err;
-       struct task_struct *tsk = current;
-       clear_fpu( tsk );
-       err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
-                               sizeof(struct i387_fxsave_struct) );
-       /* mxcsr reserved bits must be masked to zero for security reasons */
-       tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-       return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
-}
-
-int restore_i387( struct _fpstate __user *buf )
-{
-       int err;
-
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       err = restore_i387_fxsave( buf );
-               } else {
-                       err = restore_i387_fsave( buf );
-               }
-       } else {
-               err = restore_i387_soft( &current->thread.i387.soft, buf );
-       }
-       set_used_math();
-       return err;
-}
-
-/*
- * ptrace request handlers.
- */
-
-static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
-                                   struct task_struct *tsk )
-{
-       return __copy_to_user( buf, &tsk->thread.i387.fsave,
-                              sizeof(struct user_i387_struct) );
-}
-
-static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
-                                    struct task_struct *tsk )
-{
-       return convert_fxsr_to_user( (struct _fpstate __user *)buf,
-                                    &tsk->thread.i387.fxsave );
-}
-
-int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
-{
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       return get_fpregs_fxsave( buf, tsk );
-               } else {
-                       return get_fpregs_fsave( buf, tsk );
-               }
-       } else {
-               return save_i387_soft( &tsk->thread.i387.soft,
-                                      (struct _fpstate __user *)buf );
-       }
-}
-
-static inline int set_fpregs_fsave( struct task_struct *tsk,
-                                   struct user_i387_struct __user *buf )
-{
-       return __copy_from_user( &tsk->thread.i387.fsave, buf,
-                                sizeof(struct user_i387_struct) );
-}
-
-static inline int set_fpregs_fxsave( struct task_struct *tsk,
-                                    struct user_i387_struct __user *buf )
-{
-       return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
-                                      (struct _fpstate __user *)buf );
-}
-
-int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
-{
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       return set_fpregs_fxsave( tsk, buf );
-               } else {
-                       return set_fpregs_fsave( tsk, buf );
-               }
-       } else {
-               return restore_i387_soft( &tsk->thread.i387.soft,
-                                         (struct _fpstate __user *)buf );
-       }
-}
-
-int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
-                                   sizeof(struct user_fxsr_struct) ))
-                       return -EFAULT;
-               return 0;
-       } else {
-               return -EIO;
-       }
-}
-
-int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
-{
-       int ret = 0;
-
-       if ( cpu_has_fxsr ) {
-               if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
-                                 sizeof(struct user_fxsr_struct) ))
-                       ret = -EFAULT;
-               /* mxcsr reserved bits must be masked to zero for security reasons */
-               tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-       } else {
-               ret = -EIO;
-       }
-       return ret;
-}
-
-/*
- * FPU state for core dumps.
- */
-
-static inline void copy_fpu_fsave( struct task_struct *tsk,
-                                  struct user_i387_struct *fpu )
-{
-       memcpy( fpu, &tsk->thread.i387.fsave,
-               sizeof(struct user_i387_struct) );
-}
-
-static inline void copy_fpu_fxsave( struct task_struct *tsk,
-                                  struct user_i387_struct *fpu )
-{
-       unsigned short *to;
-       unsigned short *from;
-       int i;
-
-       memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
-
-       to = (unsigned short *)&fpu->st_space[0];
-       from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
-       for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
-               memcpy( to, from, 5 * sizeof(unsigned short) );
-       }
-}
-
-int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
-{
-       int fpvalid;
-       struct task_struct *tsk = current;
-
-       fpvalid = !!used_math();
-       if ( fpvalid ) {
-               unlazy_fpu( tsk );
-               if ( cpu_has_fxsr ) {
-                       copy_fpu_fxsave( tsk, fpu );
-               } else {
-                       copy_fpu_fsave( tsk, fpu );
-               }
-       }
-
-       return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-       int fpvalid = !!tsk_used_math(tsk);
-
-       if (fpvalid) {
-               if (tsk == current)
-                       unlazy_fpu(tsk);
-               if (cpu_has_fxsr)
-                       copy_fpu_fxsave(tsk, fpu);
-               else
-                       copy_fpu_fsave(tsk, fpu);
-       }
-       return fpvalid;
-}
-
-int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
-{
-       int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
-
-       if (fpvalid) {
-               if (tsk == current)
-                      unlazy_fpu(tsk);
-               memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
-       }
-       return fpvalid;
-}
diff --git a/arch/i386/kernel/i8237.c b/arch/i386/kernel/i8237.c
deleted file mode 100644 (file)
index 6f508e8..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * i8237.c: 8237A DMA controller suspend functions.
- *
- * Written by Pierre Ossman, 2005.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- */
-
-#include <linux/init.h>
-#include <linux/sysdev.h>
-
-#include <asm/dma.h>
-
-/*
- * This module just handles suspend/resume issues with the
- * 8237A DMA controller (used for ISA and LPC).
- * Allocation is handled in kernel/dma.c and normal usage is
- * in asm/dma.h.
- */
-
-static int i8237A_resume(struct sys_device *dev)
-{
-       unsigned long flags;
-       int i;
-
-       flags = claim_dma_lock();
-
-       dma_outb(DMA1_RESET_REG, 0);
-       dma_outb(DMA2_RESET_REG, 0);
-
-       for (i = 0;i < 8;i++) {
-               set_dma_addr(i, 0x000000);
-               /* DMA count is a bit weird so this is not 0 */
-               set_dma_count(i, 1);
-       }
-
-       /* Enable cascade DMA or channel 0-3 won't work */
-       enable_dma(4);
-
-       release_dma_lock(flags);
-
-       return 0;
-}
-
-static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
-{
-       return 0;
-}
-
-static struct sysdev_class i8237_sysdev_class = {
-       set_kset_name("i8237"),
-       .suspend = i8237A_suspend,
-       .resume = i8237A_resume,
-};
-
-static struct sys_device device_i8237A = {
-       .id     = 0,
-       .cls    = &i8237_sysdev_class,
-};
-
-static int __init i8237A_init_sysfs(void)
-{
-       int error = sysdev_class_register(&i8237_sysdev_class);
-       if (!error)
-               error = sysdev_register(&device_i8237A);
-       return error;
-}
-
-device_initcall(i8237A_init_sysfs);
diff --git a/arch/i386/kernel/i8253_32.c b/arch/i386/kernel/i8253_32.c
deleted file mode 100644 (file)
index 6d839f2..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * i8253.c  8253/PIT functions
- *
- */
-#include <linux/clockchips.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-
-#include <asm/smp.h>
-#include <asm/delay.h>
-#include <asm/i8253.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-/*
- * HPET replaces the PIT, when enabled. So we need to know, which of
- * the two timers is used
- */
-struct clock_event_device *global_clock_event;
-
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
-                          struct clock_event_device *evt)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-
-       switch(mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-               /* binary, mode 2, LSB/MSB, ch 0 */
-               outb_p(0x34, PIT_MODE);
-               outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
-               outb(LATCH >> 8 , PIT_CH0);     /* MSB */
-               break;
-
-       case CLOCK_EVT_MODE_SHUTDOWN:
-       case CLOCK_EVT_MODE_UNUSED:
-               if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
-                   evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-                       outb_p(0x30, PIT_MODE);
-                       outb_p(0, PIT_CH0);
-                       outb_p(0, PIT_CH0);
-               }
-               break;
-
-       case CLOCK_EVT_MODE_ONESHOT:
-               /* One shot setup */
-               outb_p(0x38, PIT_MODE);
-               break;
-
-       case CLOCK_EVT_MODE_RESUME:
-               /* Nothing to do here */
-               break;
-       }
-       spin_unlock_irqrestore(&i8253_lock, flags);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-       outb_p(delta & 0xff , PIT_CH0); /* LSB */
-       outb(delta >> 8 , PIT_CH0);     /* MSB */
-       spin_unlock_irqrestore(&i8253_lock, flags);
-
-       return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilites are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-struct clock_event_device pit_clockevent = {
-       .name           = "pit",
-       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .set_mode       = init_pit_timer,
-       .set_next_event = pit_next_event,
-       .shift          = 32,
-       .irq            = 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
-void __init setup_pit_timer(void)
-{
-       /*
-        * Start pit with the boot cpu mask and make it global after the
-        * IO_APIC has been initialized.
-        */
-       pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
-       pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
-       pit_clockevent.max_delta_ns =
-               clockevent_delta2ns(0x7FFF, &pit_clockevent);
-       pit_clockevent.min_delta_ns =
-               clockevent_delta2ns(0xF, &pit_clockevent);
-       clockevents_register_device(&pit_clockevent);
-       global_clock_event = &pit_clockevent;
-}
-
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(void)
-{
-       unsigned long flags;
-       int count;
-       u32 jifs;
-       static int old_count;
-       static u32 old_jifs;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-       /*
-        * Although our caller may have the read side of xtime_lock,
-        * this is now a seqlock, and we are cheating in this routine
-        * by having side effects on state that we cannot undo if
-        * there is a collision on the seqlock and our caller has to
-        * retry.  (Namely, old_jifs and old_count.)  So we must treat
-        * jiffies as volatile despite the lock.  We read jiffies
-        * before latching the timer count to guarantee that although
-        * the jiffies value might be older than the count (that is,
-        * the counter may underflow between the last point where
-        * jiffies was incremented and the point where we latch the
-        * count), it cannot be newer.
-        */
-       jifs = jiffies;
-       outb_p(0x00, PIT_MODE); /* latch the count ASAP */
-       count = inb_p(PIT_CH0); /* read the latched count */
-       count |= inb_p(PIT_CH0) << 8;
-
-       /* VIA686a test code... reset the latch if count > max + 1 */
-       if (count > LATCH) {
-               outb_p(0x34, PIT_MODE);
-               outb_p(LATCH & 0xff, PIT_CH0);
-               outb(LATCH >> 8, PIT_CH0);
-               count = LATCH - 1;
-       }
-
-       /*
-        * It's possible for count to appear to go the wrong way for a
-        * couple of reasons:
-        *
-        *  1. The timer counter underflows, but we haven't handled the
-        *     resulting interrupt and incremented jiffies yet.
-        *  2. Hardware problem with the timer, not giving us continuous time,
-        *     the counter does small "jumps" upwards on some Pentium systems,
-        *     (see c't 95/10 page 335 for Neptun bug.)
-        *
-        * Previous attempts to handle these cases intelligently were
-        * buggy, so we just do the simple thing now.
-        */
-       if (count > old_count && jifs == old_jifs) {
-               count = old_count;
-       }
-       old_count = count;
-       old_jifs = jifs;
-
-       spin_unlock_irqrestore(&i8253_lock, flags);
-
-       count = (LATCH - 1) - count;
-
-       return (cycle_t)(jifs * LATCH) + count;
-}
-
-static struct clocksource clocksource_pit = {
-       .name   = "pit",
-       .rating = 110,
-       .read   = pit_read,
-       .mask   = CLOCKSOURCE_MASK(32),
-       .mult   = 0,
-       .shift  = 20,
-};
-
-static int __init init_pit_clocksource(void)
-{
-       if (num_possible_cpus() > 1) /* PIT does not scale! */
-               return 0;
-
-       clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
-       return clocksource_register(&clocksource_pit);
-}
-arch_initcall(init_pit_clocksource);
diff --git a/arch/i386/kernel/i8259_32.c b/arch/i386/kernel/i8259_32.c
deleted file mode 100644 (file)
index 0499cbe..0000000
+++ /dev/null
@@ -1,420 +0,0 @@
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/8253pit.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8259.h>
-
-#include <io_ports.h>
-
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-       .name           = "XT-PIC",
-       .mask           = disable_8259A_irq,
-       .disable        = disable_8259A_irq,
-       .unmask         = enable_8259A_irq,
-       .mask_ack       = mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-       unsigned int mask = 1 << irq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       cached_irq_mask |= mask;
-       if (irq & 8)
-               outb(cached_slave_mask, PIC_SLAVE_IMR);
-       else
-               outb(cached_master_mask, PIC_MASTER_IMR);
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-       unsigned int mask = ~(1 << irq);
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       cached_irq_mask &= mask;
-       if (irq & 8)
-               outb(cached_slave_mask, PIC_SLAVE_IMR);
-       else
-               outb(cached_master_mask, PIC_MASTER_IMR);
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-       unsigned int mask = 1<<irq;
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       if (irq < 8)
-               ret = inb(PIC_MASTER_CMD) & mask;
-       else
-               ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-
-       return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-       disable_irq_nosync(irq);
-       io_apic_irqs &= ~(1<<irq);
-       set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-                                     "XT");
-       enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-       int value;
-       int irqmask = 1<<irq;
-
-       if (irq < 8) {
-               outb(0x0B,PIC_MASTER_CMD);      /* ISR register */
-               value = inb(PIC_MASTER_CMD) & irqmask;
-               outb(0x0A,PIC_MASTER_CMD);      /* back to the IRR register */
-               return value;
-       }
-       outb(0x0B,PIC_SLAVE_CMD);       /* ISR register */
-       value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-       outb(0x0A,PIC_SLAVE_CMD);       /* back to the IRR register */
-       return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-       unsigned int irqmask = 1 << irq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       /*
-        * Lightweight spurious IRQ detection. We do not want
-        * to overdo spurious IRQ handling - it's usually a sign
-        * of hardware problems, so we only do the checks we can
-        * do without slowing down good hardware unnecessarily.
-        *
-        * Note that IRQ7 and IRQ15 (the two spurious IRQs
-        * usually resulting from the 8259A-1|2 PICs) occur
-        * even if the IRQ is masked in the 8259A. Thus we
-        * can check spurious 8259A IRQs without doing the
-        * quite slow i8259A_irq_real() call for every IRQ.
-        * This does not cover 100% of spurious interrupts,
-        * but should be enough to warn the user that there
-        * is something bad going on ...
-        */
-       if (cached_irq_mask & irqmask)
-               goto spurious_8259A_irq;
-       cached_irq_mask |= irqmask;
-
-handle_real_irq:
-       if (irq & 8) {
-               inb(PIC_SLAVE_IMR);     /* DUMMY - (do we need this?) */
-               outb(cached_slave_mask, PIC_SLAVE_IMR);
-               outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
-               outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
-       } else {
-               inb(PIC_MASTER_IMR);    /* DUMMY - (do we need this?) */
-               outb(cached_master_mask, PIC_MASTER_IMR);
-               outb(0x60+irq,PIC_MASTER_CMD);  /* 'Specific EOI to master */
-       }
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-       return;
-
-spurious_8259A_irq:
-       /*
-        * this is the slow path - should happen rarely.
-        */
-       if (i8259A_irq_real(irq))
-               /*
-                * oops, the IRQ _is_ in service according to the
-                * 8259A - not spurious, go handle it.
-                */
-               goto handle_real_irq;
-
-       {
-               static int spurious_irq_mask;
-               /*
-                * At this point we can be sure the IRQ is spurious,
-                * lets ACK and report it. [once per IRQ]
-                */
-               if (!(spurious_irq_mask & irqmask)) {
-                       printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
-                       spurious_irq_mask |= irqmask;
-               }
-               atomic_inc(&irq_err_count);
-               /*
-                * Theoretically we do not have to handle this IRQ,
-                * but in Linux this does not cause problems and is
-                * simpler for us.
-                */
-               goto handle_real_irq;
-       }
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-       outb(trigger[0], 0x4d0);
-       outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-       /* IRQ 0,1,2,8,13 are marked as reserved */
-       trigger[0] = inb(0x4d0) & 0xF8;
-       trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-       init_8259A(i8259A_auto_eoi);
-       restore_ELCR(irq_trigger);
-       return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-       save_ELCR(irq_trigger);
-       return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-       /* Put the i8259A into a quiescent state that
-        * the kernel initialization code can get it
-        * out of.
-        */
-       outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
-       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-1 */
-       return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-       set_kset_name("i8259"),
-       .suspend = i8259A_suspend,
-       .resume = i8259A_resume,
-       .shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-       .id     = 0,
-       .cls    = &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-       int error = sysdev_class_register(&i8259_sysdev_class);
-       if (!error)
-               error = sysdev_register(&device_i8259A);
-       return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
-       unsigned long flags;
-
-       i8259A_auto_eoi = auto_eoi;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-
-       outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
-       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */
-
-       /*
-        * outb_p - this has to work on a wide range of PC hardware.
-        */
-       outb_p(0x11, PIC_MASTER_CMD);   /* ICW1: select 8259A-1 init */
-       outb_p(0x20 + 0, PIC_MASTER_IMR);       /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-       outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);   /* 8259A-1 (the master) has a slave on IR2 */
-       if (auto_eoi)   /* master does Auto EOI */
-               outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-       else            /* master expects normal EOI */
-               outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
-       outb_p(0x11, PIC_SLAVE_CMD);    /* ICW1: select 8259A-2 init */
-       outb_p(0x20 + 8, PIC_SLAVE_IMR);        /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-       outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);  /* 8259A-2 is a slave on master's IR2 */
-       outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
-       if (auto_eoi)
-               /*
-                * In AEOI mode we just have to mask the interrupt
-                * when acking.
-                */
-               i8259A_chip.mask_ack = disable_8259A_irq;
-       else
-               i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-       udelay(100);            /* wait for 8259A to initialize */
-
-       outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
-       outb(cached_slave_mask, PIC_SLAVE_IMR);   /* restore slave IRQ mask */
-
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-       extern void math_error(void __user *);
-       outb(0,0xF0);
-       if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-               return IRQ_NONE;
-       math_error((void __user *)get_irq_regs()->eip);
-       return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
-
-void __init init_ISA_irqs (void)
-{
-       int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       init_bsp_APIC();
-#endif
-       init_8259A(0);
-
-       for (i = 0; i < NR_IRQS; i++) {
-               irq_desc[i].status = IRQ_DISABLED;
-               irq_desc[i].action = NULL;
-               irq_desc[i].depth = 1;
-
-               if (i < 16) {
-                       /*
-                        * 16 old-style INTA-cycle interrupts:
-                        */
-                       set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                                     handle_level_irq, "XT");
-               } else {
-                       /*
-                        * 'high' PCI IRQs filled in on demand
-                        */
-                       irq_desc[i].chip = &no_irq_chip;
-               }
-       }
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-       int i;
-
-       /* all the set up before the call gates are initialised */
-       pre_intr_init_hook();
-
-       /*
-        * Cover the whole vector space, no vector can escape
-        * us. (some of these will be overridden and become
-        * 'special' SMP interrupts)
-        */
-       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-               int vector = FIRST_EXTERNAL_VECTOR + i;
-               if (i >= NR_IRQS)
-                       break;
-               if (vector != SYSCALL_VECTOR) 
-                       set_intr_gate(vector, interrupt[i]);
-       }
-
-       /* setup after call gates are initialised (usually add in
-        * the architecture specific gates)
-        */
-       intr_init_hook();
-
-       /*
-        * External FPU? Set up irq13 if so, for
-        * original braindamaged IBM FERR coupling.
-        */
-       if (boot_cpu_data.hard_math && !cpu_has_fpu)
-               setup_irq(FPU_IRQ, &fpu_irq);
-
-       irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/i386/kernel/init_task_32.c b/arch/i386/kernel/init_task_32.c
deleted file mode 100644 (file)
index d26fc06..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union 
-       __attribute__((__section__(".data.init_task"))) =
-               { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's.
- */ 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/i386/kernel/io_apic_32.c b/arch/i386/kernel/io_apic_32.c
deleted file mode 100644 (file)
index e2f4a1c..0000000
+++ /dev/null
@@ -1,2847 +0,0 @@
-/*
- *     Intel IO-APIC support for multi-Pentium hosts.
- *
- *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
- *
- *     Many thanks to Stig Venaas for trying out countless experimental
- *     patches and reporting/debugging problems patiently!
- *
- *     (c) 1999, Multiple IO-APIC support, developed by
- *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
- *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
- *     further tested and cleaned up by Zach Brown <zab@redhat.com>
- *     and Ingo Molnar <mingo@redhat.com>
- *
- *     Fixes
- *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
- *                                     thanks to Eric Gilmore
- *                                     and Rolf G. Tews
- *                                     for testing these extensively
- *     Paul Diefenbaugh        :       Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/mc146818rtc.h>
-#include <linux/compiler.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/pci.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-
-#include "io_ports.h"
-
-int (*ioapic_renumber_irq)(int ioapic, int irq);
-atomic_t irq_mis_count;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
-
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
-
-int timer_over_8254 __initdata = 1;
-
-/*
- *     Is the SiS APIC rmw bug present ?
- *     -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-static int disable_timer_pin_1 __initdata;
-
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
-static struct irq_pin_list {
-       int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
-
-struct io_apic {
-       unsigned int index;
-       unsigned int unused[3];
-       unsigned int data;
-};
-
-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
-{
-       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
-}
-
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-       struct io_apic __iomem *io_apic = io_apic_base(apic);
-       writel(reg, &io_apic->index);
-       return readl(&io_apic->data);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-       struct io_apic __iomem *io_apic = io_apic_base(apic);
-       writel(reg, &io_apic->index);
-       writel(value, &io_apic->data);
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-       volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
-       if (sis_apic_bug)
-               writel(reg, &io_apic->index);
-       writel(value, &io_apic->data);
-}
-
-union entry_union {
-       struct { u32 w1, w2; };
-       struct IO_APIC_route_entry entry;
-};
-
-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
-{
-       union entry_union eu;
-       unsigned long flags;
-       spin_lock_irqsave(&ioapic_lock, flags);
-       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
-       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-       return eu.entry;
-}
-
-/*
- * When we write a new IO APIC routing entry, we need to write the high
- * word first! If the mask bit in the low word is clear, we will enable
- * the interrupt, and we need to make sure the entry is fully populated
- * before that happens.
- */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
-       union entry_union eu;
-       eu.entry = e;
-       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-}
-
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
-       unsigned long flags;
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __ioapic_write_entry(apic, pin, e);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * When we mask an IO APIC routing entry, we need to write the low
- * word first, in order to set the mask bit before we change the
- * high bits!
- */
-static void ioapic_mask_entry(int apic, int pin)
-{
-       unsigned long flags;
-       union entry_union eu = { .entry.mask = 1 };
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-{
-       static int first_free_entry = NR_IRQS;
-       struct irq_pin_list *entry = irq_2_pin + irq;
-
-       while (entry->next)
-               entry = irq_2_pin + entry->next;
-
-       if (entry->pin != -1) {
-               entry->next = first_free_entry;
-               entry = irq_2_pin + entry->next;
-               if (++first_free_entry >= PIN_MAP_SIZE)
-                       panic("io_apic.c: whoops");
-       }
-       entry->apic = apic;
-       entry->pin = pin;
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq(unsigned int irq,
-                                     int oldapic, int oldpin,
-                                     int newapic, int newpin)
-{
-       struct irq_pin_list *entry = irq_2_pin + irq;
-
-       while (1) {
-               if (entry->apic == oldapic && entry->pin == oldpin) {
-                       entry->apic = newapic;
-                       entry->pin = newpin;
-               }
-               if (!entry->next)
-                       break;
-               entry = irq_2_pin + entry->next;
-       }
-}
-
-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
-{
-       struct irq_pin_list *entry = irq_2_pin + irq;
-       unsigned int pin, reg;
-
-       for (;;) {
-               pin = entry->pin;
-               if (pin == -1)
-                       break;
-               reg = io_apic_read(entry->apic, 0x10 + pin*2);
-               reg &= ~disable;
-               reg |= enable;
-               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
-               if (!entry->next)
-                       break;
-               entry = irq_2_pin + entry->next;
-       }
-}
-
-/* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
-{
-       __modify_IO_APIC_irq(irq, 0x00010000, 0);
-}
-
-/* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
-{
-       __modify_IO_APIC_irq(irq, 0, 0x00010000);
-}
-
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
-{
-       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
-{
-       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
-}
-
-static void mask_IO_APIC_irq (unsigned int irq)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __mask_IO_APIC_irq(irq);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void unmask_IO_APIC_irq (unsigned int irq)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __unmask_IO_APIC_irq(irq);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
-       struct IO_APIC_route_entry entry;
-       
-       /* Check delivery_mode to be sure we're not clearing an SMI pin */
-       entry = ioapic_read_entry(apic, pin);
-       if (entry.delivery_mode == dest_SMI)
-               return;
-
-       /*
-        * Disable it in the IO-APIC irq-routing table:
-        */
-       ioapic_mask_entry(apic, pin);
-}
-
-static void clear_IO_APIC (void)
-{
-       int apic, pin;
-
-       for (apic = 0; apic < nr_ioapics; apic++)
-               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
-                       clear_IO_APIC_pin(apic, pin);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
-{
-       unsigned long flags;
-       int pin;
-       struct irq_pin_list *entry = irq_2_pin + irq;
-       unsigned int apicid_value;
-       cpumask_t tmp;
-       
-       cpus_and(tmp, cpumask, cpu_online_map);
-       if (cpus_empty(tmp))
-               tmp = TARGET_CPUS;
-
-       cpus_and(cpumask, tmp, CPU_MASK_ALL);
-
-       apicid_value = cpu_mask_to_apicid(cpumask);
-       /* Prepare to do the io_apic_write */
-       apicid_value = apicid_value << 24;
-       spin_lock_irqsave(&ioapic_lock, flags);
-       for (;;) {
-               pin = entry->pin;
-               if (pin == -1)
-                       break;
-               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
-               if (!entry->next)
-                       break;
-               entry = irq_2_pin + entry->next;
-       }
-       irq_desc[irq].affinity = cpumask;
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#if defined(CONFIG_IRQBALANCE)
-# include <asm/processor.h>    /* kernel_thread() */
-# include <linux/kernel_stat.h>        /* kstat */
-# include <linux/slab.h>               /* kmalloc() */
-# include <linux/timer.h>      /* time_after() */
-#define IRQBALANCE_CHECK_ARCH -999
-#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
-#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
-#define BALANCED_IRQ_LESS_DELTA                (HZ)
-
-static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
-static int physical_balance __read_mostly;
-static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
-
-static struct irq_cpu_info {
-       unsigned long * last_irq;
-       unsigned long * irq_delta;
-       unsigned long irq;
-} irq_cpu_data[NR_CPUS];
-
-#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
-
-#define IDLE_ENOUGH(cpu,now) \
-       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
-
-#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
-
-#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
-
-static cpumask_t balance_irq_affinity[NR_IRQS] = {
-       [0 ... NR_IRQS-1] = CPU_MASK_ALL
-};
-
-void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-       balance_irq_affinity[irq] = mask;
-}
-
-static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
-                       unsigned long now, int direction)
-{
-       int search_idle = 1;
-       int cpu = curr_cpu;
-
-       goto inside;
-
-       do {
-               if (unlikely(cpu == curr_cpu))
-                       search_idle = 0;
-inside:
-               if (direction == 1) {
-                       cpu++;
-                       if (cpu >= NR_CPUS)
-                               cpu = 0;
-               } else {
-                       cpu--;
-                       if (cpu == -1)
-                               cpu = NR_CPUS-1;
-               }
-       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
-                       (search_idle && !IDLE_ENOUGH(cpu,now)));
-
-       return cpu;
-}
-
-static inline void balance_irq(int cpu, int irq)
-{
-       unsigned long now = jiffies;
-       cpumask_t allowed_mask;
-       unsigned int new_cpu;
-               
-       if (irqbalance_disabled)
-               return; 
-
-       cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
-       new_cpu = move(cpu, allowed_mask, now, 1);
-       if (cpu != new_cpu) {
-               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-       }
-}
-
-static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
-{
-       int i, j;
-
-       for_each_online_cpu(i) {
-               for (j = 0; j < NR_IRQS; j++) {
-                       if (!irq_desc[j].action)
-                               continue;
-                       /* Is it a significant load ?  */
-                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
-                                               useful_load_threshold)
-                               continue;
-                       balance_irq(i, j);
-               }
-       }
-       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
-       return;
-}
-
-static void do_irq_balance(void)
-{
-       int i, j;
-       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
-       unsigned long move_this_load = 0;
-       int max_loaded = 0, min_loaded = 0;
-       int load;
-       unsigned long useful_load_threshold = balanced_irq_interval + 10;
-       int selected_irq;
-       int tmp_loaded, first_attempt = 1;
-       unsigned long tmp_cpu_irq;
-       unsigned long imbalance = 0;
-       cpumask_t allowed_mask, target_cpu_mask, tmp;
-
-       for_each_possible_cpu(i) {
-               int package_index;
-               CPU_IRQ(i) = 0;
-               if (!cpu_online(i))
-                       continue;
-               package_index = CPU_TO_PACKAGEINDEX(i);
-               for (j = 0; j < NR_IRQS; j++) {
-                       unsigned long value_now, delta;
-                       /* Is this an active IRQ or balancing disabled ? */
-                       if (!irq_desc[j].action || irq_balancing_disabled(j))
-                               continue;
-                       if ( package_index == i )
-                               IRQ_DELTA(package_index,j) = 0;
-                       /* Determine the total count per processor per IRQ */
-                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
-
-                       /* Determine the activity per processor per IRQ */
-                       delta = value_now - LAST_CPU_IRQ(i,j);
-
-                       /* Update last_cpu_irq[][] for the next time */
-                       LAST_CPU_IRQ(i,j) = value_now;
-
-                       /* Ignore IRQs whose rate is less than the clock */
-                       if (delta < useful_load_threshold)
-                               continue;
-                       /* update the load for the processor or package total */
-                       IRQ_DELTA(package_index,j) += delta;
-
-                       /* Keep track of the higher numbered sibling as well */
-                       if (i != package_index)
-                               CPU_IRQ(i) += delta;
-                       /*
-                        * We have sibling A and sibling B in the package
-                        *
-                        * cpu_irq[A] = load for cpu A + load for cpu B
-                        * cpu_irq[B] = load for cpu B
-                        */
-                       CPU_IRQ(package_index) += delta;
-               }
-       }
-       /* Find the least loaded processor package */
-       for_each_online_cpu(i) {
-               if (i != CPU_TO_PACKAGEINDEX(i))
-                       continue;
-               if (min_cpu_irq > CPU_IRQ(i)) {
-                       min_cpu_irq = CPU_IRQ(i);
-                       min_loaded = i;
-               }
-       }
-       max_cpu_irq = ULONG_MAX;
-
-tryanothercpu:
-       /* Look for heaviest loaded processor.
-        * We may come back to get the next heaviest loaded processor.
-        * Skip processors with trivial loads.
-        */
-       tmp_cpu_irq = 0;
-       tmp_loaded = -1;
-       for_each_online_cpu(i) {
-               if (i != CPU_TO_PACKAGEINDEX(i))
-                       continue;
-               if (max_cpu_irq <= CPU_IRQ(i)) 
-                       continue;
-               if (tmp_cpu_irq < CPU_IRQ(i)) {
-                       tmp_cpu_irq = CPU_IRQ(i);
-                       tmp_loaded = i;
-               }
-       }
-
-       if (tmp_loaded == -1) {
-        /* In the case of small number of heavy interrupt sources, 
-         * loading some of the cpus too much. We use Ingo's original 
-         * approach to rotate them around.
-         */
-               if (!first_attempt && imbalance >= useful_load_threshold) {
-                       rotate_irqs_among_cpus(useful_load_threshold);
-                       return;
-               }
-               goto not_worth_the_effort;
-       }
-       
-       first_attempt = 0;              /* heaviest search */
-       max_cpu_irq = tmp_cpu_irq;      /* load */
-       max_loaded = tmp_loaded;        /* processor */
-       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-       
-       /* if imbalance is less than approx 10% of max load, then
-        * observe diminishing returns action. - quit
-        */
-       if (imbalance < (max_cpu_irq >> 3))
-               goto not_worth_the_effort;
-
-tryanotherirq:
-       /* if we select an IRQ to move that can't go where we want, then
-        * see if there is another one to try.
-        */
-       move_this_load = 0;
-       selected_irq = -1;
-       for (j = 0; j < NR_IRQS; j++) {
-               /* Is this an active IRQ? */
-               if (!irq_desc[j].action)
-                       continue;
-               if (imbalance <= IRQ_DELTA(max_loaded,j))
-                       continue;
-               /* Try to find the IRQ that is closest to the imbalance
-                * without going over.
-                */
-               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
-                       move_this_load = IRQ_DELTA(max_loaded,j);
-                       selected_irq = j;
-               }
-       }
-       if (selected_irq == -1) {
-               goto tryanothercpu;
-       }
-
-       imbalance = move_this_load;
-       
-       /* For physical_balance case, we accumlated both load
-        * values in the one of the siblings cpu_irq[],
-        * to use the same code for physical and logical processors
-        * as much as possible. 
-        *
-        * NOTE: the cpu_irq[] array holds the sum of the load for
-        * sibling A and sibling B in the slot for the lowest numbered
-        * sibling (A), _AND_ the load for sibling B in the slot for
-        * the higher numbered sibling.
-        *
-        * We seek the least loaded sibling by making the comparison
-        * (A+B)/2 vs B
-        */
-       load = CPU_IRQ(min_loaded) >> 1;
-       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
-               if (load > CPU_IRQ(j)) {
-                       /* This won't change cpu_sibling_map[min_loaded] */
-                       load = CPU_IRQ(j);
-                       min_loaded = j;
-               }
-       }
-
-       cpus_and(allowed_mask,
-               cpu_online_map,
-               balance_irq_affinity[selected_irq]);
-       target_cpu_mask = cpumask_of_cpu(min_loaded);
-       cpus_and(tmp, target_cpu_mask, allowed_mask);
-
-       if (!cpus_empty(tmp)) {
-               /* mark for change destination */
-               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
-
-               /* Since we made a change, come back sooner to 
-                * check for more variation.
-                */
-               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
-               return;
-       }
-       goto tryanotherirq;
-
-not_worth_the_effort:
-       /*
-        * if we did not find an IRQ to move, then adjust the time interval
-        * upward
-        */
-       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
-               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
-       return;
-}
-
-static int balanced_irq(void *unused)
-{
-       int i;
-       unsigned long prev_balance_time = jiffies;
-       long time_remaining = balanced_irq_interval;
-
-       /* push everything to CPU 0 to give us a starting point.  */
-       for (i = 0 ; i < NR_IRQS ; i++) {
-               irq_desc[i].pending_mask = cpumask_of_cpu(0);
-               set_pending_irq(i, cpumask_of_cpu(0));
-       }
-
-       set_freezable();
-       for ( ; ; ) {
-               time_remaining = schedule_timeout_interruptible(time_remaining);
-               try_to_freeze();
-               if (time_after(jiffies,
-                               prev_balance_time+balanced_irq_interval)) {
-                       preempt_disable();
-                       do_irq_balance();
-                       prev_balance_time = jiffies;
-                       time_remaining = balanced_irq_interval;
-                       preempt_enable();
-               }
-       }
-       return 0;
-}
-
-static int __init balanced_irq_init(void)
-{
-       int i;
-       struct cpuinfo_x86 *c;
-       cpumask_t tmp;
-
-       cpus_shift_right(tmp, cpu_online_map, 2);
-        c = &boot_cpu_data;
-       /* When not overwritten by the command line ask subarchitecture. */
-       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
-               irqbalance_disabled = NO_BALANCE_IRQ;
-       if (irqbalance_disabled)
-               return 0;
-       
-        /* disable irqbalance completely if there is only one processor online */
-       if (num_online_cpus() < 2) {
-               irqbalance_disabled = 1;
-               return 0;
-       }
-       /*
-        * Enable physical balance only if more than 1 physical processor
-        * is present
-        */
-       if (smp_num_siblings > 1 && !cpus_empty(tmp))
-               physical_balance = 1;
-
-       for_each_online_cpu(i) {
-               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
-                       printk(KERN_ERR "balanced_irq_init: out of memory");
-                       goto failed;
-               }
-               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
-               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
-       }
-       
-       printk(KERN_INFO "Starting balanced_irq\n");
-       if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
-               return 0;
-       printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
-failed:
-       for_each_possible_cpu(i) {
-               kfree(irq_cpu_data[i].irq_delta);
-               irq_cpu_data[i].irq_delta = NULL;
-               kfree(irq_cpu_data[i].last_irq);
-               irq_cpu_data[i].last_irq = NULL;
-       }
-       return 0;
-}
-
-int __devinit irqbalance_disable(char *str)
-{
-       irqbalance_disabled = 1;
-       return 1;
-}
-
-__setup("noirqbalance", irqbalance_disable);
-
-late_initcall(balanced_irq_init);
-#endif /* CONFIG_IRQBALANCE */
-#endif /* CONFIG_SMP */
-
-#ifndef CONFIG_SMP
-void fastcall send_IPI_self(int vector)
-{
-       unsigned int cfg;
-
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
-       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       apic_write_around(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP */
-
-
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
-int skip_ioapic_setup;
-
-static int __init ioapic_pirq_setup(char *str)
-{
-       int i, max;
-       int ints[MAX_PIRQS+1];
-
-       get_options(str, ARRAY_SIZE(ints), ints);
-
-       for (i = 0; i < MAX_PIRQS; i++)
-               pirq_entries[i] = -1;
-
-       pirqs_enabled = 1;
-       apic_printk(APIC_VERBOSE, KERN_INFO
-                       "PIRQ redirection, working around broken MP-BIOS.\n");
-       max = MAX_PIRQS;
-       if (ints[0] < MAX_PIRQS)
-               max = ints[0];
-
-       for (i = 0; i < max; i++) {
-               apic_printk(APIC_VERBOSE, KERN_DEBUG
-                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
-               /*
-                * PIRQs are mapped upside down, usually.
-                */
-               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
-       }
-       return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
-
-/*
- * Find the IRQ entry number of a certain pin.
- */
-static int find_irq_entry(int apic, int pin, int type)
-{
-       int i;
-
-       for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mpc_irqtype == type &&
-                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-                   mp_irqs[i].mpc_dstirq == pin)
-                       return i;
-
-       return -1;
-}
-
-/*
- * Find the pin to which IRQ[irq] (ISA) is connected
- */
-static int __init find_isa_irq_pin(int irq, int type)
-{
-       int i;
-
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
-
-               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
-                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-                    mp_bus_id_to_type[lbus] == MP_BUS_MCA
-                   ) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
-
-                       return mp_irqs[i].mpc_dstirq;
-       }
-       return -1;
-}
-
-static int __init find_isa_irq_apic(int irq, int type)
-{
-       int i;
-
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
-
-               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
-                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-                    mp_bus_id_to_type[lbus] == MP_BUS_MCA
-                   ) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
-                       break;
-       }
-       if (i < mp_irq_entries) {
-               int apic;
-               for(apic = 0; apic < nr_ioapics; apic++) {
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
-                               return apic;
-               }
-       }
-
-       return -1;
-}
-
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-       int apic, i, best_guess = -1;
-
-       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
-               "slot:%d, pin:%d.\n", bus, slot, pin);
-       if (mp_bus_id_to_pci_bus[bus] == -1) {
-               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-               return -1;
-       }
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
-
-               for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
-                               break;
-
-               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
-                   !mp_irqs[i].mpc_irqtype &&
-                   (bus == lbus) &&
-                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
-
-                       if (!(apic || IO_APIC_IRQ(irq)))
-                               continue;
-
-                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
-                               return irq;
-                       /*
-                        * Use the first all-but-pin matching entry as a
-                        * best-guess fuzzy result for broken mptables.
-                        */
-                       if (best_guess < 0)
-                               best_guess = irq;
-               }
-       }
-       return best_guess;
-}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
-/*
- * This function currently is only a helper for the i386 smp boot process where 
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
-       int pin, ioapic, irq, irq_entry;
-
-       if (skip_ioapic_setup == 1)
-               return;
-
-       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-                       if (irq_entry == -1)
-                               continue;
-                       irq = pin_2_irq(irq_entry, ioapic, pin);
-                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
-               }
-
-       }
-}
-#endif
-
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
-       if (irq < 16) {
-               unsigned int port = 0x4d0 + (irq >> 3);
-               return (inb(port) >> (irq & 7)) & 1;
-       }
-       apic_printk(APIC_VERBOSE, KERN_INFO
-                       "Broken MPtable reports ISA irq %d\n", irq);
-       return 0;
-}
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value.  If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
-#define default_EISA_polarity(idx)     (0)
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx)       (0)
-#define default_ISA_polarity(idx)      (0)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx)       (1)
-#define default_PCI_polarity(idx)      (1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx)       (1)
-#define default_MCA_polarity(idx)      (0)
-
-static int __init MPBIOS_polarity(int idx)
-{
-       int bus = mp_irqs[idx].mpc_srcbus;
-       int polarity;
-
-       /*
-        * Determine IRQ line polarity (high active or low active):
-        */
-       switch (mp_irqs[idx].mpc_irqflag & 3)
-       {
-               case 0: /* conforms, ie. bus-type dependent polarity */
-               {
-                       switch (mp_bus_id_to_type[bus])
-                       {
-                               case MP_BUS_ISA: /* ISA pin */
-                               {
-                                       polarity = default_ISA_polarity(idx);
-                                       break;
-                               }
-                               case MP_BUS_EISA: /* EISA pin */
-                               {
-                                       polarity = default_EISA_polarity(idx);
-                                       break;
-                               }
-                               case MP_BUS_PCI: /* PCI pin */
-                               {
-                                       polarity = default_PCI_polarity(idx);
-                                       break;
-                               }
-                               case MP_BUS_MCA: /* MCA pin */
-                               {
-                                       polarity = default_MCA_polarity(idx);
-                                       break;
-                               }
-                               default:
-                               {
-                                       printk(KERN_WARNING "broken BIOS!!\n");
-                                       polarity = 1;
-                                       break;
-                               }
-                       }
-                       break;
-               }
-               case 1: /* high active */
-               {
-                       polarity = 0;
-                       break;
-               }
-               case 2: /* reserved */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       polarity = 1;
-                       break;
-               }
-               case 3: /* low active */
-               {
-                       polarity = 1;
-                       break;
-               }
-               default: /* invalid */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       polarity = 1;
-                       break;
-               }
-       }
-       return polarity;
-}
-
-static int MPBIOS_trigger(int idx)
-{
-       int bus = mp_irqs[idx].mpc_srcbus;
-       int trigger;
-
-       /*
-        * Determine IRQ trigger mode (edge or level sensitive):
-        */
-       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
-       {
-               case 0: /* conforms, ie. bus-type dependent */
-               {
-                       switch (mp_bus_id_to_type[bus])
-                       {
-                               case MP_BUS_ISA: /* ISA pin */
-                               {
-                                       trigger = default_ISA_trigger(idx);
-                                       break;
-                               }
-                               case MP_BUS_EISA: /* EISA pin */
-                               {
-                                       trigger = default_EISA_trigger(idx);
-                                       break;
-                               }
-                               case MP_BUS_PCI: /* PCI pin */
-                               {
-                                       trigger = default_PCI_trigger(idx);
-                                       break;
-                               }
-                               case MP_BUS_MCA: /* MCA pin */
-                               {
-                                       trigger = default_MCA_trigger(idx);
-                                       break;
-                               }
-                               default:
-                               {
-                                       printk(KERN_WARNING "broken BIOS!!\n");
-                                       trigger = 1;
-                                       break;
-                               }
-                       }
-                       break;
-               }
-               case 1: /* edge */
-               {
-                       trigger = 0;
-                       break;
-               }
-               case 2: /* reserved */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       trigger = 1;
-                       break;
-               }
-               case 3: /* level */
-               {
-                       trigger = 1;
-                       break;
-               }
-               default: /* invalid */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       trigger = 0;
-                       break;
-               }
-       }
-       return trigger;
-}
-
-static inline int irq_polarity(int idx)
-{
-       return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
-       return MPBIOS_trigger(idx);
-}
-
-static int pin_2_irq(int idx, int apic, int pin)
-{
-       int irq, i;
-       int bus = mp_irqs[idx].mpc_srcbus;
-
-       /*
-        * Debugging check, we are in big trouble if this message pops up!
-        */
-       if (mp_irqs[idx].mpc_dstirq != pin)
-               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
-       switch (mp_bus_id_to_type[bus])
-       {
-               case MP_BUS_ISA: /* ISA pin */
-               case MP_BUS_EISA:
-               case MP_BUS_MCA:
-               {
-                       irq = mp_irqs[idx].mpc_srcbusirq;
-                       break;
-               }
-               case MP_BUS_PCI: /* PCI pin */
-               {
-                       /*
-                        * PCI IRQs are mapped in order
-                        */
-                       i = irq = 0;
-                       while (i < apic)
-                               irq += nr_ioapic_registers[i++];
-                       irq += pin;
-
-                       /*
-                        * For MPS mode, so far only needed by ES7000 platform
-                        */
-                       if (ioapic_renumber_irq)
-                               irq = ioapic_renumber_irq(apic, irq);
-
-                       break;
-               }
-               default:
-               {
-                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
-                       irq = 0;
-                       break;
-               }
-       }
-
-       /*
-        * PCI IRQ command line redirection. Yes, limits are hardcoded.
-        */
-       if ((pin >= 16) && (pin <= 23)) {
-               if (pirq_entries[pin-16] != -1) {
-                       if (!pirq_entries[pin-16]) {
-                               apic_printk(APIC_VERBOSE, KERN_DEBUG
-                                               "disabling PIRQ%d\n", pin-16);
-                       } else {
-                               irq = pirq_entries[pin-16];
-                               apic_printk(APIC_VERBOSE, KERN_DEBUG
-                                               "using PIRQ%d -> IRQ %d\n",
-                                               pin-16, irq);
-                       }
-               }
-       }
-       return irq;
-}
-
-static inline int IO_APIC_irq_trigger(int irq)
-{
-       int apic, idx, pin;
-
-       for (apic = 0; apic < nr_ioapics; apic++) {
-               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-                       idx = find_irq_entry(apic,pin,mp_INT);
-                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
-                               return irq_trigger(idx);
-               }
-       }
-       /*
-        * nonexistent IRQs are edge default
-        */
-       return 0;
-}
-
-/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
-
-static int __assign_irq_vector(int irq)
-{
-       static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
-       int vector, offset, i;
-
-       BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
-
-       if (irq_vector[irq] > 0)
-               return irq_vector[irq];
-
-       vector = current_vector;
-       offset = current_offset;
-next:
-       vector += 8;
-       if (vector >= FIRST_SYSTEM_VECTOR) {
-               offset = (offset + 1) % 8;
-               vector = FIRST_DEVICE_VECTOR + offset;
-       }
-       if (vector == current_vector)
-               return -ENOSPC;
-       if (vector == SYSCALL_VECTOR)
-               goto next;
-       for (i = 0; i < NR_IRQ_VECTORS; i++)
-               if (irq_vector[i] == vector)
-                       goto next;
-
-       current_vector = vector;
-       current_offset = offset;
-       irq_vector[irq] = vector;
-
-       return vector;
-}
-
-static int assign_irq_vector(int irq)
-{
-       unsigned long flags;
-       int vector;
-
-       spin_lock_irqsave(&vector_lock, flags);
-       vector = __assign_irq_vector(irq);
-       spin_unlock_irqrestore(&vector_lock, flags);
-
-       return vector;
-}
-static struct irq_chip ioapic_chip;
-
-#define IOAPIC_AUTO    -1
-#define IOAPIC_EDGE    0
-#define IOAPIC_LEVEL   1
-
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
-{
-       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-           trigger == IOAPIC_LEVEL) {
-               irq_desc[irq].status |= IRQ_LEVEL;
-               set_irq_chip_and_handler_name(irq, &ioapic_chip,
-                                        handle_fasteoi_irq, "fasteoi");
-       } else {
-               irq_desc[irq].status &= ~IRQ_LEVEL;
-               set_irq_chip_and_handler_name(irq, &ioapic_chip,
-                                        handle_edge_irq, "edge");
-       }
-       set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_IO_APIC_irqs(void)
-{
-       struct IO_APIC_route_entry entry;
-       int apic, pin, idx, irq, first_notcon = 1, vector;
-       unsigned long flags;
-
-       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
-       for (apic = 0; apic < nr_ioapics; apic++) {
-       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
-               /*
-                * add it to the IO-APIC irq-routing table:
-                */
-               memset(&entry,0,sizeof(entry));
-
-               entry.delivery_mode = INT_DELIVERY_MODE;
-               entry.dest_mode = INT_DEST_MODE;
-               entry.mask = 0;                         /* enable IRQ */
-               entry.dest.logical.logical_dest = 
-                                       cpu_mask_to_apicid(TARGET_CPUS);
-
-               idx = find_irq_entry(apic,pin,mp_INT);
-               if (idx == -1) {
-                       if (first_notcon) {
-                               apic_printk(APIC_VERBOSE, KERN_DEBUG
-                                               " IO-APIC (apicid-pin) %d-%d",
-                                               mp_ioapics[apic].mpc_apicid,
-                                               pin);
-                               first_notcon = 0;
-                       } else
-                               apic_printk(APIC_VERBOSE, ", %d-%d",
-                                       mp_ioapics[apic].mpc_apicid, pin);
-                       continue;
-               }
-
-               entry.trigger = irq_trigger(idx);
-               entry.polarity = irq_polarity(idx);
-
-               if (irq_trigger(idx)) {
-                       entry.trigger = 1;
-                       entry.mask = 1;
-               }
-
-               irq = pin_2_irq(idx, apic, pin);
-               /*
-                * skip adding the timer int on secondary nodes, which causes
-                * a small but painful rift in the time-space continuum
-                */
-               if (multi_timer_check(apic, irq))
-                       continue;
-               else
-                       add_pin_to_irq(irq, apic, pin);
-
-               if (!apic && !IO_APIC_IRQ(irq))
-                       continue;
-
-               if (IO_APIC_IRQ(irq)) {
-                       vector = assign_irq_vector(irq);
-                       entry.vector = vector;
-                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-               
-                       if (!apic && (irq < 16))
-                               disable_8259A_irq(irq);
-               }
-               spin_lock_irqsave(&ioapic_lock, flags);
-               __ioapic_write_entry(apic, pin, entry);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-       }
-       }
-
-       if (!first_notcon)
-               apic_printk(APIC_VERBOSE, " not connected.\n");
-}
-
-/*
- * Set up the 8259A-master output pin:
- */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
-{
-       struct IO_APIC_route_entry entry;
-
-       memset(&entry,0,sizeof(entry));
-
-       disable_8259A_irq(0);
-
-       /* mask LVT0 */
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
-       /*
-        * We use logical delivery to get the timer IRQ
-        * to the first CPU.
-        */
-       entry.dest_mode = INT_DEST_MODE;
-       entry.mask = 0;                                 /* unmask IRQ now */
-       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
-       entry.delivery_mode = INT_DELIVERY_MODE;
-       entry.polarity = 0;
-       entry.trigger = 0;
-       entry.vector = vector;
-
-       /*
-        * The timer IRQ doesn't have to know that behind the
-        * scene we have a 8259A-master in AEOI mode ...
-        */
-       irq_desc[0].chip = &ioapic_chip;
-       set_irq_handler(0, handle_edge_irq);
-
-       /*
-        * Add it to the IO-APIC irq-routing table:
-        */
-       ioapic_write_entry(apic, pin, entry);
-
-       enable_8259A_irq(0);
-}
-
-void __init print_IO_APIC(void)
-{
-       int apic, i;
-       union IO_APIC_reg_00 reg_00;
-       union IO_APIC_reg_01 reg_01;
-       union IO_APIC_reg_02 reg_02;
-       union IO_APIC_reg_03 reg_03;
-       unsigned long flags;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
-       for (i = 0; i < nr_ioapics; i++)
-               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
-
-       /*
-        * We are a bit conservative about what we expect.  We have to
-        * know about every hardware change ASAP.
-        */
-       printk(KERN_INFO "testing the IO APIC.......................\n");
-
-       for (apic = 0; apic < nr_ioapics; apic++) {
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_00.raw = io_apic_read(apic, 0);
-       reg_01.raw = io_apic_read(apic, 1);
-       if (reg_01.bits.version >= 0x10)
-               reg_02.raw = io_apic_read(apic, 2);
-       if (reg_01.bits.version >= 0x20)
-               reg_03.raw = io_apic_read(apic, 3);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
-       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
-       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
-       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
-       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
-
-       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
-       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
-
-       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
-       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
-
-       /*
-        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
-        * but the value of reg_02 is read as the previous read register
-        * value, so ignore it if reg_02 == reg_01.
-        */
-       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
-               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
-               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
-       }
-
-       /*
-        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
-        * or reg_03, but the value of reg_0[23] is read as the previous read
-        * register value, so ignore it if reg_03 == reg_0[12].
-        */
-       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
-           reg_03.raw != reg_01.raw) {
-               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
-               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
-       }
-
-       printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
-                         " Stat Dest Deli Vect:   \n");
-
-       for (i = 0; i <= reg_01.bits.entries; i++) {
-               struct IO_APIC_route_entry entry;
-
-               entry = ioapic_read_entry(apic, i);
-
-               printk(KERN_DEBUG " %02x %03X %02X  ",
-                       i,
-                       entry.dest.logical.logical_dest,
-                       entry.dest.physical.physical_dest
-               );
-
-               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
-                       entry.mask,
-                       entry.trigger,
-                       entry.irr,
-                       entry.polarity,
-                       entry.delivery_status,
-                       entry.dest_mode,
-                       entry.delivery_mode,
-                       entry.vector
-               );
-       }
-       }
-       printk(KERN_DEBUG "IRQ to pin mappings:\n");
-       for (i = 0; i < NR_IRQS; i++) {
-               struct irq_pin_list *entry = irq_2_pin + i;
-               if (entry->pin < 0)
-                       continue;
-               printk(KERN_DEBUG "IRQ%d ", i);
-               for (;;) {
-                       printk("-> %d:%d", entry->apic, entry->pin);
-                       if (!entry->next)
-                               break;
-                       entry = irq_2_pin + entry->next;
-               }
-               printk("\n");
-       }
-
-       printk(KERN_INFO ".................................... done.\n");
-
-       return;
-}
-
-#if 0
-
-static void print_APIC_bitfield (int base)
-{
-       unsigned int v;
-       int i, j;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
-       for (i = 0; i < 8; i++) {
-               v = apic_read(base + i*0x10);
-               for (j = 0; j < 32; j++) {
-                       if (v & (1<<j))
-                               printk("1");
-                       else
-                               printk("0");
-               }
-               printk("\n");
-       }
-}
-
-void /*__init*/ print_local_APIC(void * dummy)
-{
-       unsigned int v, ver, maxlvt;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
-               smp_processor_id(), hard_smp_processor_id());
-       v = apic_read(APIC_ID);
-       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
-       v = apic_read(APIC_LVR);
-       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
-       ver = GET_APIC_VERSION(v);
-       maxlvt = lapic_get_maxlvt();
-
-       v = apic_read(APIC_TASKPRI);
-       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
-       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
-               v = apic_read(APIC_ARBPRI);
-               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
-                       v & APIC_ARBPRI_MASK);
-               v = apic_read(APIC_PROCPRI);
-               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
-       }
-
-       v = apic_read(APIC_EOI);
-       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
-       v = apic_read(APIC_RRR);
-       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
-       v = apic_read(APIC_LDR);
-       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
-       v = apic_read(APIC_DFR);
-       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
-       v = apic_read(APIC_SPIV);
-       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
-       printk(KERN_DEBUG "... APIC ISR field:\n");
-       print_APIC_bitfield(APIC_ISR);
-       printk(KERN_DEBUG "... APIC TMR field:\n");
-       print_APIC_bitfield(APIC_TMR);
-       printk(KERN_DEBUG "... APIC IRR field:\n");
-       print_APIC_bitfield(APIC_IRR);
-
-       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
-               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
-                       apic_write(APIC_ESR, 0);
-               v = apic_read(APIC_ESR);
-               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
-       }
-
-       v = apic_read(APIC_ICR);
-       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
-       v = apic_read(APIC_ICR2);
-       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
-
-       v = apic_read(APIC_LVTT);
-       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
-       if (maxlvt > 3) {                       /* PC is LVT#4. */
-               v = apic_read(APIC_LVTPC);
-               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
-       }
-       v = apic_read(APIC_LVT0);
-       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
-       v = apic_read(APIC_LVT1);
-       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
-       if (maxlvt > 2) {                       /* ERR is LVT#3. */
-               v = apic_read(APIC_LVTERR);
-               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
-       }
-
-       v = apic_read(APIC_TMICT);
-       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
-       v = apic_read(APIC_TMCCT);
-       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
-       v = apic_read(APIC_TDCR);
-       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
-       printk("\n");
-}
-
-void print_all_local_APICs (void)
-{
-       on_each_cpu(print_local_APIC, NULL, 1, 1);
-}
-
-void /*__init*/ print_PIC(void)
-{
-       unsigned int v;
-       unsigned long flags;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk(KERN_DEBUG "\nprinting PIC contents\n");
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-
-       v = inb(0xa1) << 8 | inb(0x21);
-       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
-
-       v = inb(0xa0) << 8 | inb(0x20);
-       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
-
-       outb(0x0b,0xa0);
-       outb(0x0b,0x20);
-       v = inb(0xa0) << 8 | inb(0x20);
-       outb(0x0a,0xa0);
-       outb(0x0a,0x20);
-
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-
-       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
-
-       v = inb(0x4d1) << 8 | inb(0x4d0);
-       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-#endif  /*  0  */
-
-static void __init enable_IO_APIC(void)
-{
-       union IO_APIC_reg_01 reg_01;
-       int i8259_apic, i8259_pin;
-       int i, apic;
-       unsigned long flags;
-
-       for (i = 0; i < PIN_MAP_SIZE; i++) {
-               irq_2_pin[i].pin = -1;
-               irq_2_pin[i].next = 0;
-       }
-       if (!pirqs_enabled)
-               for (i = 0; i < MAX_PIRQS; i++)
-                       pirq_entries[i] = -1;
-
-       /*
-        * The number of IO-APIC IRQ registers (== #pins):
-        */
-       for (apic = 0; apic < nr_ioapics; apic++) {
-               spin_lock_irqsave(&ioapic_lock, flags);
-               reg_01.raw = io_apic_read(apic, 1);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-       }
-       for(apic = 0; apic < nr_ioapics; apic++) {
-               int pin;
-               /* See if any of the pins is in ExtINT mode */
-               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-                       struct IO_APIC_route_entry entry;
-                       entry = ioapic_read_entry(apic, pin);
-
-
-                       /* If the interrupt line is enabled and in ExtInt mode
-                        * I have found the pin where the i8259 is connected.
-                        */
-                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
-                               ioapic_i8259.apic = apic;
-                               ioapic_i8259.pin  = pin;
-                               goto found_i8259;
-                       }
-               }
-       }
- found_i8259:
-       /* Look to see what if the MP table has reported the ExtINT */
-       /* If we could not find the appropriate pin by looking at the ioapic
-        * the i8259 probably is not connected the ioapic but give the
-        * mptable a chance anyway.
-        */
-       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
-       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
-       /* Trust the MP table if nothing is setup in the hardware */
-       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
-               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
-               ioapic_i8259.pin  = i8259_pin;
-               ioapic_i8259.apic = i8259_apic;
-       }
-       /* Complain if the MP table and the hardware disagree */
-       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
-               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
-       {
-               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
-       }
-
-       /*
-        * Do not trust the IO-APIC being empty at bootup
-        */
-       clear_IO_APIC();
-}
-
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
-{
-       /*
-        * Clear the IO-APIC before rebooting:
-        */
-       clear_IO_APIC();
-
-       /*
-        * If the i8259 is routed through an IOAPIC
-        * Put that IOAPIC in virtual wire mode
-        * so legacy interrupts can be delivered.
-        */
-       if (ioapic_i8259.pin != -1) {
-               struct IO_APIC_route_entry entry;
-
-               memset(&entry, 0, sizeof(entry));
-               entry.mask            = 0; /* Enabled */
-               entry.trigger         = 0; /* Edge */
-               entry.irr             = 0;
-               entry.polarity        = 0; /* High */
-               entry.delivery_status = 0;
-               entry.dest_mode       = 0; /* Physical */
-               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-               entry.vector          = 0;
-               entry.dest.physical.physical_dest =
-                                       GET_APIC_ID(apic_read(APIC_ID));
-
-               /*
-                * Add it to the IO-APIC irq-routing table:
-                */
-               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
-       }
-       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-}
-
-/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
- */
-
-#ifndef CONFIG_X86_NUMAQ
-static void __init setup_ioapic_ids_from_mpc(void)
-{
-       union IO_APIC_reg_00 reg_00;
-       physid_mask_t phys_id_present_map;
-       int apic;
-       int i;
-       unsigned char old_id;
-       unsigned long flags;
-
-       /*
-        * Don't check I/O APIC IDs for xAPIC systems.  They have
-        * no meaning without the serial APIC bus.
-        */
-       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-               return;
-       /*
-        * This is broken; anything with a real cpu count has to
-        * circumvent this idiocy regardless.
-        */
-       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
-
-       /*
-        * Set the IOAPIC ID to the value stored in the MPC table.
-        */
-       for (apic = 0; apic < nr_ioapics; apic++) {
-
-               /* Read the register 0 value */
-               spin_lock_irqsave(&ioapic_lock, flags);
-               reg_00.raw = io_apic_read(apic, 0);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-               
-               old_id = mp_ioapics[apic].mpc_apicid;
-
-               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
-                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-                               apic, mp_ioapics[apic].mpc_apicid);
-                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
-                               reg_00.bits.ID);
-                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
-               }
-
-               /*
-                * Sanity check, is the ID really free? Every APIC in a
-                * system must have a unique ID or we get lots of nice
-                * 'stuck on smp_invalidate_needed IPI wait' messages.
-                */
-               if (check_apicid_used(phys_id_present_map,
-                                       mp_ioapics[apic].mpc_apicid)) {
-                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-                               apic, mp_ioapics[apic].mpc_apicid);
-                       for (i = 0; i < get_physical_broadcast(); i++)
-                               if (!physid_isset(i, phys_id_present_map))
-                                       break;
-                       if (i >= get_physical_broadcast())
-                               panic("Max APIC ID exceeded!\n");
-                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
-                               i);
-                       physid_set(i, phys_id_present_map);
-                       mp_ioapics[apic].mpc_apicid = i;
-               } else {
-                       physid_mask_t tmp;
-                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
-                       apic_printk(APIC_VERBOSE, "Setting %d in the "
-                                       "phys_id_present_map\n",
-                                       mp_ioapics[apic].mpc_apicid);
-                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
-               }
-
-
-               /*
-                * We need to adjust the IRQ routing table
-                * if the ID changed.
-                */
-               if (old_id != mp_ioapics[apic].mpc_apicid)
-                       for (i = 0; i < mp_irq_entries; i++)
-                               if (mp_irqs[i].mpc_dstapic == old_id)
-                                       mp_irqs[i].mpc_dstapic
-                                               = mp_ioapics[apic].mpc_apicid;
-
-               /*
-                * Read the right value from the MPC table and
-                * write it into the ID register.
-                */
-               apic_printk(APIC_VERBOSE, KERN_INFO
-                       "...changing IO-APIC physical APIC ID to %d ...",
-                       mp_ioapics[apic].mpc_apicid);
-
-               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
-               spin_lock_irqsave(&ioapic_lock, flags);
-               io_apic_write(apic, 0, reg_00.raw);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-
-               /*
-                * Sanity check
-                */
-               spin_lock_irqsave(&ioapic_lock, flags);
-               reg_00.raw = io_apic_read(apic, 0);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
-                       printk("could not set ID!\n");
-               else
-                       apic_printk(APIC_VERBOSE, " ok.\n");
-       }
-}
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
-
-int no_timer_check __initdata;
-
-static int __init notimercheck(char *s)
-{
-       no_timer_check = 1;
-       return 1;
-}
-__setup("no_timer_check", notimercheck);
-
-/*
- * There is a nasty bug in some older SMP boards, their mptable lies
- * about the timer IRQ. We do the following to work around the situation:
- *
- *     - timer IRQ defaults to IO-APIC IRQ
- *     - if this function detects that timer IRQs are defunct, then we fall
- *       back to ISA timer IRQs
- */
-static int __init timer_irq_works(void)
-{
-       unsigned long t1 = jiffies;
-
-       if (no_timer_check)
-               return 1;
-
-       local_irq_enable();
-       /* Let ten ticks pass... */
-       mdelay((10 * 1000) / HZ);
-
-       /*
-        * Expect a few ticks at least, to be sure some possible
-        * glue logic does not lock up after one or two first
-        * ticks in a non-ExtINT mode.  Also the local APIC
-        * might have cached one ExtINT interrupt.  Finally, at
-        * least one tick may be lost due to delays.
-        */
-       if (jiffies - t1 > 4)
-               return 1;
-
-       return 0;
-}
-
-/*
- * In the SMP+IOAPIC case it might happen that there are an unspecified
- * number of pending IRQ events unhandled. These cases are very rare,
- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
- * better to do it this way as thus we do not have to be aware of
- * 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Startup quirk:
- *
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
- *
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
- *
- * (We do this for level-triggered IRQs too - it cannot hurt.)
- */
-static unsigned int startup_ioapic_irq(unsigned int irq)
-{
-       int was_pending = 0;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       if (irq < 16) {
-               disable_8259A_irq(irq);
-               if (i8259A_irq_pending(irq))
-                       was_pending = 1;
-       }
-       __unmask_IO_APIC_irq(irq);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       return was_pending;
-}
-
-static void ack_ioapic_irq(unsigned int irq)
-{
-       move_native_irq(irq);
-       ack_APIC_irq();
-}
-
-static void ack_ioapic_quirk_irq(unsigned int irq)
-{
-       unsigned long v;
-       int i;
-
-       move_native_irq(irq);
-/*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets).  Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless.  As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source.  The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually.  We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt.  We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul.  --macro
- */
-       i = irq_vector[irq];
-
-       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-
-       ack_APIC_irq();
-
-       if (!(v & (1 << (i & 0x1f)))) {
-               atomic_inc(&irq_mis_count);
-               spin_lock(&ioapic_lock);
-               __mask_and_edge_IO_APIC_irq(irq);
-               __unmask_and_level_IO_APIC_irq(irq);
-               spin_unlock(&ioapic_lock);
-       }
-}
-
-static int ioapic_retrigger_irq(unsigned int irq)
-{
-       send_IPI_self(irq_vector[irq]);
-
-       return 1;
-}
-
-static struct irq_chip ioapic_chip __read_mostly = {
-       .name           = "IO-APIC",
-       .startup        = startup_ioapic_irq,
-       .mask           = mask_IO_APIC_irq,
-       .unmask         = unmask_IO_APIC_irq,
-       .ack            = ack_ioapic_irq,
-       .eoi            = ack_ioapic_quirk_irq,
-#ifdef CONFIG_SMP
-       .set_affinity   = set_ioapic_affinity_irq,
-#endif
-       .retrigger      = ioapic_retrigger_irq,
-};
-
-
-static inline void init_IO_APIC_traps(void)
-{
-       int irq;
-
-       /*
-        * NOTE! The local APIC isn't very good at handling
-        * multiple interrupts at the same interrupt level.
-        * As the interrupt level is determined by taking the
-        * vector number and shifting that right by 4, we
-        * want to spread these out a bit so that they don't
-        * all fall in the same interrupt level.
-        *
-        * Also, we've got to be careful not to trash gate
-        * 0x80, because int 0x80 is hm, kind of importantish. ;)
-        */
-       for (irq = 0; irq < NR_IRQS ; irq++) {
-               int tmp = irq;
-               if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
-                       /*
-                        * Hmm.. We don't have an entry for this,
-                        * so default to an old-fashioned 8259
-                        * interrupt if we can..
-                        */
-                       if (irq < 16)
-                               make_8259A_irq(irq);
-                       else
-                               /* Strange. Oh, well.. */
-                               irq_desc[irq].chip = &no_irq_chip;
-               }
-       }
-}
-
-/*
- * The local APIC irq-chip implementation:
- */
-
-static void ack_apic(unsigned int irq)
-{
-       ack_APIC_irq();
-}
-
-static void mask_lapic_irq (unsigned int irq)
-{
-       unsigned long v;
-
-       v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
-static void unmask_lapic_irq (unsigned int irq)
-{
-       unsigned long v;
-
-       v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
-}
-
-static struct irq_chip lapic_chip __read_mostly = {
-       .name           = "local-APIC-edge",
-       .mask           = mask_lapic_irq,
-       .unmask         = unmask_lapic_irq,
-       .eoi            = ack_apic,
-};
-
-static void setup_nmi (void)
-{
-       /*
-        * Dirty trick to enable the NMI watchdog ...
-        * We put the 8259A master into AEOI mode and
-        * unmask on all local APICs LVT0 as NMI.
-        *
-        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
-        * is from Maciej W. Rozycki - so we do not have to EOI from
-        * the NMI handler or the timer interrupt.
-        */ 
-       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
-       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
-
-       apic_printk(APIC_VERBOSE, " done.\n");
-}
-
-/*
- * This looks a bit hackish but it's about the only one way of sending
- * a few INTA cycles to 8259As and any associated glue logic.  ICR does
- * not support the ExtINT mode, unfortunately.  We need to send these
- * cycles as some i82489DX-based boards have glue logic that keeps the
- * 8259A interrupt line asserted until INTA.  --macro
- */
-static inline void unlock_ExtINT_logic(void)
-{
-       int apic, pin, i;
-       struct IO_APIC_route_entry entry0, entry1;
-       unsigned char save_control, save_freq_select;
-
-       pin  = find_isa_irq_pin(8, mp_INT);
-       if (pin == -1) {
-               WARN_ON_ONCE(1);
-               return;
-       }
-       apic = find_isa_irq_apic(8, mp_INT);
-       if (apic == -1) {
-               WARN_ON_ONCE(1);
-               return;
-       }
-
-       entry0 = ioapic_read_entry(apic, pin);
-       clear_IO_APIC_pin(apic, pin);
-
-       memset(&entry1, 0, sizeof(entry1));
-
-       entry1.dest_mode = 0;                   /* physical delivery */
-       entry1.mask = 0;                        /* unmask IRQ now */
-       entry1.dest.physical.physical_dest = hard_smp_processor_id();
-       entry1.delivery_mode = dest_ExtINT;
-       entry1.polarity = entry0.polarity;
-       entry1.trigger = 0;
-       entry1.vector = 0;
-
-       ioapic_write_entry(apic, pin, entry1);
-
-       save_control = CMOS_READ(RTC_CONTROL);
-       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
-       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
-                  RTC_FREQ_SELECT);
-       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
-
-       i = 100;
-       while (i-- > 0) {
-               mdelay(10);
-               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
-                       i -= 10;
-       }
-
-       CMOS_WRITE(save_control, RTC_CONTROL);
-       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-       clear_IO_APIC_pin(apic, pin);
-
-       ioapic_write_entry(apic, pin, entry0);
-}
-
-int timer_uses_ioapic_pin_0;
-
-/*
- * This code may look a bit paranoid, but it's supposed to cooperate with
- * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
- * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
- * fanatically on his truly buggy board.
- */
-static inline void __init check_timer(void)
-{
-       int apic1, pin1, apic2, pin2;
-       int vector;
-
-       /*
-        * get/set the timer IRQ vector:
-        */
-       disable_8259A_irq(0);
-       vector = assign_irq_vector(0);
-       set_intr_gate(vector, interrupt[0]);
-
-       /*
-        * Subtle, code in do_timer_interrupt() expects an AEOI
-        * mode for the 8259A whenever interrupts are routed
-        * through I/O APICs.  Also IRQ0 has to be enabled in
-        * the 8259A which implies the virtual wire has to be
-        * disabled in the local APIC.
-        */
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-       init_8259A(1);
-       timer_ack = 1;
-       if (timer_over_8254 > 0)
-               enable_8259A_irq(0);
-
-       pin1  = find_isa_irq_pin(0, mp_INT);
-       apic1 = find_isa_irq_apic(0, mp_INT);
-       pin2  = ioapic_i8259.pin;
-       apic2 = ioapic_i8259.apic;
-
-       if (pin1 == 0)
-               timer_uses_ioapic_pin_0 = 1;
-
-       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-               vector, apic1, pin1, apic2, pin2);
-
-       if (pin1 != -1) {
-               /*
-                * Ok, does IRQ0 through the IOAPIC work?
-                */
-               unmask_IO_APIC_irq(0);
-               if (timer_irq_works()) {
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               disable_8259A_irq(0);
-                               setup_nmi();
-                               enable_8259A_irq(0);
-                       }
-                       if (disable_timer_pin_1 > 0)
-                               clear_IO_APIC_pin(0, pin1);
-                       return;
-               }
-               clear_IO_APIC_pin(apic1, pin1);
-               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
-                               "IO-APIC\n");
-       }
-
-       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
-       if (pin2 != -1) {
-               printk("\n..... (found pin %d) ...", pin2);
-               /*
-                * legacy devices should be connected to IO APIC #0
-                */
-               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
-               if (timer_irq_works()) {
-                       printk("works.\n");
-                       if (pin1 != -1)
-                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
-                       else
-                               add_pin_to_irq(0, apic2, pin2);
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               setup_nmi();
-                       }
-                       return;
-               }
-               /*
-                * Cleanup, just in case ...
-                */
-               clear_IO_APIC_pin(apic2, pin2);
-       }
-       printk(" failed.\n");
-
-       if (nmi_watchdog == NMI_IO_APIC) {
-               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-               nmi_watchdog = 0;
-       }
-
-       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
-
-       disable_8259A_irq(0);
-       set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
-                                     "fasteoi");
-       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
-       enable_8259A_irq(0);
-
-       if (timer_irq_works()) {
-               printk(" works.\n");
-               return;
-       }
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
-       printk(" failed.\n");
-
-       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
-
-       timer_ack = 0;
-       init_8259A(0);
-       make_8259A_irq(0);
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
-
-       unlock_ExtINT_logic();
-
-       if (timer_irq_works()) {
-               printk(" works.\n");
-               return;
-       }
-       printk(" failed :(.\n");
-       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
-               "report.  Then try booting with the 'noapic' option");
-}
-
-/*
- *
- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- *   Linux doesn't really care, as it's not actually used
- *   for any interrupt handling anyway.
- */
-#define PIC_IRQS       (1 << PIC_CASCADE_IR)
-
-void __init setup_IO_APIC(void)
-{
-       enable_IO_APIC();
-
-       if (acpi_ioapic)
-               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
-       else
-               io_apic_irqs = ~PIC_IRQS;
-
-       printk("ENABLING IO-APIC IRQs\n");
-
-       /*
-        * Set up IO-APIC IRQ routing.
-        */
-       if (!acpi_ioapic)
-               setup_ioapic_ids_from_mpc();
-       sync_Arb_IDs();
-       setup_IO_APIC_irqs();
-       init_IO_APIC_traps();
-       check_timer();
-       if (!acpi_ioapic)
-               print_IO_APIC();
-}
-
-static int __init setup_disable_8254_timer(char *s)
-{
-       timer_over_8254 = -1;
-       return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-       timer_over_8254 = 2;
-       return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
-/*
- *     Called after all the initialization is done. If we didnt find any
- *     APIC bugs then we can allow the modify fast path
- */
-static int __init io_apic_bug_finalize(void)
-{
-       if(sis_apic_bug == -1)
-               sis_apic_bug = 0;
-       return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
-struct sysfs_ioapic_data {
-       struct sys_device dev;
-       struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-       struct IO_APIC_route_entry *entry;
-       struct sysfs_ioapic_data *data;
-       int i;
-       
-       data = container_of(dev, struct sysfs_ioapic_data, dev);
-       entry = data->entry;
-       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
-               entry[i] = ioapic_read_entry(dev->id, i);
-
-       return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
-       struct IO_APIC_route_entry *entry;
-       struct sysfs_ioapic_data *data;
-       unsigned long flags;
-       union IO_APIC_reg_00 reg_00;
-       int i;
-       
-       data = container_of(dev, struct sysfs_ioapic_data, dev);
-       entry = data->entry;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_00.raw = io_apic_read(dev->id, 0);
-       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
-               io_apic_write(dev->id, 0, reg_00.raw);
-       }
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
-               ioapic_write_entry(dev->id, i, entry[i]);
-
-       return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
-       set_kset_name("ioapic"),
-       .suspend = ioapic_suspend,
-       .resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
-{
-       struct sys_device * dev;
-       int i, size, error = 0;
-
-       error = sysdev_class_register(&ioapic_sysdev_class);
-       if (error)
-               return error;
-
-       for (i = 0; i < nr_ioapics; i++ ) {
-               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
-                       * sizeof(struct IO_APIC_route_entry);
-               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
-               if (!mp_ioapic_data[i]) {
-                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-                       continue;
-               }
-               memset(mp_ioapic_data[i], 0, size);
-               dev = &mp_ioapic_data[i]->dev;
-               dev->id = i; 
-               dev->cls = &ioapic_sysdev_class;
-               error = sysdev_register(dev);
-               if (error) {
-                       kfree(mp_ioapic_data[i]);
-                       mp_ioapic_data[i] = NULL;
-                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-                       continue;
-               }
-       }
-
-       return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-int create_irq(void)
-{
-       /* Allocate an unused irq */
-       int irq, new, vector = 0;
-       unsigned long flags;
-
-       irq = -ENOSPC;
-       spin_lock_irqsave(&vector_lock, flags);
-       for (new = (NR_IRQS - 1); new >= 0; new--) {
-               if (platform_legacy_irq(new))
-                       continue;
-               if (irq_vector[new] != 0)
-                       continue;
-               vector = __assign_irq_vector(new);
-               if (likely(vector > 0))
-                       irq = new;
-               break;
-       }
-       spin_unlock_irqrestore(&vector_lock, flags);
-
-       if (irq >= 0) {
-               set_intr_gate(vector, interrupt[irq]);
-               dynamic_irq_init(irq);
-       }
-       return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
-       unsigned long flags;
-
-       dynamic_irq_cleanup(irq);
-
-       spin_lock_irqsave(&vector_lock, flags);
-       irq_vector[irq] = 0;
-       spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI mesage composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
-       int vector;
-       unsigned dest;
-
-       vector = assign_irq_vector(irq);
-       if (vector >= 0) {
-               dest = cpu_mask_to_apicid(TARGET_CPUS);
-
-               msg->address_hi = MSI_ADDR_BASE_HI;
-               msg->address_lo =
-                       MSI_ADDR_BASE_LO |
-                       ((INT_DEST_MODE == 0) ?
-                               MSI_ADDR_DEST_MODE_PHYSICAL:
-                               MSI_ADDR_DEST_MODE_LOGICAL) |
-                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                               MSI_ADDR_REDIRECTION_CPU:
-                               MSI_ADDR_REDIRECTION_LOWPRI) |
-                       MSI_ADDR_DEST_ID(dest);
-
-               msg->data =
-                       MSI_DATA_TRIGGER_EDGE |
-                       MSI_DATA_LEVEL_ASSERT |
-                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                               MSI_DATA_DELIVERY_FIXED:
-                               MSI_DATA_DELIVERY_LOWPRI) |
-                       MSI_DATA_VECTOR(vector);
-       }
-       return vector;
-}
-
-#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-       struct msi_msg msg;
-       unsigned int dest;
-       cpumask_t tmp;
-       int vector;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               tmp = TARGET_CPUS;
-
-       vector = assign_irq_vector(irq);
-       if (vector < 0)
-               return;
-
-       dest = cpu_mask_to_apicid(mask);
-
-       read_msi_msg(irq, &msg);
-
-       msg.data &= ~MSI_DATA_VECTOR_MASK;
-       msg.data |= MSI_DATA_VECTOR(vector);
-       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
-       write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = mask;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-       .name           = "PCI-MSI",
-       .unmask         = unmask_msi_irq,
-       .mask           = mask_msi_irq,
-       .ack            = ack_ioapic_irq,
-#ifdef CONFIG_SMP
-       .set_affinity   = set_msi_irq_affinity,
-#endif
-       .retrigger      = ioapic_retrigger_irq,
-};
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
-       struct msi_msg msg;
-       int irq, ret;
-       irq = create_irq();
-       if (irq < 0)
-               return irq;
-
-       ret = msi_compose_msg(dev, irq, &msg);
-       if (ret < 0) {
-               destroy_irq(irq);
-               return ret;
-       }
-
-       set_irq_msi(irq, desc);
-       write_msi_msg(irq, &msg);
-
-       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
-                                     "edge");
-
-       return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-       destroy_irq(irq);
-}
-
-#endif /* CONFIG_PCI_MSI */
-
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest)
-{
-       struct ht_irq_msg msg;
-       fetch_ht_irq_msg(irq, &msg);
-
-       msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
-       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-       msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
-       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-       write_ht_irq_msg(irq, &msg);
-}
-
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-       unsigned int dest;
-       cpumask_t tmp;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               tmp = TARGET_CPUS;
-
-       cpus_and(mask, tmp, CPU_MASK_ALL);
-
-       dest = cpu_mask_to_apicid(mask);
-
-       target_ht_irq(irq, dest);
-       irq_desc[irq].affinity = mask;
-}
-#endif
-
-static struct irq_chip ht_irq_chip = {
-       .name           = "PCI-HT",
-       .mask           = mask_ht_irq,
-       .unmask         = unmask_ht_irq,
-       .ack            = ack_ioapic_irq,
-#ifdef CONFIG_SMP
-       .set_affinity   = set_ht_irq_affinity,
-#endif
-       .retrigger      = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
-       int vector;
-
-       vector = assign_irq_vector(irq);
-       if (vector >= 0) {
-               struct ht_irq_msg msg;
-               unsigned dest;
-               cpumask_t tmp;
-
-               cpus_clear(tmp);
-               cpu_set(vector >> 8, tmp);
-               dest = cpu_mask_to_apicid(tmp);
-
-               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
-               msg.address_lo =
-                       HT_IRQ_LOW_BASE |
-                       HT_IRQ_LOW_DEST_ID(dest) |
-                       HT_IRQ_LOW_VECTOR(vector) |
-                       ((INT_DEST_MODE == 0) ?
-                               HT_IRQ_LOW_DM_PHYSICAL :
-                               HT_IRQ_LOW_DM_LOGICAL) |
-                       HT_IRQ_LOW_RQEOI_EDGE |
-                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                               HT_IRQ_LOW_MT_FIXED :
-                               HT_IRQ_LOW_MT_ARBITRATED) |
-                       HT_IRQ_LOW_IRQ_MASKED;
-
-               write_ht_irq_msg(irq, &msg);
-
-               set_irq_chip_and_handler_name(irq, &ht_irq_chip,
-                                             handle_edge_irq, "edge");
-       }
-       return vector;
-}
-#endif /* CONFIG_HT_IRQ */
-
-/* --------------------------------------------------------------------------
-                          ACPI-based IOAPIC Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
-{
-       union IO_APIC_reg_00 reg_00;
-       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
-       physid_mask_t tmp;
-       unsigned long flags;
-       int i = 0;
-
-       /*
-        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
-        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
-        * supports up to 16 on one shared APIC bus.
-        * 
-        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
-        *      advantage of new APIC bus architecture.
-        */
-
-       if (physids_empty(apic_id_map))
-               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_00.raw = io_apic_read(ioapic, 0);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       if (apic_id >= get_physical_broadcast()) {
-               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
-                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
-               apic_id = reg_00.bits.ID;
-       }
-
-       /*
-        * Every APIC in a system must have a unique ID or we get lots of nice 
-        * 'stuck on smp_invalidate_needed IPI wait' messages.
-        */
-       if (check_apicid_used(apic_id_map, apic_id)) {
-
-               for (i = 0; i < get_physical_broadcast(); i++) {
-                       if (!check_apicid_used(apic_id_map, i))
-                               break;
-               }
-
-               if (i == get_physical_broadcast())
-                       panic("Max apic_id exceeded!\n");
-
-               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
-                       "trying %d\n", ioapic, apic_id, i);
-
-               apic_id = i;
-       } 
-
-       tmp = apicid_to_cpu_present(apic_id);
-       physids_or(apic_id_map, apic_id_map, tmp);
-
-       if (reg_00.bits.ID != apic_id) {
-               reg_00.bits.ID = apic_id;
-
-               spin_lock_irqsave(&ioapic_lock, flags);
-               io_apic_write(ioapic, 0, reg_00.raw);
-               reg_00.raw = io_apic_read(ioapic, 0);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-
-               /* Sanity check */
-               if (reg_00.bits.ID != apic_id) {
-                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
-                       return -1;
-               }
-       }
-
-       apic_printk(APIC_VERBOSE, KERN_INFO
-                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
-       return apic_id;
-}
-
-
-int __init io_apic_get_version (int ioapic)
-{
-       union IO_APIC_reg_01    reg_01;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_01.raw = io_apic_read(ioapic, 1);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       return reg_01.bits.version;
-}
-
-
-int __init io_apic_get_redir_entries (int ioapic)
-{
-       union IO_APIC_reg_01    reg_01;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_01.raw = io_apic_read(ioapic, 1);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       return reg_01.bits.entries;
-}
-
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
-{
-       struct IO_APIC_route_entry entry;
-       unsigned long flags;
-
-       if (!IO_APIC_IRQ(irq)) {
-               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-                       ioapic);
-               return -EINVAL;
-       }
-
-       /*
-        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
-        * Note that we mask (disable) IRQs now -- these get enabled when the
-        * corresponding device driver registers for this IRQ.
-        */
-
-       memset(&entry,0,sizeof(entry));
-
-       entry.delivery_mode = INT_DELIVERY_MODE;
-       entry.dest_mode = INT_DEST_MODE;
-       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
-       entry.trigger = edge_level;
-       entry.polarity = active_high_low;
-       entry.mask  = 1;
-
-       /*
-        * IRQs < 16 are already in the irq_2_pin[] map
-        */
-       if (irq >= 16)
-               add_pin_to_irq(irq, ioapic, pin);
-
-       entry.vector = assign_irq_vector(irq);
-
-       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
-               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
-               edge_level, active_high_low);
-
-       ioapic_register_intr(irq, entry.vector, edge_level);
-
-       if (!ioapic && (irq < 16))
-               disable_8259A_irq(irq);
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __ioapic_write_entry(ioapic, pin, entry);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       return 0;
-}
-
-#endif /* CONFIG_ACPI */
-
-static int __init parse_disable_timer_pin_1(char *arg)
-{
-       disable_timer_pin_1 = 1;
-       return 0;
-}
-early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
-
-static int __init parse_enable_timer_pin_1(char *arg)
-{
-       disable_timer_pin_1 = -1;
-       return 0;
-}
-early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
-
-static int __init parse_noapic(char *arg)
-{
-       /* disable IO-APIC */
-       disable_ioapic_setup();
-       return 0;
-}
-early_param("noapic", parse_noapic);
diff --git a/arch/i386/kernel/ioport_32.c b/arch/i386/kernel/ioport_32.c
deleted file mode 100644 (file)
index 3d310a9..0000000
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *     linux/arch/i386/kernel/ioport.c
- *
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
-{
-       unsigned long mask;
-       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
-       unsigned int low_index = base & (BITS_PER_LONG-1);
-       int length = low_index + extent;
-
-       if (low_index != 0) {
-               mask = (~0UL << low_index);
-               if (length < BITS_PER_LONG)
-                       mask &= ~(~0UL << length);
-               if (new_value)
-                       *bitmap_base++ |= mask;
-               else
-                       *bitmap_base++ &= ~mask;
-               length -= BITS_PER_LONG;
-       }
-
-       mask = (new_value ? ~0UL : 0UL);
-       while (length >= BITS_PER_LONG) {
-               *bitmap_base++ = mask;
-               length -= BITS_PER_LONG;
-       }
-
-       if (length > 0) {
-               mask = ~(~0UL << length);
-               if (new_value)
-                       *bitmap_base++ |= mask;
-               else
-                       *bitmap_base++ &= ~mask;
-       }
-}
-
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-       unsigned long i, max_long, bytes, bytes_updated;
-       struct thread_struct * t = &current->thread;
-       struct tss_struct * tss;
-       unsigned long *bitmap;
-
-       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-               return -EINVAL;
-       if (turn_on && !capable(CAP_SYS_RAWIO))
-               return -EPERM;
-
-       /*
-        * If it's the first ioperm() call in this thread's lifetime, set the
-        * IO bitmap up. ioperm() is much less timing critical than clone(),
-        * this is why we delay this operation until now:
-        */
-       if (!t->io_bitmap_ptr) {
-               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-               if (!bitmap)
-                       return -ENOMEM;
-
-               memset(bitmap, 0xff, IO_BITMAP_BYTES);
-               t->io_bitmap_ptr = bitmap;
-               set_thread_flag(TIF_IO_BITMAP);
-       }
-
-       /*
-        * do it in the per-thread copy and in the TSS ...
-        *
-        * Disable preemption via get_cpu() - we must not switch away
-        * because the ->io_bitmap_max value must match the bitmap
-        * contents:
-        */
-       tss = &per_cpu(init_tss, get_cpu());
-
-       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-       /*
-        * Search for a (possibly new) maximum. This is simple and stupid,
-        * to keep it obviously correct:
-        */
-       max_long = 0;
-       for (i = 0; i < IO_BITMAP_LONGS; i++)
-               if (t->io_bitmap_ptr[i] != ~0UL)
-                       max_long = i;
-
-       bytes = (max_long + 1) * sizeof(long);
-       bytes_updated = max(bytes, t->io_bitmap_max);
-
-       t->io_bitmap_max = bytes;
-
-       /*
-        * Sets the lazy trigger so that the next I/O operation will
-        * reload the correct bitmap.
-        * Reset the owner so that a process switch will not set
-        * tss->io_bitmap_base to IO_BITMAP_OFFSET.
-        */
-       tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-       tss->io_bitmap_owner = NULL;
-
-       put_cpu();
-
-       return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- * Here we just change the eflags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
- */
-
-asmlinkage long sys_iopl(unsigned long unused)
-{
-       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
-       unsigned int level = regs->ebx;
-       unsigned int old = (regs->eflags >> 12) & 3;
-       struct thread_struct *t = &current->thread;
-
-       if (level > 3)
-               return -EINVAL;
-       /* Trying to gain more privileges? */
-       if (level > old) {
-               if (!capable(CAP_SYS_RAWIO))
-                       return -EPERM;
-       }
-       t->iopl = level << 12;
-       regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
-       set_iopl_mask(t->iopl);
-       return 0;
-}
diff --git a/arch/i386/kernel/irq_32.c b/arch/i386/kernel/irq_32.c
deleted file mode 100644 (file)
index dd2b97f..0000000
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- *     linux/arch/i386/kernel/irq.c
- *
- *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
- *
- * This file contains the lowest level x86-specific interrupt
- * entry, irq-stacks and irq statistics code. All the remaining
- * irq logic is done by the generic kernel/irq/ code and
- * by the x86-specific irq controller code. (e.g. i8259.c and
- * io_apic.c.)
- */
-
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-
-#include <asm/apic.h>
-#include <asm/uaccess.h>
-
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-       printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       /*
-        * Currently unexpected vectors happen only on SMP and APIC.
-        * We _must_ ack these because every local APIC has only N
-        * irq slots per priority level, and a 'hanging, unacked' IRQ
-        * holds up an irq slot - in excessive cases (when multiple
-        * unexpected vectors occur) that might lock up the APIC
-        * completely.
-        * But only ack when the APIC is enabled -AK
-        */
-       if (cpu_has_apic)
-               ack_APIC_irq();
-#endif
-}
-
-#ifdef CONFIG_4KSTACKS
-/*
- * per-CPU IRQ handling contexts (thread information and stack)
- */
-union irq_ctx {
-       struct thread_info      tinfo;
-       u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
-
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-#endif
-
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
-{      
-       struct pt_regs *old_regs;
-       /* high bit used in ret_from_ code */
-       int irq = ~regs->orig_eax;
-       struct irq_desc *desc = irq_desc + irq;
-#ifdef CONFIG_4KSTACKS
-       union irq_ctx *curctx, *irqctx;
-       u32 *isp;
-#endif
-
-       if (unlikely((unsigned)irq >= NR_IRQS)) {
-               printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
-                                       __FUNCTION__, irq);
-               BUG();
-       }
-
-       old_regs = set_irq_regs(regs);
-       irq_enter();
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-       /* Debugging check for stack overflow: is there less than 1KB free? */
-       {
-               long esp;
-
-               __asm__ __volatile__("andl %%esp,%0" :
-                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
-               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
-                       printk("do_IRQ: stack overflow: %ld\n",
-                               esp - sizeof(struct thread_info));
-                       dump_stack();
-               }
-       }
-#endif
-
-#ifdef CONFIG_4KSTACKS
-
-       curctx = (union irq_ctx *) current_thread_info();
-       irqctx = hardirq_ctx[smp_processor_id()];
-
-       /*
-        * this is where we switch to the IRQ stack. However, if we are
-        * already using the IRQ stack (because we interrupted a hardirq
-        * handler) we can't do that and just have to keep using the
-        * current stack (which is the irq stack already after all)
-        */
-       if (curctx != irqctx) {
-               int arg1, arg2, ebx;
-
-               /* build the stack frame on the IRQ stack */
-               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
-               irqctx->tinfo.task = curctx->tinfo.task;
-               irqctx->tinfo.previous_esp = current_stack_pointer;
-
-               /*
-                * Copy the softirq bits in preempt_count so that the
-                * softirq checks work in the hardirq context.
-                */
-               irqctx->tinfo.preempt_count =
-                       (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
-                       (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
-
-               asm volatile(
-                       "       xchgl  %%ebx,%%esp      \n"
-                       "       call   *%%edi           \n"
-                       "       movl   %%ebx,%%esp      \n"
-                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
-                       :  "0" (irq),   "1" (desc),  "2" (isp),
-                          "D" (desc->handle_irq)
-                       : "memory", "cc"
-               );
-       } else
-#endif
-               desc->handle_irq(irq, desc);
-
-       irq_exit();
-       set_irq_regs(old_regs);
-       return 1;
-}
-
-#ifdef CONFIG_4KSTACKS
-
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
-
-/*
- * allocate per-cpu stacks for hardirq and for softirq processing
- */
-void irq_ctx_init(int cpu)
-{
-       union irq_ctx *irqctx;
-
-       if (hardirq_ctx[cpu])
-               return;
-
-       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
-       irqctx->tinfo.task              = NULL;
-       irqctx->tinfo.exec_domain       = NULL;
-       irqctx->tinfo.cpu               = cpu;
-       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
-       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
-
-       hardirq_ctx[cpu] = irqctx;
-
-       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
-       irqctx->tinfo.task              = NULL;
-       irqctx->tinfo.exec_domain       = NULL;
-       irqctx->tinfo.cpu               = cpu;
-       irqctx->tinfo.preempt_count     = 0;
-       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
-
-       softirq_ctx[cpu] = irqctx;
-
-       printk("CPU %u irqstacks, hard=%p soft=%p\n",
-               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
-}
-
-void irq_ctx_exit(int cpu)
-{
-       hardirq_ctx[cpu] = NULL;
-}
-
-extern asmlinkage void __do_softirq(void);
-
-asmlinkage void do_softirq(void)
-{
-       unsigned long flags;
-       struct thread_info *curctx;
-       union irq_ctx *irqctx;
-       u32 *isp;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       if (local_softirq_pending()) {
-               curctx = current_thread_info();
-               irqctx = softirq_ctx[smp_processor_id()];
-               irqctx->tinfo.task = curctx->task;
-               irqctx->tinfo.previous_esp = current_stack_pointer;
-
-               /* build the stack frame on the softirq stack */
-               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
-
-               asm volatile(
-                       "       xchgl   %%ebx,%%esp     \n"
-                       "       call    __do_softirq    \n"
-                       "       movl    %%ebx,%%esp     \n"
-                       : "=b"(isp)
-                       : "0"(isp)
-                       : "memory", "cc", "edx", "ecx", "eax"
-               );
-               /*
-                * Shouldnt happen, we returned above if in_interrupt():
-                */
-               WARN_ON_ONCE(softirq_count());
-       }
-
-       local_irq_restore(flags);
-}
-
-EXPORT_SYMBOL(do_softirq);
-#endif
-
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
-       int i = *(loff_t *) v, j;
-       struct irqaction * action;
-       unsigned long flags;
-
-       if (i == 0) {
-               seq_printf(p, "           ");
-               for_each_online_cpu(j)
-                       seq_printf(p, "CPU%-8d",j);
-               seq_putc(p, '\n');
-       }
-
-       if (i < NR_IRQS) {
-               spin_lock_irqsave(&irq_desc[i].lock, flags);
-               action = irq_desc[i].action;
-               if (!action)
-                       goto skip;
-               seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
-               seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
-               seq_printf(p, " %8s", irq_desc[i].chip->name);
-               seq_printf(p, "-%-8s", irq_desc[i].name);
-               seq_printf(p, "  %s", action->name);
-
-               for (action=action->next; action; action = action->next)
-                       seq_printf(p, ", %s", action->name);
-
-               seq_putc(p, '\n');
-skip:
-               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-       } else if (i == NR_IRQS) {
-               seq_printf(p, "NMI: ");
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", nmi_count(j));
-               seq_putc(p, '\n');
-#ifdef CONFIG_X86_LOCAL_APIC
-               seq_printf(p, "LOC: ");
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ",
-                               per_cpu(irq_stat,j).apic_timer_irqs);
-               seq_putc(p, '\n');
-#endif
-               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#if defined(CONFIG_X86_IO_APIC)
-               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
-       }
-       return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
-
-void fixup_irqs(cpumask_t map)
-{
-       unsigned int irq;
-       static int warned;
-
-       for (irq = 0; irq < NR_IRQS; irq++) {
-               cpumask_t mask;
-               if (irq == 2)
-                       continue;
-
-               cpus_and(mask, irq_desc[irq].affinity, map);
-               if (any_online_cpu(mask) == NR_CPUS) {
-                       printk("Breaking affinity for irq %i\n", irq);
-                       mask = map;
-               }
-               if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
-               else if (irq_desc[irq].action && !(warned++))
-                       printk("Cannot set affinity for irq %i\n", irq);
-       }
-
-#if 0
-       barrier();
-       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
-          [note the nop - the interrupt-enable boundary on x86 is two
-          instructions from sti] - to flush out pending hardirqs and
-          IPIs. After this point nothing is supposed to reach this CPU." */
-       __asm__ __volatile__("sti; nop; cli");
-       barrier();
-#else
-       /* That doesn't seem sufficient.  Give it 1ms. */
-       local_irq_enable();
-       mdelay(1);
-       local_irq_disable();
-#endif
-}
-#endif
-
diff --git a/arch/i386/kernel/kprobes_32.c b/arch/i386/kernel/kprobes_32.c
deleted file mode 100644 (file)
index 448a50b..0000000
+++ /dev/null
@@ -1,751 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *  arch/i386/kernel/kprobes.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *             Probes initial implementation ( includes contributions from
- *             Rusty Russell).
- * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- *             interface to access function arguments.
- * 2005-May    Hien Nguyen <hien@us.ibm.com>, Jim Keniston
- *             <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- *             <prasanna@in.ibm.com> added function-return probes.
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/preempt.h>
-#include <linux/kdebug.h>
-#include <asm/cacheflush.h>
-#include <asm/desc.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-/* insert a jmp code */
-static __always_inline void set_jmp_op(void *from, void *to)
-{
-       struct __arch_jmp_op {
-               char op;
-               long raddr;
-       } __attribute__((packed)) *jop;
-       jop = (struct __arch_jmp_op *)from;
-       jop->raddr = (long)(to) - ((long)(from) + 5);
-       jop->op = RELATIVEJUMP_INSTRUCTION;
-}
-
-/*
- * returns non-zero if opcodes can be boosted.
- */
-static __always_inline int can_boost(kprobe_opcode_t *opcodes)
-{
-#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)               \
-       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-        << (row % 32))
-       /*
-        * Undefined/reserved opcodes, conditional jump, Opcode Extension
-        * Groups, and some special opcodes can not be boost.
-        */
-       static const unsigned long twobyte_is_boostable[256 / 32] = {
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-               /*      -------------------------------         */
-               W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
-               W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
-               W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
-               W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
-               W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
-               W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
-               W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
-               W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
-               W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
-               W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
-               W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
-               W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
-               W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
-               W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
-               W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
-               W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */
-               /*      -------------------------------         */
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-       };
-#undef W
-       kprobe_opcode_t opcode;
-       kprobe_opcode_t *orig_opcodes = opcodes;
-retry:
-       if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-               return 0;
-       opcode = *(opcodes++);
-
-       /* 2nd-byte opcode */
-       if (opcode == 0x0f) {
-               if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-                       return 0;
-               return test_bit(*opcodes, twobyte_is_boostable);
-       }
-
-       switch (opcode & 0xf0) {
-       case 0x60:
-               if (0x63 < opcode && opcode < 0x67)
-                       goto retry; /* prefixes */
-               /* can't boost Address-size override and bound */
-               return (opcode != 0x62 && opcode != 0x67);
-       case 0x70:
-               return 0; /* can't boost conditional jump */
-       case 0xc0:
-               /* can't boost software-interruptions */
-               return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
-       case 0xd0:
-               /* can boost AA* and XLAT */
-               return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
-       case 0xe0:
-               /* can boost in/out and absolute jmps */
-               return ((opcode & 0x04) || opcode == 0xea);
-       case 0xf0:
-               if ((opcode & 0x0c) == 0 && opcode != 0xf1)
-                       goto retry; /* lock/rep(ne) prefix */
-               /* clear and set flags can be boost */
-               return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
-       default:
-               if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
-                       goto retry; /* prefixes */
-               /* can't boost CS override and call */
-               return (opcode != 0x2e && opcode != 0x9a);
-       }
-}
-
-/*
- * returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
-{
-       switch (opcode) {
-       case 0xfa:              /* cli */
-       case 0xfb:              /* sti */
-       case 0xcf:              /* iret/iretd */
-       case 0x9d:              /* popf/popfd */
-               return 1;
-       }
-       return 0;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-       /* insn: must be on special executable page on i386. */
-       p->ainsn.insn = get_insn_slot();
-       if (!p->ainsn.insn)
-               return -ENOMEM;
-
-       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-       p->opcode = *p->addr;
-       if (can_boost(p->addr)) {
-               p->ainsn.boostable = 0;
-       } else {
-               p->ainsn.boostable = -1;
-       }
-       return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-       mutex_lock(&kprobe_mutex);
-       free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
-       mutex_unlock(&kprobe_mutex);
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       kcb->prev_kprobe.kp = kprobe_running();
-       kcb->prev_kprobe.status = kcb->kprobe_status;
-       kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
-       kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
-       kcb->kprobe_status = kcb->prev_kprobe.status;
-       kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
-       kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
-                               struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = p;
-       kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
-               = (regs->eflags & (TF_MASK | IF_MASK));
-       if (is_IF_modifier(p->opcode))
-               kcb->kprobe_saved_eflags &= ~IF_MASK;
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       regs->eflags |= TF_MASK;
-       regs->eflags &= ~IF_MASK;
-       /*single step inline if the instruction is an int3*/
-       if (p->opcode == BREAKPOINT_INSTRUCTION)
-               regs->eip = (unsigned long)p->addr;
-       else
-               regs->eip = (unsigned long)p->ainsn.insn;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-                                     struct pt_regs *regs)
-{
-       unsigned long *sara = (unsigned long *)&regs->esp;
-
-       ri->ret_addr = (kprobe_opcode_t *) *sara;
-
-       /* Replace the return addr with trampoline addr */
-       *sara = (unsigned long) &kretprobe_trampoline;
-}
-
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *p;
-       int ret = 0;
-       kprobe_opcode_t *addr;
-       struct kprobe_ctlblk *kcb;
-
-       addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
-
-       /*
-        * We don't want to be preempted for the entire
-        * duration of kprobe processing
-        */
-       preempt_disable();
-       kcb = get_kprobe_ctlblk();
-
-       /* Check we're not actually recursing */
-       if (kprobe_running()) {
-               p = get_kprobe(addr);
-               if (p) {
-                       if (kcb->kprobe_status == KPROBE_HIT_SS &&
-                               *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-                               regs->eflags &= ~TF_MASK;
-                               regs->eflags |= kcb->kprobe_saved_eflags;
-                               goto no_kprobe;
-                       }
-                       /* We have reentered the kprobe_handler(), since
-                        * another probe was hit while within the handler.
-                        * We here save the original kprobes variables and
-                        * just single step on the instruction of the new probe
-                        * without calling any user handlers.
-                        */
-                       save_previous_kprobe(kcb);
-                       set_current_kprobe(p, regs, kcb);
-                       kprobes_inc_nmissed_count(p);
-                       prepare_singlestep(p, regs);
-                       kcb->kprobe_status = KPROBE_REENTER;
-                       return 1;
-               } else {
-                       if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /* The breakpoint instruction was removed by
-                        * another cpu right after we hit, no further
-                        * handling of this interrupt is appropriate
-                        */
-                               regs->eip -= sizeof(kprobe_opcode_t);
-                               ret = 1;
-                               goto no_kprobe;
-                       }
-                       p = __get_cpu_var(current_kprobe);
-                       if (p->break_handler && p->break_handler(p, regs)) {
-                               goto ss_probe;
-                       }
-               }
-               goto no_kprobe;
-       }
-
-       p = get_kprobe(addr);
-       if (!p) {
-               if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /*
-                        * The breakpoint instruction was removed right
-                        * after we hit it.  Another cpu has removed
-                        * either a probepoint or a debugger breakpoint
-                        * at this address.  In either case, no further
-                        * handling of this interrupt is appropriate.
-                        * Back up over the (now missing) int3 and run
-                        * the original instruction.
-                        */
-                       regs->eip -= sizeof(kprobe_opcode_t);
-                       ret = 1;
-               }
-               /* Not one of ours: let kernel handle it */
-               goto no_kprobe;
-       }
-
-       set_current_kprobe(p, regs, kcb);
-       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-       if (p->pre_handler && p->pre_handler(p, regs))
-               /* handler has already set things up, so skip ss setup */
-               return 1;
-
-ss_probe:
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
-       if (p->ainsn.boostable == 1 && !p->post_handler){
-               /* Boost up -- we can execute copied instructions directly */
-               reset_current_kprobe();
-               regs->eip = (unsigned long)p->ainsn.insn;
-               preempt_enable_no_resched();
-               return 1;
-       }
-#endif
-       prepare_singlestep(p, regs);
-       kcb->kprobe_status = KPROBE_HIT_SS;
-       return 1;
-
-no_kprobe:
-       preempt_enable_no_resched();
-       return ret;
-}
-
-/*
- * For function-return probes, init_kprobes() establishes a probepoint
- * here. When a retprobed function returns, this probe is hit and
- * trampoline_probe_handler() runs, calling the kretprobe's handler.
- */
- void __kprobes kretprobe_trampoline_holder(void)
- {
-       asm volatile ( ".global kretprobe_trampoline\n"
-                       "kretprobe_trampoline: \n"
-                       "       pushf\n"
-                       /* skip cs, eip, orig_eax */
-                       "       subl $12, %esp\n"
-                       "       pushl %fs\n"
-                       "       pushl %ds\n"
-                       "       pushl %es\n"
-                       "       pushl %eax\n"
-                       "       pushl %ebp\n"
-                       "       pushl %edi\n"
-                       "       pushl %esi\n"
-                       "       pushl %edx\n"
-                       "       pushl %ecx\n"
-                       "       pushl %ebx\n"
-                       "       movl %esp, %eax\n"
-                       "       call trampoline_handler\n"
-                       /* move eflags to cs */
-                       "       movl 52(%esp), %edx\n"
-                       "       movl %edx, 48(%esp)\n"
-                       /* save true return address on eflags */
-                       "       movl %eax, 52(%esp)\n"
-                       "       popl %ebx\n"
-                       "       popl %ecx\n"
-                       "       popl %edx\n"
-                       "       popl %esi\n"
-                       "       popl %edi\n"
-                       "       popl %ebp\n"
-                       "       popl %eax\n"
-                       /* skip eip, orig_eax, es, ds, fs */
-                       "       addl $20, %esp\n"
-                       "       popf\n"
-                       "       ret\n");
-}
-
-/*
- * Called from kretprobe_trampoline
- */
-fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
-{
-       struct kretprobe_instance *ri = NULL;
-       struct hlist_head *head, empty_rp;
-       struct hlist_node *node, *tmp;
-       unsigned long flags, orig_ret_address = 0;
-       unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
-
-       INIT_HLIST_HEAD(&empty_rp);
-       spin_lock_irqsave(&kretprobe_lock, flags);
-       head = kretprobe_inst_table_head(current);
-       /* fixup registers */
-       regs->xcs = __KERNEL_CS | get_kernel_rpl();
-       regs->eip = trampoline_address;
-       regs->orig_eax = 0xffffffff;
-
-       /*
-        * It is possible to have multiple instances associated with a given
-        * task either because an multiple functions in the call path
-        * have a return probe installed on them, and/or more then one return
-        * return probe was registered for a target function.
-        *
-        * We can handle this because:
-        *     - instances are always inserted at the head of the list
-        *     - when multiple return probes are registered for the same
-        *       function, the first instance's ret_addr will point to the
-        *       real return address, and all the rest will point to
-        *       kretprobe_trampoline
-        */
-       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-               if (ri->task != current)
-                       /* another task is sharing our hash bucket */
-                       continue;
-
-               if (ri->rp && ri->rp->handler){
-                       __get_cpu_var(current_kprobe) = &ri->rp->kp;
-                       get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
-                       ri->rp->handler(ri, regs);
-                       __get_cpu_var(current_kprobe) = NULL;
-               }
-
-               orig_ret_address = (unsigned long)ri->ret_addr;
-               recycle_rp_inst(ri, &empty_rp);
-
-               if (orig_ret_address != trampoline_address)
-                       /*
-                        * This is the real return address. Any other
-                        * instances associated with this task are for
-                        * other calls deeper on the call stack
-                        */
-                       break;
-       }
-
-       kretprobe_assert(ri, orig_ret_address, trampoline_address);
-       spin_unlock_irqrestore(&kretprobe_lock, flags);
-
-       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
-               hlist_del(&ri->hlist);
-               kfree(ri);
-       }
-       return (void*)orig_ret_address;
-}
-
-/*
- * Called after single-stepping.  p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction.  To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction.  The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt.  We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new eip is relative to the copied instruction.  We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- *
- * This function also checks instruction size for preparing direct execution.
- */
-static void __kprobes resume_execution(struct kprobe *p,
-               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
-       unsigned long *tos = (unsigned long *)&regs->esp;
-       unsigned long copy_eip = (unsigned long)p->ainsn.insn;
-       unsigned long orig_eip = (unsigned long)p->addr;
-
-       regs->eflags &= ~TF_MASK;
-       switch (p->ainsn.insn[0]) {
-       case 0x9c:              /* pushfl */
-               *tos &= ~(TF_MASK | IF_MASK);
-               *tos |= kcb->kprobe_old_eflags;
-               break;
-       case 0xc2:              /* iret/ret/lret */
-       case 0xc3:
-       case 0xca:
-       case 0xcb:
-       case 0xcf:
-       case 0xea:              /* jmp absolute -- eip is correct */
-               /* eip is already adjusted, no more changes required */
-               p->ainsn.boostable = 1;
-               goto no_change;
-       case 0xe8:              /* call relative - Fix return addr */
-               *tos = orig_eip + (*tos - copy_eip);
-               break;
-       case 0x9a:              /* call absolute -- same as call absolute, indirect */
-               *tos = orig_eip + (*tos - copy_eip);
-               goto no_change;
-       case 0xff:
-               if ((p->ainsn.insn[1] & 0x30) == 0x10) {
-                       /*
-                        * call absolute, indirect
-                        * Fix return addr; eip is correct.
-                        * But this is not boostable
-                        */
-                       *tos = orig_eip + (*tos - copy_eip);
-                       goto no_change;
-               } else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||       /* jmp near, absolute indirect */
-                          ((p->ainsn.insn[1] & 0x31) == 0x21)) {       /* jmp far, absolute indirect */
-                       /* eip is correct. And this is boostable */
-                       p->ainsn.boostable = 1;
-                       goto no_change;
-               }
-       default:
-               break;
-       }
-
-       if (p->ainsn.boostable == 0) {
-               if ((regs->eip > copy_eip) &&
-                   (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
-                       /*
-                        * These instructions can be executed directly if it
-                        * jumps back to correct address.
-                        */
-                       set_jmp_op((void *)regs->eip,
-                                  (void *)orig_eip + (regs->eip - copy_eip));
-                       p->ainsn.boostable = 1;
-               } else {
-                       p->ainsn.boostable = -1;
-               }
-       }
-
-       regs->eip = orig_eip + (regs->eip - copy_eip);
-
-no_change:
-       return;
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
- */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       if (!cur)
-               return 0;
-
-       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-               kcb->kprobe_status = KPROBE_HIT_SSDONE;
-               cur->post_handler(cur, regs, 0);
-       }
-
-       resume_execution(cur, regs, kcb);
-       regs->eflags |= kcb->kprobe_saved_eflags;
-
-       /*Restore back the original saved kprobes variables and continue. */
-       if (kcb->kprobe_status == KPROBE_REENTER) {
-               restore_previous_kprobe(kcb);
-               goto out;
-       }
-       reset_current_kprobe();
-out:
-       preempt_enable_no_resched();
-
-       /*
-        * if somebody else is singlestepping across a probe point, eflags
-        * will have TF set, in which case, continue the remaining processing
-        * of do_debug, as if this is not a probe hit.
-        */
-       if (regs->eflags & TF_MASK)
-               return 0;
-
-       return 1;
-}
-
-static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       switch(kcb->kprobe_status) {
-       case KPROBE_HIT_SS:
-       case KPROBE_REENTER:
-               /*
-                * We are here because the instruction being single
-                * stepped caused a page fault. We reset the current
-                * kprobe and the eip points back to the probe address
-                * and allow the page fault handler to continue as a
-                * normal page fault.
-                */
-               regs->eip = (unsigned long)cur->addr;
-               regs->eflags |= kcb->kprobe_old_eflags;
-               if (kcb->kprobe_status == KPROBE_REENTER)
-                       restore_previous_kprobe(kcb);
-               else
-                       reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-       case KPROBE_HIT_ACTIVE:
-       case KPROBE_HIT_SSDONE:
-               /*
-                * We increment the nmissed count for accounting,
-                * we can also use npre/npostfault count for accouting
-                * these specific fault cases.
-                */
-               kprobes_inc_nmissed_count(cur);
-
-               /*
-                * We come here because instructions in the pre/post
-                * handler caused the page_fault, this could happen
-                * if handler tries to access user space by
-                * copy_from_user(), get_user() etc. Let the
-                * user-specified handler try to fix it first.
-                */
-               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-                       return 1;
-
-               /*
-                * In case the user-specified fault handler returned
-                * zero, try to fix up.
-                */
-               if (fixup_exception(regs))
-                       return 1;
-
-               /*
-                * fixup_exception() could not handle it,
-                * Let do_page_fault() fix it.
-                */
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-/*
- * Wrapper routine to for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-       struct die_args *args = (struct die_args *)data;
-       int ret = NOTIFY_DONE;
-
-       if (args->regs && user_mode_vm(args->regs))
-               return ret;
-
-       switch (val) {
-       case DIE_INT3:
-               if (kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_GPF:
-       case DIE_PAGE_FAULT:
-               /* kprobe_running() needs smp_processor_id() */
-               preempt_disable();
-               if (kprobe_running() &&
-                   kprobe_fault_handler(args->regs, args->trapnr))
-                       ret = NOTIFY_STOP;
-               preempt_enable();
-               break;
-       default:
-               break;
-       }
-       return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-       unsigned long addr;
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       kcb->jprobe_saved_regs = *regs;
-       kcb->jprobe_saved_esp = &regs->esp;
-       addr = (unsigned long)(kcb->jprobe_saved_esp);
-
-       /*
-        * TBD: As Linus pointed out, gcc assumes that the callee
-        * owns the argument space and could overwrite it, e.g.
-        * tailcall optimization. So, to be absolutely safe
-        * we also save and restore enough stack bytes to cover
-        * the argument area.
-        */
-       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
-                       MIN_STACK_SIZE(addr));
-       regs->eflags &= ~IF_MASK;
-       regs->eip = (unsigned long)(jp->entry);
-       return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       asm volatile ("       xchgl   %%ebx,%%esp     \n"
-                     "       int3                      \n"
-                     "       .globl jprobe_return_end  \n"
-                     "       jprobe_return_end:        \n"
-                     "       nop                       \n"::"b"
-                     (kcb->jprobe_saved_esp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       u8 *addr = (u8 *) (regs->eip - 1);
-       unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-       if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-               if (&regs->esp != kcb->jprobe_saved_esp) {
-                       struct pt_regs *saved_regs =
-                           container_of(kcb->jprobe_saved_esp,
-                                           struct pt_regs, esp);
-                       printk("current esp %p does not match saved esp %p\n",
-                              &regs->esp, kcb->jprobe_saved_esp);
-                       printk("Saved registers for jprobe %p\n", jp);
-                       show_registers(saved_regs);
-                       printk("Current registers\n");
-                       show_registers(regs);
-                       BUG();
-               }
-               *regs = kcb->jprobe_saved_regs;
-               memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
-                      MIN_STACK_SIZE(stack_addr));
-               preempt_enable_no_resched();
-               return 1;
-       }
-       return 0;
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-       return 0;
-}
-
-int __init arch_init_kprobes(void)
-{
-       return 0;
-}
diff --git a/arch/i386/kernel/ldt_32.c b/arch/i386/kernel/ldt_32.c
deleted file mode 100644 (file)
index e0b2d17..0000000
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * linux/arch/i386/kernel/ldt.c
- *
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/mmu_context.h>
-
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
-static void flush_ldt(void *null)
-{
-       if (current->active_mm)
-               load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
-{
-       void *oldldt;
-       void *newldt;
-       int oldsize;
-
-       if (mincount <= pc->size)
-               return 0;
-       oldsize = pc->size;
-       mincount = (mincount+511)&(~511);
-       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
-       else
-               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
-
-       if (!newldt)
-               return -ENOMEM;
-
-       if (oldsize)
-               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
-       oldldt = pc->ldt;
-       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
-       pc->ldt = newldt;
-       wmb();
-       pc->size = mincount;
-       wmb();
-
-       if (reload) {
-#ifdef CONFIG_SMP
-               cpumask_t mask;
-               preempt_disable();
-               load_LDT(pc);
-               mask = cpumask_of_cpu(smp_processor_id());
-               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-                       smp_call_function(flush_ldt, NULL, 1, 1);
-               preempt_enable();
-#else
-               load_LDT(pc);
-#endif
-       }
-       if (oldsize) {
-               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
-                       vfree(oldldt);
-               else
-                       kfree(oldldt);
-       }
-       return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-       int err = alloc_ldt(new, old->size, 0);
-       if (err < 0)
-               return err;
-       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
-       return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       struct mm_struct * old_mm;
-       int retval = 0;
-
-       init_MUTEX(&mm->context.sem);
-       mm->context.size = 0;
-       old_mm = current->mm;
-       if (old_mm && old_mm->context.size > 0) {
-               down(&old_mm->context.sem);
-               retval = copy_ldt(&mm->context, &old_mm->context);
-               up(&old_mm->context.sem);
-       }
-       return retval;
-}
-
-/*
- * No need to lock the MM as we are the last user
- */
-void destroy_context(struct mm_struct *mm)
-{
-       if (mm->context.size) {
-               if (mm == current->active_mm)
-                       clear_LDT();
-               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
-                       vfree(mm->context.ldt);
-               else
-                       kfree(mm->context.ldt);
-               mm->context.size = 0;
-       }
-}
-
-static int read_ldt(void __user * ptr, unsigned long bytecount)
-{
-       int err;
-       unsigned long size;
-       struct mm_struct * mm = current->mm;
-
-       if (!mm->context.size)
-               return 0;
-       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
-
-       down(&mm->context.sem);
-       size = mm->context.size*LDT_ENTRY_SIZE;
-       if (size > bytecount)
-               size = bytecount;
-
-       err = 0;
-       if (copy_to_user(ptr, mm->context.ldt, size))
-               err = -EFAULT;
-       up(&mm->context.sem);
-       if (err < 0)
-               goto error_return;
-       if (size != bytecount) {
-               /* zero-fill the rest */
-               if (clear_user(ptr+size, bytecount-size) != 0) {
-                       err = -EFAULT;
-                       goto error_return;
-               }
-       }
-       return bytecount;
-error_return:
-       return err;
-}
-
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
-{
-       int err;
-       unsigned long size;
-
-       err = 0;
-       size = 5*sizeof(struct desc_struct);
-       if (size > bytecount)
-               size = bytecount;
-
-       err = size;
-       if (clear_user(ptr, size))
-               err = -EFAULT;
-
-       return err;
-}
-
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
-{
-       struct mm_struct * mm = current->mm;
-       __u32 entry_1, entry_2;
-       int error;
-       struct user_desc ldt_info;
-
-       error = -EINVAL;
-       if (bytecount != sizeof(ldt_info))
-               goto out;
-       error = -EFAULT;        
-       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
-               goto out;
-
-       error = -EINVAL;
-       if (ldt_info.entry_number >= LDT_ENTRIES)
-               goto out;
-       if (ldt_info.contents == 3) {
-               if (oldmode)
-                       goto out;
-               if (ldt_info.seg_not_present == 0)
-                       goto out;
-       }
-
-       down(&mm->context.sem);
-       if (ldt_info.entry_number >= mm->context.size) {
-               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
-               if (error < 0)
-                       goto out_unlock;
-       }
-
-       /* Allow LDTs to be cleared by the user. */
-       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-               if (oldmode || LDT_empty(&ldt_info)) {
-                       entry_1 = 0;
-                       entry_2 = 0;
-                       goto install;
-               }
-       }
-
-       entry_1 = LDT_entry_a(&ldt_info);
-       entry_2 = LDT_entry_b(&ldt_info);
-       if (oldmode)
-               entry_2 &= ~(1 << 20);
-
-       /* Install the new entry ...  */
-install:
-       write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
-       error = 0;
-
-out_unlock:
-       up(&mm->context.sem);
-out:
-       return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
-{
-       int ret = -ENOSYS;
-
-       switch (func) {
-       case 0:
-               ret = read_ldt(ptr, bytecount);
-               break;
-       case 1:
-               ret = write_ldt(ptr, bytecount, 1);
-               break;
-       case 2:
-               ret = read_default_ldt(ptr, bytecount);
-               break;
-       case 0x11:
-               ret = write_ldt(ptr, bytecount, 0);
-               break;
-       }
-       return ret;
-}
diff --git a/arch/i386/kernel/machine_kexec_32.c b/arch/i386/kernel/machine_kexec_32.c
deleted file mode 100644 (file)
index 91966ba..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * machine_kexec.c - handle transition of Linux booting another kernel
- * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#include <linux/mm.h>
-#include <linux/kexec.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/cpufeature.h>
-#include <asm/desc.h>
-#include <asm/system.h>
-
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u32 kexec_pgd[1024] PAGE_ALIGNED;
-#ifdef CONFIG_X86_PAE
-static u32 kexec_pmd0[1024] PAGE_ALIGNED;
-static u32 kexec_pmd1[1024] PAGE_ALIGNED;
-#endif
-static u32 kexec_pte0[1024] PAGE_ALIGNED;
-static u32 kexec_pte1[1024] PAGE_ALIGNED;
-
-static void set_idt(void *newidt, __u16 limit)
-{
-       struct Xgt_desc_struct curidt;
-
-       /* ia32 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       load_idt(&curidt);
-};
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
-       struct Xgt_desc_struct curgdt;
-
-       /* ia32 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
-
-       load_gdt(&curgdt);
-};
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-       __asm__ __volatile__ (
-               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
-               "\t1:\n"
-               "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-               "\tmovl %%eax,%%ds\n"
-               "\tmovl %%eax,%%es\n"
-               "\tmovl %%eax,%%fs\n"
-               "\tmovl %%eax,%%gs\n"
-               "\tmovl %%eax,%%ss\n"
-               ::: "eax", "memory");
-#undef STR
-#undef __STR
-}
-
-/*
- * A architecture hook called to validate the
- * proposed image and prepare the control pages
- * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
- * have been allocated, but the segments have yet
- * been copied into the kernel.
- *
- * Do what every setup is needed on image and the
- * reboot code buffer to allow us to avoid allocations
- * later.
- *
- * Currently nothing.
- */
-int machine_kexec_prepare(struct kimage *image)
-{
-       return 0;
-}
-
-/*
- * Undo anything leftover by machine_kexec_prepare
- * when an image is freed.
- */
-void machine_kexec_cleanup(struct kimage *image)
-{
-}
-
-/*
- * Do not allocate memory (or fail in any way) in machine_kexec().
- * We are past the point of no return, committed to rebooting now.
- */
-NORET_TYPE void machine_kexec(struct kimage *image)
-{
-       unsigned long page_list[PAGES_NR];
-       void *control_page;
-
-       /* Interrupts aren't acceptable while we reboot */
-       local_irq_disable();
-
-       control_page = page_address(image->control_code_page);
-       memcpy(control_page, relocate_kernel, PAGE_SIZE);
-
-       page_list[PA_CONTROL_PAGE] = __pa(control_page);
-       page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
-       page_list[PA_PGD] = __pa(kexec_pgd);
-       page_list[VA_PGD] = (unsigned long)kexec_pgd;
-#ifdef CONFIG_X86_PAE
-       page_list[PA_PMD_0] = __pa(kexec_pmd0);
-       page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-       page_list[PA_PMD_1] = __pa(kexec_pmd1);
-       page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-#endif
-       page_list[PA_PTE_0] = __pa(kexec_pte0);
-       page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-       page_list[PA_PTE_1] = __pa(kexec_pte1);
-       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
-
-       /* The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /* The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0),0);
-       set_idt(phys_to_virt(0),0);
-
-       /* now call it */
-       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-                       image->start, cpu_has_pae);
-}
-
-/* crashkernel=size@addr specifies the location to reserve for
- * a crash kernel.  By reserving this memory we guarantee
- * that linux never sets it up as a DMA target.
- * Useful for holding code to do something appropriate
- * after a kernel panic.
- */
-static int __init parse_crashkernel(char *arg)
-{
-       unsigned long size, base;
-       size = memparse(arg, &arg);
-       if (*arg == '@') {
-               base = memparse(arg+1, &arg);
-               /* FIXME: Do I want a sanity check
-                * to validate the memory range?
-                */
-               crashk_res.start = base;
-               crashk_res.end   = base + size - 1;
-       }
-       return 0;
-}
-early_param("crashkernel", parse_crashkernel);
diff --git a/arch/i386/kernel/mca_32.c b/arch/i386/kernel/mca_32.c
deleted file mode 100644 (file)
index b83672b..0000000
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- *  linux/arch/i386/kernel/mca.c
- *  Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- *     Chris Beauregard July 28th, 1996
- *     - Fixed up integrated SCSI detection
- *
- *     Chris Beauregard August 3rd, 1996
- *     - Made mca_info local
- *     - Made integrated registers accessible through standard function calls
- *     - Added name field
- *     - More sanity checking
- *
- *     Chris Beauregard August 9th, 1996
- *     - Rewrote /proc/mca
- *
- *     Chris Beauregard January 7th, 1997
- *     - Added basic NMI-processing
- *     - Added more information to mca_info structure
- *
- *     David Weinehall October 12th, 1998
- *     - Made a lot of cleaning up in the source
- *     - Added use of save_flags / restore_flags
- *     - Added the 'driver_loaded' flag in MCA_adapter
- *     - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- *     David Weinehall March 24th, 1999
- *     - Fixed the output of 'Driver Installed' in /proc/mca/pos
- *     - Made the Integrated Video & SCSI show up even if they have id 0000
- *
- *     Alexander Viro November 9th, 1999
- *     - Switched to regular procfs methods
- *
- *     Alfred Arnold & David Weinehall August 23rd, 2000
- *     - Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-#include <asm/arch_hooks.h>
-
-static unsigned char which_scsi = 0;
-
-int MCA_bus = 0;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev) {
-       mca_dev->status = MCA_ADAPTER_NONE;
-
-       mca_dev->pos_id = mca_dev->pos[0]
-               + (mca_dev->pos[1] << 8);
-
-       if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
-               /* id = 0x0000 usually indicates hardware failure,
-                * however, ZP Gu (zpg@castle.net> reports that his 9556
-                * has 0x0000 as id and everything still works. There
-                * also seem to be an adapter with id = 0x0000; the
-                * NCR Parallel Bus Memory Card. Until this is confirmed,
-                * however, this code will stay.
-                */
-
-               mca_dev->status = MCA_ADAPTER_ERROR;
-
-               return;
-       } else if(mca_dev->pos_id != 0xffff) {
-
-               /* 0xffff usually indicates that there's no adapter,
-                * however, some integrated adapters may have 0xffff as
-                * their id and still be valid. Examples are on-board
-                * VGA of the 55sx, the integrated SCSI of the 56 & 57,
-                * and possibly also the 95 ULTIMEDIA.
-                */
-
-               mca_dev->status = MCA_ADAPTER_NORMAL;
-       }
-
-       if((mca_dev->pos_id == 0xffff ||
-           mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
-               int j;
-
-               for(j = 2; j < 8; j++) {
-                       if(mca_dev->pos[j] != 0xff) {
-                               mca_dev->status = MCA_ADAPTER_NORMAL;
-                               break;
-                       }
-               }
-       }
-
-       if(!(mca_dev->pos[2] & MCA_ENABLED)) {
-
-               /* enabled bit is in POS 2 */
-
-               mca_dev->status = MCA_ADAPTER_DISABLED;
-       }
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
-       { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
-       { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
-       { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
-       { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
-       { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
-       { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
-       { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-
-/**
- *     mca_read_and_store_pos - read the POS registers into a memory buffer
- *      @pos: a char pointer to 8 bytes, contains the POS register value on
- *            successful return
- *
- *     Returns 1 if a card actually exists (i.e. the pos isn't
- *     all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos) {
-       int j;
-       int found = 0;
-
-       for(j=0; j<8; j++) {
-               if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
-                       /* 0xff all across means no device. 0x00 means
-                        * something's broken, but a device is
-                        * probably there.  However, if you get 0x00
-                        * from a motherboard register it won't matter
-                        * what we find.  For the record, on the
-                        * 57SLC, the integrated SCSI adapter has
-                        * 0xffff for the adapter ID, but nonzero for
-                        * other registers.  */
-
-                       found = 1;
-               }
-       }
-       return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
-       unsigned char byte;
-       unsigned long flags;
-
-       if(reg < 0 || reg >= 8)
-               return 0;
-
-       spin_lock_irqsave(&mca_lock, flags);
-       if(mca_dev->pos_register) {
-               /* Disable adapter setup, enable motherboard setup */
-
-               outb_p(0, MCA_ADAPTER_SETUP_REG);
-               outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
-               byte = inb_p(MCA_POS_REG(reg));
-               outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-       } else {
-
-               /* Make sure motherboard setup is off */
-
-               outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
-               /* Read the appropriate register */
-
-               outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
-               byte = inb_p(MCA_POS_REG(reg));
-               outb_p(0, MCA_ADAPTER_SETUP_REG);
-       }
-       spin_unlock_irqrestore(&mca_lock, flags);
-
-       mca_dev->pos[reg] = byte;
-
-       return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
-                            unsigned char byte)
-{
-       unsigned long flags;
-
-       if(reg < 0 || reg >= 8)
-               return;
-
-       spin_lock_irqsave(&mca_lock, flags);
-
-       /* Make sure motherboard setup is off */
-
-       outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
-       /* Read in the appropriate register */
-
-       outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
-       outb_p(byte, MCA_POS_REG(reg));
-       outb_p(0, MCA_ADAPTER_SETUP_REG);
-
-       spin_unlock_irqrestore(&mca_lock, flags);
-
-       /* Update the global register list, while we have the byte */
-
-       mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
-{
-       return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
-{
-       return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
-{
-       return mem;
-}
-
-
-static int __init mca_init(void)
-{
-       unsigned int i, j;
-       struct mca_device *mca_dev;
-       unsigned char pos[8];
-       short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
-       struct mca_bus *bus;
-
-       /* WARNING: Be careful when making changes here. Putting an adapter
-        * and the motherboard simultaneously into setup mode may result in
-        * damage to chips (according to The Indispensible PC Hardware Book
-        * by Hans-Peter Messmer). Also, we disable system interrupts (so
-        * that we are not disturbed in the middle of this).
-        */
-
-       /* Make sure the MCA bus is present */
-
-       if (mca_system_init()) {
-               printk(KERN_ERR "MCA bus system initialisation failed\n");
-               return -ENODEV;
-       }
-
-       if (!MCA_bus)
-               return -ENODEV;
-
-       printk(KERN_INFO "Micro Channel bus detected.\n");
-
-       /* All MCA systems have at least a primary bus */
-       bus = mca_attach_bus(MCA_PRIMARY_BUS);
-       if (!bus)
-               goto out_nomem;
-       bus->default_dma_mask = 0xffffffffLL;
-       bus->f.mca_write_pos = mca_pc_write_pos;
-       bus->f.mca_read_pos = mca_pc_read_pos;
-       bus->f.mca_transform_irq = mca_dummy_transform_irq;
-       bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
-       bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
-       /* get the motherboard device */
-       mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
-       if(unlikely(!mca_dev))
-               goto out_nomem;
-
-       /*
-        * We do not expect many MCA interrupts during initialization,
-        * but let us be safe:
-        */
-       spin_lock_irq(&mca_lock);
-
-       /* Make sure adapter setup is off */
-
-       outb_p(0, MCA_ADAPTER_SETUP_REG);
-
-       /* Read motherboard POS registers */
-
-       mca_dev->pos_register = 0x7f;
-       outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-       mca_dev->name[0] = 0;
-       mca_read_and_store_pos(mca_dev->pos);
-       mca_configure_adapter_status(mca_dev);
-       /* fake POS and slot for a motherboard */
-       mca_dev->pos_id = MCA_MOTHERBOARD_POS;
-       mca_dev->slot = MCA_MOTHERBOARD;
-       mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
-       mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
-       if(unlikely(!mca_dev))
-               goto out_unlock_nomem;
-
-       /* Put motherboard into video setup mode, read integrated video
-        * POS registers, and turn motherboard setup off.
-        */
-
-       mca_dev->pos_register = 0xdf;
-       outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-       mca_dev->name[0] = 0;
-       mca_read_and_store_pos(mca_dev->pos);
-       mca_configure_adapter_status(mca_dev);
-       /* fake POS and slot for the integrated video */
-       mca_dev->pos_id = MCA_INTEGVIDEO_POS;
-       mca_dev->slot = MCA_INTEGVIDEO;
-       mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
-       /* Put motherboard into scsi setup mode, read integrated scsi
-        * POS registers, and turn motherboard setup off.
-        *
-        * It seems there are two possible SCSI registers. Martin says that
-        * for the 56,57, 0xf7 is the one, but fails on the 76.
-        * Alfredo (apena@vnet.ibm.com) says
-        * 0xfd works on his machine. We'll try both of them. I figure it's
-        * a good bet that only one could be valid at a time. This could
-        * screw up though if one is used for something else on the other
-        * machine.
-        */
-
-       for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
-               outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
-               if(mca_read_and_store_pos(pos))
-                       break;
-       }
-       if(which_scsi) {
-               /* found a scsi card */
-               mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
-               if(unlikely(!mca_dev))
-                       goto out_unlock_nomem;
-
-               for(j = 0; j < 8; j++)
-                       mca_dev->pos[j] = pos[j];
-
-               mca_configure_adapter_status(mca_dev);
-               /* fake POS and slot for integrated SCSI controller */
-               mca_dev->pos_id = MCA_INTEGSCSI_POS;
-               mca_dev->slot = MCA_INTEGSCSI;
-               mca_dev->pos_register = which_scsi;
-               mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-       }
-
-       /* Turn off motherboard setup */
-
-       outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
-       /* Now loop over MCA slots: put each adapter into setup mode, and
-        * read its POS registers. Then put adapter setup off.
-        */
-
-       for(i=0; i<MCA_MAX_SLOT_NR; i++) {
-               outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
-               if(!mca_read_and_store_pos(pos))
-                       continue;
-
-               mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
-               if(unlikely(!mca_dev))
-                       goto out_unlock_nomem;
-
-               for(j=0; j<8; j++)
-                       mca_dev->pos[j]=pos[j];
-
-               mca_dev->driver_loaded = 0;
-               mca_dev->slot = i;
-               mca_dev->pos_register = 0;
-               mca_configure_adapter_status(mca_dev);
-               mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-       }
-       outb_p(0, MCA_ADAPTER_SETUP_REG);
-
-       /* Enable interrupts and return memory start */
-       spin_unlock_irq(&mca_lock);
-
-       for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
-               request_resource(&ioport_resource, mca_standard_resources + i);
-
-       mca_do_proc_init();
-
-       return 0;
-
- out_unlock_nomem:
-       spin_unlock_irq(&mca_lock);
- out_nomem:
-       printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
-       return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
-       int slot = mca_dev->slot;
-
-       if(slot == MCA_INTEGSCSI) {
-               printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
-                       mca_dev->name);
-       } else if(slot == MCA_INTEGVIDEO) {
-               printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
-                       mca_dev->name);
-       } else if(slot == MCA_MOTHERBOARD) {
-               printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
-                       mca_dev->name);
-       }
-
-       /* More info available in POS 6 and 7? */
-
-       if(check_flag) {
-               unsigned char pos6, pos7;
-
-               pos6 = mca_device_read_pos(mca_dev, 6);
-               pos7 = mca_device_read_pos(mca_dev, 7);
-
-               printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
-       }
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
-       struct mca_device *mca_dev = to_mca_device(dev);
-       unsigned char pos5;
-
-       pos5 = mca_device_read_pos(mca_dev, 5);
-
-       if(!(pos5 & 0x80)) {
-               /* Bit 7 of POS 5 is reset when this adapter has a hardware
-                * error. Bit 7 it reset if there's error information
-                * available in POS 6 and 7.
-                */
-               mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
-               return 1;
-       }
-       return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
-       /* First try - scan the various adapters and see if a specific
-        * adapter was responsible for the error.
-        */
-       bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-
-       mca_nmi_hook();
-} /* mca_handle_nmi */
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
deleted file mode 100644 (file)
index 09cf781..0000000
+++ /dev/null
@@ -1,850 +0,0 @@
-/*
- *     Intel CPU Microcode Update Driver for Linux
- *
- *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *                   2006      Shaohua Li <shaohua.li@intel.com>
- *
- *     This driver allows to upgrade microcode on Intel processors
- *     belonging to IA-32 family - PentiumPro, Pentium II, 
- *     Pentium III, Xeon, Pentium 4, etc.
- *
- *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
- *     Order Number 245472 or free download from:
- *             
- *     http://developer.intel.com/design/pentium4/manuals/245472.htm
- *
- *     For more information, go to http://www.urbanmyth.org/microcode
- *
- *     This program is free software; you can redistribute it and/or
- *     modify it under the terms of the GNU General Public License
- *     as published by the Free Software Foundation; either version
- *     2 of the License, or (at your option) any later version.
- *
- *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Initial release.
- *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added read() support + cleanups.
- *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added 'device trimming' support. open(O_WRONLY) zeroes
- *             and frees the saved copy of applied microcode.
- *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Made to use devfs (/dev/cpu/microcode) + cleanups.
- *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Added misc device support (now uses both devfs and misc).
- *             Added MICROCODE_IOCFREE ioctl to clear memory.
- *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Messages for error cases (non Intel & no suitable microcode).
- *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *             Removed ->release(). Removed exclusive open and status bitmap.
- *             Added microcode_rwsem to serialize read()/write()/ioctl().
- *             Removed global kernel lock usage.
- *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *             Write 0 to 0x8B msr and then cpuid before reading revision,
- *             so that it works even if there were no update done by the
- *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *             to be 0 on my machine which is why it worked even when I
- *             disabled update by the BIOS)
- *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *                          Tigran Aivazian <tigran@veritas.com>
- *             Intel Pentium 4 processor support and bugfixes.
- *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *             Bugfix for HT (Hyper-Threading) enabled processors
- *             whereby processor resources are shared by all logical processors
- *             in a single CPU package.
- *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *             Tigran Aivazian <tigran@veritas.com>,
- *             Serialize updates as required on HT processors due to speculative
- *             nature of implementation.
- *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *             Fix the panic when writing zero-length microcode chunk.
- *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 
- *             Jun Nakajima <jun.nakajima@intel.com>
- *             Support for the microcode updates in the new format.
- *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *             because we no longer hold a copy of applied microcode 
- *             in kernel memory.
- *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *             Fix sigmatch() macro to handle old CPUs with pf == 0.
- *             Thanks to Stuart Swales for pointing out this bug.
- */
-
-//#define DEBUG /* pr_debug */
-#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
-
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-
-MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION      "1.14a"
-
-#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
-#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
-#define EXT_HEADER_SIZE                (sizeof (struct extended_sigtable)) /* 20 bytes */
-#define EXT_SIGNATURE_SIZE     (sizeof (struct extended_signature)) /* 12 bytes */
-#define DWSIZE                 (sizeof (u32))
-#define get_totalsize(mc) \
-       (((microcode_t *)mc)->hdr.totalsize ? \
-        ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
-       (((microcode_t *)mc)->hdr.datasize ? \
-        ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
-       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
-static DEFINE_MUTEX(microcode_mutex);
-
-static struct ucode_cpu_info {
-       int valid;
-       unsigned int sig;
-       unsigned int pf;
-       unsigned int rev;
-       microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
-
-static void collect_cpu_info(int cpu_num)
-{
-       struct cpuinfo_x86 *c = cpu_data + cpu_num;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-       unsigned int val[2];
-
-       /* We should bind the task to the CPU */
-       BUG_ON(raw_smp_processor_id() != cpu_num);
-       uci->pf = uci->rev = 0;
-       uci->mc = NULL;
-       uci->valid = 1;
-
-       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-               cpu_has(c, X86_FEATURE_IA64)) {
-               printk(KERN_ERR "microcode: CPU%d not a capable Intel "
-                       "processor\n", cpu_num);
-               uci->valid = 0;
-               return;
-       }
-
-       uci->sig = cpuid_eax(0x00000001);
-
-       if ((c->x86_model >= 5) || (c->x86 > 6)) {
-               /* get processor flags from MSR 0x17 */
-               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-               uci->pf = 1 << ((val[1] >> 18) & 7);
-       }
-
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-       /* see notes above for revision 1.07.  Apparent chip bug */
-       sync_core();
-       /* get the current revision from MSR 0x8B */
-       rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
-       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       uci->sig, uci->pf, uci->rev);
-}
-
-static inline int microcode_update_match(int cpu_num,
-       microcode_header_t *mc_header, int sig, int pf)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-       if (!sigmatch(sig, uci->sig, pf, uci->pf)
-               || mc_header->rev <= uci->rev)
-               return 0;
-       return 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
-       microcode_header_t *mc_header = mc;
-       struct extended_sigtable *ext_header = NULL;
-       struct extended_signature *ext_sig;
-       unsigned long total_size, data_size, ext_table_size;
-       int sum, orig_sum, ext_sigcount = 0, i;
-
-       total_size = get_totalsize(mc_header);
-       data_size = get_datasize(mc_header);
-       if (data_size + MC_HEADER_SIZE > total_size) {
-               printk(KERN_ERR "microcode: error! "
-                       "Bad data size in microcode data file\n");
-               return -EINVAL;
-       }
-
-       if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-               printk(KERN_ERR "microcode: error! "
-                       "Unknown microcode update format\n");
-               return -EINVAL;
-       }
-       ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-       if (ext_table_size) {
-               if ((ext_table_size < EXT_HEADER_SIZE)
-                || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-                       printk(KERN_ERR "microcode: error! "
-                               "Small exttable size in microcode data file\n");
-                       return -EINVAL;
-               }
-               ext_header = mc + MC_HEADER_SIZE + data_size;
-               if (ext_table_size != exttable_size(ext_header)) {
-                       printk(KERN_ERR "microcode: error! "
-                               "Bad exttable size in microcode data file\n");
-                       return -EFAULT;
-               }
-               ext_sigcount = ext_header->count;
-       }
-
-       /* check extended table checksum */
-       if (ext_table_size) {
-               int ext_table_sum = 0;
-               int *ext_tablep = (int *)ext_header;
-
-               i = ext_table_size / DWSIZE;
-               while (i--)
-                       ext_table_sum += ext_tablep[i];
-               if (ext_table_sum) {
-                       printk(KERN_WARNING "microcode: aborting, "
-                               "bad extended signature table checksum\n");
-                       return -EINVAL;
-               }
-       }
-
-       /* calculate the checksum */
-       orig_sum = 0;
-       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
-       while (i--)
-               orig_sum += ((int *)mc)[i];
-       if (orig_sum) {
-               printk(KERN_ERR "microcode: aborting, bad checksum\n");
-               return -EINVAL;
-       }
-       if (!ext_table_size)
-               return 0;
-       /* check extended signature checksum */
-       for (i = 0; i < ext_sigcount; i++) {
-               ext_sig = (struct extended_signature *)((void *)ext_header
-                       + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
-               sum = orig_sum
-                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
-                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-               if (sum) {
-                       printk(KERN_ERR "microcode: aborting, bad checksum\n");
-                       return -EINVAL;
-               }
-       }
-       return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- * return < 0 - error
- */
-static int get_maching_microcode(void *mc, int cpu)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       microcode_header_t *mc_header = mc;
-       struct extended_sigtable *ext_header;
-       unsigned long total_size = get_totalsize(mc_header);
-       int ext_sigcount, i;
-       struct extended_signature *ext_sig;
-       void *new_mc;
-
-       if (microcode_update_match(cpu, mc_header,
-                       mc_header->sig, mc_header->pf))
-               goto find;
-
-       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
-               return 0;
-
-       ext_header = (struct extended_sigtable *)(mc +
-                       get_datasize(mc_header) + MC_HEADER_SIZE);
-       ext_sigcount = ext_header->count;
-       ext_sig = (struct extended_signature *)((void *)ext_header
-                       + EXT_HEADER_SIZE);
-       for (i = 0; i < ext_sigcount; i++) {
-               if (microcode_update_match(cpu, mc_header,
-                               ext_sig->sig, ext_sig->pf))
-                       goto find;
-               ext_sig++;
-       }
-       return 0;
-find:
-       pr_debug("microcode: CPU %d found a matching microcode update with"
-               " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
-       new_mc = vmalloc(total_size);
-       if (!new_mc) {
-               printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-               return -ENOMEM;
-       }
-
-       /* free previous update file */
-       vfree(uci->mc);
-
-       memcpy(new_mc, mc, total_size);
-       uci->mc = new_mc;
-       return 1;
-}
-
-static void apply_microcode(int cpu)
-{
-       unsigned long flags;
-       unsigned int val[2];
-       int cpu_num = raw_smp_processor_id();
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu_num != cpu);
-
-       if (uci->mc == NULL)
-               return;
-
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);          
-
-       /* write microcode via MSR 0x79 */
-       wrmsr(MSR_IA32_UCODE_WRITE,
-               (unsigned long) uci->mc->bits, 
-               (unsigned long) uci->mc->bits >> 16 >> 16);
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-       /* see notes above for revision 1.07.  Apparent chip bug */
-       sync_core();
-
-       /* get the current revision from MSR 0x8B */
-       rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
-       if (val[1] != uci->mc->hdr.rev) {
-               printk(KERN_ERR "microcode: CPU%d updated from revision "
-                       "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
-               return;
-       }
-       pr_debug("microcode: CPU%d updated from revision "
-              "0x%x to 0x%x, date = %08x \n", 
-              cpu_num, uci->rev, val[1], uci->mc->hdr.date);
-       uci->rev = val[1];
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static void __user *user_buffer;       /* user area microcode data buffer */
-static unsigned int user_buffer_size;  /* it's size */
-
-static long get_next_ucode(void **mc, long offset)
-{
-       microcode_header_t mc_header;
-       unsigned long total_size;
-
-       /* No more data */
-       if (offset >= user_buffer_size)
-               return 0;
-       if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
-               printk(KERN_ERR "microcode: error! Can not read user data\n");
-               return -EFAULT;
-       }
-       total_size = get_totalsize(&mc_header);
-       if (offset + total_size > user_buffer_size) {
-               printk(KERN_ERR "microcode: error! Bad total size in microcode "
-                               "data file\n");
-               return -EINVAL;
-       }
-       *mc = vmalloc(total_size);
-       if (!*mc)
-               return -ENOMEM;
-       if (copy_from_user(*mc, user_buffer + offset, total_size)) {
-               printk(KERN_ERR "microcode: error! Can not read user data\n");
-               vfree(*mc);
-               return -EFAULT;
-       }
-       return offset + total_size;
-}
-
-static int do_microcode_update (void)
-{
-       long cursor = 0;
-       int error = 0;
-       void *new_mc = NULL;
-       int cpu;
-       cpumask_t old;
-
-       old = current->cpus_allowed;
-
-       while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
-               error = microcode_sanity_check(new_mc);
-               if (error)
-                       goto out;
-               /*
-                * It's possible the data file has multiple matching ucode,
-                * lets keep searching till the latest version
-                */
-               for_each_online_cpu(cpu) {
-                       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-                       if (!uci->valid)
-                               continue;
-                       set_cpus_allowed(current, cpumask_of_cpu(cpu));
-                       error = get_maching_microcode(new_mc, cpu);
-                       if (error < 0)
-                               goto out;
-                       if (error == 1)
-                               apply_microcode(cpu);
-               }
-               vfree(new_mc);
-       }
-out:
-       if (cursor > 0)
-               vfree(new_mc);
-       if (cursor < 0)
-               error = cursor;
-       set_cpus_allowed(current, old);
-       return error;
-}
-
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
-       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
-{
-       ssize_t ret;
-
-       if ((len >> PAGE_SHIFT) > num_physpages) {
-               printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
-               return -EINVAL;
-       }
-
-       lock_cpu_hotplug();
-       mutex_lock(&microcode_mutex);
-
-       user_buffer = (void __user *) buf;
-       user_buffer_size = (int) len;
-
-       ret = do_microcode_update();
-       if (!ret)
-               ret = (ssize_t)len;
-
-       mutex_unlock(&microcode_mutex);
-       unlock_cpu_hotplug();
-
-       return ret;
-}
-
-static const struct file_operations microcode_fops = {
-       .owner          = THIS_MODULE,
-       .write          = microcode_write,
-       .open           = microcode_open,
-};
-
-static struct miscdevice microcode_dev = {
-       .minor          = MICROCODE_MINOR,
-       .name           = "microcode",
-       .fops           = &microcode_fops,
-};
-
-static int __init microcode_dev_init (void)
-{
-       int error;
-
-       error = misc_register(&microcode_dev);
-       if (error) {
-               printk(KERN_ERR
-                       "microcode: can't misc_register on minor=%d\n",
-                       MICROCODE_MINOR);
-               return error;
-       }
-
-       return 0;
-}
-
-static void microcode_dev_exit (void)
-{
-       misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while(0)
-#endif
-
-static long get_next_ucode_from_buffer(void **mc, void *buf,
-       unsigned long size, long offset)
-{
-       microcode_header_t *mc_header;
-       unsigned long total_size;
-
-       /* No more data */
-       if (offset >= size)
-               return 0;
-       mc_header = (microcode_header_t *)(buf + offset);
-       total_size = get_totalsize(mc_header);
-
-       if (offset + total_size > size) {
-               printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-               return -EINVAL;
-       }
-
-       *mc = vmalloc(total_size);
-       if (!*mc) {
-               printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-               return -ENOMEM;
-       }
-       memcpy(*mc, buf + offset, total_size);
-       return offset + total_size;
-}
-
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
-
-static int cpu_request_microcode(int cpu)
-{
-       char name[30];
-       struct cpuinfo_x86 *c = cpu_data + cpu;
-       const struct firmware *firmware;
-       void *buf;
-       unsigned long size;
-       long offset = 0;
-       int error;
-       void *mc;
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
-       sprintf(name,"intel-ucode/%02x-%02x-%02x",
-               c->x86, c->x86_model, c->x86_mask);
-       error = request_firmware(&firmware, name, &microcode_pdev->dev);
-       if (error) {
-               pr_debug("ucode data file %s load failed\n", name);
-               return error;
-       }
-       buf = (void *)firmware->data;
-       size = firmware->size;
-       while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
-                       > 0) {
-               error = microcode_sanity_check(mc);
-               if (error)
-                       break;
-               error = get_maching_microcode(mc, cpu);
-               if (error < 0)
-                       break;
-               /*
-                * It's possible the data file has multiple matching ucode,
-                * lets keep searching till the latest version
-                */
-               if (error == 1) {
-                       apply_microcode(cpu);
-                       error = 0;
-               }
-               vfree(mc);
-       }
-       if (offset > 0)
-               vfree(mc);
-       if (offset < 0)
-               error = offset;
-       release_firmware(firmware);
-
-       return error;
-}
-
-static int apply_microcode_check_cpu(int cpu)
-{
-       struct cpuinfo_x86 *c = cpu_data + cpu;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       cpumask_t old;
-       unsigned int val[2];
-       int err = 0;
-
-       /* Check if the microcode is available */
-       if (!uci->mc)
-               return 0;
-
-       old = current->cpus_allowed;
-       set_cpus_allowed(current, cpumask_of_cpu(cpu));
-
-       /* Check if the microcode we have in memory matches the CPU */
-       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-           cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
-               err = -EINVAL;
-
-       if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
-               /* get processor flags from MSR 0x17 */
-               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-               if (uci->pf != (1 << ((val[1] >> 18) & 7)))
-                       err = -EINVAL;
-       }
-
-       if (!err) {
-               wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-               /* see notes above for revision 1.07.  Apparent chip bug */
-               sync_core();
-               /* get the current revision from MSR 0x8B */
-               rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-               if (uci->rev != val[1])
-                       err = -EINVAL;
-       }
-
-       if (!err)
-               apply_microcode(cpu);
-       else
-               printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
-                       " sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       cpu, uci->sig, uci->pf, uci->rev);
-
-       set_cpus_allowed(current, old);
-       return err;
-}
-
-static void microcode_init_cpu(int cpu, int resume)
-{
-       cpumask_t old;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       old = current->cpus_allowed;
-
-       set_cpus_allowed(current, cpumask_of_cpu(cpu));
-       mutex_lock(&microcode_mutex);
-       collect_cpu_info(cpu);
-       if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
-               cpu_request_microcode(cpu);
-       mutex_unlock(&microcode_mutex);
-       set_cpus_allowed(current, old);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       mutex_lock(&microcode_mutex);
-       uci->valid = 0;
-       vfree(uci->mc);
-       uci->mc = NULL;
-       mutex_unlock(&microcode_mutex);
-}
-
-static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-       char *end;
-       unsigned long val = simple_strtoul(buf, &end, 0);
-       int err = 0;
-       int cpu = dev->id;
-
-       if (end == buf)
-               return -EINVAL;
-       if (val == 1) {
-               cpumask_t old;
-
-               old = current->cpus_allowed;
-
-               lock_cpu_hotplug();
-               set_cpus_allowed(current, cpumask_of_cpu(cpu));
-
-               mutex_lock(&microcode_mutex);
-               if (uci->valid)
-                       err = cpu_request_microcode(cpu);
-               mutex_unlock(&microcode_mutex);
-               unlock_cpu_hotplug();
-               set_cpus_allowed(current, old);
-       }
-       if (err)
-               return err;
-       return sz;
-}
-
-static ssize_t version_show(struct sys_device *dev, char *buf)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-       return sprintf(buf, "0x%x\n", uci->rev);
-}
-
-static ssize_t pf_show(struct sys_device *dev, char *buf)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-       return sprintf(buf, "0x%x\n", uci->pf);
-}
-
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
-       &attr_reload.attr,
-       &attr_version.attr,
-       &attr_processor_flags.attr,
-       NULL
-};
-
-static struct attribute_group mc_attr_group = {
-       .attrs = mc_default_attrs,
-       .name = "microcode",
-};
-
-static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
-{
-       int err, cpu = sys_dev->id;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       if (!cpu_online(cpu))
-               return 0;
-
-       pr_debug("Microcode:CPU %d added\n", cpu);
-       memset(uci, 0, sizeof(*uci));
-
-       err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
-       if (err)
-               return err;
-
-       microcode_init_cpu(cpu, resume);
-
-       return 0;
-}
-
-static int mc_sysdev_add(struct sys_device *sys_dev)
-{
-       return __mc_sysdev_add(sys_dev, 0);
-}
-
-static int mc_sysdev_remove(struct sys_device *sys_dev)
-{
-       int cpu = sys_dev->id;
-
-       if (!cpu_online(cpu))
-               return 0;
-
-       pr_debug("Microcode:CPU %d removed\n", cpu);
-       microcode_fini_cpu(cpu);
-       sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
-       return 0;
-}
-
-static int mc_sysdev_resume(struct sys_device *dev)
-{
-       int cpu = dev->id;
-
-       if (!cpu_online(cpu))
-               return 0;
-       pr_debug("Microcode:CPU %d resumed\n", cpu);
-       /* only CPU 0 will apply ucode here */
-       apply_microcode(0);
-       return 0;
-}
-
-static struct sysdev_driver mc_sysdev_driver = {
-       .add = mc_sysdev_add,
-       .remove = mc_sysdev_remove,
-       .resume = mc_sysdev_resume,
-};
-
-static __cpuinit int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-       struct sys_device *sys_dev;
-
-       sys_dev = get_cpu_sysdev(cpu);
-       switch (action) {
-       case CPU_UP_CANCELED_FROZEN:
-               /* The CPU refused to come up during a system resume */
-               microcode_fini_cpu(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-               mc_sysdev_add(sys_dev);
-               break;
-       case CPU_ONLINE_FROZEN:
-               /* System-wide resume is in progress, try to apply microcode */
-               if (apply_microcode_check_cpu(cpu)) {
-                       /* The application of microcode failed */
-                       microcode_fini_cpu(cpu);
-                       __mc_sysdev_add(sys_dev, 1);
-                       break;
-               }
-       case CPU_DOWN_FAILED_FROZEN:
-               if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-                       printk(KERN_ERR "Microcode: Failed to create the sysfs "
-                               "group for CPU%d\n", cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               mc_sysdev_remove(sys_dev);
-               break;
-       case CPU_DOWN_PREPARE_FROZEN:
-               /* Suspend is in progress, only remove the interface */
-               sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata mc_cpu_notifier = {
-       .notifier_call = mc_cpu_callback,
-};
-
-static int __init microcode_init (void)
-{
-       int error;
-
-       error = microcode_dev_init();
-       if (error)
-               return error;
-       microcode_pdev = platform_device_register_simple("microcode", -1,
-                                                        NULL, 0);
-       if (IS_ERR(microcode_pdev)) {
-               microcode_dev_exit();
-               return PTR_ERR(microcode_pdev);
-       }
-
-       lock_cpu_hotplug();
-       error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-       unlock_cpu_hotplug();
-       if (error) {
-               microcode_dev_exit();
-               platform_device_unregister(microcode_pdev);
-               return error;
-       }
-
-       register_hotcpu_notifier(&mc_cpu_notifier);
-
-       printk(KERN_INFO 
-               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
-       return 0;
-}
-
-static void __exit microcode_exit (void)
-{
-       microcode_dev_exit();
-
-       unregister_hotcpu_notifier(&mc_cpu_notifier);
-
-       lock_cpu_hotplug();
-       sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-       unlock_cpu_hotplug();
-
-       platform_device_unregister(microcode_pdev);
-}
-
-module_init(microcode_init)
-module_exit(microcode_exit)
diff --git a/arch/i386/kernel/module_32.c b/arch/i386/kernel/module_32.c
deleted file mode 100644 (file)
index 3db0a54..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-/*  Kernel module help for i386.
-    Copyright (C) 2001 Rusty Russell.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-#include <linux/moduleloader.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt...)
-#endif
-
-void *module_alloc(unsigned long size)
-{
-       if (size == 0)
-               return NULL;
-       return vmalloc_exec(size);
-}
-
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
-       vfree(module_region);
-       /* FIXME: If module_region == mod->init_region, trim exception
-           table entries. */
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
-                             Elf_Shdr *sechdrs,
-                             char *secstrings,
-                             struct module *mod)
-{
-       return 0;
-}
-
-int apply_relocate(Elf32_Shdr *sechdrs,
-                  const char *strtab,
-                  unsigned int symindex,
-                  unsigned int relsec,
-                  struct module *me)
-{
-       unsigned int i;
-       Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
-       Elf32_Sym *sym;
-       uint32_t *location;
-
-       DEBUGP("Applying relocate section %u to %u\n", relsec,
-              sechdrs[relsec].sh_info);
-       for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
-               /* This is where to make the change */
-               location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
-                       + rel[i].r_offset;
-               /* This is the symbol it is referring to.  Note that all
-                  undefined symbols have been resolved.  */
-               sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
-                       + ELF32_R_SYM(rel[i].r_info);
-
-               switch (ELF32_R_TYPE(rel[i].r_info)) {
-               case R_386_32:
-                       /* We add the value into the location given */
-                       *location += sym->st_value;
-                       break;
-               case R_386_PC32:
-                       /* Add the value, subtract its postition */
-                       *location += sym->st_value - (uint32_t)location;
-                       break;
-               default:
-                       printk(KERN_ERR "module %s: Unknown relocation: %u\n",
-                              me->name, ELF32_R_TYPE(rel[i].r_info));
-                       return -ENOEXEC;
-               }
-       }
-       return 0;
-}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
-                      const char *strtab,
-                      unsigned int symindex,
-                      unsigned int relsec,
-                      struct module *me)
-{
-       printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
-              me->name);
-       return -ENOEXEC;
-}
-
-int module_finalize(const Elf_Ehdr *hdr,
-                   const Elf_Shdr *sechdrs,
-                   struct module *me)
-{
-       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
-               *para = NULL;
-       char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
-       for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
-               if (!strcmp(".text", secstrings + s->sh_name))
-                       text = s;
-               if (!strcmp(".altinstructions", secstrings + s->sh_name))
-                       alt = s;
-               if (!strcmp(".smp_locks", secstrings + s->sh_name))
-                       locks= s;
-               if (!strcmp(".parainstructions", secstrings + s->sh_name))
-                       para = s;
-       }
-
-       if (alt) {
-               /* patch .altinstructions */
-               void *aseg = (void *)alt->sh_addr;
-               apply_alternatives(aseg, aseg + alt->sh_size);
-       }
-       if (locks && text) {
-               void *lseg = (void *)locks->sh_addr;
-               void *tseg = (void *)text->sh_addr;
-               alternatives_smp_module_add(me, me->name,
-                                           lseg, lseg + locks->sh_size,
-                                           tseg, tseg + text->sh_size);
-       }
-
-       if (para) {
-               void *pseg = (void *)para->sh_addr;
-               apply_paravirt(pseg, pseg + para->sh_size);
-       }
-
-       return module_bug_finalize(hdr, sechdrs, me);
-}
-
-void module_arch_cleanup(struct module *mod)
-{
-       alternatives_smp_module_del(mod);
-       module_bug_cleanup(mod);
-}
diff --git a/arch/i386/kernel/mpparse_32.c b/arch/i386/kernel/mpparse_32.c
deleted file mode 100644 (file)
index 13abb4e..0000000
+++ /dev/null
@@ -1,1132 +0,0 @@
-/*
- *     Intel Multiprocessor Specification 1.1 and 1.4
- *     compliant MP-table parsing routines.
- *
- *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *     Fixes
- *             Erich Boleyn    :       MP v1.4 and additional changes.
- *             Alan Cox        :       Added EBDA scanning
- *             Ingo Molnar     :       various cleanups and rewrites
- *             Maciej W. Rozycki:      Bits for default MP configurations
- *             Paul Diefenbaugh:       Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/acpi.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/bitops.h>
-
-#include <asm/smp.h>
-#include <asm/acpi.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/io_apic.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-#include <mach_mpparse.h>
-#include <bios_ebda.h>
-
-/* Have we found an MP table */
-int smp_found_config;
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-int apic_version [MAX_APICS];
-int mp_bus_id_to_type [MAX_MP_BUSSES];
-int mp_bus_id_to_node [MAX_MP_BUSSES];
-int mp_bus_id_to_local [MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-static int mp_current_pci_id;
-
-/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* MP IRQ source entries */
-int mp_irq_entries;
-
-int nr_ioapics;
-
-int pic_mode;
-unsigned long mp_lapic_addr;
-
-unsigned int def_to_bigsmp = 0;
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-/* Internal processor count */
-unsigned int __cpuinitdata num_processors;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-
-/*
- * Checksum an MP configuration block.
- */
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
-       int sum = 0;
-
-       while (len--)
-               sum += *mp++;
-
-       return sum & 0xFF;
-}
-
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-
-static int mpc_record; 
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
-
-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
-{
-       int ver, apicid;
-       physid_mask_t phys_cpu;
-       
-       if (!(m->mpc_cpuflag & CPU_ENABLED))
-               return;
-
-       apicid = mpc_apic_id(m, translation_table[mpc_record]);
-
-       if (m->mpc_featureflag&(1<<0))
-               Dprintk("    Floating point unit present.\n");
-       if (m->mpc_featureflag&(1<<7))
-               Dprintk("    Machine Exception supported.\n");
-       if (m->mpc_featureflag&(1<<8))
-               Dprintk("    64 bit compare & exchange supported.\n");
-       if (m->mpc_featureflag&(1<<9))
-               Dprintk("    Internal APIC present.\n");
-       if (m->mpc_featureflag&(1<<11))
-               Dprintk("    SEP present.\n");
-       if (m->mpc_featureflag&(1<<12))
-               Dprintk("    MTRR  present.\n");
-       if (m->mpc_featureflag&(1<<13))
-               Dprintk("    PGE  present.\n");
-       if (m->mpc_featureflag&(1<<14))
-               Dprintk("    MCA  present.\n");
-       if (m->mpc_featureflag&(1<<15))
-               Dprintk("    CMOV  present.\n");
-       if (m->mpc_featureflag&(1<<16))
-               Dprintk("    PAT  present.\n");
-       if (m->mpc_featureflag&(1<<17))
-               Dprintk("    PSE  present.\n");
-       if (m->mpc_featureflag&(1<<18))
-               Dprintk("    PSN  present.\n");
-       if (m->mpc_featureflag&(1<<19))
-               Dprintk("    Cache Line Flush Instruction present.\n");
-       /* 20 Reserved */
-       if (m->mpc_featureflag&(1<<21))
-               Dprintk("    Debug Trace and EMON Store present.\n");
-       if (m->mpc_featureflag&(1<<22))
-               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
-       if (m->mpc_featureflag&(1<<23))
-               Dprintk("    MMX  present.\n");
-       if (m->mpc_featureflag&(1<<24))
-               Dprintk("    FXSR  present.\n");
-       if (m->mpc_featureflag&(1<<25))
-               Dprintk("    XMM  present.\n");
-       if (m->mpc_featureflag&(1<<26))
-               Dprintk("    Willamette New Instructions  present.\n");
-       if (m->mpc_featureflag&(1<<27))
-               Dprintk("    Self Snoop  present.\n");
-       if (m->mpc_featureflag&(1<<28))
-               Dprintk("    HT  present.\n");
-       if (m->mpc_featureflag&(1<<29))
-               Dprintk("    Thermal Monitor present.\n");
-       /* 30, 31 Reserved */
-
-
-       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-               Dprintk("    Bootup CPU\n");
-               boot_cpu_physical_apicid = m->mpc_apicid;
-       }
-
-       ver = m->mpc_apicver;
-
-       /*
-        * Validate version
-        */
-       if (ver == 0x0) {
-               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
-                               "fixing up to 0x10. (tell your hw vendor)\n",
-                               m->mpc_apicid);
-               ver = 0x10;
-       }
-       apic_version[m->mpc_apicid] = ver;
-
-       phys_cpu = apicid_to_cpu_present(apicid);
-       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
-       if (num_processors >= NR_CPUS) {
-               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-                       "  Processor ignored.\n", NR_CPUS);
-               return;
-       }
-
-       if (num_processors >= maxcpus) {
-               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-                       " Processor ignored.\n", maxcpus);
-               return;
-       }
-
-       cpu_set(num_processors, cpu_possible_map);
-       num_processors++;
-
-       /*
-        * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
-        * but we need to work other dependencies like SMP_SUSPEND etc
-        * before this can be done without some confusion.
-        * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
-        *       - Ashok Raj <ashok.raj@intel.com>
-        */
-       if (num_processors > 8) {
-               switch (boot_cpu_data.x86_vendor) {
-               case X86_VENDOR_INTEL:
-                       if (!APIC_XAPIC(ver)) {
-                               def_to_bigsmp = 0;
-                               break;
-                       }
-                       /* If P4 and above fall through */
-               case X86_VENDOR_AMD:
-                       def_to_bigsmp = 1;
-               }
-       }
-       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
-}
-
-static void __init MP_bus_info (struct mpc_config_bus *m)
-{
-       char str[7];
-
-       memcpy(str, m->mpc_bustype, 6);
-       str[6] = 0;
-
-       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-
-#if MAX_MP_BUSSES < 256
-       if (m->mpc_busid >= MAX_MP_BUSSES) {
-               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
-                       " is too large, max. supported is %d\n",
-                       m->mpc_busid, str, MAX_MP_BUSSES - 1);
-               return;
-       }
-#endif
-
-       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
-               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
-       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
-               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
-       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
-               mpc_oem_pci_bus(m, translation_table[mpc_record]);
-               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
-               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-               mp_current_pci_id++;
-       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
-               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
-       } else {
-               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
-       }
-}
-
-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
-{
-       if (!(m->mpc_flags & MPC_APIC_USABLE))
-               return;
-
-       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
-               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
-       if (nr_ioapics >= MAX_IO_APICS) {
-               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
-                       MAX_IO_APICS, nr_ioapics);
-               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
-       }
-       if (!m->mpc_apicaddr) {
-               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
-                       " found in MP table, skipping!\n");
-               return;
-       }
-       mp_ioapics[nr_ioapics] = *m;
-       nr_ioapics++;
-}
-
-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
-{
-       mp_irqs [mp_irq_entries] = *m;
-       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-                       m->mpc_irqtype, m->mpc_irqflag & 3,
-                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-       if (++mp_irq_entries == MAX_IRQ_SOURCES)
-               panic("Max # of irq sources exceeded!!\n");
-}
-
-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
-{
-       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-                       m->mpc_irqtype, m->mpc_irqflag & 3,
-                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
-                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-}
-
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info (struct mpc_config_translation *m)
-{
-       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
-
-       if (mpc_record >= MAX_MPC_ENTRY) 
-               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-       else
-               translation_table[mpc_record] = m; /* stash this for later */
-       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-               node_set_online(m->trans_quad);
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
-       unsigned short oemsize)
-{
-       int count = sizeof (*oemtable); /* the header size */
-       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
-       
-       mpc_record = 0;
-       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
-       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
-       {
-               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-                       oemtable->oem_signature[0],
-                       oemtable->oem_signature[1],
-                       oemtable->oem_signature[2],
-                       oemtable->oem_signature[3]);
-               return;
-       }
-       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
-       {
-               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-               return;
-       }
-       while (count < oemtable->oem_length) {
-               switch (*oemptr) {
-                       case MP_TRANSLATION:
-                       {
-                               struct mpc_config_translation *m=
-                                       (struct mpc_config_translation *)oemptr;
-                               MP_translation_info(m);
-                               oemptr += sizeof(*m);
-                               count += sizeof(*m);
-                               ++mpc_record;
-                               break;
-                       }
-                       default:
-                       {
-                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
-                               return;
-                       }
-               }
-       }
-}
-
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
-               char *productid)
-{
-       if (strncmp(oem, "IBM NUMA", 8))
-               printk("Warning!  May not be a NUMA-Q system!\n");
-       if (mpc->mpc_oemptr)
-               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
-                               mpc->mpc_oemsize);
-}
-#endif /* CONFIG_X86_NUMAQ */
-
-/*
- * Read/parse the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
-       char str[16];
-       char oem[10];
-       int count=sizeof(*mpc);
-       unsigned char *mpt=((unsigned char *)mpc)+count;
-
-       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
-                       *(u32 *)mpc->mpc_signature);
-               return 0;
-       }
-       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-               printk(KERN_ERR "SMP mptable: checksum error!\n");
-               return 0;
-       }
-       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
-                       mpc->mpc_spec);
-               return 0;
-       }
-       if (!mpc->mpc_lapic) {
-               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
-               return 0;
-       }
-       memcpy(oem,mpc->mpc_oem,8);
-       oem[8]=0;
-       printk(KERN_INFO "OEM ID: %s ",oem);
-
-       memcpy(str,mpc->mpc_productid,12);
-       str[12]=0;
-       printk("Product ID: %s ",str);
-
-       mps_oem_check(mpc, oem, str);
-
-       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
-
-       /* 
-        * Save the local APIC address (it might be non-default) -- but only
-        * if we're not using ACPI.
-        */
-       if (!acpi_lapic)
-               mp_lapic_addr = mpc->mpc_lapic;
-
-       /*
-        *      Now process the configuration blocks.
-        */
-       mpc_record = 0;
-       while (count < mpc->mpc_length) {
-               switch(*mpt) {
-                       case MP_PROCESSOR:
-                       {
-                               struct mpc_config_processor *m=
-                                       (struct mpc_config_processor *)mpt;
-                               /* ACPI may have already provided this data */
-                               if (!acpi_lapic)
-                                       MP_processor_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-                       case MP_BUS:
-                       {
-                               struct mpc_config_bus *m=
-                                       (struct mpc_config_bus *)mpt;
-                               MP_bus_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-                       case MP_IOAPIC:
-                       {
-                               struct mpc_config_ioapic *m=
-                                       (struct mpc_config_ioapic *)mpt;
-                               MP_ioapic_info(m);
-                               mpt+=sizeof(*m);
-                               count+=sizeof(*m);
-                               break;
-                       }
-                       case MP_INTSRC:
-                       {
-                               struct mpc_config_intsrc *m=
-                                       (struct mpc_config_intsrc *)mpt;
-
-                               MP_intsrc_info(m);
-                               mpt+=sizeof(*m);
-                               count+=sizeof(*m);
-                               break;
-                       }
-                       case MP_LINTSRC:
-                       {
-                               struct mpc_config_lintsrc *m=
-                                       (struct mpc_config_lintsrc *)mpt;
-                               MP_lintsrc_info(m);
-                               mpt+=sizeof(*m);
-                               count+=sizeof(*m);
-                               break;
-                       }
-                       default:
-                       {
-                               count = mpc->mpc_length;
-                               break;
-                       }
-               }
-               ++mpc_record;
-       }
-       setup_apic_routing();
-       if (!num_processors)
-               printk(KERN_ERR "SMP mptable: no processors registered!\n");
-       return num_processors;
-}
-
-static int __init ELCR_trigger(unsigned int irq)
-{
-       unsigned int port;
-
-       port = 0x4d0 + (irq >> 3);
-       return (inb(port) >> (irq & 7)) & 1;
-}
-
-static void __init construct_default_ioirq_mptable(int mpc_default_type)
-{
-       struct mpc_config_intsrc intsrc;
-       int i;
-       int ELCR_fallback = 0;
-
-       intsrc.mpc_type = MP_INTSRC;
-       intsrc.mpc_irqflag = 0;                 /* conforming */
-       intsrc.mpc_srcbus = 0;
-       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
-
-       intsrc.mpc_irqtype = mp_INT;
-
-       /*
-        *  If true, we have an ISA/PCI system with no IRQ entries
-        *  in the MP table. To prevent the PCI interrupts from being set up
-        *  incorrectly, we try to use the ELCR. The sanity check to see if
-        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-        *  never be level sensitive, so we simply see if the ELCR agrees.
-        *  If it does, we assume it's valid.
-        */
-       if (mpc_default_type == 5) {
-               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
-
-               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
-                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
-               else {
-                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
-                       ELCR_fallback = 1;
-               }
-       }
-
-       for (i = 0; i < 16; i++) {
-               switch (mpc_default_type) {
-               case 2:
-                       if (i == 0 || i == 13)
-                               continue;       /* IRQ0 & IRQ13 not connected */
-                       /* fall through */
-               default:
-                       if (i == 2)
-                               continue;       /* IRQ2 is never connected */
-               }
-
-               if (ELCR_fallback) {
-                       /*
-                        *  If the ELCR indicates a level-sensitive interrupt, we
-                        *  copy that information over to the MP table in the
-                        *  irqflag field (level sensitive, active high polarity).
-                        */
-                       if (ELCR_trigger(i))
-                               intsrc.mpc_irqflag = 13;
-                       else
-                               intsrc.mpc_irqflag = 0;
-               }
-
-               intsrc.mpc_srcbusirq = i;
-               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
-               MP_intsrc_info(&intsrc);
-       }
-
-       intsrc.mpc_irqtype = mp_ExtINT;
-       intsrc.mpc_srcbusirq = 0;
-       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
-       MP_intsrc_info(&intsrc);
-}
-
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
-{
-       struct mpc_config_processor processor;
-       struct mpc_config_bus bus;
-       struct mpc_config_ioapic ioapic;
-       struct mpc_config_lintsrc lintsrc;
-       int linttypes[2] = { mp_ExtINT, mp_NMI };
-       int i;
-
-       /*
-        * local APIC has default address
-        */
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-       /*
-        * 2 CPUs, numbered 0 & 1.
-        */
-       processor.mpc_type = MP_PROCESSOR;
-       /* Either an integrated APIC or a discrete 82489DX. */
-       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-       processor.mpc_cpuflag = CPU_ENABLED;
-       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-                                  (boot_cpu_data.x86_model << 4) |
-                                  boot_cpu_data.x86_mask;
-       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-       processor.mpc_reserved[0] = 0;
-       processor.mpc_reserved[1] = 0;
-       for (i = 0; i < 2; i++) {
-               processor.mpc_apicid = i;
-               MP_processor_info(&processor);
-       }
-
-       bus.mpc_type = MP_BUS;
-       bus.mpc_busid = 0;
-       switch (mpc_default_type) {
-               default:
-                       printk("???\n");
-                       printk(KERN_ERR "Unknown standard configuration %d\n",
-                               mpc_default_type);
-                       /* fall through */
-               case 1:
-               case 5:
-                       memcpy(bus.mpc_bustype, "ISA   ", 6);
-                       break;
-               case 2:
-               case 6:
-               case 3:
-                       memcpy(bus.mpc_bustype, "EISA  ", 6);
-                       break;
-               case 4:
-               case 7:
-                       memcpy(bus.mpc_bustype, "MCA   ", 6);
-       }
-       MP_bus_info(&bus);
-       if (mpc_default_type > 4) {
-               bus.mpc_busid = 1;
-               memcpy(bus.mpc_bustype, "PCI   ", 6);
-               MP_bus_info(&bus);
-       }
-
-       ioapic.mpc_type = MP_IOAPIC;
-       ioapic.mpc_apicid = 2;
-       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-       ioapic.mpc_flags = MPC_APIC_USABLE;
-       ioapic.mpc_apicaddr = 0xFEC00000;
-       MP_ioapic_info(&ioapic);
-
-       /*
-        * We set up most of the low 16 IO-APIC pins according to MPS rules.
-        */
-       construct_default_ioirq_mptable(mpc_default_type);
-
-       lintsrc.mpc_type = MP_LINTSRC;
-       lintsrc.mpc_irqflag = 0;                /* conforming */
-       lintsrc.mpc_srcbusid = 0;
-       lintsrc.mpc_srcbusirq = 0;
-       lintsrc.mpc_destapic = MP_APIC_ALL;
-       for (i = 0; i < 2; i++) {
-               lintsrc.mpc_irqtype = linttypes[i];
-               lintsrc.mpc_destapiclint = i;
-               MP_lintsrc_info(&lintsrc);
-       }
-}
-
-static struct intel_mp_floating *mpf_found;
-
-/*
- * Scan the memory blocks for an SMP configuration block.
- */
-void __init get_smp_config (void)
-{
-       struct intel_mp_floating *mpf = mpf_found;
-
-       /*
-        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
-        * processors, where MPS only supports physical.
-        */
-       if (acpi_lapic && acpi_ioapic) {
-               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
-               return;
-       }
-       else if (acpi_lapic)
-               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
-
-       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-       if (mpf->mpf_feature2 & (1<<7)) {
-               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
-               pic_mode = 1;
-       } else {
-               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
-               pic_mode = 0;
-       }
-
-       /*
-        * Now see if we need to read further.
-        */
-       if (mpf->mpf_feature1 != 0) {
-
-               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
-               construct_default_ISA_mptable(mpf->mpf_feature1);
-
-       } else if (mpf->mpf_physptr) {
-
-               /*
-                * Read the physical hardware table.  Anything here will
-                * override the defaults.
-                */
-               if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
-                       smp_found_config = 0;
-                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
-                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
-                       return;
-               }
-               /*
-                * If there are no explicit MP IRQ entries, then we are
-                * broken.  We set up most of the low 16 IO-APIC pins to
-                * ISA defaults and hope it will work.
-                */
-               if (!mp_irq_entries) {
-                       struct mpc_config_bus bus;
-
-                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
-
-                       bus.mpc_type = MP_BUS;
-                       bus.mpc_busid = 0;
-                       memcpy(bus.mpc_bustype, "ISA   ", 6);
-                       MP_bus_info(&bus);
-
-                       construct_default_ioirq_mptable(0);
-               }
-
-       } else
-               BUG();
-
-       printk(KERN_INFO "Processors: %d\n", num_processors);
-       /*
-        * Only use the first configuration found.
-        */
-}
-
-static int __init smp_scan_config (unsigned long base, unsigned long length)
-{
-       unsigned long *bp = phys_to_virt(base);
-       struct intel_mp_floating *mpf;
-
-       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
-       if (sizeof(*mpf) != 16)
-               printk("Error: MPF size\n");
-
-       while (length > 0) {
-               mpf = (struct intel_mp_floating *)bp;
-               if ((*bp == SMP_MAGIC_IDENT) &&
-                       (mpf->mpf_length == 1) &&
-                       !mpf_checksum((unsigned char *)bp, 16) &&
-                       ((mpf->mpf_specification == 1)
-                               || (mpf->mpf_specification == 4)) ) {
-
-                       smp_found_config = 1;
-                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
-                                               virt_to_phys(mpf));
-                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
-                       if (mpf->mpf_physptr) {
-                               /*
-                                * We cannot access to MPC table to compute
-                                * table size yet, as only few megabytes from
-                                * the bottom is mapped now.
-                                * PC-9800's MPC table places on the very last
-                                * of physical memory; so that simply reserving
-                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
-                                * in reserve_bootmem.
-                                */
-                               unsigned long size = PAGE_SIZE;
-                               unsigned long end = max_low_pfn * PAGE_SIZE;
-                               if (mpf->mpf_physptr + size > end)
-                                       size = end - mpf->mpf_physptr;
-                               reserve_bootmem(mpf->mpf_physptr, size);
-                       }
-
-                       mpf_found = mpf;
-                       return 1;
-               }
-               bp += 4;
-               length -= 16;
-       }
-       return 0;
-}
-
-void __init find_smp_config (void)
-{
-       unsigned int address;
-
-       /*
-        * FIXME: Linux assumes you have 640K of base ram..
-        * this continues the error...
-        *
-        * 1) Scan the bottom 1K for a signature
-        * 2) Scan the top 1K of base RAM
-        * 3) Scan the 64K of bios
-        */
-       if (smp_scan_config(0x0,0x400) ||
-               smp_scan_config(639*0x400,0x400) ||
-                       smp_scan_config(0xF0000,0x10000))
-               return;
-       /*
-        * If it is an SMP machine we should know now, unless the
-        * configuration is in an EISA/MCA bus machine with an
-        * extended bios data area.
-        *
-        * there is a real-mode segmented pointer pointing to the
-        * 4K EBDA area at 0x40E, calculate and scan it here.
-        *
-        * NOTE! There are Linux loaders that will corrupt the EBDA
-        * area, and as such this kind of SMP config may be less
-        * trustworthy, simply because the SMP table may have been
-        * stomped on during early boot. These loaders are buggy and
-        * should be fixed.
-        *
-        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
-        */
-
-       address = get_bios_ebda();
-       if (address)
-               smp_scan_config(address, 0x400);
-}
-
-int es7000_plat;
-
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-void __init mp_register_lapic_address(u64 address)
-{
-       mp_lapic_addr = (unsigned long) address;
-
-       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
-       if (boot_cpu_physical_apicid == -1U)
-               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-
-       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
-}
-
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
-{
-       struct mpc_config_processor processor;
-       int boot_cpu = 0;
-       
-       if (MAX_APICS - id <= 0) {
-               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
-                       id, MAX_APICS);
-               return;
-       }
-
-       if (id == boot_cpu_physical_apicid)
-               boot_cpu = 1;
-
-       processor.mpc_type = MP_PROCESSOR;
-       processor.mpc_apicid = id;
-       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
-       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
-       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
-               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-       processor.mpc_reserved[0] = 0;
-       processor.mpc_reserved[1] = 0;
-
-       MP_processor_info(&processor);
-}
-
-#ifdef CONFIG_X86_IO_APIC
-
-#define MP_ISA_BUS             0
-#define MP_MAX_IOAPIC_PIN      127
-
-static struct mp_ioapic_routing {
-       int                     apic_id;
-       int                     gsi_base;
-       int                     gsi_end;
-       u32                     pin_programmed[4];
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic (int gsi)
-{
-       int i = 0;
-
-       /* Find the IOAPIC that manages this GSI. */
-       for (i = 0; i < nr_ioapics; i++) {
-               if ((gsi >= mp_ioapic_routing[i].gsi_base)
-                       && (gsi <= mp_ioapic_routing[i].gsi_end))
-                       return i;
-       }
-
-       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-
-       return -1;
-}
-
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
-{
-       int idx = 0;
-       int tmpid;
-
-       if (nr_ioapics >= MAX_IO_APICS) {
-               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-       }
-       if (!address) {
-               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-                       " found in MADT table, skipping!\n");
-               return;
-       }
-
-       idx = nr_ioapics++;
-
-       mp_ioapics[idx].mpc_type = MP_IOAPIC;
-       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-       mp_ioapics[idx].mpc_apicaddr = address;
-
-       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-               && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-               tmpid = io_apic_get_unique_id(idx, id);
-       else
-               tmpid = id;
-       if (tmpid == -1) {
-               nr_ioapics--;
-               return;
-       }
-       mp_ioapics[idx].mpc_apicid = tmpid;
-       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-       
-       /* 
-        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-        */
-       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-       mp_ioapic_routing[idx].gsi_base = gsi_base;
-       mp_ioapic_routing[idx].gsi_end = gsi_base + 
-               io_apic_get_redir_entries(idx);
-
-       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
-               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
-               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-               mp_ioapic_routing[idx].gsi_base,
-               mp_ioapic_routing[idx].gsi_end);
-}
-
-void __init
-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-       struct mpc_config_intsrc intsrc;
-       int                     ioapic = -1;
-       int                     pin = -1;
-
-       /* 
-        * Convert 'gsi' to 'ioapic.pin'.
-        */
-       ioapic = mp_find_ioapic(gsi);
-       if (ioapic < 0)
-               return;
-       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-       /*
-        * TBD: This check is for faulty timer entries, where the override
-        *      erroneously sets the trigger to level, resulting in a HUGE 
-        *      increase of timer interrupts!
-        */
-       if ((bus_irq == 0) && (trigger == 3))
-               trigger = 1;
-
-       intsrc.mpc_type = MP_INTSRC;
-       intsrc.mpc_irqtype = mp_INT;
-       intsrc.mpc_irqflag = (trigger << 2) | polarity;
-       intsrc.mpc_srcbus = MP_ISA_BUS;
-       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
-       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
-       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
-
-       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
-               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
-               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
-               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
-       mp_irqs[mp_irq_entries] = intsrc;
-       if (++mp_irq_entries == MAX_IRQ_SOURCES)
-               panic("Max # of irq sources exceeded!\n");
-}
-
-void __init mp_config_acpi_legacy_irqs (void)
-{
-       struct mpc_config_intsrc intsrc;
-       int i = 0;
-       int ioapic = -1;
-
-       /* 
-        * Fabricate the legacy ISA bus (bus #31).
-        */
-       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
-
-       /*
-        * Older generations of ES7000 have no legacy identity mappings
-        */
-       if (es7000_plat == 1)
-               return;
-
-       /* 
-        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
-        */
-       ioapic = mp_find_ioapic(0);
-       if (ioapic < 0)
-               return;
-
-       intsrc.mpc_type = MP_INTSRC;
-       intsrc.mpc_irqflag = 0;                                 /* Conforming */
-       intsrc.mpc_srcbus = MP_ISA_BUS;
-       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-
-       /* 
-        * Use the default configuration for the IRQs 0-15.  Unless
-        * overriden by (MADT) interrupt source override entries.
-        */
-       for (i = 0; i < 16; i++) {
-               int idx;
-
-               for (idx = 0; idx < mp_irq_entries; idx++) {
-                       struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-                       /* Do we already have a mapping for this ISA IRQ? */
-                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
-                               break;
-
-                       /* Do we already have a mapping for this IOAPIC pin */
-                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-                               (irq->mpc_dstirq == i))
-                               break;
-               }
-
-               if (idx != mp_irq_entries) {
-                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-                       continue;                       /* IRQ already used */
-               }
-
-               intsrc.mpc_irqtype = mp_INT;
-               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
-               intsrc.mpc_dstirq = i;
-
-               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
-                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
-                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
-                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
-                       intsrc.mpc_dstirq);
-
-               mp_irqs[mp_irq_entries] = intsrc;
-               if (++mp_irq_entries == MAX_IRQ_SOURCES)
-                       panic("Max # of irq sources exceeded!\n");
-       }
-}
-
-#define MAX_GSI_NUM    4096
-
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
-       int ioapic = -1;
-       int ioapic_pin = 0;
-       int idx, bit = 0;
-       static int pci_irq = 16;
-       /*
-        * Mapping between Global System Interrups, which
-        * represent all possible interrupts, and IRQs
-        * assigned to actual devices.
-        */
-       static int              gsi_to_irq[MAX_GSI_NUM];
-
-       /* Don't set up the ACPI SCI because it's already set up */
-       if (acpi_gbl_FADT.sci_interrupt == gsi)
-               return gsi;
-
-       ioapic = mp_find_ioapic(gsi);
-       if (ioapic < 0) {
-               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-               return gsi;
-       }
-
-       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-       if (ioapic_renumber_irq)
-               gsi = ioapic_renumber_irq(ioapic, gsi);
-
-       /* 
-        * Avoid pin reprogramming.  PRTs typically include entries  
-        * with redundant pin->gsi mappings (but unique PCI devices);
-        * we only program the IOAPIC on the first.
-        */
-       bit = ioapic_pin % 32;
-       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
-       if (idx > 3) {
-               printk(KERN_ERR "Invalid reference to IOAPIC pin "
-                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
-                       ioapic_pin);
-               return gsi;
-       }
-       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
-               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-               return gsi_to_irq[gsi];
-       }
-
-       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
-
-       if (triggering == ACPI_LEVEL_SENSITIVE) {
-               /*
-                * For PCI devices assign IRQs in order, avoiding gaps
-                * due to unused I/O APIC pins.
-                */
-               int irq = gsi;
-               if (gsi < MAX_GSI_NUM) {
-                       /*
-                        * Retain the VIA chipset work-around (gsi > 15), but
-                        * avoid a problem where the 8254 timer (IRQ0) is setup
-                        * via an override (so it's not on pin 0 of the ioapic),
-                        * and at the same time, the pin 0 interrupt is a PCI
-                        * type.  The gsi > 15 test could cause these two pins
-                        * to be shared as IRQ0, and they are not shareable.
-                        * So test for this condition, and if necessary, avoid
-                        * the pin collision.
-                        */
-                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
-                               gsi = pci_irq++;
-                       /*
-                        * Don't assign IRQ used by ACPI SCI
-                        */
-                       if (gsi == acpi_gbl_FADT.sci_interrupt)
-                               gsi = pci_irq++;
-                       gsi_to_irq[irq] = gsi;
-               } else {
-                       printk(KERN_ERR "GSI %u is too high\n", gsi);
-                       return gsi;
-               }
-       }
-
-       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-       return gsi;
-}
-
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
deleted file mode 100644 (file)
index 0c1069b..0000000
+++ /dev/null
@@ -1,224 +0,0 @@
-/* ----------------------------------------------------------------------- *
- *   
- *   Copyright 2000 H. Peter Anvin - All Rights Reserved
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- *   USA; either version 2 of the License, or (at your option) any later
- *   version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * msr.c
- *
- * x86 MSR access device
- *
- * This device is accessed by lseek() to the appropriate register number
- * and then read/write in chunks of 8 bytes.  A larger size means multiple
- * reads or writes of the same register.
- *
- * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
- * an SMP box will direct the access to CPU %d.
- */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/smp.h>
-#include <linux/smp_lock.h>
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/device.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-static struct class *msr_class;
-
-static loff_t msr_seek(struct file *file, loff_t offset, int orig)
-{
-       loff_t ret = -EINVAL;
-
-       lock_kernel();
-       switch (orig) {
-       case 0:
-               file->f_pos = offset;
-               ret = file->f_pos;
-               break;
-       case 1:
-               file->f_pos += offset;
-               ret = file->f_pos;
-       }
-       unlock_kernel();
-       return ret;
-}
-
-static ssize_t msr_read(struct file *file, char __user * buf,
-                       size_t count, loff_t * ppos)
-{
-       u32 __user *tmp = (u32 __user *) buf;
-       u32 data[2];
-       u32 reg = *ppos;
-       int cpu = iminor(file->f_path.dentry->d_inode);
-       int err;
-
-       if (count % 8)
-               return -EINVAL; /* Invalid chunk size */
-
-       for (; count; count -= 8) {
-               err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
-               if (err)
-                       return -EIO;
-               if (copy_to_user(tmp, &data, 8))
-                       return -EFAULT;
-               tmp += 2;
-       }
-
-       return ((char __user *)tmp) - buf;
-}
-
-static ssize_t msr_write(struct file *file, const char __user *buf,
-                        size_t count, loff_t *ppos)
-{
-       const u32 __user *tmp = (const u32 __user *)buf;
-       u32 data[2];
-       u32 reg = *ppos;
-       int cpu = iminor(file->f_path.dentry->d_inode);
-       int err;
-
-       if (count % 8)
-               return -EINVAL; /* Invalid chunk size */
-
-       for (; count; count -= 8) {
-               if (copy_from_user(&data, tmp, 8))
-                       return -EFAULT;
-               err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
-               if (err)
-                       return -EIO;
-               tmp += 2;
-       }
-
-       return ((char __user *)tmp) - buf;
-}
-
-static int msr_open(struct inode *inode, struct file *file)
-{
-       unsigned int cpu = iminor(file->f_path.dentry->d_inode);
-       struct cpuinfo_x86 *c = &(cpu_data)[cpu];
-
-       if (cpu >= NR_CPUS || !cpu_online(cpu))
-               return -ENXIO;  /* No such CPU */
-       if (!cpu_has(c, X86_FEATURE_MSR))
-               return -EIO;    /* MSR not supported */
-
-       return 0;
-}
-
-/*
- * File operations we support
- */
-static const struct file_operations msr_fops = {
-       .owner = THIS_MODULE,
-       .llseek = msr_seek,
-       .read = msr_read,
-       .write = msr_write,
-       .open = msr_open,
-};
-
-static int msr_device_create(int i)
-{
-       int err = 0;
-       struct device *dev;
-
-       dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
-       if (IS_ERR(dev))
-               err = PTR_ERR(dev);
-       return err;
-}
-
-static int msr_class_cpu_callback(struct notifier_block *nfb,
-                               unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               msr_device_create(cpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
-{
-       .notifier_call = msr_class_cpu_callback,
-};
-
-static int __init msr_init(void)
-{
-       int i, err = 0;
-       i = 0;
-
-       if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
-               printk(KERN_ERR "msr: unable to get major %d for msr\n",
-                      MSR_MAJOR);
-               err = -EBUSY;
-               goto out;
-       }
-       msr_class = class_create(THIS_MODULE, "msr");
-       if (IS_ERR(msr_class)) {
-               err = PTR_ERR(msr_class);
-               goto out_chrdev;
-       }
-       for_each_online_cpu(i) {
-               err = msr_device_create(i);
-               if (err != 0)
-                       goto out_class;
-       }
-       register_hotcpu_notifier(&msr_class_cpu_notifier);
-
-       err = 0;
-       goto out;
-
-out_class:
-       i = 0;
-       for_each_online_cpu(i)
-               device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
-       class_destroy(msr_class);
-out_chrdev:
-       unregister_chrdev(MSR_MAJOR, "cpu/msr");
-out:
-       return err;
-}
-
-static void __exit msr_exit(void)
-{
-       int cpu = 0;
-       for_each_online_cpu(cpu)
-               device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
-       class_destroy(msr_class);
-       unregister_chrdev(MSR_MAJOR, "cpu/msr");
-       unregister_hotcpu_notifier(&msr_class_cpu_notifier);
-}
-
-module_init(msr_init);
-module_exit(msr_exit)
-
-MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
-MODULE_DESCRIPTION("x86 generic MSR driver");
-MODULE_LICENSE("GPL");
diff --git a/arch/i386/kernel/nmi_32.c b/arch/i386/kernel/nmi_32.c
deleted file mode 100644 (file)
index c7227e2..0000000
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- *  linux/arch/i386/nmi.c
- *
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson  : AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson  : Power Management for local APIC NMI watchdog.
- *  Mikael Pettersson  : Pentium 4 support for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson  : PM converted to driver model. Disable/enable API.
- */
-
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-
-#include <asm/smp.h>
-#include <asm/nmi.h>
-
-#include "mach_traps.h"
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-static cpumask_t backtrace_mask = CPU_MASK_NONE;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
-
-unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
-
-static DEFINE_PER_CPU(short, wd_enabled);
-
-/* local prototypes */
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
-
-static int endflag __initdata = 0;
-
-#ifdef CONFIG_SMP
-/* The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-       local_irq_enable_in_hardirq();
-       /* Intentionally don't use cpu_relax here. This is
-          to make sure that the performance counter really ticks,
-          even if there is a simulator or similar that catches the
-          pause instruction. On a real HT machine this is fine because
-          all other CPUs are busy with "useless" delay loops and don't
-          care if they get somewhat less cycles. */
-       while (endflag == 0)
-               mb();
-}
-#endif
-
-static int __init check_nmi_watchdog(void)
-{
-       unsigned int *prev_nmi_count;
-       int cpu;
-
-       if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
-               return 0;
-
-       if (!atomic_read(&nmi_active))
-               return 0;
-
-       prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
-       if (!prev_nmi_count)
-               return -1;
-
-       printk(KERN_INFO "Testing NMI watchdog ... ");
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
-
-       for_each_possible_cpu(cpu)
-               prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
-       local_irq_enable();
-       mdelay((20*1000)/nmi_hz); // wait 20 ticks
-
-       for_each_possible_cpu(cpu) {
-#ifdef CONFIG_SMP
-               /* Check cpu_callin_map here because that is set
-                  after the timer is started. */
-               if (!cpu_isset(cpu, cpu_callin_map))
-                       continue;
-#endif
-               if (!per_cpu(wd_enabled, cpu))
-                       continue;
-               if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-                       printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
-                               cpu,
-                               prev_nmi_count[cpu],
-                               nmi_count(cpu));
-                       per_cpu(wd_enabled, cpu) = 0;
-                       atomic_dec(&nmi_active);
-               }
-       }
-       endflag = 1;
-       if (!atomic_read(&nmi_active)) {
-               kfree(prev_nmi_count);
-               atomic_set(&nmi_active, -1);
-               return -1;
-       }
-       printk("OK.\n");
-
-       /* now that we know it works we can reduce NMI frequency to
-          something more reasonable; makes a difference in some configs */
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               nmi_hz = lapic_adjust_nmi_hz(1);
-
-       kfree(prev_nmi_count);
-       return 0;
-}
-/* This needs to happen later in boot so counters are working */
-late_initcall(check_nmi_watchdog);
-
-static int __init setup_nmi_watchdog(char *str)
-{
-       int nmi;
-
-       get_option(&str, &nmi);
-
-       if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
-               return 0;
-
-       nmi_watchdog = nmi;
-       return 1;
-}
-
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-
-/* Suspend/resume support */
-
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       nmi_pm_active = atomic_read(&nmi_active);
-       stop_apic_nmi_watchdog(NULL);
-       BUG_ON(atomic_read(&nmi_active) != 0);
-       return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       if (nmi_pm_active > 0) {
-               setup_apic_nmi_watchdog(NULL);
-               touch_nmi_watchdog();
-       }
-       return 0;
-}
-
-
-static struct sysdev_class nmi_sysclass = {
-       set_kset_name("lapic_nmi"),
-       .resume         = lapic_nmi_resume,
-       .suspend        = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
-       .id     = 0,
-       .cls    = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
-       int error;
-
-       /* should really be a BUG_ON but b/c this is an
-        * init call, it just doesn't work.  -dcz
-        */
-       if (nmi_watchdog != NMI_LOCAL_APIC)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0)
-               return 0;
-
-       error = sysdev_class_register(&nmi_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic_nmi);
-       return error;
-}
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
-       apic_write_around(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-void setup_apic_nmi_watchdog (void *unused)
-{
-       if (__get_cpu_var(wd_enabled))
-               return;
-
-       /* cheap hack to support suspend/resume */
-       /* if cpu0 is not active neither should the other cpus */
-       if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
-               return;
-
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
-               if (lapic_watchdog_init(nmi_hz) < 0) {
-                       __get_cpu_var(wd_enabled) = 0;
-                       return;
-               }
-               /* FALL THROUGH */
-       case NMI_IO_APIC:
-               __get_cpu_var(wd_enabled) = 1;
-               atomic_inc(&nmi_active);
-       }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
-       /* only support LOCAL and IO APICs for now */
-       if ((nmi_watchdog != NMI_LOCAL_APIC) &&
-           (nmi_watchdog != NMI_IO_APIC))
-               return;
-       if (__get_cpu_var(wd_enabled) == 0)
-               return;
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               lapic_watchdog_stop();
-       __get_cpu_var(wd_enabled) = 0;
-       atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up
- *  here too!]
- */
-
-static unsigned int
-       last_irq_sums [NR_CPUS],
-       alert_counter [NR_CPUS];
-
-void touch_nmi_watchdog(void)
-{
-       if (nmi_watchdog > 0) {
-               unsigned cpu;
-
-               /*
-                * Just reset the alert counters, (other CPUs might be
-                * spinning on locks we hold):
-                */
-               for_each_present_cpu(cpu) {
-                       if (alert_counter[cpu])
-                               alert_counter[cpu] = 0;
-               }
-       }
-
-       /*
-        * Tickle the softlockup detector too:
-        */
-       touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-extern void die_nmi(struct pt_regs *, const char *msg);
-
-__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
-{
-
-       /*
-        * Since current_thread_info()-> is always on the stack, and we
-        * always switch the stack NMI-atomically, it's safe to use
-        * smp_processor_id().
-        */
-       unsigned int sum;
-       int touched = 0;
-       int cpu = smp_processor_id();
-       int rc=0;
-
-       /* check for other users first */
-       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-                       == NOTIFY_STOP) {
-               rc = 1;
-               touched = 1;
-       }
-
-       if (cpu_isset(cpu, backtrace_mask)) {
-               static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
-
-               spin_lock(&lock);
-               printk("NMI backtrace for cpu %d\n", cpu);
-               dump_stack();
-               spin_unlock(&lock);
-               cpu_clear(cpu, backtrace_mask);
-       }
-
-       /*
-        * Take the local apic timer and PIT/HPET into account. We don't
-        * know which one is active, when we have highres/dyntick on
-        */
-       sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
-
-       /* if the none of the timers isn't firing, this cpu isn't doing much */
-       if (!touched && last_irq_sums[cpu] == sum) {
-               /*
-                * Ayiee, looks like this CPU is stuck ...
-                * wait a few IRQs (5 seconds) before doing the oops ...
-                */
-               alert_counter[cpu]++;
-               if (alert_counter[cpu] == 5*nmi_hz)
-                       /*
-                        * die_nmi will return ONLY if NOTIFY_STOP happens..
-                        */
-                       die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
-       } else {
-               last_irq_sums[cpu] = sum;
-               alert_counter[cpu] = 0;
-       }
-       /* see if the nmi watchdog went off */
-       if (!__get_cpu_var(wd_enabled))
-               return rc;
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               rc |= lapic_wd_event(nmi_hz);
-               break;
-       case NMI_IO_APIC:
-               /* don't know how to accurately check for this.
-                * just assume it was a watchdog timer interrupt
-                * This matches the old behaviour.
-                */
-               rc = 1;
-               break;
-       }
-       return rc;
-}
-
-int do_nmi_callback(struct pt_regs * regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-       if (unknown_nmi_panic)
-               return unknown_nmi_panic_callback(regs, cpu);
-#endif
-       return 0;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-       unsigned char reason = get_nmi_reason();
-       char buf[64];
-
-       sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-       die_nmi(regs, buf);
-       return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
-                       void __user *buffer, size_t *length, loff_t *ppos)
-{
-       int old_state;
-
-       nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-       old_state = nmi_watchdog_enabled;
-       proc_dointvec(table, write, file, buffer, length, ppos);
-       if (!!old_state == !!nmi_watchdog_enabled)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
-               printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
-               return -EIO;
-       }
-
-       if (nmi_watchdog == NMI_DEFAULT) {
-               if (lapic_watchdog_ok())
-                       nmi_watchdog = NMI_LOCAL_APIC;
-               else
-                       nmi_watchdog = NMI_IO_APIC;
-       }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC) {
-               if (nmi_watchdog_enabled)
-                       enable_lapic_nmi_watchdog();
-               else
-                       disable_lapic_nmi_watchdog();
-       } else {
-               printk( KERN_WARNING
-                       "NMI watchdog doesn't know what hardware to touch\n");
-               return -EIO;
-       }
-       return 0;
-}
-
-#endif
-
-void __trigger_all_cpu_backtrace(void)
-{
-       int i;
-
-       backtrace_mask = cpu_online_map;
-       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-       for (i = 0; i < 10 * 1000; i++) {
-               if (cpus_empty(backtrace_mask))
-                       break;
-               mdelay(1);
-       }
-}
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/i386/kernel/numaq_32.c b/arch/i386/kernel/numaq_32.c
deleted file mode 100644 (file)
index 9000d82..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <asm/numaq.h>
-#include <asm/topology.h>
-#include <asm/processor.h>
-
-#define        MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
-
-/*
- * Function: smp_dump_qct()
- *
- * Description: gets memory layout from the quad config table.  This
- * function also updates node_online_map with the nodes (quads) present.
- */
-static void __init smp_dump_qct(void)
-{
-       int node;
-       struct eachquadmem *eq;
-       struct sys_cfg_data *scd =
-               (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
-
-       nodes_clear(node_online_map);
-       for_each_node(node) {
-               if (scd->quads_present31_0 & (1 << node)) {
-                       node_set_online(node);
-                       eq = &scd->eq[node];
-                       /* Convert to pages */
-                       node_start_pfn[node] = MB_TO_PAGES(
-                               eq->hi_shrd_mem_start - eq->priv_mem_size);
-                       node_end_pfn[node] = MB_TO_PAGES(
-                               eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
-                       memory_present(node,
-                               node_start_pfn[node], node_end_pfn[node]);
-                       node_remap_size[node] = node_memmap_size_bytes(node,
-                                                       node_start_pfn[node],
-                                                       node_end_pfn[node]);
-               }
-       }
-}
-
-/*
- * Unlike Summit, we don't really care to let the NUMA-Q
- * fall back to flat mode.  Don't compile for NUMA-Q
- * unless you really need it!
- */
-int __init get_memcfg_numaq(void)
-{
-       smp_dump_qct();
-       return 1;
-}
-
-static int __init numaq_tsc_disable(void)
-{
-       if (num_online_nodes() > 1) {
-               printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-               tsc_disable = 1;
-       }
-       return 0;
-}
-arch_initcall(numaq_tsc_disable);
diff --git a/arch/i386/kernel/paravirt_32.c b/arch/i386/kernel/paravirt_32.c
deleted file mode 100644 (file)
index 739cfb2..0000000
+++ /dev/null
@@ -1,392 +0,0 @@
-/*  Paravirtualization interfaces
-    Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-*/
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/bcd.h>
-#include <linux/highmem.h>
-
-#include <asm/bug.h>
-#include <asm/paravirt.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/time.h>
-#include <asm/irq.h>
-#include <asm/delay.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
-
-static void __init default_banner(void)
-{
-       printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-              paravirt_ops.name);
-}
-
-char *memory_setup(void)
-{
-       return paravirt_ops.memory_setup();
-}
-
-/* Simple instruction patching code. */
-#define DEF_NATIVE(name, code)                                 \
-       extern const char start_##name[], end_##name[];         \
-       asm("start_" #name ": " code "; end_" #name ":")
-
-DEF_NATIVE(irq_disable, "cli");
-DEF_NATIVE(irq_enable, "sti");
-DEF_NATIVE(restore_fl, "push %eax; popf");
-DEF_NATIVE(save_fl, "pushf; pop %eax");
-DEF_NATIVE(iret, "iret");
-DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(clts, "clts");
-DEF_NATIVE(read_tsc, "rdtsc");
-
-DEF_NATIVE(ud2a, "ud2a");
-
-static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
-                            unsigned long addr, unsigned len)
-{
-       const unsigned char *start, *end;
-       unsigned ret;
-
-       switch(type) {
-#define SITE(x)        case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
-               SITE(irq_disable);
-               SITE(irq_enable);
-               SITE(restore_fl);
-               SITE(save_fl);
-               SITE(iret);
-               SITE(irq_enable_sysexit);
-               SITE(read_cr2);
-               SITE(read_cr3);
-               SITE(write_cr3);
-               SITE(clts);
-               SITE(read_tsc);
-#undef SITE
-
-       patch_site:
-               ret = paravirt_patch_insns(ibuf, len, start, end);
-               break;
-
-       case PARAVIRT_PATCH(make_pgd):
-       case PARAVIRT_PATCH(make_pte):
-       case PARAVIRT_PATCH(pgd_val):
-       case PARAVIRT_PATCH(pte_val):
-#ifdef CONFIG_X86_PAE
-       case PARAVIRT_PATCH(make_pmd):
-       case PARAVIRT_PATCH(pmd_val):
-#endif
-               /* These functions end up returning exactly what
-                  they're passed, in the same registers. */
-               ret = paravirt_patch_nop();
-               break;
-
-       default:
-               ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
-               break;
-       }
-
-       return ret;
-}
-
-unsigned paravirt_patch_nop(void)
-{
-       return 0;
-}
-
-unsigned paravirt_patch_ignore(unsigned len)
-{
-       return len;
-}
-
-struct branch {
-       unsigned char opcode;
-       u32 delta;
-} __attribute__((packed));
-
-unsigned paravirt_patch_call(void *insnbuf,
-                            const void *target, u16 tgt_clobbers,
-                            unsigned long addr, u16 site_clobbers,
-                            unsigned len)
-{
-       struct branch *b = insnbuf;
-       unsigned long delta = (unsigned long)target - (addr+5);
-
-       if (tgt_clobbers & ~site_clobbers)
-               return len;     /* target would clobber too much for this site */
-       if (len < 5)
-               return len;     /* call too long for patch site */
-
-       b->opcode = 0xe8; /* call */
-       b->delta = delta;
-       BUILD_BUG_ON(sizeof(*b) != 5);
-
-       return 5;
-}
-
-unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
-                           unsigned long addr, unsigned len)
-{
-       struct branch *b = insnbuf;
-       unsigned long delta = (unsigned long)target - (addr+5);
-
-       if (len < 5)
-               return len;     /* call too long for patch site */
-
-       b->opcode = 0xe9;       /* jmp */
-       b->delta = delta;
-
-       return 5;
-}
-
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
-                               unsigned long addr, unsigned len)
-{
-       void *opfunc = *((void **)&paravirt_ops + type);
-       unsigned ret;
-
-       if (opfunc == NULL)
-               /* If there's no function, patch it with a ud2a (BUG) */
-               ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
-       else if (opfunc == paravirt_nop)
-               /* If the operation is a nop, then nop the callsite */
-               ret = paravirt_patch_nop();
-       else if (type == PARAVIRT_PATCH(iret) ||
-                type == PARAVIRT_PATCH(irq_enable_sysexit))
-               /* If operation requires a jmp, then jmp */
-               ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
-       else
-               /* Otherwise call the function; assume target could
-                  clobber any caller-save reg */
-               ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
-                                         addr, clobbers, len);
-
-       return ret;
-}
-
-unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
-                             const char *start, const char *end)
-{
-       unsigned insn_len = end - start;
-
-       if (insn_len > len || start == NULL)
-               insn_len = len;
-       else
-               memcpy(insnbuf, start, insn_len);
-
-       return insn_len;
-}
-
-void init_IRQ(void)
-{
-       paravirt_ops.init_IRQ();
-}
-
-static void native_flush_tlb(void)
-{
-       __native_flush_tlb();
-}
-
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-static void native_flush_tlb_global(void)
-{
-       __native_flush_tlb_global();
-}
-
-static void native_flush_tlb_single(unsigned long addr)
-{
-       __native_flush_tlb_single(addr);
-}
-
-/* These are in entry.S */
-extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
-
-static int __init print_banner(void)
-{
-       paravirt_ops.banner();
-       return 0;
-}
-core_initcall(print_banner);
-
-static struct resource reserve_ioports = {
-       .start = 0,
-       .end = IO_SPACE_LIMIT,
-       .name = "paravirt-ioport",
-       .flags = IORESOURCE_IO | IORESOURCE_BUSY,
-};
-
-static struct resource reserve_iomem = {
-       .start = 0,
-       .end = -1,
-       .name = "paravirt-iomem",
-       .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-/*
- * Reserve the whole legacy IO space to prevent any legacy drivers
- * from wasting time probing for their hardware.  This is a fairly
- * brute-force approach to disabling all non-virtual drivers.
- *
- * Note that this must be called very early to have any effect.
- */
-int paravirt_disable_iospace(void)
-{
-       int ret;
-
-       ret = request_resource(&ioport_resource, &reserve_ioports);
-       if (ret == 0) {
-               ret = request_resource(&iomem_resource, &reserve_iomem);
-               if (ret)
-                       release_resource(&reserve_ioports);
-       }
-
-       return ret;
-}
-
-struct paravirt_ops paravirt_ops = {
-       .name = "bare hardware",
-       .paravirt_enabled = 0,
-       .kernel_rpl = 0,
-       .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
-
-       .patch = native_patch,
-       .banner = default_banner,
-       .arch_setup = paravirt_nop,
-       .memory_setup = machine_specific_memory_setup,
-       .get_wallclock = native_get_wallclock,
-       .set_wallclock = native_set_wallclock,
-       .time_init = hpet_time_init,
-       .init_IRQ = native_init_IRQ,
-
-       .cpuid = native_cpuid,
-       .get_debugreg = native_get_debugreg,
-       .set_debugreg = native_set_debugreg,
-       .clts = native_clts,
-       .read_cr0 = native_read_cr0,
-       .write_cr0 = native_write_cr0,
-       .read_cr2 = native_read_cr2,
-       .write_cr2 = native_write_cr2,
-       .read_cr3 = native_read_cr3,
-       .write_cr3 = native_write_cr3,
-       .read_cr4 = native_read_cr4,
-       .read_cr4_safe = native_read_cr4_safe,
-       .write_cr4 = native_write_cr4,
-       .save_fl = native_save_fl,
-       .restore_fl = native_restore_fl,
-       .irq_disable = native_irq_disable,
-       .irq_enable = native_irq_enable,
-       .safe_halt = native_safe_halt,
-       .halt = native_halt,
-       .wbinvd = native_wbinvd,
-       .read_msr = native_read_msr_safe,
-       .write_msr = native_write_msr_safe,
-       .read_tsc = native_read_tsc,
-       .read_pmc = native_read_pmc,
-       .sched_clock = native_sched_clock,
-       .get_cpu_khz = native_calculate_cpu_khz,
-       .load_tr_desc = native_load_tr_desc,
-       .set_ldt = native_set_ldt,
-       .load_gdt = native_load_gdt,
-       .load_idt = native_load_idt,
-       .store_gdt = native_store_gdt,
-       .store_idt = native_store_idt,
-       .store_tr = native_store_tr,
-       .load_tls = native_load_tls,
-       .write_ldt_entry = write_dt_entry,
-       .write_gdt_entry = write_dt_entry,
-       .write_idt_entry = write_dt_entry,
-       .load_esp0 = native_load_esp0,
-
-       .set_iopl_mask = native_set_iopl_mask,
-       .io_delay = native_io_delay,
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       .apic_write = native_apic_write,
-       .apic_write_atomic = native_apic_write_atomic,
-       .apic_read = native_apic_read,
-       .setup_boot_clock = setup_boot_APIC_clock,
-       .setup_secondary_clock = setup_secondary_APIC_clock,
-       .startup_ipi_hook = paravirt_nop,
-#endif
-       .set_lazy_mode = paravirt_nop,
-
-       .pagetable_setup_start = native_pagetable_setup_start,
-       .pagetable_setup_done = native_pagetable_setup_done,
-
-       .flush_tlb_user = native_flush_tlb,
-       .flush_tlb_kernel = native_flush_tlb_global,
-       .flush_tlb_single = native_flush_tlb_single,
-       .flush_tlb_others = native_flush_tlb_others,
-
-       .alloc_pt = paravirt_nop,
-       .alloc_pd = paravirt_nop,
-       .alloc_pd_clone = paravirt_nop,
-       .release_pt = paravirt_nop,
-       .release_pd = paravirt_nop,
-
-       .set_pte = native_set_pte,
-       .set_pte_at = native_set_pte_at,
-       .set_pmd = native_set_pmd,
-       .pte_update = paravirt_nop,
-       .pte_update_defer = paravirt_nop,
-
-#ifdef CONFIG_HIGHPTE
-       .kmap_atomic_pte = kmap_atomic,
-#endif
-
-#ifdef CONFIG_X86_PAE
-       .set_pte_atomic = native_set_pte_atomic,
-       .set_pte_present = native_set_pte_present,
-       .set_pud = native_set_pud,
-       .pte_clear = native_pte_clear,
-       .pmd_clear = native_pmd_clear,
-
-       .pmd_val = native_pmd_val,
-       .make_pmd = native_make_pmd,
-#endif
-
-       .pte_val = native_pte_val,
-       .pgd_val = native_pgd_val,
-
-       .make_pte = native_make_pte,
-       .make_pgd = native_make_pgd,
-
-       .irq_enable_sysexit = native_irq_enable_sysexit,
-       .iret = native_iret,
-
-       .dup_mmap = paravirt_nop,
-       .exit_mmap = paravirt_nop,
-       .activate_mm = paravirt_nop,
-};
-
-EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma_32.c b/arch/i386/kernel/pci-dma_32.c
deleted file mode 100644 (file)
index 048f09b..0000000
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Dynamic DMA mapping support.
- *
- * On i386 there is no hardware dynamic DMA address translation,
- * so consistent alloc/free are merely page allocation/freeing.
- * The rest of the dynamic DMA mapping interface is implemented
- * in asm/pci.h.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <asm/io.h>
-
-struct dma_coherent_mem {
-       void            *virt_base;
-       u32             device_base;
-       int             size;
-       int             flags;
-       unsigned long   *bitmap;
-};
-
-void *dma_alloc_coherent(struct device *dev, size_t size,
-                          dma_addr_t *dma_handle, gfp_t gfp)
-{
-       void *ret;
-       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-       int order = get_order(size);
-       /* ignore region specifiers */
-       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
-
-       if (mem) {
-               int page = bitmap_find_free_region(mem->bitmap, mem->size,
-                                                    order);
-               if (page >= 0) {
-                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
-                       ret = mem->virt_base + (page << PAGE_SHIFT);
-                       memset(ret, 0, size);
-                       return ret;
-               }
-               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-                       return NULL;
-       }
-
-       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
-               gfp |= GFP_DMA;
-
-       ret = (void *)__get_free_pages(gfp, order);
-
-       if (ret != NULL) {
-               memset(ret, 0, size);
-               *dma_handle = virt_to_phys(ret);
-       }
-       return ret;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-void dma_free_coherent(struct device *dev, size_t size,
-                        void *vaddr, dma_addr_t dma_handle)
-{
-       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-       int order = get_order(size);
-       
-       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
-               bitmap_release_region(mem->bitmap, page, order);
-       } else
-               free_pages((unsigned long)vaddr, order);
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
-                               dma_addr_t device_addr, size_t size, int flags)
-{
-       void __iomem *mem_base = NULL;
-       int pages = size >> PAGE_SHIFT;
-       int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
-       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
-               goto out;
-       if (!size)
-               goto out;
-       if (dev->dma_mem)
-               goto out;
-
-       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
-       mem_base = ioremap(bus_addr, size);
-       if (!mem_base)
-               goto out;
-
-       dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
-       if (!dev->dma_mem)
-               goto out;
-       dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-       if (!dev->dma_mem->bitmap)
-               goto free1_out;
-
-       dev->dma_mem->virt_base = mem_base;
-       dev->dma_mem->device_base = device_addr;
-       dev->dma_mem->size = pages;
-       dev->dma_mem->flags = flags;
-
-       if (flags & DMA_MEMORY_MAP)
-               return DMA_MEMORY_MAP;
-
-       return DMA_MEMORY_IO;
-
- free1_out:
-       kfree(dev->dma_mem);
- out:
-       if (mem_base)
-               iounmap(mem_base);
-       return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
-       struct dma_coherent_mem *mem = dev->dma_mem;
-       
-       if(!mem)
-               return;
-       dev->dma_mem = NULL;
-       iounmap(mem->virt_base);
-       kfree(mem->bitmap);
-       kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
-                                       dma_addr_t device_addr, size_t size)
-{
-       struct dma_coherent_mem *mem = dev->dma_mem;
-       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       int pos, err;
-
-       if (!mem)
-               return ERR_PTR(-EINVAL);
-
-       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
-       if (err != 0)
-               return ERR_PTR(err);
-       return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-int forbid_dac;
-EXPORT_SYMBOL(forbid_dac);
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
-       if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-               printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
-               forbid_dac = 1;
-       }
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-
-static int check_iommu(char *s)
-{
-       if (!strcmp(s, "usedac")) {
-               forbid_dac = -1;
-               return 1;
-       }
-       return 0;
-}
-__setup("iommu=", check_iommu);
-#endif
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
deleted file mode 100644 (file)
index bc1f2d3..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/platform_device.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-
-static __init int add_pcspkr(void)
-{
-       struct platform_device *pd;
-       int ret;
-
-       pd = platform_device_alloc("pcspkr", -1);
-       if (!pd)
-               return -ENOMEM;
-
-       ret = platform_device_add(pd);
-       if (ret)
-               platform_device_put(pd);
-
-       return ret;
-}
-device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/process_32.c b/arch/i386/kernel/process_32.c
deleted file mode 100644 (file)
index 8466471..0000000
+++ /dev/null
@@ -1,951 +0,0 @@
-/*
- *  linux/arch/i386/kernel/process.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-/*
- * This file handles the architecture-dependent parts of process handling..
- */
-
-#include <stdarg.h>
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/elfcore.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/user.h>
-#include <linux/a.out.h>
-#include <linux/interrupt.h>
-#include <linux/utsname.h>
-#include <linux/delay.h>
-#include <linux/reboot.h>
-#include <linux/init.h>
-#include <linux/mc146818rtc.h>
-#include <linux/module.h>
-#include <linux/kallsyms.h>
-#include <linux/ptrace.h>
-#include <linux/random.h>
-#include <linux/personality.h>
-#include <linux/tick.h>
-#include <linux/percpu.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/ldt.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/desc.h>
-#include <asm/vm86.h>
-#ifdef CONFIG_MATH_EMULATION
-#include <asm/math_emu.h>
-#endif
-
-#include <linux/err.h>
-
-#include <asm/tlbflush.h>
-#include <asm/cpu.h>
-
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-
-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
-       return ((unsigned long *)tsk->thread.esp)[3];
-}
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
-
-void disable_hlt(void)
-{
-       hlt_counter++;
-}
-
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
-       hlt_counter--;
-}
-
-EXPORT_SYMBOL(enable_hlt);
-
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
-       if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-               current_thread_info()->status &= ~TS_POLLING;
-               /*
-                * TS_POLLING-cleared state must be visible before we
-                * test NEED_RESCHED:
-                */
-               smp_mb();
-
-               local_irq_disable();
-               if (!need_resched())
-                       safe_halt();    /* enables interrupts racelessly */
-               else
-                       local_irq_enable();
-               current_thread_info()->status |= TS_POLLING;
-       } else {
-               /* loop is done by the caller */
-               cpu_relax();
-       }
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
-#endif
-
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle (void)
-{
-       cpu_relax();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-#include <asm/nmi.h>
-/* We don't actually take CPU down, just spin without interrupts. */
-static inline void play_dead(void)
-{
-       /* This must be done before dead CPU ack */
-       cpu_exit_clear();
-       wbinvd();
-       mb();
-       /* Ack it */
-       __get_cpu_var(cpu_state) = CPU_DEAD;
-
-       /*
-        * With physical CPU hotplug, we should halt the cpu
-        */
-       local_irq_disable();
-       while (1)
-               halt();
-}
-#else
-static inline void play_dead(void)
-{
-       BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
-       int cpu = smp_processor_id();
-
-       current_thread_info()->status |= TS_POLLING;
-
-       /* endless idle loop with no priority at all */
-       while (1) {
-               tick_nohz_stop_sched_tick();
-               while (!need_resched()) {
-                       void (*idle)(void);
-
-                       if (__get_cpu_var(cpu_idle_state))
-                               __get_cpu_var(cpu_idle_state) = 0;
-
-                       check_pgt_cache();
-                       rmb();
-                       idle = pm_idle;
-
-                       if (!idle)
-                               idle = default_idle;
-
-                       if (cpu_is_offline(cpu))
-                               play_dead();
-
-                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
-                       idle();
-               }
-               tick_nohz_restart_sched_tick();
-               preempt_enable_no_resched();
-               schedule();
-               preempt_disable();
-       }
-}
-
-void cpu_idle_wait(void)
-{
-       unsigned int cpu, this_cpu = get_cpu();
-       cpumask_t map, tmp = current->cpus_allowed;
-
-       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-       put_cpu();
-
-       cpus_clear(map);
-       for_each_online_cpu(cpu) {
-               per_cpu(cpu_idle_state, cpu) = 1;
-               cpu_set(cpu, map);
-       }
-
-       __get_cpu_var(cpu_idle_state) = 0;
-
-       wmb();
-       do {
-               ssleep(1);
-               for_each_online_cpu(cpu) {
-                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
-                               cpu_clear(cpu, map);
-               }
-               cpus_and(map, map, cpu_online_map);
-       } while (!cpus_empty(map));
-
-       set_cpus_allowed(current, tmp);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
-{
-       if (!need_resched()) {
-               __monitor((void *)&current_thread_info()->flags, 0, 0);
-               smp_mb();
-               if (!need_resched())
-                       __mwait(eax, ecx);
-       }
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
-       local_irq_enable();
-       mwait_idle_with_hints(0, 0);
-}
-
-void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
-       if (cpu_has(c, X86_FEATURE_MWAIT)) {
-               printk("monitor/mwait feature present.\n");
-               /*
-                * Skip, if setup has overridden idle.
-                * One CPU supports mwait => All CPUs supports mwait
-                */
-               if (!pm_idle) {
-                       printk("using mwait in idle threads.\n");
-                       pm_idle = mwait_idle;
-               }
-       }
-}
-
-static int __init idle_setup(char *str)
-{
-       if (!strcmp(str, "poll")) {
-               printk("using polling idle threads.\n");
-               pm_idle = poll_idle;
-#ifdef CONFIG_X86_SMP
-               if (smp_num_siblings > 1)
-                       printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
-#endif
-       } else if (!strcmp(str, "mwait"))
-               force_mwait = 1;
-       else
-               return -1;
-
-       boot_option_idle_override = 1;
-       return 0;
-}
-early_param("idle", idle_setup);
-
-void show_regs(struct pt_regs * regs)
-{
-       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
-       unsigned long d0, d1, d2, d3, d6, d7;
-
-       printk("\n");
-       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
-       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
-       print_symbol("EIP is at %s\n", regs->eip);
-
-       if (user_mode_vm(regs))
-               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
-       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
-              regs->eflags, print_tainted(), init_utsname()->release,
-              (int)strcspn(init_utsname()->version, " "),
-              init_utsname()->version);
-       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-               regs->eax,regs->ebx,regs->ecx,regs->edx);
-       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
-               regs->esi, regs->edi, regs->ebp);
-       printk(" DS: %04x ES: %04x FS: %04x\n",
-              0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
-
-       cr0 = read_cr0();
-       cr2 = read_cr2();
-       cr3 = read_cr3();
-       cr4 = read_cr4_safe();
-       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
-
-       get_debugreg(d0, 0);
-       get_debugreg(d1, 1);
-       get_debugreg(d2, 2);
-       get_debugreg(d3, 3);
-       printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
-                       d0, d1, d2, d3);
-       get_debugreg(d6, 6);
-       get_debugreg(d7, 7);
-       printk("DR6: %08lx DR7: %08lx\n", d6, d7);
-
-       show_trace(NULL, regs, &regs->esp);
-}
-
-/*
- * This gets run with %ebx containing the
- * function to call, and %edx containing
- * the "args".
- */
-extern void kernel_thread_helper(void);
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
-{
-       struct pt_regs regs;
-
-       memset(&regs, 0, sizeof(regs));
-
-       regs.ebx = (unsigned long) fn;
-       regs.edx = (unsigned long) arg;
-
-       regs.xds = __USER_DS;
-       regs.xes = __USER_DS;
-       regs.xfs = __KERNEL_PERCPU;
-       regs.orig_eax = -1;
-       regs.eip = (unsigned long) kernel_thread_helper;
-       regs.xcs = __KERNEL_CS | get_kernel_rpl();
-       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
-
-       /* Ok, create the new process.. */
-       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-       /* The process may have allocated an io port bitmap... nuke it. */
-       if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-               struct task_struct *tsk = current;
-               struct thread_struct *t = &tsk->thread;
-               int cpu = get_cpu();
-               struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
-               kfree(t->io_bitmap_ptr);
-               t->io_bitmap_ptr = NULL;
-               clear_thread_flag(TIF_IO_BITMAP);
-               /*
-                * Careful, clear this in the TSS too:
-                */
-               memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
-               t->io_bitmap_max = 0;
-               tss->io_bitmap_owner = NULL;
-               tss->io_bitmap_max = 0;
-               tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
-               put_cpu();
-       }
-}
-
-void flush_thread(void)
-{
-       struct task_struct *tsk = current;
-
-       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
-       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
-       clear_tsk_thread_flag(tsk, TIF_DEBUG);
-       /*
-        * Forget coprocessor state..
-        */
-       clear_fpu(tsk);
-       clear_used_math();
-}
-
-void release_thread(struct task_struct *dead_task)
-{
-       BUG_ON(dead_task->mm);
-       release_vm86_irqs(dead_task);
-}
-
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
-       unlazy_fpu(tsk);
-}
-
-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
-       unsigned long unused,
-       struct task_struct * p, struct pt_regs * regs)
-{
-       struct pt_regs * childregs;
-       struct task_struct *tsk;
-       int err;
-
-       childregs = task_pt_regs(p);
-       *childregs = *regs;
-       childregs->eax = 0;
-       childregs->esp = esp;
-
-       p->thread.esp = (unsigned long) childregs;
-       p->thread.esp0 = (unsigned long) (childregs+1);
-
-       p->thread.eip = (unsigned long) ret_from_fork;
-
-       savesegment(gs,p->thread.gs);
-
-       tsk = current;
-       if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
-               p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
-                                               IO_BITMAP_BYTES, GFP_KERNEL);
-               if (!p->thread.io_bitmap_ptr) {
-                       p->thread.io_bitmap_max = 0;
-                       return -ENOMEM;
-               }
-               set_tsk_thread_flag(p, TIF_IO_BITMAP);
-       }
-
-       /*
-        * Set a new TLS for the child thread?
-        */
-       if (clone_flags & CLONE_SETTLS) {
-               struct desc_struct *desc;
-               struct user_desc info;
-               int idx;
-
-               err = -EFAULT;
-               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
-                       goto out;
-               err = -EINVAL;
-               if (LDT_empty(&info))
-                       goto out;
-
-               idx = info.entry_number;
-               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-                       goto out;
-
-               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
-
-       err = 0;
- out:
-       if (err && p->thread.io_bitmap_ptr) {
-               kfree(p->thread.io_bitmap_ptr);
-               p->thread.io_bitmap_max = 0;
-       }
-       return err;
-}
-
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-       int i;
-
-/* changed the size calculations - should hopefully work better. lbt */
-       dump->magic = CMAGIC;
-       dump->start_code = 0;
-       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
-       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-       dump->u_dsize -= dump->u_tsize;
-       dump->u_ssize = 0;
-       for (i = 0; i < 8; i++)
-               dump->u_debugreg[i] = current->thread.debugreg[i];  
-
-       if (dump->start_stack < TASK_SIZE)
-               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-       dump->regs.ebx = regs->ebx;
-       dump->regs.ecx = regs->ecx;
-       dump->regs.edx = regs->edx;
-       dump->regs.esi = regs->esi;
-       dump->regs.edi = regs->edi;
-       dump->regs.ebp = regs->ebp;
-       dump->regs.eax = regs->eax;
-       dump->regs.ds = regs->xds;
-       dump->regs.es = regs->xes;
-       dump->regs.fs = regs->xfs;
-       savesegment(gs,dump->regs.gs);
-       dump->regs.orig_eax = regs->orig_eax;
-       dump->regs.eip = regs->eip;
-       dump->regs.cs = regs->xcs;
-       dump->regs.eflags = regs->eflags;
-       dump->regs.esp = regs->esp;
-       dump->regs.ss = regs->xss;
-
-       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
-}
-EXPORT_SYMBOL(dump_thread);
-
-/* 
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-       struct pt_regs ptregs = *task_pt_regs(tsk);
-       ptregs.xcs &= 0xffff;
-       ptregs.xds &= 0xffff;
-       ptregs.xes &= 0xffff;
-       ptregs.xss &= 0xffff;
-
-       elf_core_copy_regs(regs, &ptregs);
-
-       return 1;
-}
-
-#ifdef CONFIG_SECCOMP
-void hard_disable_TSC(void)
-{
-       write_cr4(read_cr4() | X86_CR4_TSD);
-}
-void disable_TSC(void)
-{
-       preempt_disable();
-       if (!test_and_set_thread_flag(TIF_NOTSC))
-               /*
-                * Must flip the CPU state synchronously with
-                * TIF_NOTSC in the current running context.
-                */
-               hard_disable_TSC();
-       preempt_enable();
-}
-void hard_enable_TSC(void)
-{
-       write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-#endif /* CONFIG_SECCOMP */
-
-static noinline void
-__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
-                struct tss_struct *tss)
-{
-       struct thread_struct *next;
-
-       next = &next_p->thread;
-
-       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-               set_debugreg(next->debugreg[0], 0);
-               set_debugreg(next->debugreg[1], 1);
-               set_debugreg(next->debugreg[2], 2);
-               set_debugreg(next->debugreg[3], 3);
-               /* no 4 and 5 */
-               set_debugreg(next->debugreg[6], 6);
-               set_debugreg(next->debugreg[7], 7);
-       }
-
-#ifdef CONFIG_SECCOMP
-       if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-           test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-               /* prev and next are different */
-               if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-                       hard_disable_TSC();
-               else
-                       hard_enable_TSC();
-       }
-#endif
-
-       if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-               /*
-                * Disable the bitmap via an invalid offset. We still cache
-                * the previous bitmap owner and the IO bitmap contents:
-                */
-               tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
-               return;
-       }
-
-       if (likely(next == tss->io_bitmap_owner)) {
-               /*
-                * Previous owner of the bitmap (hence the bitmap content)
-                * matches the next task, we dont have to do anything but
-                * to set a valid offset in the TSS:
-                */
-               tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-               return;
-       }
-       /*
-        * Lazy TSS's I/O bitmap copy. We set an invalid offset here
-        * and we let the task to get a GPF in case an I/O instruction
-        * is performed.  The handler of the GPF will verify that the
-        * faulting task has a valid I/O bitmap and, it true, does the
-        * real copy and restart the instruction.  This will save us
-        * redundant copies when the currently switched task does not
-        * perform any I/O during its timeslice.
-        */
-       tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-}
-
-/*
- *     switch_to(x,yn) should switch tasks from x to y.
- *
- * We fsave/fwait so that an exception goes off at the right time
- * (as a call from the fsave or fwait in effect) rather than to
- * the wrong process. Lazy FP saving no longer makes any sense
- * with modern CPU's, and this simplifies a lot of things (SMP
- * and UP become the same).
- *
- * NOTE! We used to use the x86 hardware context switching. The
- * reason for not using it any more becomes apparent when you
- * try to recover gracefully from saved state that is no longer
- * valid (stale segment register values in particular). With the
- * hardware task-switch, there is no way to fix up bad state in
- * a reasonable manner.
- *
- * The fact that Intel documents the hardware task-switching to
- * be slow is a fairly red herring - this code is not noticeably
- * faster. However, there _is_ some room for improvement here,
- * so the performance issues may eventually be a valid point.
- * More important, however, is the fact that this allows us much
- * more flexibility.
- *
- * The return value (in %eax) will be the "prev" task after
- * the task-switch, and shows up in ret_from_fork in entry.S,
- * for example.
- */
-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
-{
-       struct thread_struct *prev = &prev_p->thread,
-                                *next = &next_p->thread;
-       int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
-       /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
-
-       __unlazy_fpu(prev_p);
-
-
-       /* we're going to use this soon, after a few expensive things */
-       if (next_p->fpu_counter > 5)
-               prefetch(&next->i387.fxsave);
-
-       /*
-        * Reload esp0.
-        */
-       load_esp0(tss, next);
-
-       /*
-        * Save away %gs. No need to save %fs, as it was saved on the
-        * stack on entry.  No need to save %es and %ds, as those are
-        * always kernel segments while inside the kernel.  Doing this
-        * before setting the new TLS descriptors avoids the situation
-        * where we temporarily have non-reloadable segments in %fs
-        * and %gs.  This could be an issue if the NMI handler ever
-        * used %fs or %gs (it does not today), or if the kernel is
-        * running inside of a hypervisor layer.
-        */
-       savesegment(gs, prev->gs);
-
-       /*
-        * Load the per-thread Thread-Local Storage descriptor.
-        */
-       load_TLS(next, cpu);
-
-       /*
-        * Restore IOPL if needed.  In normal use, the flags restore
-        * in the switch assembly will handle this.  But if the kernel
-        * is running virtualized at a non-zero CPL, the popf will
-        * not restore flags, so it must be done in a separate step.
-        */
-       if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
-               set_iopl_mask(next->iopl);
-
-       /*
-        * Now maybe handle debug registers and/or IO bitmaps
-        */
-       if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
-                    task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
-               __switch_to_xtra(prev_p, next_p, tss);
-
-       /*
-        * Leave lazy mode, flushing any hypercalls made here.
-        * This must be done before restoring TLS segments so
-        * the GDT and LDT are properly updated, and must be
-        * done before math_state_restore, so the TS bit is up
-        * to date.
-        */
-       arch_leave_lazy_cpu_mode();
-
-       /* If the task has used fpu the last 5 timeslices, just do a full
-        * restore of the math state immediately to avoid the trap; the
-        * chances of needing FPU soon are obviously high now
-        */
-       if (next_p->fpu_counter > 5)
-               math_state_restore();
-
-       /*
-        * Restore %gs if needed (which is common)
-        */
-       if (prev->gs | next->gs)
-               loadsegment(gs, next->gs);
-
-       x86_write_percpu(current_task, next_p);
-
-       return prev_p;
-}
-
-asmlinkage int sys_fork(struct pt_regs regs)
-{
-       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
-}
-
-asmlinkage int sys_clone(struct pt_regs regs)
-{
-       unsigned long clone_flags;
-       unsigned long newsp;
-       int __user *parent_tidptr, *child_tidptr;
-
-       clone_flags = regs.ebx;
-       newsp = regs.ecx;
-       parent_tidptr = (int __user *)regs.edx;
-       child_tidptr = (int __user *)regs.edi;
-       if (!newsp)
-               newsp = regs.esp;
-       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
-}
-
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage int sys_vfork(struct pt_regs regs)
-{
-       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
-}
-
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage int sys_execve(struct pt_regs regs)
-{
-       int error;
-       char * filename;
-
-       filename = getname((char __user *) regs.ebx);
-       error = PTR_ERR(filename);
-       if (IS_ERR(filename))
-               goto out;
-       error = do_execve(filename,
-                       (char __user * __user *) regs.ecx,
-                       (char __user * __user *) regs.edx,
-                       &regs);
-       if (error == 0) {
-               task_lock(current);
-               current->ptrace &= ~PT_DTRACE;
-               task_unlock(current);
-               /* Make sure we don't return using sysenter.. */
-               set_thread_flag(TIF_IRET);
-       }
-       putname(filename);
-out:
-       return error;
-}
-
-#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
-
-unsigned long get_wchan(struct task_struct *p)
-{
-       unsigned long ebp, esp, eip;
-       unsigned long stack_page;
-       int count = 0;
-       if (!p || p == current || p->state == TASK_RUNNING)
-               return 0;
-       stack_page = (unsigned long)task_stack_page(p);
-       esp = p->thread.esp;
-       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
-               return 0;
-       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
-       ebp = *(unsigned long *) esp;
-       do {
-               if (ebp < stack_page || ebp > top_ebp+stack_page)
-                       return 0;
-               eip = *(unsigned long *) (ebp+4);
-               if (!in_sched_functions(eip))
-                       return eip;
-               ebp = *(unsigned long *) ebp;
-       } while (count++ < 16);
-       return 0;
-}
-
-/*
- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
- */
-static int get_free_idx(void)
-{
-       struct thread_struct *t = &current->thread;
-       int idx;
-
-       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-               if (desc_empty(t->tls_array + idx))
-                       return idx + GDT_ENTRY_TLS_MIN;
-       return -ESRCH;
-}
-
-/*
- * Set a given TLS descriptor:
- */
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
-{
-       struct thread_struct *t = &current->thread;
-       struct user_desc info;
-       struct desc_struct *desc;
-       int cpu, idx;
-
-       if (copy_from_user(&info, u_info, sizeof(info)))
-               return -EFAULT;
-       idx = info.entry_number;
-
-       /*
-        * index -1 means the kernel should try to find and
-        * allocate an empty descriptor:
-        */
-       if (idx == -1) {
-               idx = get_free_idx();
-               if (idx < 0)
-                       return idx;
-               if (put_user(idx, &u_info->entry_number))
-                       return -EFAULT;
-       }
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-       /*
-        * We must not get preempted while modifying the TLS.
-        */
-       cpu = get_cpu();
-
-       if (LDT_empty(&info)) {
-               desc->a = 0;
-               desc->b = 0;
-       } else {
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
-       load_TLS(t, cpu);
-
-       put_cpu();
-
-       return 0;
-}
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-       (((desc)->a >> 16) & 0x0000ffff) | \
-       (((desc)->b << 16) & 0x00ff0000) | \
-       ( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-       ((desc)->a & 0x0ffff) | \
-        ((desc)->b & 0xf0000) )
-       
-#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
-
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
-{
-       struct user_desc info;
-       struct desc_struct *desc;
-       int idx;
-
-       if (get_user(idx, &u_info->entry_number))
-               return -EFAULT;
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       memset(&info, 0, sizeof(info));
-
-       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-       info.entry_number = idx;
-       info.base_addr = GET_BASE(desc);
-       info.limit = GET_LIMIT(desc);
-       info.seg_32bit = GET_32BIT(desc);
-       info.contents = GET_CONTENTS(desc);
-       info.read_exec_only = !GET_WRITABLE(desc);
-       info.limit_in_pages = GET_LIMIT_PAGES(desc);
-       info.seg_not_present = !GET_PRESENT(desc);
-       info.useable = GET_USEABLE(desc);
-
-       if (copy_to_user(u_info, &info, sizeof(info)))
-               return -EFAULT;
-       return 0;
-}
-
-unsigned long arch_align_stack(unsigned long sp)
-{
-       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-               sp -= get_random_int() % 8192;
-       return sp & ~0xf;
-}
diff --git a/arch/i386/kernel/ptrace_32.c b/arch/i386/kernel/ptrace_32.c
deleted file mode 100644 (file)
index 7c1b925..0000000
+++ /dev/null
@@ -1,723 +0,0 @@
-/* ptrace.c */
-/* By Ross Biro 1/23/92 */
-/*
- * Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/debugreg.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-
-/*
- * does not yet catch signals sent when the child dies.
- * in exit.c or in signal.c.
- */
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
- * Also masks reserved bits (31-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x00050dd5
-
-/* set's the trap flag. */
-#define TRAP_FLAG 0x100
-
-/*
- * Offset of eflags on child stack..
- */
-#define EFL_OFFSET offsetof(struct pt_regs, eflags)
-
-static inline struct pt_regs *get_child_regs(struct task_struct *task)
-{
-       void *stack_top = (void *)task->thread.esp0;
-       return stack_top - sizeof(struct pt_regs);
-}
-
-/*
- * This routine will get a word off of the processes privileged stack.
- * the offset is bytes into the pt_regs structure on the stack.
- * This routine assumes that all the privileged stacks are in our
- * data space.
- */   
-static inline int get_stack_long(struct task_struct *task, int offset)
-{
-       unsigned char *stack;
-
-       stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
-       stack += offset;
-       return (*((int *)stack));
-}
-
-/*
- * This routine will put a word on the processes privileged stack.
- * the offset is bytes into the pt_regs structure on the stack.
- * This routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline int put_stack_long(struct task_struct *task, int offset,
-       unsigned long data)
-{
-       unsigned char * stack;
-
-       stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
-       stack += offset;
-       *(unsigned long *) stack = data;
-       return 0;
-}
-
-static int putreg(struct task_struct *child,
-       unsigned long regno, unsigned long value)
-{
-       switch (regno >> 2) {
-               case GS:
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.gs = value;
-                       return 0;
-               case DS:
-               case ES:
-               case FS:
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       break;
-               case SS:
-               case CS:
-                       if ((value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       break;
-               case EFL:
-                       value &= FLAG_MASK;
-                       value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
-                       break;
-       }
-       if (regno > FS*4)
-               regno -= 1*4;
-       put_stack_long(child, regno, value);
-       return 0;
-}
-
-static unsigned long getreg(struct task_struct *child,
-       unsigned long regno)
-{
-       unsigned long retval = ~0UL;
-
-       switch (regno >> 2) {
-               case GS:
-                       retval = child->thread.gs;
-                       break;
-               case DS:
-               case ES:
-               case FS:
-               case SS:
-               case CS:
-                       retval = 0xffff;
-                       /* fall through */
-               default:
-                       if (regno > FS*4)
-                               regno -= 1*4;
-                       retval &= get_stack_long(child, regno);
-       }
-       return retval;
-}
-
-#define LDT_SEGMENT 4
-
-static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
-{
-       unsigned long addr, seg;
-
-       addr = regs->eip;
-       seg = regs->xcs & 0xffff;
-       if (regs->eflags & VM_MASK) {
-               addr = (addr & 0xffff) + (seg << 4);
-               return addr;
-       }
-
-       /*
-        * We'll assume that the code segments in the GDT
-        * are all zero-based. That is largely true: the
-        * TLS segments are used for data, and the PNPBIOS
-        * and APM bios ones we just ignore here.
-        */
-       if (seg & LDT_SEGMENT) {
-               u32 *desc;
-               unsigned long base;
-
-               seg &= ~7UL;
-
-               down(&child->mm->context.sem);
-               if (unlikely((seg >> 3) >= child->mm->context.size))
-                       addr = -1L; /* bogus selector, access would fault */
-               else {
-                       desc = child->mm->context.ldt + seg;
-                       base = ((desc[0] >> 16) |
-                               ((desc[1] & 0xff) << 16) |
-                               (desc[1] & 0xff000000));
-
-                       /* 16-bit code segment? */
-                       if (!((desc[1] >> 22) & 1))
-                               addr &= 0xffff;
-                       addr += base;
-               }
-               up(&child->mm->context.sem);
-       }
-       return addr;
-}
-
-static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
-{
-       int i, copied;
-       unsigned char opcode[15];
-       unsigned long addr = convert_eip_to_linear(child, regs);
-
-       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
-       for (i = 0; i < copied; i++) {
-               switch (opcode[i]) {
-               /* popf and iret */
-               case 0x9d: case 0xcf:
-                       return 1;
-               /* opcode and address size prefixes */
-               case 0x66: case 0x67:
-                       continue;
-               /* irrelevant prefixes (segment overrides and repeats) */
-               case 0x26: case 0x2e:
-               case 0x36: case 0x3e:
-               case 0x64: case 0x65:
-               case 0xf0: case 0xf2: case 0xf3:
-                       continue;
-
-               /*
-                * pushf: NOTE! We should probably not let
-                * the user see the TF bit being set. But
-                * it's more pain than it's worth to avoid
-                * it, and a debugger could emulate this
-                * all in user space if it _really_ cares.
-                */
-               case 0x9c:
-               default:
-                       return 0;
-               }
-       }
-       return 0;
-}
-
-static void set_singlestep(struct task_struct *child)
-{
-       struct pt_regs *regs = get_child_regs(child);
-
-       /*
-        * Always set TIF_SINGLESTEP - this guarantees that 
-        * we single-step system calls etc..  This will also
-        * cause us to set TF when returning to user mode.
-        */
-       set_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /*
-        * If TF was already set, don't do anything else
-        */
-       if (regs->eflags & TRAP_FLAG)
-               return;
-
-       /* Set TF on the kernel stack.. */
-       regs->eflags |= TRAP_FLAG;
-
-       /*
-        * ..but if TF is changed by the instruction we will trace,
-        * don't mark it as being "us" that set it, so that we
-        * won't clear it by hand later.
-        */
-       if (is_setting_trap_flag(child, regs))
-               return;
-       
-       child->ptrace |= PT_DTRACE;
-}
-
-static void clear_singlestep(struct task_struct *child)
-{
-       /* Always clear TIF_SINGLESTEP... */
-       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /* But touch TF only if it was set by us.. */
-       if (child->ptrace & PT_DTRACE) {
-               struct pt_regs *regs = get_child_regs(child);
-               regs->eflags &= ~TRAP_FLAG;
-               child->ptrace &= ~PT_DTRACE;
-       }
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void ptrace_disable(struct task_struct *child)
-{ 
-       clear_singlestep(child);
-       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-}
-
-/*
- * Perform get_thread_area on behalf of the traced child.
- */
-static int
-ptrace_get_thread_area(struct task_struct *child,
-                      int idx, struct user_desc __user *user_desc)
-{
-       struct user_desc info;
-       struct desc_struct *desc;
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-       (((desc)->a >> 16) & 0x0000ffff) | \
-       (((desc)->b << 16) & 0x00ff0000) | \
-       ( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-       ((desc)->a & 0x0ffff) | \
-        ((desc)->b & 0xf0000) )
-
-#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-       info.entry_number = idx;
-       info.base_addr = GET_BASE(desc);
-       info.limit = GET_LIMIT(desc);
-       info.seg_32bit = GET_32BIT(desc);
-       info.contents = GET_CONTENTS(desc);
-       info.read_exec_only = !GET_WRITABLE(desc);
-       info.limit_in_pages = GET_LIMIT_PAGES(desc);
-       info.seg_not_present = !GET_PRESENT(desc);
-       info.useable = GET_USEABLE(desc);
-
-       if (copy_to_user(user_desc, &info, sizeof(info)))
-               return -EFAULT;
-
-       return 0;
-}
-
-/*
- * Perform set_thread_area on behalf of the traced child.
- */
-static int
-ptrace_set_thread_area(struct task_struct *child,
-                      int idx, struct user_desc __user *user_desc)
-{
-       struct user_desc info;
-       struct desc_struct *desc;
-
-       if (copy_from_user(&info, user_desc, sizeof(info)))
-               return -EFAULT;
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-       if (LDT_empty(&info)) {
-               desc->a = 0;
-               desc->b = 0;
-       } else {
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
-
-       return 0;
-}
-
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-       struct user * dummy = NULL;
-       int i, ret;
-       unsigned long __user *datap = (unsigned long __user *)data;
-
-       switch (request) {
-       /* when I and D space are separate, these will need to be fixed. */
-       case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-       case PTRACE_PEEKDATA:
-               ret = generic_ptrace_peekdata(child, addr, data);
-               break;
-
-       /* read the word at location addr in the USER area. */
-       case PTRACE_PEEKUSR: {
-               unsigned long tmp;
-
-               ret = -EIO;
-               if ((addr & 3) || addr < 0 || 
-                   addr > sizeof(struct user) - 3)
-                       break;
-
-               tmp = 0;  /* Default return condition */
-               if(addr < FRAME_SIZE*sizeof(long))
-                       tmp = getreg(child, addr);
-               if(addr >= (long) &dummy->u_debugreg[0] &&
-                  addr <= (long) &dummy->u_debugreg[7]){
-                       addr -= (long) &dummy->u_debugreg[0];
-                       addr = addr >> 2;
-                       tmp = child->thread.debugreg[addr];
-               }
-               ret = put_user(tmp, datap);
-               break;
-       }
-
-       /* when I and D space are separate, this will have to be fixed. */
-       case PTRACE_POKETEXT: /* write the word at location addr. */
-       case PTRACE_POKEDATA:
-               ret = generic_ptrace_pokedata(child, addr, data);
-               break;
-
-       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-               ret = -EIO;
-               if ((addr & 3) || addr < 0 || 
-                   addr > sizeof(struct user) - 3)
-                       break;
-
-               if (addr < FRAME_SIZE*sizeof(long)) {
-                       ret = putreg(child, addr, data);
-                       break;
-               }
-               /* We need to be very careful here.  We implicitly
-                  want to modify a portion of the task_struct, and we
-                  have to be selective about what portions we allow someone
-                  to modify. */
-
-                 ret = -EIO;
-                 if(addr >= (long) &dummy->u_debugreg[0] &&
-                    addr <= (long) &dummy->u_debugreg[7]){
-
-                         if(addr == (long) &dummy->u_debugreg[4]) break;
-                         if(addr == (long) &dummy->u_debugreg[5]) break;
-                         if(addr < (long) &dummy->u_debugreg[4] &&
-                            ((unsigned long) data) >= TASK_SIZE-3) break;
-                         
-                         /* Sanity-check data. Take one half-byte at once with
-                          * check = (val >> (16 + 4*i)) & 0xf. It contains the
-                          * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-                          * 2 and 3 are LENi. Given a list of invalid values,
-                          * we do mask |= 1 << invalid_value, so that
-                          * (mask >> check) & 1 is a correct test for invalid
-                          * values.
-                          *
-                          * R/Wi contains the type of the breakpoint /
-                          * watchpoint, LENi contains the length of the watched
-                          * data in the watchpoint case.
-                          *
-                          * The invalid values are:
-                          * - LENi == 0x10 (undefined), so mask |= 0x0f00.
-                          * - R/Wi == 0x10 (break on I/O reads or writes), so
-                          *   mask |= 0x4444.
-                          * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-                          *   0x1110.
-                          *
-                          * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-                          *
-                          * See the Intel Manual "System Programming Guide",
-                          * 15.2.4
-                          *
-                          * Note that LENi == 0x10 is defined on x86_64 in long
-                          * mode (i.e. even for 32-bit userspace software, but
-                          * 64-bit kernel), so the x86_64 mask value is 0x5454.
-                          * See the AMD manual no. 24593 (AMD64 System
-                          * Programming)*/
-
-                         if(addr == (long) &dummy->u_debugreg[7]) {
-                                 data &= ~DR_CONTROL_RESERVED;
-                                 for(i=0; i<4; i++)
-                                         if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                                                 goto out_tsk;
-                                 if (data)
-                                         set_tsk_thread_flag(child, TIF_DEBUG);
-                                 else
-                                         clear_tsk_thread_flag(child, TIF_DEBUG);
-                         }
-                         addr -= (long) &dummy->u_debugreg;
-                         addr = addr >> 2;
-                         child->thread.debugreg[addr] = data;
-                         ret = 0;
-                 }
-                 break;
-
-       case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
-       case PTRACE_SYSCALL:    /* continue and stop at next (return from) syscall */
-       case PTRACE_CONT:       /* restart after signal. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               if (request == PTRACE_SYSEMU) {
-                       set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               } else if (request == PTRACE_SYSCALL) {
-                       set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-               } else {
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               }
-               child->exit_code = data;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-/*
- * make the child exit.  Best I can do is send it a sigkill. 
- * perhaps it should be put in the status that it wants to 
- * exit.
- */
-       case PTRACE_KILL:
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               child->exit_code = SIGKILL;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               break;
-
-       case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
-       case PTRACE_SINGLESTEP: /* set the trap flag. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-
-               if (request == PTRACE_SYSEMU_SINGLESTEP)
-                       set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-               else
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-
-               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               set_singlestep(child);
-               child->exit_code = data;
-               /* give it a chance to run. */
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-       case PTRACE_DETACH:
-               /* detach a process that was attached. */
-               ret = ptrace_detach(child, data);
-               break;
-
-       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-               if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
-                       ret = -EIO;
-                       break;
-               }
-               for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
-                       __put_user(getreg(child, i), datap);
-                       datap++;
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-               unsigned long tmp;
-               if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
-                       ret = -EIO;
-                       break;
-               }
-               for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
-                       __get_user(tmp, datap);
-                       putreg(child, i, tmp);
-                       datap++;
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-               if (!access_ok(VERIFY_WRITE, datap,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               if (!tsk_used_math(child))
-                       init_fpu(child);
-               get_fpregs((struct user_i387_struct __user *)data, child);
-               break;
-       }
-
-       case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-               if (!access_ok(VERIFY_READ, datap,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               set_stopped_child_used_math(child);
-               set_fpregs(child, (struct user_i387_struct __user *)data);
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
-               if (!access_ok(VERIFY_WRITE, datap,
-                              sizeof(struct user_fxsr_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               if (!tsk_used_math(child))
-                       init_fpu(child);
-               ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
-               break;
-       }
-
-       case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
-               if (!access_ok(VERIFY_READ, datap,
-                              sizeof(struct user_fxsr_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               set_stopped_child_used_math(child);
-               ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
-               break;
-       }
-
-       case PTRACE_GET_THREAD_AREA:
-               ret = ptrace_get_thread_area(child, addr,
-                                       (struct user_desc __user *) data);
-               break;
-
-       case PTRACE_SET_THREAD_AREA:
-               ret = ptrace_set_thread_area(child, addr,
-                                       (struct user_desc __user *) data);
-               break;
-
-       default:
-               ret = ptrace_request(child, request, addr, data);
-               break;
-       }
- out_tsk:
-       return ret;
-}
-
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
-{
-       struct siginfo info;
-
-       tsk->thread.trap_no = 1;
-       tsk->thread.error_code = error_code;
-
-       memset(&info, 0, sizeof(info));
-       info.si_signo = SIGTRAP;
-       info.si_code = TRAP_BRKPT;
-
-       /* User-mode eip? */
-       info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
-
-       /* Send us the fakey SIGTRAP */
-       force_sig_info(SIGTRAP, &info, tsk);
-}
-
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-__attribute__((regparm(3)))
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-       int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-       /*
-        * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-        * interception
-        */
-       int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-       int ret = 0;
-
-       /* do the secure computing check first */
-       if (!entryexit)
-               secure_computing(regs->orig_eax);
-
-       if (unlikely(current->audit_context)) {
-               if (entryexit)
-                       audit_syscall_exit(AUDITSC_RESULT(regs->eax),
-                                               regs->eax);
-               /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-                * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-                * not used, entry.S will call us only on syscall exit, not
-                * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-                * calling send_sigtrap() on syscall entry.
-                *
-                * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-                * is_singlestep is false, despite his name, so we will still do
-                * the correct thing.
-                */
-               else if (is_singlestep)
-                       goto out;
-       }
-
-       if (!(current->ptrace & PT_PTRACED))
-               goto out;
-
-       /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-        * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-        * here. We have to check this and return */
-       if (is_sysemu && entryexit)
-               return 0;
-
-       /* Fake a debug trap */
-       if (is_singlestep)
-               send_sigtrap(current, regs, 0);
-
-       if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-               goto out;
-
-       /* the 0x80 provides a way for the tracing parent to distinguish
-          between a syscall stop and SIGTRAP delivery */
-       /* Note that the debugger could change the result of test_thread_flag!*/
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
-
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-       ret = is_sysemu;
-out:
-       if (unlikely(current->audit_context) && !entryexit)
-               audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
-                                   regs->ebx, regs->ecx, regs->edx, regs->esi);
-       if (ret == 0)
-               return 0;
-
-       regs->orig_eax = -1; /* force skip of syscall restarting */
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
-       return 1;
-}
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
deleted file mode 100644 (file)
index 6722469..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * This file contains work-arounds for x86 and x86_64 platform bugs.
- */
-#include <linux/pci.h>
-#include <linux/irq.h>
-
-#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
-{
-       u8 config, rev;
-       u32 word;
-
-       /* BIOS may enable hardware IRQ balancing for
-        * E7520/E7320/E7525(revision ID 0x9 and below)
-        * based platforms.
-        * Disable SW irqbalance/affinity on those platforms.
-        */
-       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-       if (rev > 0x9)
-               return;
-
-       /* enable access to config space*/
-       pci_read_config_byte(dev, 0xf4, &config);
-       pci_write_config_byte(dev, 0xf4, config|0x2);
-
-       /* read xTPR register */
-       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
-
-       if (!(word & (1 << 13))) {
-               printk(KERN_INFO "Intel E7520/7320/7525 detected. "
-                       "Disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
-               irqbalance_disable("");
-#endif
-               noirqdebug_setup("");
-#ifdef CONFIG_PROC_FS
-               no_irq_affinity = 1;
-#endif
-       }
-
-       /* put back the original value for config space*/
-       if (!(config & 0x2))
-               pci_write_config_byte(dev, 0xf4, config);
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
-#endif
diff --git a/arch/i386/kernel/reboot_32.c b/arch/i386/kernel/reboot_32.c
deleted file mode 100644 (file)
index 0d79624..0000000
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- *  linux/arch/i386/kernel/reboot.c
- */
-
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/efi.h>
-#include <linux/dmi.h>
-#include <linux/ctype.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <asm/uaccess.h>
-#include <asm/apic.h>
-#include <asm/desc.h>
-#include "mach_reboot.h"
-#include <asm/reboot_fixups.h>
-#include <asm/reboot.h>
-
-/*
- * Power off function, if any
- */
-void (*pm_power_off)(void);
-EXPORT_SYMBOL(pm_power_off);
-
-static int reboot_mode;
-static int reboot_thru_bios;
-
-#ifdef CONFIG_SMP
-static int reboot_cpu = -1;
-#endif
-static int __init reboot_setup(char *str)
-{
-       while(1) {
-               switch (*str) {
-               case 'w': /* "warm" reboot (no memory testing etc) */
-                       reboot_mode = 0x1234;
-                       break;
-               case 'c': /* "cold" reboot (with memory testing etc) */
-                       reboot_mode = 0x0;
-                       break;
-               case 'b': /* "bios" reboot by jumping through the BIOS */
-                       reboot_thru_bios = 1;
-                       break;
-               case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
-                       reboot_thru_bios = 0;
-                       break;
-#ifdef CONFIG_SMP
-               case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
-                       if (isdigit(*(str+1))) {
-                               reboot_cpu = (int) (*(str+1) - '0');
-                               if (isdigit(*(str+2)))
-                                       reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
-                       }
-                               /* we will leave sorting out the final value 
-                               when we are ready to reboot, since we might not
-                               have set up boot_cpu_id or smp_num_cpu */
-                       break;
-#endif
-               }
-               if((str = strchr(str,',')) != NULL)
-                       str++;
-               else
-                       break;
-       }
-       return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-/*
- * Reboot options and system auto-detection code provided by
- * Dell Inc. so their systems "just work". :-)
- */
-
-/*
- * Some machines require the "reboot=b"  commandline option, this quirk makes that automatic.
- */
-static int __init set_bios_reboot(struct dmi_system_id *d)
-{
-       if (!reboot_thru_bios) {
-               reboot_thru_bios = 1;
-               printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
-       }
-       return 0;
-}
-
-static struct dmi_system_id __initdata reboot_dmi_table[] = {
-       {       /* Handle problems with rebooting on Dell E520's */
-               .callback = set_bios_reboot,
-               .ident = "Dell E520",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
-               },
-       },
-       {       /* Handle problems with rebooting on Dell 1300's */
-               .callback = set_bios_reboot,
-               .ident = "Dell PowerEdge 1300",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
-               },
-       },
-       {       /* Handle problems with rebooting on Dell 300's */
-               .callback = set_bios_reboot,
-               .ident = "Dell PowerEdge 300",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
-               },
-       },
-       {       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
-               .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 745",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
-                       DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
-               },
-       },
-       {       /* Handle problems with rebooting on Dell 2400's */
-               .callback = set_bios_reboot,
-               .ident = "Dell PowerEdge 2400",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
-               },
-       },
-       {       /* Handle problems with rebooting on HP laptops */
-               .callback = set_bios_reboot,
-               .ident = "HP Compaq Laptop",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
-               },
-       },
-       { }
-};
-
-static int __init reboot_init(void)
-{
-       dmi_check_system(reboot_dmi_table);
-       return 0;
-}
-
-core_initcall(reboot_init);
-
-/* The following code and data reboots the machine by switching to real
-   mode and jumping to the BIOS reset entry point, as if the CPU has
-   really been reset.  The previous version asked the keyboard
-   controller to pulse the CPU reset line, which is more thorough, but
-   doesn't work with at least one type of 486 motherboard.  It is easy
-   to stop this code working; hence the copious comments. */
-
-static unsigned long long
-real_mode_gdt_entries [3] =
-{
-       0x0000000000000000ULL,  /* Null descriptor */
-       0x00009a000000ffffULL,  /* 16-bit real-mode 64k code at 0x00000000 */
-       0x000092000100ffffULL   /* 16-bit real-mode 64k data at 0x00000100 */
-};
-
-static struct Xgt_desc_struct
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, 0 },
-no_idt = { 0, 0 };
-
-
-/* This is 16-bit protected mode code to disable paging and the cache,
-   switch to real mode and jump to the BIOS reset code.
-
-   The instruction that switches to real mode by writing to CR0 must be
-   followed immediately by a far jump instruction, which set CS to a
-   valid value for real mode, and flushes the prefetch queue to avoid
-   running instructions that have already been decoded in protected
-   mode.
-
-   Clears all the flags except ET, especially PG (paging), PE
-   (protected-mode enable) and TS (task switch for coprocessor state
-   save).  Flushes the TLB after paging has been disabled.  Sets CD and
-   NW, to disable the cache on a 486, and invalidates the cache.  This
-   is more like the state of a 486 after reset.  I don't know if
-   something else should be done for other chips.
-
-   More could be done here to set up the registers as if a CPU reset had
-   occurred; hopefully real BIOSs don't assume much. */
-
-static unsigned char real_mode_switch [] =
-{
-       0x66, 0x0f, 0x20, 0xc0,                 /*    movl  %cr0,%eax        */
-       0x66, 0x83, 0xe0, 0x11,                 /*    andl  $0x00000011,%eax */
-       0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,     /*    orl   $0x60000000,%eax */
-       0x66, 0x0f, 0x22, 0xc0,                 /*    movl  %eax,%cr0        */
-       0x66, 0x0f, 0x22, 0xd8,                 /*    movl  %eax,%cr3        */
-       0x66, 0x0f, 0x20, 0xc3,                 /*    movl  %cr0,%ebx        */
-       0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,       /*    andl  $0x60000000,%ebx */
-       0x74, 0x02,                             /*    jz    f                */
-       0x0f, 0x09,                             /*    wbinvd                 */
-       0x24, 0x10,                             /* f: andb  $0x10,al         */
-       0x66, 0x0f, 0x22, 0xc0                  /*    movl  %eax,%cr0        */
-};
-static unsigned char jump_to_bios [] =
-{
-       0xea, 0x00, 0x00, 0xff, 0xff            /*    ljmp  $0xffff,$0x0000  */
-};
-
-/*
- * Switch to real mode and then execute the code
- * specified by the code and length parameters.
- * We assume that length will aways be less that 100!
- */
-void machine_real_restart(unsigned char *code, int length)
-{
-       local_irq_disable();
-
-       /* Write zero to CMOS register number 0x0f, which the BIOS POST
-          routine will recognize as telling it to do a proper reboot.  (Well
-          that's what this book in front of me says -- it may only apply to
-          the Phoenix BIOS though, it's not clear).  At the same time,
-          disable NMIs by setting the top bit in the CMOS address register,
-          as we're about to do peculiar things to the CPU.  I'm not sure if
-          `outb_p' is needed instead of just `outb'.  Use it to be on the
-          safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
-        */
-
-       spin_lock(&rtc_lock);
-       CMOS_WRITE(0x00, 0x8f);
-       spin_unlock(&rtc_lock);
-
-       /* Remap the kernel at virtual address zero, as well as offset zero
-          from the kernel segment.  This assumes the kernel segment starts at
-          virtual address PAGE_OFFSET. */
-
-       memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-               sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
-
-       /*
-        * Use `swapper_pg_dir' as our page directory.
-        */
-       load_cr3(swapper_pg_dir);
-
-       /* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
-          this on booting to tell it to "Bypass memory test (also warm
-          boot)".  This seems like a fairly standard thing that gets set by
-          REBOOT.COM programs, and the previous reset routine did this
-          too. */
-
-       *((unsigned short *)0x472) = reboot_mode;
-
-       /* For the switch to real mode, copy some code to low memory.  It has
-          to be in the first 64k because it is running in 16-bit mode, and it
-          has to have the same physical and virtual address, because it turns
-          off paging.  Copy it near the end of the first page, out of the way
-          of BIOS variables. */
-
-       memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
-               real_mode_switch, sizeof (real_mode_switch));
-       memcpy ((void *) (0x1000 - 100), code, length);
-
-       /* Set up the IDT for real mode. */
-
-       load_idt(&real_mode_idt);
-
-       /* Set up a GDT from which we can load segment descriptors for real
-          mode.  The GDT is not used in real mode; it is just needed here to
-          prepare the descriptors. */
-
-       load_gdt(&real_mode_gdt);
-
-       /* Load the data segment registers, and thus the descriptors ready for
-          real mode.  The base address of each segment is 0x100, 16 times the
-          selector value being loaded here.  This is so that the segment
-          registers don't have to be reloaded after switching to real mode:
-          the values are consistent for real mode operation already. */
-
-       __asm__ __volatile__ ("movl $0x0010,%%eax\n"
-                               "\tmovl %%eax,%%ds\n"
-                               "\tmovl %%eax,%%es\n"
-                               "\tmovl %%eax,%%fs\n"
-                               "\tmovl %%eax,%%gs\n"
-                               "\tmovl %%eax,%%ss" : : : "eax");
-
-       /* Jump to the 16-bit code that we copied earlier.  It disables paging
-          and the cache, switches to real mode, and jumps to the BIOS reset
-          entry point. */
-
-       __asm__ __volatile__ ("ljmp $0x0008,%0"
-                               :
-                               : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
-
-static void native_machine_shutdown(void)
-{
-#ifdef CONFIG_SMP
-       int reboot_cpu_id;
-
-       /* The boot cpu is always logical cpu 0 */
-       reboot_cpu_id = 0;
-
-       /* See if there has been given a command line override */
-       if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
-               cpu_isset(reboot_cpu, cpu_online_map)) {
-               reboot_cpu_id = reboot_cpu;
-       }
-
-       /* Make certain the cpu I'm rebooting on is online */
-       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
-               reboot_cpu_id = smp_processor_id();
-       }
-
-       /* Make certain I only run on the appropriate processor */
-       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
-
-       /* O.K. Now that I'm on the appropriate processor, stop
-        * all of the others, and disable their local APICs.
-        */
-
-       smp_send_stop();
-#endif /* CONFIG_SMP */
-
-       lapic_shutdown();
-
-#ifdef CONFIG_X86_IO_APIC
-       disable_IO_APIC();
-#endif
-}
-
-void __attribute__((weak)) mach_reboot_fixups(void)
-{
-}
-
-static void native_machine_emergency_restart(void)
-{
-       if (!reboot_thru_bios) {
-               if (efi_enabled) {
-                       efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
-                       load_idt(&no_idt);
-                       __asm__ __volatile__("int3");
-               }
-               /* rebooting needs to touch the page at absolute addr 0 */
-               *((unsigned short *)__va(0x472)) = reboot_mode;
-               for (;;) {
-                       mach_reboot_fixups(); /* for board specific fixups */
-                       mach_reboot();
-                       /* That didn't work - force a triple fault.. */
-                       load_idt(&no_idt);
-                       __asm__ __volatile__("int3");
-               }
-       }
-       if (efi_enabled)
-               efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
-
-       machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
-}
-
-static void native_machine_restart(char * __unused)
-{
-       machine_shutdown();
-       machine_emergency_restart();
-}
-
-static void native_machine_halt(void)
-{
-}
-
-static void native_machine_power_off(void)
-{
-       if (pm_power_off) {
-               machine_shutdown();
-               pm_power_off();
-       }
-}
-
-
-struct machine_ops machine_ops = {
-       .power_off = native_machine_power_off,
-       .shutdown = native_machine_shutdown,
-       .emergency_restart = native_machine_emergency_restart,
-       .restart = native_machine_restart,
-       .halt = native_machine_halt,
-};
-
-void machine_power_off(void)
-{
-       machine_ops.power_off();
-}
-
-void machine_shutdown(void)
-{
-       machine_ops.shutdown();
-}
-
-void machine_emergency_restart(void)
-{
-       machine_ops.emergency_restart();
-}
-
-void machine_restart(char *cmd)
-{
-       machine_ops.restart(cmd);
-}
-
-void machine_halt(void)
-{
-       machine_ops.halt();
-}
diff --git a/arch/i386/kernel/reboot_fixups_32.c b/arch/i386/kernel/reboot_fixups_32.c
deleted file mode 100644 (file)
index 03e1cce..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * linux/arch/i386/kernel/reboot_fixups.c
- *
- * This is a good place to put board specific reboot fixups.
- *
- * List of supported fixups:
- * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
- * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
- *
- */
-
-#include <asm/delay.h>
-#include <linux/pci.h>
-#include <asm/reboot_fixups.h>
-#include <asm/msr.h>
-
-static void cs5530a_warm_reset(struct pci_dev *dev)
-{
-       /* writing 1 to the reset control register, 0x44 causes the
-       cs5530a to perform a system warm reset */
-       pci_write_config_byte(dev, 0x44, 0x1);
-       udelay(50); /* shouldn't get here but be safe and spin-a-while */
-       return;
-}
-
-static void cs5536_warm_reset(struct pci_dev *dev)
-{
-       /*
-        * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
-        * writing 1 to the LSB of this MSR causes a hard reset.
-        */
-       wrmsrl(0x51400017, 1ULL);
-       udelay(50); /* shouldn't get here but be safe and spin a while */
-}
-
-struct device_fixup {
-       unsigned int vendor;
-       unsigned int device;
-       void (*reboot_fixup)(struct pci_dev *);
-};
-
-static struct device_fixup fixups_table[] = {
-{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
-{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
-};
-
-/*
- * we see if any fixup is available for our current hardware. if there
- * is a fixup, we call it and we expect to never return from it. if we
- * do return, we keep looking and then eventually fall back to the
- * standard mach_reboot on return.
- */
-void mach_reboot_fixups(void)
-{
-       struct device_fixup *cur;
-       struct pci_dev *dev;
-       int i;
-
-       for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
-               cur = &(fixups_table[i]);
-               dev = pci_get_device(cur->vendor, cur->device, NULL);
-               if (!dev)
-                       continue;
-
-               cur->reboot_fixup(dev);
-       }
-}
-
diff --git a/arch/i386/kernel/relocate_kernel_32.S b/arch/i386/kernel/relocate_kernel_32.S
deleted file mode 100644 (file)
index f151d6f..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * relocate_kernel.S - put the kernel image in place to boot
- * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-#include <asm/kexec.h>
-
-/*
- * Must be relocatable PIC code callable as a C function
- */
-
-#define PTR(x) (x << 2)
-#define PAGE_ALIGNED (1 << PAGE_SHIFT)
-#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
-#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
-
-       .text
-       .align PAGE_ALIGNED
-       .globl relocate_kernel
-relocate_kernel:
-       movl    8(%esp), %ebp /* list of pages */
-
-#ifdef CONFIG_X86_PAE
-       /* map the control page at its virtual address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xc0000000, %eax
-       shrl    $27, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PMD_0)(%ebp), %edx
-       orl     $PAE_PGD_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PMD_0)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x3fe00000, %eax
-       shrl    $18, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_0)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_0)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x001ff000, %eax
-       shrl    $9, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       /* identity map the control page at its physical address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xc0000000, %eax
-       shrl    $27, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PMD_1)(%ebp), %edx
-       orl     $PAE_PGD_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PMD_1)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x3fe00000, %eax
-       shrl    $18, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_1)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_1)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x001ff000, %eax
-       shrl    $9, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-#else
-       /* map the control page at its virtual address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xffc00000, %eax
-       shrl    $20, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_0)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_0)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x003ff000, %eax
-       shrl    $10, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       /* identity map the control page at its physical address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xffc00000, %eax
-       shrl    $20, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_1)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_1)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x003ff000, %eax
-       shrl    $10, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-#endif
-
-relocate_new_kernel:
-       /* read the arguments and say goodbye to the stack */
-       movl  4(%esp), %ebx /* page_list */
-       movl  8(%esp), %ebp /* list of pages */
-       movl  12(%esp), %edx /* start address */
-       movl  16(%esp), %ecx /* cpu_has_pae */
-
-       /* zero out flags, and disable interrupts */
-       pushl $0
-       popfl
-
-       /* get physical address of control page now */
-       /* this is impossible after page table switch */
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edi
-
-       /* switch to new set of page tables */
-       movl    PTR(PA_PGD)(%ebp), %eax
-       movl    %eax, %cr3
-
-       /* setup a new stack at the end of the physical control page */
-       lea     4096(%edi), %esp
-
-       /* jump to identity mapped page */
-       movl    %edi, %eax
-       addl    $(identity_mapped - relocate_kernel), %eax
-       pushl   %eax
-       ret
-
-identity_mapped:
-       /* store the start address on the stack */
-       pushl   %edx
-
-       /* Set cr0 to a known state:
-        * 31 0 == Paging disabled
-        * 18 0 == Alignment check disabled
-        * 16 0 == Write protect disabled
-        * 3  0 == No task switch
-        * 2  0 == Don't do FP software emulation.
-        * 0  1 == Proctected mode enabled
-        */
-       movl    %cr0, %eax
-       andl    $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
-       orl     $(1<<0), %eax
-       movl    %eax, %cr0
-
-       /* clear cr4 if applicable */
-       testl   %ecx, %ecx
-       jz      1f
-       /* Set cr4 to a known state:
-        * Setting everything to zero seems safe.
-        */
-       movl    %cr4, %eax
-       andl    $0, %eax
-       movl    %eax, %cr4
-
-       jmp 1f
-1:
-
-       /* Flush the TLB (needed?) */
-       xorl    %eax, %eax
-       movl    %eax, %cr3
-
-       /* Do the copies */
-       movl    %ebx, %ecx
-       jmp     1f
-
-0:     /* top, read another word from the indirection page */
-       movl    (%ebx), %ecx
-       addl    $4, %ebx
-1:
-       testl   $0x1,   %ecx  /* is it a destination page */
-       jz      2f
-       movl    %ecx,   %edi
-       andl    $0xfffff000, %edi
-       jmp     0b
-2:
-       testl   $0x2,   %ecx  /* is it an indirection page */
-       jz      2f
-       movl    %ecx,   %ebx
-       andl    $0xfffff000, %ebx
-       jmp     0b
-2:
-       testl   $0x4,   %ecx /* is it the done indicator */
-       jz      2f
-       jmp     3f
-2:
-       testl   $0x8,   %ecx /* is it the source indicator */
-       jz      0b           /* Ignore it otherwise */
-       movl    %ecx,   %esi /* For every source page do a copy */
-       andl    $0xfffff000, %esi
-
-       movl    $1024, %ecx
-       rep ; movsl
-       jmp     0b
-
-3:
-
-       /* To be certain of avoiding problems with self-modifying code
-        * I need to execute a serializing instruction here.
-        * So I flush the TLB, it's handy, and not processor dependent.
-        */
-       xorl    %eax, %eax
-       movl    %eax, %cr3
-
-       /* set all of the registers to known values */
-       /* leave %esp alone */
-
-       xorl    %eax, %eax
-       xorl    %ebx, %ebx
-       xorl    %ecx, %ecx
-       xorl    %edx, %edx
-       xorl    %esi, %esi
-       xorl    %edi, %edi
-       xorl    %ebp, %ebp
-       ret
diff --git a/arch/i386/kernel/scx200_32.c b/arch/i386/kernel/scx200_32.c
deleted file mode 100644 (file)
index c7d3df2..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-/* linux/arch/i386/kernel/scx200.c 
-
-   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
-
-   National Semiconductor SCx200 support. */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/pci.h>
-
-#include <linux/scx200.h>
-#include <linux/scx200_gpio.h>
-
-/* Verify that the configuration block really is there */
-#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
-
-#define NAME "scx200"
-
-MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
-MODULE_DESCRIPTION("NatSemi SCx200 Driver");
-MODULE_LICENSE("GPL");
-
-unsigned scx200_gpio_base = 0;
-long scx200_gpio_shadow[2];
-
-unsigned scx200_cb_base = 0;
-
-static struct pci_device_id scx200_tbl[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
-       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
-       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
-       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
-       { },
-};
-MODULE_DEVICE_TABLE(pci,scx200_tbl);
-
-static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
-
-static struct pci_driver scx200_pci_driver = {
-       .name = "scx200",
-       .id_table = scx200_tbl,
-       .probe = scx200_probe,
-};
-
-static DEFINE_MUTEX(scx200_gpio_config_lock);
-
-static void __devinit scx200_init_shadow(void)
-{
-       int bank;
-
-       /* read the current values driven on the GPIO signals */
-       for (bank = 0; bank < 2; ++bank)
-               scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
-}
-
-static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       unsigned base;
-
-       if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
-           pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
-               base = pci_resource_start(pdev, 0);
-               printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
-
-               if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
-                       printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
-                       return -EBUSY;
-               }
-
-               scx200_gpio_base = base;
-               scx200_init_shadow();
-
-       } else {
-               /* find the base of the Configuration Block */
-               if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
-                       scx200_cb_base = SCx200_CB_BASE_FIXED;
-               } else {
-                       pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
-                       if (scx200_cb_probe(base)) {
-                               scx200_cb_base = base;
-                       } else {
-                               printk(KERN_WARNING NAME ": Configuration Block not found\n");
-                               return -ENODEV;
-                       }
-               }
-               printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
-       }
-
-       return 0;
-}
-
-u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
-{
-       u32 config, new_config;
-
-       mutex_lock(&scx200_gpio_config_lock);
-
-       outl(index, scx200_gpio_base + 0x20);
-       config = inl(scx200_gpio_base + 0x24);
-
-       new_config = (config & mask) | bits;
-       outl(new_config, scx200_gpio_base + 0x24);
-
-       mutex_unlock(&scx200_gpio_config_lock);
-
-       return config;
-}
-
-static int __init scx200_init(void)
-{
-       printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
-
-       return pci_register_driver(&scx200_pci_driver);
-}
-
-static void __exit scx200_cleanup(void)
-{
-       pci_unregister_driver(&scx200_pci_driver);
-       release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
-}
-
-module_init(scx200_init);
-module_exit(scx200_cleanup);
-
-EXPORT_SYMBOL(scx200_gpio_base);
-EXPORT_SYMBOL(scx200_gpio_shadow);
-EXPORT_SYMBOL(scx200_gpio_configure);
-EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/i386/kernel/setup_32.c b/arch/i386/kernel/setup_32.c
deleted file mode 100644 (file)
index d474cd6..0000000
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
- *  linux/arch/i386/kernel/setup.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- *  Memory region support
- *     David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- *  Added E820 sanitization routine (removes overlapping memory regions);
- *  Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- *    Patrick Mochel <mochel@osdl.org>, March 2002
- *
- *  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *  Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
-#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
-
-#include <video/edid.h>
-
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/mmzone.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/sections.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/io.h>
-#include <asm/vmi.h>
-#include <setup_arch.h>
-#include <bios_ebda.h>
-
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
-int disable_pse __devinitdata = 0;
-
-/*
- * Machine setup..
- */
-extern struct resource code_resource;
-extern struct resource data_resource;
-
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
-EXPORT_SYMBOL(boot_cpu_data);
-
-unsigned long mmu_cr4_features;
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-#ifdef CONFIG_MCA
-EXPORT_SYMBOL(machine_id);
-#endif
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-unsigned int mca_pentium_flag;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-/* user-defined highmem size */
-static unsigned int highmem_pages = -1;
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct apm_info apm_info;
-EXPORT_SYMBOL(apm_info);
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-struct ist_info ist_info;
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
-       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-EXPORT_SYMBOL(ist_info);
-#endif
-
-extern void early_cpu_init(void);
-extern int root_mountflags;
-
-unsigned long saved_videomode;
-
-#define RAMDISK_IMAGE_START_MASK       0x07FF
-#define RAMDISK_PROMPT_FLAG            0x8000
-#define RAMDISK_LOAD_FLAG              0x4000  
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-struct boot_params __initdata boot_params;
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-/**
- * copy_edd() - Copy the BIOS EDD information
- *              from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
-     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
-     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
-     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
-     edd.edd_info_nr = EDD_NR;
-}
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-int __initdata user_defined_memmap = 0;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem=  [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       if (strcmp(arg, "nopentium") == 0) {
-               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-               disable_pse = 1;
-       } else {
-               /* If the user specifies memory size, we
-                * limit the BIOS-provided memory map to
-                * that size. exactmap can be used to specify
-                * the exact map. mem=number can be used to
-                * trim the existing memory map.
-                */
-               unsigned long long mem_size;
-               mem_size = memparse(arg, &arg);
-               limit_regions(mem_size);
-               user_defined_memmap = 1;
-       }
-       return 0;
-}
-early_param("mem", parse_mem);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       elfcorehdr_addr = memparse(arg, &arg);
-       return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-#endif /* CONFIG_PROC_VMCORE */
-
-/*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
-static int __init parse_highmem(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
-       return 0;
-}
-early_param("highmem", parse_highmem);
-
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
-static int __init parse_vmalloc(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       __VMALLOC_RESERVE = memparse(arg, &arg);
-       return 0;
-}
-early_param("vmalloc", parse_vmalloc);
-
-/*
- * reservetop=size reserves a hole at the top of the kernel address space which
- * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
- * so relocating the fixmap can be done before paging initialization.
- */
-static int __init parse_reservetop(char *arg)
-{
-       unsigned long address;
-
-       if (!arg)
-               return -EINVAL;
-
-       address = memparse(arg, &arg);
-       reserve_top_address(address);
-       return 0;
-}
-early_param("reservetop", parse_reservetop);
-
-/*
- * Determine low and high memory ranges:
- */
-unsigned long __init find_max_low_pfn(void)
-{
-       unsigned long max_low_pfn;
-
-       max_low_pfn = max_pfn;
-       if (max_low_pfn > MAXMEM_PFN) {
-               if (highmem_pages == -1)
-                       highmem_pages = max_pfn - MAXMEM_PFN;
-               if (highmem_pages + MAXMEM_PFN < max_pfn)
-                       max_pfn = MAXMEM_PFN + highmem_pages;
-               if (highmem_pages + MAXMEM_PFN > max_pfn) {
-                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
-                       highmem_pages = 0;
-               }
-               max_low_pfn = MAXMEM_PFN;
-#ifndef CONFIG_HIGHMEM
-               /* Maximum memory usable is what is directly addressable */
-               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
-                                       MAXMEM>>20);
-               if (max_pfn > MAX_NONPAE_PFN)
-                       printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-               else
-                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-               max_pfn = MAXMEM_PFN;
-#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
-               if (max_pfn > MAX_NONPAE_PFN) {
-                       max_pfn = MAX_NONPAE_PFN;
-                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
-                       printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-               }
-#endif /* !CONFIG_HIGHMEM64G */
-#endif /* !CONFIG_HIGHMEM */
-       } else {
-               if (highmem_pages == -1)
-                       highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
-               if (highmem_pages >= max_pfn) {
-                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
-                       highmem_pages = 0;
-               }
-               if (highmem_pages) {
-                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
-                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
-                               highmem_pages = 0;
-                       }
-                       max_low_pfn -= highmem_pages;
-               }
-#else
-               if (highmem_pages)
-                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
-#endif
-       }
-       return max_low_pfn;
-}
-
-/*
- * workaround for Dell systems that neglect to reserve EBDA
- */
-static void __init reserve_ebda_region(void)
-{
-       unsigned int addr;
-       addr = get_bios_ebda();
-       if (addr)
-               reserve_bootmem(addr, PAGE_SIZE);       
-}
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init setup_bootmem_allocator(void);
-static unsigned long __init setup_memory(void)
-{
-       /*
-        * partially used pages are not usable - thus
-        * we are rounding upwards:
-        */
-       min_low_pfn = PFN_UP(init_pg_tables_end);
-
-       find_max_pfn();
-
-       max_low_pfn = find_max_low_pfn();
-
-#ifdef CONFIG_HIGHMEM
-       highstart_pfn = highend_pfn = max_pfn;
-       if (max_pfn > max_low_pfn) {
-               highstart_pfn = max_low_pfn;
-       }
-       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-               pages_to_mb(highend_pfn - highstart_pfn));
-       num_physpages = highend_pfn;
-       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
-       num_physpages = max_low_pfn;
-       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-#ifdef CONFIG_FLATMEM
-       max_mapnr = num_physpages;
-#endif
-       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-                       pages_to_mb(max_low_pfn));
-
-       setup_bootmem_allocator();
-
-       return max_low_pfn;
-}
-
-void __init zone_sizes_init(void)
-{
-       unsigned long max_zone_pfns[MAX_NR_ZONES];
-       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-       max_zone_pfns[ZONE_DMA] =
-               virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-       max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-       max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-       add_active_range(0, 0, highend_pfn);
-#else
-       add_active_range(0, 0, max_low_pfn);
-#endif
-
-       free_area_init_nodes(max_zone_pfns);
-}
-#else
-extern unsigned long __init setup_memory(void);
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-
-void __init setup_bootmem_allocator(void)
-{
-       unsigned long bootmap_size;
-       /*
-        * Initialize the boot-time allocator (with low memory only):
-        */
-       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
-       register_bootmem_low_pages(max_low_pfn);
-
-       /*
-        * Reserve the bootmem bitmap itself as well. We do this in two
-        * steps (first step was init_bootmem()) because this catches
-        * the (very unlikely) case of us accidentally initializing the
-        * bootmem allocator with an invalid RAM area.
-        */
-       reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
-                        bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
-
-       /*
-        * reserve physical page 0 - it's a special BIOS page on many boxes,
-        * enabling clean reboots, SMP operation, laptop functions.
-        */
-       reserve_bootmem(0, PAGE_SIZE);
-
-       /* reserve EBDA region, it's a 4K region */
-       reserve_ebda_region();
-
-    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
-       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
-       unless you have no PS/2 mouse plugged in. */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-           boot_cpu_data.x86 == 6)
-            reserve_bootmem(0xa0000 - 4096, 4096);
-
-#ifdef CONFIG_SMP
-       /*
-        * But first pinch a few for the stack/trampoline stuff
-        * FIXME: Don't need the extra page at 4K, but need to fix
-        * trampoline before removing it. (see the GDT stuff)
-        */
-       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
-       /*
-        * Reserve low memory region for sleep support.
-        */
-       acpi_reserve_bootmem();
-#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
-       /*
-        * Find and reserve possible boot-time SMP configuration:
-        */
-       find_smp_config();
-#endif
-       numa_kva_reserve();
-#ifdef CONFIG_BLK_DEV_INITRD
-       if (LOADER_TYPE && INITRD_START) {
-               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
-                       reserve_bootmem(INITRD_START, INITRD_SIZE);
-                       initrd_start = INITRD_START + PAGE_OFFSET;
-                       initrd_end = initrd_start+INITRD_SIZE;
-               }
-               else {
-                       printk(KERN_ERR "initrd extends beyond end of memory "
-                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-                           INITRD_START + INITRD_SIZE,
-                           max_low_pfn << PAGE_SHIFT);
-                       initrd_start = 0;
-               }
-       }
-#endif
-#ifdef CONFIG_KEXEC
-       if (crashk_res.start != crashk_res.end)
-               reserve_bootmem(crashk_res.start,
-                       crashk_res.end - crashk_res.start + 1);
-#endif
-}
-
-/*
- * The node 0 pgdat is initialized before all of these because
- * it's needed for bootmem.  node>0 pgdats have their virtual
- * space allocated before the pagetables are in place to access
- * them, so they can't be cleared then.
- *
- * This should all compile down to nothing when NUMA is off.
- */
-static void __init remapped_pgdat_init(void)
-{
-       int nid;
-
-       for_each_online_node(nid) {
-               if (nid != 0)
-                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-       }
-}
-
-#ifdef CONFIG_MCA
-static void set_mca_bus(int x)
-{
-       MCA_bus = x;
-}
-#else
-static void set_mca_bus(int x) { }
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-       return machine_specific_memory_setup();
-}
-
-/*
- * Determine if we were loaded by an EFI loader.  If so, then we have also been
- * passed the efi memmap, systab, etc., so we should use these data structures
- * for initialization.  Note, the efi init code path is determined by the
- * global efi_enabled. This allows the same kernel image to be used on existing
- * systems (with a traditional BIOS) as well as on EFI systems.
- */
-void __init setup_arch(char **cmdline_p)
-{
-       unsigned long max_low_pfn;
-
-       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
-       pre_setup_arch_hook();
-       early_cpu_init();
-
-       /*
-        * FIXME: This isn't an official loader_type right
-        * now but does currently work with elilo.
-        * If we were configured as an EFI kernel, check to make
-        * sure that we were loaded correctly from elilo and that
-        * the system table is valid.  If not, then initialize normally.
-        */
-#ifdef CONFIG_EFI
-       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
-               efi_enabled = 1;
-#endif
-
-       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
-       screen_info = SCREEN_INFO;
-       edid_info = EDID_INFO;
-       apm_info.bios = APM_BIOS_INFO;
-       ist_info = IST_INFO;
-       saved_videomode = VIDEO_MODE;
-       if( SYS_DESC_TABLE.length != 0 ) {
-               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
-               machine_id = SYS_DESC_TABLE.table[0];
-               machine_submodel_id = SYS_DESC_TABLE.table[1];
-               BIOS_revision = SYS_DESC_TABLE.table[2];
-       }
-       bootloader_type = LOADER_TYPE;
-
-#ifdef CONFIG_BLK_DEV_RAM
-       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
-       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
-       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
-#endif
-       ARCH_SETUP
-       if (efi_enabled)
-               efi_init();
-       else {
-               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-               print_memory_map(memory_setup());
-       }
-
-       copy_edd();
-
-       if (!MOUNT_ROOT_RDONLY)
-               root_mountflags &= ~MS_RDONLY;
-       init_mm.start_code = (unsigned long) _text;
-       init_mm.end_code = (unsigned long) _etext;
-       init_mm.end_data = (unsigned long) _edata;
-       init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-
-       code_resource.start = virt_to_phys(_text);
-       code_resource.end = virt_to_phys(_etext)-1;
-       data_resource.start = virt_to_phys(_etext);
-       data_resource.end = virt_to_phys(_edata)-1;
-
-       parse_early_param();
-
-       if (user_defined_memmap) {
-               printk(KERN_INFO "user-defined physical RAM map:\n");
-               print_memory_map("user");
-       }
-
-       strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-       *cmdline_p = command_line;
-
-       max_low_pfn = setup_memory();
-
-#ifdef CONFIG_VMI
-       /*
-        * Must be after max_low_pfn is determined, and before kernel
-        * pagetables are setup.
-        */
-       vmi_init();
-#endif
-
-       /*
-        * NOTE: before this point _nobody_ is allowed to allocate
-        * any memory using the bootmem allocator.  Although the
-        * alloctor is now initialised only the first 8Mb of the kernel
-        * virtual address space has been mapped.  All allocations before
-        * paging_init() has completed must use the alloc_bootmem_low_pages()
-        * variant (which allocates DMA'able memory) and care must be taken
-        * not to exceed the 8Mb limit.
-        */
-
-#ifdef CONFIG_SMP
-       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
-       paging_init();
-       remapped_pgdat_init();
-       sparse_init();
-       zone_sizes_init();
-
-       /*
-        * NOTE: at this point the bootmem allocator is fully available.
-        */
-
-       paravirt_post_allocator_init();
-
-       dmi_scan_machine();
-
-#ifdef CONFIG_X86_GENERICARCH
-       generic_apic_probe();
-#endif 
-       if (efi_enabled)
-               efi_map_memmap();
-
-#ifdef CONFIG_ACPI
-       /*
-        * Parse the ACPI tables for possible boot-time SMP configuration.
-        */
-       acpi_boot_table_init();
-#endif
-
-#ifdef CONFIG_PCI
-#ifdef CONFIG_X86_IO_APIC
-       check_acpi_pci();       /* Checks more than just ACPI actually */
-#endif
-#endif
-
-#ifdef CONFIG_ACPI
-       acpi_boot_init();
-
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
-       if (def_to_bigsmp)
-               printk(KERN_WARNING "More than 8 CPUs detected and "
-                       "CONFIG_X86_PC cannot handle it.\nUse "
-                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-#endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-       if (smp_found_config)
-               get_smp_config();
-#endif
-
-       e820_register_memory();
-       e820_mark_nosave_regions();
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-       if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-               conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-       conswitchp = &dummy_con;
-#endif
-#endif
-}
diff --git a/arch/i386/kernel/sigframe_32.h b/arch/i386/kernel/sigframe_32.h
deleted file mode 100644 (file)
index 0b22217..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-struct sigframe
-{
-       char __user *pretcode;
-       int sig;
-       struct sigcontext sc;
-       struct _fpstate fpstate;
-       unsigned long extramask[_NSIG_WORDS-1];
-       char retcode[8];
-};
-
-struct rt_sigframe
-{
-       char __user *pretcode;
-       int sig;
-       struct siginfo __user *pinfo;
-       void __user *puc;
-       struct siginfo info;
-       struct ucontext uc;
-       struct _fpstate fpstate;
-       char retcode[8];
-};
diff --git a/arch/i386/kernel/signal_32.c b/arch/i386/kernel/signal_32.c
deleted file mode 100644 (file)
index c03570f..0000000
+++ /dev/null
@@ -1,667 +0,0 @@
-/*
- *  linux/arch/i386/kernel/signal.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
- *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <linux/ptrace.h>
-#include <linux/elf.h>
-#include <linux/binfmts.h>
-#include <asm/processor.h>
-#include <asm/ucontext.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
-#include "sigframe_32.h"
-
-#define DEBUG_SIG 0
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
-       mask &= _BLOCKABLE;
-       spin_lock_irq(&current->sighand->siglock);
-       current->saved_sigmask = current->blocked;
-       siginitset(&current->blocked, mask);
-       recalc_sigpending();
-       spin_unlock_irq(&current->sighand->siglock);
-
-       current->state = TASK_INTERRUPTIBLE;
-       schedule();
-       set_thread_flag(TIF_RESTORE_SIGMASK);
-       return -ERESTARTNOHAND;
-}
-
-asmlinkage int 
-sys_sigaction(int sig, const struct old_sigaction __user *act,
-             struct old_sigaction __user *oact)
-{
-       struct k_sigaction new_ka, old_ka;
-       int ret;
-
-       if (act) {
-               old_sigset_t mask;
-               if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
-                   __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
-                   __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
-                       return -EFAULT;
-               __get_user(new_ka.sa.sa_flags, &act->sa_flags);
-               __get_user(mask, &act->sa_mask);
-               siginitset(&new_ka.sa.sa_mask, mask);
-       }
-
-       ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
-       if (!ret && oact) {
-               if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
-                   __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
-                   __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
-                       return -EFAULT;
-               __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
-               __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
-       }
-
-       return ret;
-}
-
-asmlinkage int
-sys_sigaltstack(unsigned long ebx)
-{
-       /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
-       struct pt_regs *regs = (struct pt_regs *)&ebx;
-       const stack_t __user *uss = (const stack_t __user *)ebx;
-       stack_t __user *uoss = (stack_t __user *)regs->ecx;
-
-       return do_sigaltstack(uss, uoss, regs->esp);
-}
-
-
-/*
- * Do a signal return; undo the signal stack.
- */
-
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
-{
-       unsigned int err = 0;
-
-       /* Always make any pending restarted system calls return -EINTR */
-       current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
-#define COPY(x)                err |= __get_user(regs->x, &sc->x)
-
-#define COPY_SEG(seg)                                                  \
-       { unsigned short tmp;                                           \
-         err |= __get_user(tmp, &sc->seg);                             \
-         regs->x##seg = tmp; }
-
-#define COPY_SEG_STRICT(seg)                                           \
-       { unsigned short tmp;                                           \
-         err |= __get_user(tmp, &sc->seg);                             \
-         regs->x##seg = tmp|3; }
-
-#define GET_SEG(seg)                                                   \
-       { unsigned short tmp;                                           \
-         err |= __get_user(tmp, &sc->seg);                             \
-         loadsegment(seg,tmp); }
-
-#define        FIX_EFLAGS      (X86_EFLAGS_AC | X86_EFLAGS_RF |                 \
-                        X86_EFLAGS_OF | X86_EFLAGS_DF |                 \
-                        X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
-                        X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
-
-       GET_SEG(gs);
-       COPY_SEG(fs);
-       COPY_SEG(es);
-       COPY_SEG(ds);
-       COPY(edi);
-       COPY(esi);
-       COPY(ebp);
-       COPY(esp);
-       COPY(ebx);
-       COPY(edx);
-       COPY(ecx);
-       COPY(eip);
-       COPY_SEG_STRICT(cs);
-       COPY_SEG_STRICT(ss);
-       
-       {
-               unsigned int tmpflags;
-               err |= __get_user(tmpflags, &sc->eflags);
-               regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-               regs->orig_eax = -1;            /* disable syscall checks */
-       }
-
-       {
-               struct _fpstate __user * buf;
-               err |= __get_user(buf, &sc->fpstate);
-               if (buf) {
-                       if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-                               goto badframe;
-                       err |= restore_i387(buf);
-               } else {
-                       struct task_struct *me = current;
-                       if (used_math()) {
-                               clear_fpu(me);
-                               clear_used_math();
-                       }
-               }
-       }
-
-       err |= __get_user(*peax, &sc->eax);
-       return err;
-
-badframe:
-       return 1;
-}
-
-asmlinkage int sys_sigreturn(unsigned long __unused)
-{
-       struct pt_regs *regs = (struct pt_regs *) &__unused;
-       struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
-       sigset_t set;
-       int eax;
-
-       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
-               goto badframe;
-       if (__get_user(set.sig[0], &frame->sc.oldmask)
-           || (_NSIG_WORDS > 1
-               && __copy_from_user(&set.sig[1], &frame->extramask,
-                                   sizeof(frame->extramask))))
-               goto badframe;
-
-       sigdelsetmask(&set, ~_BLOCKABLE);
-       spin_lock_irq(&current->sighand->siglock);
-       current->blocked = set;
-       recalc_sigpending();
-       spin_unlock_irq(&current->sighand->siglock);
-       
-       if (restore_sigcontext(regs, &frame->sc, &eax))
-               goto badframe;
-       return eax;
-
-badframe:
-       if (show_unhandled_signals && printk_ratelimit())
-               printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
-                      " esp:%lx oeax:%lx\n",
-                   current->pid > 1 ? KERN_INFO : KERN_EMERG,
-                   current->comm, current->pid, frame, regs->eip,
-                   regs->esp, regs->orig_eax);
-
-       force_sig(SIGSEGV, current);
-       return 0;
-}      
-
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
-{
-       struct pt_regs *regs = (struct pt_regs *) &__unused;
-       struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
-       sigset_t set;
-       int eax;
-
-       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
-               goto badframe;
-       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
-               goto badframe;
-
-       sigdelsetmask(&set, ~_BLOCKABLE);
-       spin_lock_irq(&current->sighand->siglock);
-       current->blocked = set;
-       recalc_sigpending();
-       spin_unlock_irq(&current->sighand->siglock);
-       
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
-               goto badframe;
-
-       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
-               goto badframe;
-
-       return eax;
-
-badframe:
-       force_sig(SIGSEGV, current);
-       return 0;
-}      
-
-/*
- * Set up a signal frame.
- */
-
-static int
-setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
-                struct pt_regs *regs, unsigned long mask)
-{
-       int tmp, err = 0;
-
-       err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
-       savesegment(gs, tmp);
-       err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
-
-       err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
-       err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
-       err |= __put_user(regs->edi, &sc->edi);
-       err |= __put_user(regs->esi, &sc->esi);
-       err |= __put_user(regs->ebp, &sc->ebp);
-       err |= __put_user(regs->esp, &sc->esp);
-       err |= __put_user(regs->ebx, &sc->ebx);
-       err |= __put_user(regs->edx, &sc->edx);
-       err |= __put_user(regs->ecx, &sc->ecx);
-       err |= __put_user(regs->eax, &sc->eax);
-       err |= __put_user(current->thread.trap_no, &sc->trapno);
-       err |= __put_user(current->thread.error_code, &sc->err);
-       err |= __put_user(regs->eip, &sc->eip);
-       err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
-       err |= __put_user(regs->eflags, &sc->eflags);
-       err |= __put_user(regs->esp, &sc->esp_at_signal);
-       err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
-
-       tmp = save_i387(fpstate);
-       if (tmp < 0)
-         err = 1;
-       else
-         err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
-
-       /* non-iBCS2 extensions.. */
-       err |= __put_user(mask, &sc->oldmask);
-       err |= __put_user(current->thread.cr2, &sc->cr2);
-
-       return err;
-}
-
-/*
- * Determine which stack to use..
- */
-static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
-{
-       unsigned long esp;
-
-       /* Default to using normal stack */
-       esp = regs->esp;
-
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(esp) == 0)
-                       esp = current->sas_ss_sp + current->sas_ss_size;
-       }
-
-       /* This is the legacy signal stack switching. */
-       else if ((regs->xss & 0xffff) != __USER_DS &&
-                !(ka->sa.sa_flags & SA_RESTORER) &&
-                ka->sa.sa_restorer) {
-               esp = (unsigned long) ka->sa.sa_restorer;
-       }
-
-       esp -= frame_size;
-       /* Align the stack pointer according to the i386 ABI,
-        * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-       esp = ((esp + 4) & -16ul) - 4;
-       return (void __user *) esp;
-}
-
-/* These symbols are defined with the addresses in the vsyscall page.
-   See vsyscall-sigreturn.S.  */
-extern void __user __kernel_sigreturn;
-extern void __user __kernel_rt_sigreturn;
-
-static int setup_frame(int sig, struct k_sigaction *ka,
-                      sigset_t *set, struct pt_regs * regs)
-{
-       void __user *restorer;
-       struct sigframe __user *frame;
-       int err = 0;
-       int usig;
-
-       frame = get_sigframe(ka, regs, sizeof(*frame));
-
-       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
-
-       usig = current_thread_info()->exec_domain
-               && current_thread_info()->exec_domain->signal_invmap
-               && sig < 32
-               ? current_thread_info()->exec_domain->signal_invmap[sig]
-               : sig;
-
-       err = __put_user(usig, &frame->sig);
-       if (err)
-               goto give_sigsegv;
-
-       err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
-       if (err)
-               goto give_sigsegv;
-
-       if (_NSIG_WORDS > 1) {
-               err = __copy_to_user(&frame->extramask, &set->sig[1],
-                                     sizeof(frame->extramask));
-               if (err)
-                       goto give_sigsegv;
-       }
-
-       if (current->binfmt->hasvdso)
-               restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
-       else
-               restorer = (void *)&frame->retcode;
-       if (ka->sa.sa_flags & SA_RESTORER)
-               restorer = ka->sa.sa_restorer;
-
-       /* Set up to return from userspace.  */
-       err |= __put_user(restorer, &frame->pretcode);
-        
-       /*
-        * This is popl %eax ; movl $,%eax ; int $0x80
-        *
-        * WE DO NOT USE IT ANY MORE! It's only left here for historical
-        * reasons and because gdb uses it as a signature to notice
-        * signal handler stack frames.
-        */
-       err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
-       err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
-       err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
-
-       if (err)
-               goto give_sigsegv;
-
-       /* Set up registers for signal handler */
-       regs->esp = (unsigned long) frame;
-       regs->eip = (unsigned long) ka->sa.sa_handler;
-       regs->eax = (unsigned long) sig;
-       regs->edx = (unsigned long) 0;
-       regs->ecx = (unsigned long) 0;
-
-       set_fs(USER_DS);
-       regs->xds = __USER_DS;
-       regs->xes = __USER_DS;
-       regs->xss = __USER_DS;
-       regs->xcs = __USER_CS;
-
-       /*
-        * Clear TF when entering the signal handler, but
-        * notify any tracer that was single-stepping it.
-        * The tracer may want to single-step inside the
-        * handler too.
-        */
-       regs->eflags &= ~TF_MASK;
-       if (test_thread_flag(TIF_SINGLESTEP))
-               ptrace_notify(SIGTRAP);
-
-#if DEBUG_SIG
-       printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-               current->comm, current->pid, frame, regs->eip, frame->pretcode);
-#endif
-
-       return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
-}
-
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-                          sigset_t *set, struct pt_regs * regs)
-{
-       void __user *restorer;
-       struct rt_sigframe __user *frame;
-       int err = 0;
-       int usig;
-
-       frame = get_sigframe(ka, regs, sizeof(*frame));
-
-       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
-
-       usig = current_thread_info()->exec_domain
-               && current_thread_info()->exec_domain->signal_invmap
-               && sig < 32
-               ? current_thread_info()->exec_domain->signal_invmap[sig]
-               : sig;
-
-       err |= __put_user(usig, &frame->sig);
-       err |= __put_user(&frame->info, &frame->pinfo);
-       err |= __put_user(&frame->uc, &frame->puc);
-       err |= copy_siginfo_to_user(&frame->info, info);
-       if (err)
-               goto give_sigsegv;
-
-       /* Create the ucontext.  */
-       err |= __put_user(0, &frame->uc.uc_flags);
-       err |= __put_user(0, &frame->uc.uc_link);
-       err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-       err |= __put_user(sas_ss_flags(regs->esp),
-                         &frame->uc.uc_stack.ss_flags);
-       err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
-       err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
-                               regs, set->sig[0]);
-       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-       if (err)
-               goto give_sigsegv;
-
-       /* Set up to return from userspace.  */
-       restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
-       if (ka->sa.sa_flags & SA_RESTORER)
-               restorer = ka->sa.sa_restorer;
-       err |= __put_user(restorer, &frame->pretcode);
-        
-       /*
-        * This is movl $,%eax ; int $0x80
-        *
-        * WE DO NOT USE IT ANY MORE! It's only left here for historical
-        * reasons and because gdb uses it as a signature to notice
-        * signal handler stack frames.
-        */
-       err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
-       err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
-       err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
-
-       if (err)
-               goto give_sigsegv;
-
-       /* Set up registers for signal handler */
-       regs->esp = (unsigned long) frame;
-       regs->eip = (unsigned long) ka->sa.sa_handler;
-       regs->eax = (unsigned long) usig;
-       regs->edx = (unsigned long) &frame->info;
-       regs->ecx = (unsigned long) &frame->uc;
-
-       set_fs(USER_DS);
-       regs->xds = __USER_DS;
-       regs->xes = __USER_DS;
-       regs->xss = __USER_DS;
-       regs->xcs = __USER_CS;
-
-       /*
-        * Clear TF when entering the signal handler, but
-        * notify any tracer that was single-stepping it.
-        * The tracer may want to single-step inside the
-        * handler too.
-        */
-       regs->eflags &= ~TF_MASK;
-       if (test_thread_flag(TIF_SINGLESTEP))
-               ptrace_notify(SIGTRAP);
-
-#if DEBUG_SIG
-       printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-               current->comm, current->pid, frame, regs->eip, frame->pretcode);
-#endif
-
-       return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
-}
-
-/*
- * OK, we're invoking a handler
- */    
-
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-             sigset_t *oldset, struct pt_regs * regs)
-{
-       int ret;
-
-       /* Are we from a system call? */
-       if (regs->orig_eax >= 0) {
-               /* If so, check system call restarting.. */
-               switch (regs->eax) {
-                       case -ERESTART_RESTARTBLOCK:
-                       case -ERESTARTNOHAND:
-                               regs->eax = -EINTR;
-                               break;
-
-                       case -ERESTARTSYS:
-                               if (!(ka->sa.sa_flags & SA_RESTART)) {
-                                       regs->eax = -EINTR;
-                                       break;
-                               }
-                       /* fallthrough */
-                       case -ERESTARTNOINTR:
-                               regs->eax = regs->orig_eax;
-                               regs->eip -= 2;
-               }
-       }
-
-       /*
-        * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
-        * that register information in the sigcontext is correct.
-        */
-       if (unlikely(regs->eflags & TF_MASK)
-           && likely(current->ptrace & PT_DTRACE)) {
-               current->ptrace &= ~PT_DTRACE;
-               regs->eflags &= ~TF_MASK;
-       }
-
-       /* Set up the stack frame */
-       if (ka->sa.sa_flags & SA_SIGINFO)
-               ret = setup_rt_frame(sig, ka, info, oldset, regs);
-       else
-               ret = setup_frame(sig, ka, oldset, regs);
-
-       if (ret == 0) {
-               spin_lock_irq(&current->sighand->siglock);
-               sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
-               if (!(ka->sa.sa_flags & SA_NODEFER))
-                       sigaddset(&current->blocked,sig);
-               recalc_sigpending();
-               spin_unlock_irq(&current->sighand->siglock);
-       }
-
-       return ret;
-}
-
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- */
-static void fastcall do_signal(struct pt_regs *regs)
-{
-       siginfo_t info;
-       int signr;
-       struct k_sigaction ka;
-       sigset_t *oldset;
-
-       /*
-        * We want the common case to go fast, which
-        * is why we may in certain cases get here from
-        * kernel mode. Just return without doing anything
-        * if so.  vm86 regs switched out by assembly code
-        * before reaching here, so testing against kernel
-        * CS suffices.
-        */
-       if (!user_mode(regs))
-               return;
-
-       if (test_thread_flag(TIF_RESTORE_SIGMASK))
-               oldset = &current->saved_sigmask;
-       else
-               oldset = &current->blocked;
-
-       signr = get_signal_to_deliver(&info, &ka, regs, NULL);
-       if (signr > 0) {
-               /* Reenable any watchpoints before delivering the
-                * signal to user space. The processor register will
-                * have been cleared if the watchpoint triggered
-                * inside the kernel.
-                */
-               if (unlikely(current->thread.debugreg[7]))
-                       set_debugreg(current->thread.debugreg[7], 7);
-
-               /* Whee!  Actually deliver the signal.  */
-               if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
-                       /* a signal was successfully delivered; the saved
-                        * sigmask will have been stored in the signal frame,
-                        * and will be restored by sigreturn, so we can simply
-                        * clear the TIF_RESTORE_SIGMASK flag */
-                       if (test_thread_flag(TIF_RESTORE_SIGMASK))
-                               clear_thread_flag(TIF_RESTORE_SIGMASK);
-               }
-
-               return;
-       }
-
-       /* Did we come from a system call? */
-       if (regs->orig_eax >= 0) {
-               /* Restart the system call - no handlers present */
-               switch (regs->eax) {
-               case -ERESTARTNOHAND:
-               case -ERESTARTSYS:
-               case -ERESTARTNOINTR:
-                       regs->eax = regs->orig_eax;
-                       regs->eip -= 2;
-                       break;
-
-               case -ERESTART_RESTARTBLOCK:
-                       regs->eax = __NR_restart_syscall;
-                       regs->eip -= 2;
-                       break;
-               }
-       }
-
-       /* if there's no signal to deliver, we just put the saved sigmask
-        * back */
-       if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-               clear_thread_flag(TIF_RESTORE_SIGMASK);
-               sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-       }
-}
-
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__attribute__((regparm(3)))
-void do_notify_resume(struct pt_regs *regs, void *_unused,
-                     __u32 thread_info_flags)
-{
-       /* Pending single-step? */
-       if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->eflags |= TF_MASK;
-               clear_thread_flag(TIF_SINGLESTEP);
-       }
-
-       /* deal with pending signal delivery */
-       if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
-               do_signal(regs);
-       
-       clear_thread_flag(TIF_IRET);
-}
diff --git a/arch/i386/kernel/smp_32.c b/arch/i386/kernel/smp_32.c
deleted file mode 100644 (file)
index 2d35d85..0000000
+++ /dev/null
@@ -1,707 +0,0 @@
-/*
- *     Intel SMP support routines.
- *
- *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *     This code is released under the GNU General Public License version 2 or
- *     later.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <mach_apic.h>
-
-/*
- *     Some notes on x86 processor bugs affecting SMP operation:
- *
- *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
- *     The Linux implications for SMP are handled as follows:
- *
- *     Pentium III / [Xeon]
- *             None of the E1AP-E3AP errata are visible to the user.
- *
- *     E1AP.   see PII A1AP
- *     E2AP.   see PII A2AP
- *     E3AP.   see PII A3AP
- *
- *     Pentium II / [Xeon]
- *             None of the A1AP-A3AP errata are visible to the user.
- *
- *     A1AP.   see PPro 1AP
- *     A2AP.   see PPro 2AP
- *     A3AP.   see PPro 7AP
- *
- *     Pentium Pro
- *             None of 1AP-9AP errata are visible to the normal user,
- *     except occasional delivery of 'spurious interrupt' as trap #15.
- *     This is very rare and a non-problem.
- *
- *     1AP.    Linux maps APIC as non-cacheable
- *     2AP.    worked around in hardware
- *     3AP.    fixed in C0 and above steppings microcode update.
- *             Linux does not use excessive STARTUP_IPIs.
- *     4AP.    worked around in hardware
- *     5AP.    symmetric IO mode (normal Linux operation) not affected.
- *             'noapic' mode has vector 0xf filled out properly.
- *     6AP.    'noapic' mode might be affected - fixed in later steppings
- *     7AP.    We do not assume writes to the LVT deassering IRQs
- *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
- *     9AP.    We do not use mixed mode
- *
- *     Pentium
- *             There is a marginal case where REP MOVS on 100MHz SMP
- *     machines with B stepping processors can fail. XXX should provide
- *     an L1cache=Writethrough or L1cache=off option.
- *
- *             B stepping CPUs may hang. There are hardware work arounds
- *     for this. We warn about it in case your board doesn't have the work
- *     arounds. Basically thats so I can tell anyone with a B stepping
- *     CPU and SMP problems "tough".
- *
- *     Specific items [From Pentium Processor Specification Update]
- *
- *     1AP.    Linux doesn't use remote read
- *     2AP.    Linux doesn't trust APIC errors
- *     3AP.    We work around this
- *     4AP.    Linux never generated 3 interrupts of the same priority
- *             to cause a lost local interrupt.
- *     5AP.    Remote read is never used
- *     6AP.    not affected - worked around in hardware
- *     7AP.    not affected - worked around in hardware
- *     8AP.    worked around in hardware - we get explicit CS errors if not
- *     9AP.    only 'noapic' mode affected. Might generate spurious
- *             interrupts, we log only the first one and count the
- *             rest silently.
- *     10AP.   not affected - worked around in hardware
- *     11AP.   Linux reads the APIC between writes to avoid this, as per
- *             the documentation. Make sure you preserve this as it affects
- *             the C stepping chips too.
- *     12AP.   not affected - worked around in hardware
- *     13AP.   not affected - worked around in hardware
- *     14AP.   we always deassert INIT during bootup
- *     15AP.   not affected - worked around in hardware
- *     16AP.   not affected - worked around in hardware
- *     17AP.   not affected - worked around in hardware
- *     18AP.   not affected - worked around in hardware
- *     19AP.   not affected - worked around in BIOS
- *
- *     If this sounds worrying believe me these bugs are either ___RARE___,
- *     or are signal timing bugs worked around in hardware and there's
- *     about nothing of note with C stepping upwards.
- */
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR (unsigned int shortcut, int vector)
-{
-       unsigned int icr = shortcut | APIC_DEST_LOGICAL;
-
-       switch (vector) {
-       default:
-               icr |= APIC_DM_FIXED | vector;
-               break;
-       case NMI_VECTOR:
-               icr |= APIC_DM_NMI;
-               break;
-       }
-       return icr;
-}
-
-static inline int __prepare_ICR2 (unsigned int mask)
-{
-       return SET_APIC_DEST_FIELD(mask);
-}
-
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
-{
-       /*
-        * Subtle. In the case of the 'never do double writes' workaround
-        * we have to lock out interrupts to be safe.  As we don't care
-        * of the value read we use an atomic rmw access to avoid costly
-        * cli/sti.  Otherwise we use an even cheaper single atomic write
-        * to the APIC.
-        */
-       unsigned int cfg;
-
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
-
-       /*
-        * No need to touch the target chip field
-        */
-       cfg = __prepare_ICR(shortcut, vector);
-
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       apic_write_around(APIC_ICR, cfg);
-}
-
-void fastcall send_IPI_self(int vector)
-{
-       __send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
- */
-static inline void __send_IPI_dest_field(unsigned long mask, int vector)
-{
-       unsigned long cfg;
-
-       /*
-        * Wait for idle.
-        */
-       if (unlikely(vector == NMI_VECTOR))
-               safe_apic_wait_icr_idle();
-       else
-               apic_wait_icr_idle();
-               
-       /*
-        * prepare target chip field
-        */
-       cfg = __prepare_ICR2(mask);
-       apic_write_around(APIC_ICR2, cfg);
-               
-       /*
-        * program the ICR 
-        */
-       cfg = __prepare_ICR(0, vector);
-                       
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       apic_write_around(APIC_ICR, cfg);
-}
-
-/*
- * This is only used on smaller machines.
- */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
-{
-       unsigned long mask = cpus_addr(cpumask)[0];
-       unsigned long flags;
-
-       local_irq_save(flags);
-       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
-       __send_IPI_dest_field(mask, vector);
-       local_irq_restore(flags);
-}
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
-{
-       unsigned long flags;
-       unsigned int query_cpu;
-
-       /*
-        * Hack. The clustered APIC addressing mode doesn't allow us to send 
-        * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
-        * should be modified to do 1 message per cluster ID - mbligh
-        */ 
-
-       local_irq_save(flags);
-       for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
-               if (cpu_isset(query_cpu, mask)) {
-                       __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
-                                             vector);
-               }
-       }
-       local_irq_restore(flags);
-}
-
-#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
-
-/*
- *     Smarter SMP flushing macros. 
- *             c/o Linus Torvalds.
- *
- *     These mean you can really definitely utterly forget about
- *     writing to user space from interrupts. (Its not allowed anyway).
- *
- *     Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct * flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(unsigned long cpu)
-{
-       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
-               BUG();
-       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
-       load_cr3(swapper_pg_dir);
-}
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *     Stop ipi delivery for the old mm. This is not synchronized with
- *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *     for the wrong mm, and in the worst case we perform a superflous
- *     tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *     was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- *     Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *     Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
- *     flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *     Atomically set the bit [other cpus will start sending flush ipis],
- *     and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-       unsigned long cpu;
-
-       cpu = get_cpu();
-
-       if (!cpu_isset(cpu, flush_cpumask))
-               goto out;
-               /* 
-                * This was a BUG() but until someone can quote me the
-                * line from the intel manual that guarantees an IPI to
-                * multiple CPUs is retried _only_ on the erroring CPUs
-                * its staying as a return
-                *
-                * BUG();
-                */
-                
-       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
-               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
-                       if (flush_va == TLB_FLUSH_ALL)
-                               local_flush_tlb();
-                       else
-                               __flush_tlb_one(flush_va);
-               } else
-                       leave_mm(cpu);
-       }
-       ack_APIC_irq();
-       smp_mb__before_clear_bit();
-       cpu_clear(cpu, flush_cpumask);
-       smp_mb__after_clear_bit();
-out:
-       put_cpu_no_resched();
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-                            unsigned long va)
-{
-       cpumask_t cpumask = *cpumaskp;
-
-       /*
-        * A couple of (to be removed) sanity checks:
-        *
-        * - current CPU must not be in mask
-        * - mask must exist :)
-        */
-       BUG_ON(cpus_empty(cpumask));
-       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-       BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
-       /* If a CPU which we ran on has gone down, OK. */
-       cpus_and(cpumask, cpumask, cpu_online_map);
-       if (unlikely(cpus_empty(cpumask)))
-               return;
-#endif
-
-       /*
-        * i'm not happy about this global shared spinlock in the
-        * MM hot path, but we'll see how contended it is.
-        * AK: x86-64 has a faster method that could be ported.
-        */
-       spin_lock(&tlbstate_lock);
-       
-       flush_mm = mm;
-       flush_va = va;
-       cpus_or(flush_cpumask, cpumask, flush_cpumask);
-       /*
-        * We have to send the IPI only to
-        * CPUs affected.
-        */
-       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
-
-       while (!cpus_empty(flush_cpumask))
-               /* nothing. lockup detection does not belong here */
-               cpu_relax();
-
-       flush_mm = NULL;
-       flush_va = 0;
-       spin_unlock(&tlbstate_lock);
-}
-       
-void flush_tlb_current_task(void)
-{
-       struct mm_struct *mm = current->mm;
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       local_flush_tlb();
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-       preempt_enable();
-}
-
-void flush_tlb_mm (struct mm_struct * mm)
-{
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       if (current->active_mm == mm) {
-               if (current->mm)
-                       local_flush_tlb();
-               else
-                       leave_mm(smp_processor_id());
-       }
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
-       preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       if (current->active_mm == mm) {
-               if(current->mm)
-                       __flush_tlb_one(va);
-                else
-                       leave_mm(smp_processor_id());
-       }
-
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, va);
-
-       preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void* info)
-{
-       unsigned long cpu = smp_processor_id();
-
-       __flush_tlb_all();
-       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
-               leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
-       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
-}
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static void native_smp_send_reschedule(int cpu)
-{
-       WARN_ON(cpu_is_offline(cpu));
-       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       atomic_t started;
-       atomic_t finished;
-       int wait;
-};
-
-void lock_ipi_call_lock(void)
-{
-       spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
-       spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
-                               int nonatomic, int wait)
-{
-       struct call_data_struct data;
-       int cpus = num_online_cpus() - 1;
-
-       if (!cpus)
-               return;
-
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
-
-       call_data = &data;
-       mb();
-       
-       /* Send a message to all other CPUs and wait for them to respond */
-       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus)
-               cpu_relax();
-
-       if (wait)
-               while (atomic_read(&data.finished) != cpus)
-                       cpu_relax();
-}
-
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.  Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
-  * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-static int
-native_smp_call_function_mask(cpumask_t mask,
-                             void (*func)(void *), void *info,
-                             int wait)
-{
-       struct call_data_struct data;
-       cpumask_t allbutself;
-       int cpus;
-
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
-
-       /* Holding any lock stops cpus from going down. */
-       spin_lock(&call_lock);
-
-       allbutself = cpu_online_map;
-       cpu_clear(smp_processor_id(), allbutself);
-
-       cpus_and(mask, mask, allbutself);
-       cpus = cpus_weight(mask);
-
-       if (!cpus) {
-               spin_unlock(&call_lock);
-               return 0;
-       }
-
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
-
-       call_data = &data;
-       mb();
-
-       /* Send a message to other CPUs */
-       if (cpus_equal(mask, allbutself))
-               send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-       else
-               send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus)
-               cpu_relax();
-
-       if (wait)
-               while (atomic_read(&data.finished) != cpus)
-                       cpu_relax();
-       spin_unlock(&call_lock);
-
-       return 0;
-}
-
-static void stop_this_cpu (void * dummy)
-{
-       local_irq_disable();
-       /*
-        * Remove this CPU:
-        */
-       cpu_clear(smp_processor_id(), cpu_online_map);
-       disable_local_APIC();
-       if (cpu_data[smp_processor_id()].hlt_works_ok)
-               for(;;) halt();
-       for (;;);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-static void native_smp_send_stop(void)
-{
-       /* Don't deadlock on the call lock in panic */
-       int nolock = !spin_trylock(&call_lock);
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __smp_call_function(stop_this_cpu, NULL, 0, 0);
-       if (!nolock)
-               spin_unlock(&call_lock);
-       disable_local_APIC();
-       local_irq_restore(flags);
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
-{
-       ack_APIC_irq();
-}
-
-fastcall void smp_call_function_interrupt(struct pt_regs *regs)
-{
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
-       int wait = call_data->wait;
-
-       ack_APIC_irq();
-       /*
-        * Notify initiating CPU that I've grabbed the data and am
-        * about to execute the function
-        */
-       mb();
-       atomic_inc(&call_data->started);
-       /*
-        * At this point the info structure may be out of scope unless wait==1
-        */
-       irq_enter();
-       (*func)(info);
-       irq_exit();
-
-       if (wait) {
-               mb();
-               atomic_inc(&call_data->finished);
-       }
-}
-
-static int convert_apicid_to_cpu(int apic_id)
-{
-       int i;
-
-       for (i = 0; i < NR_CPUS; i++) {
-               if (x86_cpu_to_apicid[i] == apic_id)
-                       return i;
-       }
-       return -1;
-}
-
-int safe_smp_processor_id(void)
-{
-       int apicid, cpuid;
-
-       if (!boot_cpu_has(X86_FEATURE_APIC))
-               return 0;
-
-       apicid = hard_smp_processor_id();
-       if (apicid == BAD_APICID)
-               return 0;
-
-       cpuid = convert_apicid_to_cpu(apicid);
-
-       return cpuid >= 0 ? cpuid : 0;
-}
-
-struct smp_ops smp_ops = {
-       .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
-       .smp_prepare_cpus = native_smp_prepare_cpus,
-       .cpu_up = native_cpu_up,
-       .smp_cpus_done = native_smp_cpus_done,
-
-       .smp_send_stop = native_smp_send_stop,
-       .smp_send_reschedule = native_smp_send_reschedule,
-       .smp_call_function_mask = native_smp_call_function_mask,
-};
diff --git a/arch/i386/kernel/smpboot_32.c b/arch/i386/kernel/smpboot_32.c
deleted file mode 100644 (file)
index e4f61d1..0000000
+++ /dev/null
@@ -1,1322 +0,0 @@
-/*
- *     x86 SMP booting functions
- *
- *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *     Much of the core SMP work is based on previous work by Thomas Radke, to
- *     whom a great many thanks are extended.
- *
- *     Thanks to Intel for making available several different Pentium,
- *     Pentium Pro and Pentium-II/Xeon MP machines.
- *     Original development of Linux SMP code supported by Caldera.
- *
- *     This code is released under the GNU General Public License version 2 or
- *     later.
- *
- *     Fixes
- *             Felix Koop      :       NR_CPUS used properly
- *             Jose Renau      :       Handle single CPU case.
- *             Alan Cox        :       By repeated request 8) - Total BogoMIPS report.
- *             Greg Wright     :       Fix for kernel stacks panic.
- *             Erich Boleyn    :       MP v1.4 and additional changes.
- *     Matthias Sattler        :       Changes for 2.1 kernel map.
- *     Michel Lespinasse       :       Changes for 2.1 kernel map.
- *     Michael Chastain        :       Change trampoline.S to gnu as.
- *             Alan Cox        :       Dumb bug: 'B' step PPro's are fine
- *             Ingo Molnar     :       Added APIC timers, based on code
- *                                     from Jose Renau
- *             Ingo Molnar     :       various cleanups and rewrites
- *             Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
- *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
- *             Martin J. Bligh :       Added support for multi-quad systems
- *             Dave Jones      :       Report invalid combinations of Athlon CPUs.
-*              Rusty Russell   :       Hacked into shape for new "hotplug" boot process. */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel_stat.h>
-#include <linux/bootmem.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/percpu.h>
-#include <linux/nmi.h>
-
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
-#include <asm/tlbflush.h>
-#include <asm/desc.h>
-#include <asm/arch_hooks.h>
-#include <asm/nmi.h>
-
-#include <mach_apic.h>
-#include <mach_wakecpu.h>
-#include <smpboot_hooks.h>
-#include <asm/vmi.h>
-#include <asm/mtrr.h>
-
-/* Set if we find a B stepping CPU */
-static int __devinitdata smp_b_stepping;
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
-
-/* representing HT siblings of each logical CPU */
-cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(cpu_sibling_map);
-
-/* representing HT and core siblings of each logical CPU */
-cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(cpu_core_map);
-
-/* bitmap of online cpus */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-EXPORT_SYMBOL(cpu_callout_map);
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-static cpumask_t smp_commenced_mask;
-
-/* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_data);
-
-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
-                       { [0 ... NR_CPUS-1] = 0xff };
-EXPORT_SYMBOL(x86_cpu_to_apicid);
-
-u8 apicid_2_node[MAX_APICID];
-
-/*
- * Trampoline 80x86 program as an array.
- */
-
-extern unsigned char trampoline_data [];
-extern unsigned char trampoline_end  [];
-static unsigned char *trampoline_base;
-static int trampoline_exec;
-
-static void map_cpu_to_logical_apicid(void);
-
-/* State of each CPU. */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-
-static unsigned long __devinit setup_trampoline(void)
-{
-       memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
-       return virt_to_phys(trampoline_base);
-}
-
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
-       trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
-       /*
-        * Has to be in very low memory so we can execute
-        * real-mode AP code.
-        */
-       if (__pa(trampoline_base) >= 0x9F000)
-               BUG();
-       /*
-        * Make the SMP trampoline executable:
-        */
-       trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
-}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-
-void __cpuinit smp_store_cpu_info(int id)
-{
-       struct cpuinfo_x86 *c = cpu_data + id;
-
-       *c = boot_cpu_data;
-       if (id!=0)
-               identify_secondary_cpu(c);
-       /*
-        * Mask B, Pentium, but not Pentium MMX
-        */
-       if (c->x86_vendor == X86_VENDOR_INTEL &&
-           c->x86 == 5 &&
-           c->x86_mask >= 1 && c->x86_mask <= 4 &&
-           c->x86_model <= 3)
-               /*
-                * Remember we have B step Pentia with bugs
-                */
-               smp_b_stepping = 1;
-
-       /*
-        * Certain Athlons might work (for various values of 'work') in SMP
-        * but they are not certified as MP capable.
-        */
-       if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-               if (num_possible_cpus() == 1)
-                       goto valid_k7;
-
-               /* Athlon 660/661 is valid. */  
-               if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
-                       goto valid_k7;
-
-               /* Duron 670 is valid */
-               if ((c->x86_model==7) && (c->x86_mask==0))
-                       goto valid_k7;
-
-               /*
-                * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
-                * It's worth noting that the A5 stepping (662) of some Athlon XP's
-                * have the MP bit set.
-                * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
-                */
-               if (((c->x86_model==6) && (c->x86_mask>=2)) ||
-                   ((c->x86_model==7) && (c->x86_mask>=1)) ||
-                    (c->x86_model> 7))
-                       if (cpu_has_mp)
-                               goto valid_k7;
-
-               /* If we get here, it's not a certified SMP capable AMD system. */
-               add_taint(TAINT_UNSAFE_SMP);
-       }
-
-valid_k7:
-       ;
-}
-
-extern void calibrate_delay(void);
-
-static atomic_t init_deasserted;
-
-static void __cpuinit smp_callin(void)
-{
-       int cpuid, phys_id;
-       unsigned long timeout;
-
-       /*
-        * If waken up by an INIT in an 82489DX configuration
-        * we may get here before an INIT-deassert IPI reaches
-        * our local APIC.  We have to wait for the IPI or we'll
-        * lock up on an APIC access.
-        */
-       wait_for_init_deassert(&init_deasserted);
-
-       /*
-        * (This works even if the APIC is not enabled.)
-        */
-       phys_id = GET_APIC_ID(apic_read(APIC_ID));
-       cpuid = smp_processor_id();
-       if (cpu_isset(cpuid, cpu_callin_map)) {
-               printk("huh, phys CPU#%d, CPU#%d already present??\n",
-                                       phys_id, cpuid);
-               BUG();
-       }
-       Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
-
-       /*
-        * STARTUP IPIs are fragile beasts as they might sometimes
-        * trigger some glue motherboard logic. Complete APIC bus
-        * silence for 1 second, this overestimates the time the
-        * boot CPU is spending to send the up to 2 STARTUP IPIs
-        * by a factor of two. This should be enough.
-        */
-
-       /*
-        * Waiting 2s total for startup (udelay is not yet working)
-        */
-       timeout = jiffies + 2*HZ;
-       while (time_before(jiffies, timeout)) {
-               /*
-                * Has the boot CPU finished it's STARTUP sequence?
-                */
-               if (cpu_isset(cpuid, cpu_callout_map))
-                       break;
-               rep_nop();
-       }
-
-       if (!time_before(jiffies, timeout)) {
-               printk("BUG: CPU%d started up but did not get a callout!\n",
-                       cpuid);
-               BUG();
-       }
-
-       /*
-        * the boot CPU has finished the init stage and is spinning
-        * on callin_map until we finish. We are free to set up this
-        * CPU, first the APIC. (this is probably redundant on most
-        * boards)
-        */
-
-       Dprintk("CALLIN, before setup_local_APIC().\n");
-       smp_callin_clear_local_apic();
-       setup_local_APIC();
-       map_cpu_to_logical_apicid();
-
-       /*
-        * Get our bogomips.
-        */
-       calibrate_delay();
-       Dprintk("Stack at about %p\n",&cpuid);
-
-       /*
-        * Save our processor parameters
-        */
-       smp_store_cpu_info(cpuid);
-
-       /*
-        * Allow the master to continue.
-        */
-       cpu_set(cpuid, cpu_callin_map);
-}
-
-static int cpucount;
-
-/* maps the cpu to the sched domain representing multi-core */
-cpumask_t cpu_coregroup_map(int cpu)
-{
-       struct cpuinfo_x86 *c = cpu_data + cpu;
-       /*
-        * For perf, we return last level cache shared map.
-        * And for power savings, we return cpu_core_map
-        */
-       if (sched_mc_power_savings || sched_smt_power_savings)
-               return cpu_core_map[cpu];
-       else
-               return c->llc_shared_map;
-}
-
-/* representing cpus for which sibling maps can be computed */
-static cpumask_t cpu_sibling_setup_map;
-
-void __cpuinit set_cpu_sibling_map(int cpu)
-{
-       int i;
-       struct cpuinfo_x86 *c = cpu_data;
-
-       cpu_set(cpu, cpu_sibling_setup_map);
-
-       if (smp_num_siblings > 1) {
-               for_each_cpu_mask(i, cpu_sibling_setup_map) {
-                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
-                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
-                               cpu_set(i, cpu_sibling_map[cpu]);
-                               cpu_set(cpu, cpu_sibling_map[i]);
-                               cpu_set(i, cpu_core_map[cpu]);
-                               cpu_set(cpu, cpu_core_map[i]);
-                               cpu_set(i, c[cpu].llc_shared_map);
-                               cpu_set(cpu, c[i].llc_shared_map);
-                       }
-               }
-       } else {
-               cpu_set(cpu, cpu_sibling_map[cpu]);
-       }
-
-       cpu_set(cpu, c[cpu].llc_shared_map);
-
-       if (current_cpu_data.x86_max_cores == 1) {
-               cpu_core_map[cpu] = cpu_sibling_map[cpu];
-               c[cpu].booted_cores = 1;
-               return;
-       }
-
-       for_each_cpu_mask(i, cpu_sibling_setup_map) {
-               if (cpu_llc_id[cpu] != BAD_APICID &&
-                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
-                       cpu_set(i, c[cpu].llc_shared_map);
-                       cpu_set(cpu, c[i].llc_shared_map);
-               }
-               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
-                       cpu_set(i, cpu_core_map[cpu]);
-                       cpu_set(cpu, cpu_core_map[i]);
-                       /*
-                        *  Does this new cpu bringup a new core?
-                        */
-                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
-                               /*
-                                * for each core in package, increment
-                                * the booted_cores for this new cpu
-                                */
-                               if (first_cpu(cpu_sibling_map[i]) == i)
-                                       c[cpu].booted_cores++;
-                               /*
-                                * increment the core count for all
-                                * the other cpus in this package
-                                */
-                               if (i != cpu)
-                                       c[i].booted_cores++;
-                       } else if (i != cpu && !c[cpu].booted_cores)
-                               c[cpu].booted_cores = c[i].booted_cores;
-               }
-       }
-}
-
-/*
- * Activate a secondary processor.
- */
-static void __cpuinit start_secondary(void *unused)
-{
-       /*
-        * Don't put *anything* before cpu_init(), SMP booting is too
-        * fragile that we want to limit the things done here to the
-        * most necessary things.
-        */
-#ifdef CONFIG_VMI
-       vmi_bringup();
-#endif
-       cpu_init();
-       preempt_disable();
-       smp_callin();
-       while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
-               rep_nop();
-       /*
-        * Check TSC synchronization with the BP:
-        */
-       check_tsc_sync_target();
-
-       setup_secondary_clock();
-       if (nmi_watchdog == NMI_IO_APIC) {
-               disable_8259A_irq(0);
-               enable_NMI_through_LVT0(NULL);
-               enable_8259A_irq(0);
-       }
-       /*
-        * low-memory mappings have been cleared, flush them from
-        * the local TLBs too.
-        */
-       local_flush_tlb();
-
-       /* This must be done before setting cpu_online_map */
-       set_cpu_sibling_map(raw_smp_processor_id());
-       wmb();
-
-       /*
-        * We need to hold call_lock, so there is no inconsistency
-        * between the time smp_call_function() determines number of
-        * IPI receipients, and the time when the determination is made
-        * for which cpus receive the IPI. Holding this
-        * lock helps us to not include this cpu in a currently in progress
-        * smp_call_function().
-        */
-       lock_ipi_call_lock();
-       cpu_set(smp_processor_id(), cpu_online_map);
-       unlock_ipi_call_lock();
-       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
-
-       /* We can take interrupts now: we're officially "up". */
-       local_irq_enable();
-
-       wmb();
-       cpu_idle();
-}
-
-/*
- * Everything has been set up for the secondary
- * CPUs - they just need to reload everything
- * from the task structure
- * This function must not return.
- */
-void __devinit initialize_secondary(void)
-{
-       /*
-        * We don't actually need to load the full TSS,
-        * basically just the stack pointer and the eip.
-        */
-
-       asm volatile(
-               "movl %0,%%esp\n\t"
-               "jmp *%1"
-               :
-               :"m" (current->thread.esp),"m" (current->thread.eip));
-}
-
-/* Static state in head.S used to set up a CPU */
-extern struct {
-       void * esp;
-       unsigned short ss;
-} stack_start;
-
-#ifdef CONFIG_NUMA
-
-/* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
-                               { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-EXPORT_SYMBOL(node_2_cpu_mask);
-/* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_2_node);
-
-/* set up a mapping between cpu and node. */
-static inline void map_cpu_to_node(int cpu, int node)
-{
-       printk("Mapping cpu %d to node %d\n", cpu, node);
-       cpu_set(cpu, node_2_cpu_mask[node]);
-       cpu_2_node[cpu] = node;
-}
-
-/* undo a mapping between cpu and node. */
-static inline void unmap_cpu_to_node(int cpu)
-{
-       int node;
-
-       printk("Unmapping cpu %d from all nodes\n", cpu);
-       for (node = 0; node < MAX_NUMNODES; node ++)
-               cpu_clear(cpu, node_2_cpu_mask[node]);
-       cpu_2_node[cpu] = 0;
-}
-#else /* !CONFIG_NUMA */
-
-#define map_cpu_to_node(cpu, node)     ({})
-#define unmap_cpu_to_node(cpu) ({})
-
-#endif /* CONFIG_NUMA */
-
-u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-
-static void map_cpu_to_logical_apicid(void)
-{
-       int cpu = smp_processor_id();
-       int apicid = logical_smp_processor_id();
-       int node = apicid_to_node(apicid);
-
-       if (!node_online(node))
-               node = first_online_node;
-
-       cpu_2_logical_apicid[cpu] = apicid;
-       map_cpu_to_node(cpu, node);
-}
-
-static void unmap_cpu_to_logical_apicid(int cpu)
-{
-       cpu_2_logical_apicid[cpu] = BAD_APICID;
-       unmap_cpu_to_node(cpu);
-}
-
-static inline void __inquire_remote_apic(int apicid)
-{
-       int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
-       char *names[] = { "ID", "VERSION", "SPIV" };
-       int timeout;
-       unsigned long status;
-
-       printk("Inquiring remote APIC #%d...\n", apicid);
-
-       for (i = 0; i < ARRAY_SIZE(regs); i++) {
-               printk("... APIC #%d %s: ", apicid, names[i]);
-
-               /*
-                * Wait for idle.
-                */
-               status = safe_apic_wait_icr_idle();
-               if (status)
-                       printk("a previous APIC delivery may have failed\n");
-
-               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-               apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
-
-               timeout = 0;
-               do {
-                       udelay(100);
-                       status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
-               } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
-               switch (status) {
-               case APIC_ICR_RR_VALID:
-                       status = apic_read(APIC_RRR);
-                       printk("%lx\n", status);
-                       break;
-               default:
-                       printk("failed\n");
-               }
-       }
-}
-
-#ifdef WAKE_SECONDARY_VIA_NMI
-/* 
- * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
- * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
- * won't ... remember to clear down the APIC, etc later.
- */
-static int __devinit
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
-{
-       unsigned long send_status, accept_status = 0;
-       int maxlvt;
-
-       /* Target chip */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
-
-       /* Boot on the stack */
-       /* Kick the second */
-       apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
-
-       Dprintk("Waiting for send to finish...\n");
-       send_status = safe_apic_wait_icr_idle();
-
-       /*
-        * Give the other CPU some time to accept the IPI.
-        */
-       udelay(200);
-       /*
-        * Due to the Pentium erratum 3AP.
-        */
-       maxlvt = lapic_get_maxlvt();
-       if (maxlvt > 3) {
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
-       }
-       accept_status = (apic_read(APIC_ESR) & 0xEF);
-       Dprintk("NMI sent.\n");
-
-       if (send_status)
-               printk("APIC never delivered???\n");
-       if (accept_status)
-               printk("APIC delivery error (%lx).\n", accept_status);
-
-       return (send_status | accept_status);
-}
-#endif /* WAKE_SECONDARY_VIA_NMI */
-
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __devinit
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
-{
-       unsigned long send_status, accept_status = 0;
-       int maxlvt, num_starts, j;
-
-       /*
-        * Be paranoid about clearing APIC errors.
-        */
-       if (APIC_INTEGRATED(apic_version[phys_apicid])) {
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
-               apic_read(APIC_ESR);
-       }
-
-       Dprintk("Asserting INIT.\n");
-
-       /*
-        * Turn INIT on target chip
-        */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-       /*
-        * Send IPI
-        */
-       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-                               | APIC_DM_INIT);
-
-       Dprintk("Waiting for send to finish...\n");
-       send_status = safe_apic_wait_icr_idle();
-
-       mdelay(10);
-
-       Dprintk("Deasserting INIT.\n");
-
-       /* Target chip */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-       /* Send IPI */
-       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
-
-       Dprintk("Waiting for send to finish...\n");
-       send_status = safe_apic_wait_icr_idle();
-
-       atomic_set(&init_deasserted, 1);
-
-       /*
-        * Should we send STARTUP IPIs ?
-        *
-        * Determine this based on the APIC version.
-        * If we don't have an integrated APIC, don't send the STARTUP IPIs.
-        */
-       if (APIC_INTEGRATED(apic_version[phys_apicid]))
-               num_starts = 2;
-       else
-               num_starts = 0;
-
-       /*
-        * Paravirt / VMI wants a startup IPI hook here to set up the
-        * target processor state.
-        */
-       startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-                        (unsigned long) stack_start.esp);
-
-       /*
-        * Run STARTUP IPI loop.
-        */
-       Dprintk("#startup loops: %d.\n", num_starts);
-
-       maxlvt = lapic_get_maxlvt();
-
-       for (j = 1; j <= num_starts; j++) {
-               Dprintk("Sending STARTUP #%d.\n",j);
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
-               apic_read(APIC_ESR);
-               Dprintk("After apic_write.\n");
-
-               /*
-                * STARTUP IPI
-                */
-
-               /* Target chip */
-               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-               /* Boot on the stack */
-               /* Kick the second */
-               apic_write_around(APIC_ICR, APIC_DM_STARTUP
-                                       | (start_eip >> 12));
-
-               /*
-                * Give the other CPU some time to accept the IPI.
-                */
-               udelay(300);
-
-               Dprintk("Startup point 1.\n");
-
-               Dprintk("Waiting for send to finish...\n");
-               send_status = safe_apic_wait_icr_idle();
-
-               /*
-                * Give the other CPU some time to accept the IPI.
-                */
-               udelay(200);
-               /*
-                * Due to the Pentium erratum 3AP.
-                */
-               if (maxlvt > 3) {
-                       apic_read_around(APIC_SPIV);
-                       apic_write(APIC_ESR, 0);
-               }
-               accept_status = (apic_read(APIC_ESR) & 0xEF);
-               if (send_status || accept_status)
-                       break;
-       }
-       Dprintk("After Startup.\n");
-
-       if (send_status)
-               printk("APIC never delivered???\n");
-       if (accept_status)
-               printk("APIC delivery error (%lx).\n", accept_status);
-
-       return (send_status | accept_status);
-}
-#endif /* WAKE_SECONDARY_VIA_INIT */
-
-extern cpumask_t cpu_initialized;
-static inline int alloc_cpu_id(void)
-{
-       cpumask_t       tmp_map;
-       int cpu;
-       cpus_complement(tmp_map, cpu_present_map);
-       cpu = first_cpu(tmp_map);
-       if (cpu >= NR_CPUS)
-               return -ENODEV;
-       return cpu;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
-static inline struct task_struct * alloc_idle_task(int cpu)
-{
-       struct task_struct *idle;
-
-       if ((idle = cpu_idle_tasks[cpu]) != NULL) {
-               /* initialize thread_struct.  we really want to avoid destroy
-                * idle tread
-                */
-               idle->thread.esp = (unsigned long)task_pt_regs(idle);
-               init_idle(idle, cpu);
-               return idle;
-       }
-       idle = fork_idle(cpu);
-
-       if (!IS_ERR(idle))
-               cpu_idle_tasks[cpu] = idle;
-       return idle;
-}
-#else
-#define alloc_idle_task(cpu) fork_idle(cpu)
-#endif
-
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
-/*
- * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
- * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
- */
-{
-       struct task_struct *idle;
-       unsigned long boot_error;
-       int timeout;
-       unsigned long start_eip;
-       unsigned short nmi_high = 0, nmi_low = 0;
-
-       /*
-        * Save current MTRR state in case it was changed since early boot
-        * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
-        */
-       mtrr_save_state();
-
-       /*
-        * We can't use kernel_thread since we must avoid to
-        * reschedule the child.
-        */
-       idle = alloc_idle_task(cpu);
-       if (IS_ERR(idle))
-               panic("failed fork for CPU %d", cpu);
-
-       init_gdt(cpu);
-       per_cpu(current_task, cpu) = idle;
-       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
-
-       idle->thread.eip = (unsigned long) start_secondary;
-       /* start_eip had better be page-aligned! */
-       start_eip = setup_trampoline();
-
-       ++cpucount;
-       alternatives_smp_switch(1);
-
-       /* So we see what's up   */
-       printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
-       /* Stack for startup_32 can be just as for start_secondary onwards */
-       stack_start.esp = (void *) idle->thread.esp;
-
-       irq_ctx_init(cpu);
-
-       x86_cpu_to_apicid[cpu] = apicid;
-       /*
-        * This grunge runs the startup process for
-        * the targeted processor.
-        */
-
-       atomic_set(&init_deasserted, 0);
-
-       Dprintk("Setting warm reset code and vector.\n");
-
-       store_NMI_vector(&nmi_high, &nmi_low);
-
-       smpboot_setup_warm_reset_vector(start_eip);
-
-       /*
-        * Starting actual IPI sequence...
-        */
-       boot_error = wakeup_secondary_cpu(apicid, start_eip);
-
-       if (!boot_error) {
-               /*
-                * allow APs to start initializing.
-                */
-               Dprintk("Before Callout %d.\n", cpu);
-               cpu_set(cpu, cpu_callout_map);
-               Dprintk("After Callout %d.\n", cpu);
-
-               /*
-                * Wait 5s total for a response
-                */
-               for (timeout = 0; timeout < 50000; timeout++) {
-                       if (cpu_isset(cpu, cpu_callin_map))
-                               break;  /* It has booted */
-                       udelay(100);
-               }
-
-               if (cpu_isset(cpu, cpu_callin_map)) {
-                       /* number CPUs logically, starting from 1 (BSP is 0) */
-                       Dprintk("OK.\n");
-                       printk("CPU%d: ", cpu);
-                       print_cpu_info(&cpu_data[cpu]);
-                       Dprintk("CPU has booted.\n");
-               } else {
-                       boot_error= 1;
-                       if (*((volatile unsigned char *)trampoline_base)
-                                       == 0xA5)
-                               /* trampoline started but...? */
-                               printk("Stuck ??\n");
-                       else
-                               /* trampoline code not run */
-                               printk("Not responding.\n");
-                       inquire_remote_apic(apicid);
-               }
-       }
-
-       if (boot_error) {
-               /* Try to put things back the way they were before ... */
-               unmap_cpu_to_logical_apicid(cpu);
-               cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
-               cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
-               cpucount--;
-       } else {
-               x86_cpu_to_apicid[cpu] = apicid;
-               cpu_set(cpu, cpu_present_map);
-       }
-
-       /* mark "stuck" area as not stuck */
-       *((volatile unsigned long *)trampoline_base) = 0;
-
-       return boot_error;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-void cpu_exit_clear(void)
-{
-       int cpu = raw_smp_processor_id();
-
-       idle_task_exit();
-
-       cpucount --;
-       cpu_uninit();
-       irq_ctx_exit(cpu);
-
-       cpu_clear(cpu, cpu_callout_map);
-       cpu_clear(cpu, cpu_callin_map);
-
-       cpu_clear(cpu, smp_commenced_mask);
-       unmap_cpu_to_logical_apicid(cpu);
-}
-
-struct warm_boot_cpu_info {
-       struct completion *complete;
-       struct work_struct task;
-       int apicid;
-       int cpu;
-};
-
-static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
-{
-       struct warm_boot_cpu_info *info =
-               container_of(work, struct warm_boot_cpu_info, task);
-       do_boot_cpu(info->apicid, info->cpu);
-       complete(info->complete);
-}
-
-static int __cpuinit __smp_prepare_cpu(int cpu)
-{
-       DECLARE_COMPLETION_ONSTACK(done);
-       struct warm_boot_cpu_info info;
-       int     apicid, ret;
-
-       apicid = x86_cpu_to_apicid[cpu];
-       if (apicid == BAD_APICID) {
-               ret = -ENODEV;
-               goto exit;
-       }
-
-       info.complete = &done;
-       info.apicid = apicid;
-       info.cpu = cpu;
-       INIT_WORK(&info.task, do_warm_boot_cpu);
-
-       /* init low mem mapping */
-       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-                       min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
-       flush_tlb_all();
-       schedule_work(&info.task);
-       wait_for_completion(&done);
-
-       zap_low_mappings();
-       ret = 0;
-exit:
-       return ret;
-}
-#endif
-
-/*
- * Cycle through the processors sending APIC IPIs to boot each.
- */
-
-static int boot_cpu_logical_apicid;
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-#ifdef CONFIG_X86_NUMAQ
-EXPORT_SYMBOL(xquad_portio);
-#endif
-
-static void __init smp_boot_cpus(unsigned int max_cpus)
-{
-       int apicid, cpu, bit, kicked;
-       unsigned long bogosum = 0;
-
-       /*
-        * Setup boot CPU information
-        */
-       smp_store_cpu_info(0); /* Final full version of the data */
-       printk("CPU%d: ", 0);
-       print_cpu_info(&cpu_data[0]);
-
-       boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-       boot_cpu_logical_apicid = logical_smp_processor_id();
-       x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
-
-       current_thread_info()->cpu = 0;
-
-       set_cpu_sibling_map(0);
-
-       /*
-        * If we couldn't find an SMP configuration at boot time,
-        * get out of here now!
-        */
-       if (!smp_found_config && !acpi_lapic) {
-               printk(KERN_NOTICE "SMP motherboard not detected.\n");
-               smpboot_clear_io_apic_irqs();
-               phys_cpu_present_map = physid_mask_of_physid(0);
-               if (APIC_init_uniprocessor())
-                       printk(KERN_NOTICE "Local APIC not detected."
-                                          " Using dummy APIC emulation.\n");
-               map_cpu_to_logical_apicid();
-               cpu_set(0, cpu_sibling_map[0]);
-               cpu_set(0, cpu_core_map[0]);
-               return;
-       }
-
-       /*
-        * Should not be necessary because the MP table should list the boot
-        * CPU too, but we do it for the sake of robustness anyway.
-        * Makes no sense to do this check in clustered apic mode, so skip it
-        */
-       if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
-               printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
-                               boot_cpu_physical_apicid);
-               physid_set(hard_smp_processor_id(), phys_cpu_present_map);
-       }
-
-       /*
-        * If we couldn't find a local APIC, then get out of here now!
-        */
-       if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
-               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-                       boot_cpu_physical_apicid);
-               printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
-               smpboot_clear_io_apic_irqs();
-               phys_cpu_present_map = physid_mask_of_physid(0);
-               cpu_set(0, cpu_sibling_map[0]);
-               cpu_set(0, cpu_core_map[0]);
-               return;
-       }
-
-       verify_local_APIC();
-
-       /*
-        * If SMP should be disabled, then really disable it!
-        */
-       if (!max_cpus) {
-               smp_found_config = 0;
-               printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
-               smpboot_clear_io_apic_irqs();
-               phys_cpu_present_map = physid_mask_of_physid(0);
-               cpu_set(0, cpu_sibling_map[0]);
-               cpu_set(0, cpu_core_map[0]);
-               return;
-       }
-
-       connect_bsp_APIC();
-       setup_local_APIC();
-       map_cpu_to_logical_apicid();
-
-
-       setup_portio_remap();
-
-       /*
-        * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
-        *
-        * In clustered apic mode, phys_cpu_present_map is a constructed thus:
-        * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
-        * clustered apic ID.
-        */
-       Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
-
-       kicked = 1;
-       for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
-               apicid = cpu_present_to_apicid(bit);
-               /*
-                * Don't even attempt to start the boot CPU!
-                */
-               if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
-                       continue;
-
-               if (!check_apicid_present(bit))
-                       continue;
-               if (max_cpus <= cpucount+1)
-                       continue;
-
-               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
-                       printk("CPU #%d not responding - cannot use it.\n",
-                                                               apicid);
-               else
-                       ++kicked;
-       }
-
-       /*
-        * Cleanup possible dangling ends...
-        */
-       smpboot_restore_warm_reset_vector();
-
-       /*
-        * Allow the user to impress friends.
-        */
-       Dprintk("Before bogomips.\n");
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
-               if (cpu_isset(cpu, cpu_callout_map))
-                       bogosum += cpu_data[cpu].loops_per_jiffy;
-       printk(KERN_INFO
-               "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
-               cpucount+1,
-               bogosum/(500000/HZ),
-               (bogosum/(5000/HZ))%100);
-       
-       Dprintk("Before bogocount - setting activated=1.\n");
-
-       if (smp_b_stepping)
-               printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
-
-       /*
-        * Don't taint if we are running SMP kernel on a single non-MP
-        * approved Athlon
-        */
-       if (tainted & TAINT_UNSAFE_SMP) {
-               if (cpucount)
-                       printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
-               else
-                       tainted &= ~TAINT_UNSAFE_SMP;
-       }
-
-       Dprintk("Boot done.\n");
-
-       /*
-        * construct cpu_sibling_map[], so that we can tell sibling CPUs
-        * efficiently.
-        */
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
-               cpus_clear(cpu_sibling_map[cpu]);
-               cpus_clear(cpu_core_map[cpu]);
-       }
-
-       cpu_set(0, cpu_sibling_map[0]);
-       cpu_set(0, cpu_core_map[0]);
-
-       smpboot_setup_io_apic();
-
-       setup_boot_clock();
-}
-
-/* These are wrappers to interface to the new boot process.  Someone
-   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init native_smp_prepare_cpus(unsigned int max_cpus)
-{
-       smp_commenced_mask = cpumask_of_cpu(0);
-       cpu_callin_map = cpumask_of_cpu(0);
-       mb();
-       smp_boot_cpus(max_cpus);
-}
-
-void __init native_smp_prepare_boot_cpu(void)
-{
-       unsigned int cpu = smp_processor_id();
-
-       init_gdt(cpu);
-       switch_to_new_gdt();
-
-       cpu_set(cpu, cpu_online_map);
-       cpu_set(cpu, cpu_callout_map);
-       cpu_set(cpu, cpu_present_map);
-       cpu_set(cpu, cpu_possible_map);
-       __get_cpu_var(cpu_state) = CPU_ONLINE;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-void remove_siblinginfo(int cpu)
-{
-       int sibling;
-       struct cpuinfo_x86 *c = cpu_data;
-
-       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
-               cpu_clear(cpu, cpu_core_map[sibling]);
-               /*
-                * last thread sibling in this cpu core going down
-                */
-               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
-                       c[sibling].booted_cores--;
-       }
-                       
-       for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
-               cpu_clear(cpu, cpu_sibling_map[sibling]);
-       cpus_clear(cpu_sibling_map[cpu]);
-       cpus_clear(cpu_core_map[cpu]);
-       c[cpu].phys_proc_id = 0;
-       c[cpu].cpu_core_id = 0;
-       cpu_clear(cpu, cpu_sibling_setup_map);
-}
-
-int __cpu_disable(void)
-{
-       cpumask_t map = cpu_online_map;
-       int cpu = smp_processor_id();
-
-       /*
-        * Perhaps use cpufreq to drop frequency, but that could go
-        * into generic code.
-        *
-        * We won't take down the boot processor on i386 due to some
-        * interrupts only being able to be serviced by the BSP.
-        * Especially so if we're not using an IOAPIC   -zwane
-        */
-       if (cpu == 0)
-               return -EBUSY;
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               stop_apic_nmi_watchdog(NULL);
-       clear_local_APIC();
-       /* Allow any queued timer interrupts to get serviced */
-       local_irq_enable();
-       mdelay(1);
-       local_irq_disable();
-
-       remove_siblinginfo(cpu);
-
-       cpu_clear(cpu, map);
-       fixup_irqs(map);
-       /* It's now safe to remove this processor from the online map */
-       cpu_clear(cpu, cpu_online_map);
-       return 0;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-       /* We don't do anything here: idle task is faking death itself. */
-       unsigned int i;
-
-       for (i = 0; i < 10; i++) {
-               /* They ack this in play_dead by setting CPU_DEAD */
-               if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
-                       printk ("CPU %d is now offline\n", cpu);
-                       if (1 == num_online_cpus())
-                               alternatives_smp_switch(0);
-                       return;
-               }
-               msleep(100);
-       }
-       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
-}
-#else /* ... !CONFIG_HOTPLUG_CPU */
-int __cpu_disable(void)
-{
-       return -ENOSYS;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-       /* We said "no" in __cpu_disable */
-       BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-int __cpuinit native_cpu_up(unsigned int cpu)
-{
-       unsigned long flags;
-#ifdef CONFIG_HOTPLUG_CPU
-       int ret = 0;
-
-       /*
-        * We do warm boot only on cpus that had booted earlier
-        * Otherwise cold boot is all handled from smp_boot_cpus().
-        * cpu_callin_map is set during AP kickstart process. Its reset
-        * when a cpu is taken offline from cpu_exit_clear().
-        */
-       if (!cpu_isset(cpu, cpu_callin_map))
-               ret = __smp_prepare_cpu(cpu);
-
-       if (ret)
-               return -EIO;
-#endif
-
-       /* In case one didn't come up */
-       if (!cpu_isset(cpu, cpu_callin_map)) {
-               printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
-               return -EIO;
-       }
-
-       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-       /* Unleash the CPU! */
-       cpu_set(cpu, smp_commenced_mask);
-
-       /*
-        * Check TSC synchronization with the AP (keep irqs disabled
-        * while doing so):
-        */
-       local_irq_save(flags);
-       check_tsc_sync_source(cpu);
-       local_irq_restore(flags);
-
-       while (!cpu_isset(cpu, cpu_online_map)) {
-               cpu_relax();
-               touch_nmi_watchdog();
-       }
-
-       return 0;
-}
-
-void __init native_smp_cpus_done(unsigned int max_cpus)
-{
-#ifdef CONFIG_X86_IO_APIC
-       setup_ioapic_dest();
-#endif
-       zap_low_mappings();
-#ifndef CONFIG_HOTPLUG_CPU
-       /*
-        * Disable executability of the SMP trampoline:
-        */
-       set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
-#endif
-}
-
-void __init smp_intr_init(void)
-{
-       /*
-        * IRQ0 must be given a fixed assignment and initialized,
-        * because it's used before the IO-APIC is set up.
-        */
-       set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
-       /*
-        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-        * IPI, driven by wakeup.
-        */
-       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-       /* IPI for invalidation */
-       set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
-
-       /* IPI for generic function call */
-       set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-}
-
-/*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
- */
-static int __init parse_maxcpus(char *arg)
-{
-       extern unsigned int maxcpus;
-
-       maxcpus = simple_strtoul(arg, NULL, 0);
-       return 0;
-}
-early_param("maxcpus", parse_maxcpus);
diff --git a/arch/i386/kernel/smpcommon_32.c b/arch/i386/kernel/smpcommon_32.c
deleted file mode 100644 (file)
index bbfe85a..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * SMP stuff which is common to all sub-architectures.
- */
-#include <linux/module.h>
-#include <asm/smp.h>
-
-DEFINE_PER_CPU(unsigned long, this_cpu_off);
-EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-
-/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
-   (still using the master per-cpu area), or a CPU doing it for a
-   secondary which will soon come up. */
-__cpuinit void init_gdt(int cpu)
-{
-       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-
-       pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
-                       (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
-                       __per_cpu_offset[cpu], 0xFFFFF,
-                       0x80 | DESCTYPE_S | 0x2, 0x8);
-
-       per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
-       per_cpu(cpu_number, cpu) = cpu;
-}
-
-
-/**
- * smp_call_function(): Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-                     int wait)
-{
-       return smp_call_function_mask(cpu_online_map, func, info, wait);
-}
-EXPORT_SYMBOL(smp_call_function);
-
-/**
- * smp_call_function_single - Run a function on a specific CPU
- * @cpu: The target CPU.  Cannot be the calling CPU.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-                            int nonatomic, int wait)
-{
-       /* prevent preemption and reschedule on another processor */
-       int ret;
-       int me = get_cpu();
-       if (cpu == me) {
-               local_irq_disable();
-               func(info);
-               local_irq_enable();
-               put_cpu();
-               return 0;
-       }
-
-       ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
-
-       put_cpu();
-       return ret;
-}
-EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/i386/kernel/srat_32.c b/arch/i386/kernel/srat_32.c
deleted file mode 100644 (file)
index 2a8713e..0000000
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit 
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x)    ((x) / 8)       /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x)   ((x) % 8)       /* 8 bits/char */
-#define BMAP_SET(bmap, bit)    ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit)   ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 pxm_bitmap[PXM_BITMAP_LEN];  /* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE    3
-#define MAXCHUNKS              (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
-       unsigned long   start_pfn;
-       unsigned long   end_pfn;
-       u8      pxm;            // proximity domain of node
-       u8      nid;            // which cnode contains this chunk?
-       u8      bank;           // which mem bank on this node
-};
-static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
-
-static int num_memory_chunks;          /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
-
-extern void * boot_ioremap(unsigned long, unsigned long);
-
-/* Identify CPU proximity domains */
-static void __init parse_cpu_affinity_structure(char *p)
-{
-       struct acpi_srat_cpu_affinity *cpu_affinity =
-                               (struct acpi_srat_cpu_affinity *) p;
-
-       if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-               return;         /* empty entry */
-
-       /* mark this node as "seen" in node bitmap */
-       BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
-       apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
-       printk("CPU 0x%02X in proximity domain 0x%02X\n",
-               cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-static void __init parse_memory_affinity_structure (char *sratp)
-{
-       unsigned long long paddr, size;
-       unsigned long start_pfn, end_pfn;
-       u8 pxm;
-       struct node_memory_chunk_s *p, *q, *pend;
-       struct acpi_srat_mem_affinity *memory_affinity =
-                       (struct acpi_srat_mem_affinity *) sratp;
-
-       if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-               return;         /* empty entry */
-
-       pxm = memory_affinity->proximity_domain & 0xff;
-
-       /* mark this node as "seen" in node bitmap */
-       BMAP_SET(pxm_bitmap, pxm);
-
-       /* calculate info for memory chunk structure */
-       paddr = memory_affinity->base_address;
-       size = memory_affinity->length;
-
-       start_pfn = paddr >> PAGE_SHIFT;
-       end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
-       if (num_memory_chunks >= MAXCHUNKS) {
-               printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
-                       size/(1024*1024), paddr);
-               return;
-       }
-
-       /* Insertion sort based on base address */
-       pend = &node_memory_chunk[num_memory_chunks];
-       for (p = &node_memory_chunk[0]; p < pend; p++) {
-               if (start_pfn < p->start_pfn)
-                       break;
-       }
-       if (p < pend) {
-               for (q = pend; q >= p; q--)
-                       *(q + 1) = *q;
-       }
-       p->start_pfn = start_pfn;
-       p->end_pfn = end_pfn;
-       p->pxm = pxm;
-
-       num_memory_chunks++;
-
-       printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
-               start_pfn, end_pfn,
-               memory_affinity->memory_type,
-               pxm,
-               ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
-                "enabled and removable" : "enabled" ) );
-}
-
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
-       /*
-        * Only add present memory as told by the e820.
-        * There is no guarantee from the SRAT that the memory it
-        * enumerates is present at boot time because it represents
-        * *possible* memory hotplug areas the same as normal RAM.
-        */
-       if (memory_chunk->start_pfn >= max_pfn) {
-               printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
-                       memory_chunk->start_pfn, memory_chunk->end_pfn);
-               return;
-       }
-       if (memory_chunk->nid != nid)
-               return;
-
-       if (!node_has_online_mem(nid))
-               node_start_pfn[nid] = memory_chunk->start_pfn;
-
-       if (node_start_pfn[nid] > memory_chunk->start_pfn)
-               node_start_pfn[nid] = memory_chunk->start_pfn;
-
-       if (node_end_pfn[nid] < memory_chunk->end_pfn)
-               node_end_pfn[nid] = memory_chunk->end_pfn;
-}
-
-/* Parse the ACPI Static Resource Affinity Table */
-static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
-{
-       u8 *start, *end, *p;
-       int i, j, nid;
-
-       start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
-       p = start;
-       end = (u8 *)sratp + sratp->header.length;
-
-       memset(pxm_bitmap, 0, sizeof(pxm_bitmap));      /* init proximity domain bitmap */
-       memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
-
-       num_memory_chunks = 0;
-       while (p < end) {
-               switch (*p) {
-               case ACPI_SRAT_TYPE_CPU_AFFINITY:
-                       parse_cpu_affinity_structure(p);
-                       break;
-               case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
-                       parse_memory_affinity_structure(p);
-                       break;
-               default:
-                       printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
-                       break;
-               }
-               p += p[1];
-               if (p[1] == 0) {
-                       printk("acpi20_parse_srat: Entry length value is zero;"
-                               " can't parse any further!\n");
-                       break;
-               }
-       }
-
-       if (num_memory_chunks == 0) {
-               printk("could not finy any ACPI SRAT memory areas.\n");
-               goto out_fail;
-       }
-
-       /* Calculate total number of nodes in system from PXM bitmap and create
-        * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
-        * to specify the range of _PXM values.)
-        */
-       /*
-        * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
-        * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
-        * 32, so we will continue numbering them in this manner until MAX_NUMNODES
-        * approaches MAX_PXM_DOMAINS for i386.
-        */
-       nodes_clear(node_online_map);
-       for (i = 0; i < MAX_PXM_DOMAINS; i++) {
-               if (BMAP_TEST(pxm_bitmap, i)) {
-                       int nid = acpi_map_pxm_to_node(i);
-                       node_set_online(nid);
-               }
-       }
-       BUG_ON(num_online_nodes() == 0);
-
-       /* set cnode id in memory chunk structure */
-       for (i = 0; i < num_memory_chunks; i++)
-               node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
-       printk("pxm bitmap: ");
-       for (i = 0; i < sizeof(pxm_bitmap); i++) {
-               printk("%02X ", pxm_bitmap[i]);
-       }
-       printk("\n");
-       printk("Number of logical nodes in system = %d\n", num_online_nodes());
-       printk("Number of memory chunks in system = %d\n", num_memory_chunks);
-
-       for (i = 0; i < MAX_APICID; i++)
-               apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
-
-       for (j = 0; j < num_memory_chunks; j++){
-               struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
-               printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
-                      j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
-               node_read_chunk(chunk->nid, chunk);
-               add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
-       }
-       for_each_online_node(nid) {
-               unsigned long start = node_start_pfn[nid];
-               unsigned long end = node_end_pfn[nid];
-
-               memory_present(nid, start, end);
-               node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
-       }
-       return 1;
-out_fail:
-       return 0;
-}
-
-struct acpi_static_rsdt {
-       struct acpi_table_rsdt table;
-       u32 padding[7]; /* Allow for 7 more table entries */
-};
-
-int __init get_memcfg_from_srat(void)
-{
-       struct acpi_table_header *header = NULL;
-       struct acpi_table_rsdp *rsdp = NULL;
-       struct acpi_table_rsdt *rsdt = NULL;
-       acpi_native_uint rsdp_address = 0;
-       struct acpi_static_rsdt saved_rsdt;
-       int tables = 0;
-       int i = 0;
-
-       rsdp_address = acpi_find_rsdp();
-       if (!rsdp_address) {
-               printk("%s: System description tables not found\n",
-                      __FUNCTION__);
-               goto out_err;
-       }
-
-       printk("%s: assigning address to rsdp\n", __FUNCTION__);
-       rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
-       if (!rsdp) {
-               printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
-               goto out_err;
-       }
-
-       printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
-               rsdp->oem_id);
-
-       if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
-               printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
-               goto out_err;
-       }
-
-       rsdt = (struct acpi_table_rsdt *)
-           boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
-
-       if (!rsdt) {
-               printk(KERN_WARNING
-                      "%s: ACPI: Invalid root system description tables (RSDT)\n",
-                      __FUNCTION__);
-               goto out_err;
-       }
-
-       header = &rsdt->header;
-
-       if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
-               printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
-               goto out_err;
-       }
-
-       /* 
-        * The number of tables is computed by taking the 
-        * size of all entries (header size minus total 
-        * size of RSDT) divided by the size of each entry
-        * (4-byte table pointers).
-        */
-       tables = (header->length - sizeof(struct acpi_table_header)) / 4;
-
-       if (!tables)
-               goto out_err;
-
-       memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-
-       if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
-               printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
-                      saved_rsdt.table.header.length);
-               goto out_err;
-       }
-
-       printk("Begin SRAT table scan....\n");
-
-       for (i = 0; i < tables; i++) {
-               /* Map in header, then map in full table length. */
-               header = (struct acpi_table_header *)
-                       boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
-               if (!header)
-                       break;
-               header = (struct acpi_table_header *)
-                       boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
-               if (!header)
-                       break;
-
-               if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
-                       continue;
-
-               /* we've found the srat table. don't need to look at any more tables */
-               return acpi20_parse_srat((struct acpi_table_srat *)header);
-       }
-out_err:
-       remove_all_active_ranges();
-       printk("failed to get NUMA memory information from SRAT table\n");
-       return 0;
-}
diff --git a/arch/i386/kernel/summit_32.c b/arch/i386/kernel/summit_32.c
deleted file mode 100644 (file)
index d0e01a3..0000000
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * arch/i386/kernel/summit.c - IBM Summit-Specific Code
- *
- * Written By: Matthew Dobson, IBM Corporation
- *
- * Copyright (c) 2003 IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/mach-summit/mach_mpparse.h>
-
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
-
-static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
-{
-       int twister = 0, node = 0;
-       int i, bus, num_buses;
-
-       for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
-               if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
-                       twister = rio_devs[i]->owner_id;
-                       break;
-               }
-       }
-       if (i == rio_table_hdr->num_rio_dev){
-               printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
-               return last_bus;
-       }
-
-       for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
-               if (scal_devs[i]->node_id == twister){
-                       node = scal_devs[i]->node_id;
-                       break;
-               }
-       }
-       if (i == rio_table_hdr->num_scal_dev){
-               printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
-               return last_bus;
-       }
-
-       switch (rio_devs[wpeg_num]->type){
-       case CompatWPEG:
-               /* The Compatability Winnipeg controls the 2 legacy buses,
-                * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
-                * a PCI-PCI bridge card is used in either slot: total 5 buses.
-                */
-               num_buses = 5;
-               break;
-       case AltWPEG:
-               /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
-                * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
-                * the "extra" buses for each of those slots: total 7 buses.
-                */
-               num_buses = 7;
-               break;
-       case LookOutAWPEG:
-       case LookOutBWPEG:
-               /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
-                * & the "extra" buses for each of those slots: total 9 buses.
-                */
-               num_buses = 9;
-               break;
-       default:
-               printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
-               return last_bus;
-       }
-
-       for(bus = last_bus; bus < last_bus + num_buses; bus++)
-               mp_bus_id_to_node[bus] = node;
-       return bus;
-}
-
-static int __init build_detail_arrays(void)
-{
-       unsigned long ptr;
-       int i, scal_detail_size, rio_detail_size;
-
-       if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
-               printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
-               return 0;
-       }
-
-       switch (rio_table_hdr->version){
-       default:
-               printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
-               return 0;
-       case 2:
-               scal_detail_size = 11;
-               rio_detail_size = 13;
-               break;
-       case 3:
-               scal_detail_size = 12;
-               rio_detail_size = 15;
-               break;
-       }
-
-       ptr = (unsigned long)rio_table_hdr + 3;
-       for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
-               scal_devs[i] = (struct scal_detail *)ptr;
-
-       for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
-               rio_devs[i] = (struct rio_detail *)ptr;
-
-       return 1;
-}
-
-void __init setup_summit(void)
-{
-       unsigned long           ptr;
-       unsigned short          offset;
-       int                     i, next_wpeg, next_bus = 0;
-
-       /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
-       ptr = *(unsigned short *)phys_to_virt(0x40Eul);
-       ptr = (unsigned long)phys_to_virt(ptr << 4);
-
-       rio_table_hdr = NULL;
-       offset = 0x180;
-       while (offset){
-               /* The block id is stored in the 2nd word */
-               if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
-                       /* set the pointer past the offset & block id */
-                       rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
-                       break;
-               }
-               /* The next offset is stored in the 1st word.  0 means no more */
-               offset = *((unsigned short *)(ptr + offset));
-       }
-       if (!rio_table_hdr){
-               printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
-               return;
-       }
-
-       if (!build_detail_arrays())
-               return;
-
-       /* The first Winnipeg we're looking for has an index of 0 */
-       next_wpeg = 0;
-       do {
-               for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
-                       if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
-                               /* It's the Winnipeg we're looking for! */
-                               next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
-                               next_wpeg++;
-                               break;
-                       }
-               }
-               /*
-                * If we go through all Rio devices and don't find one with
-                * the next index, it means we've found all the Winnipegs,
-                * and thus all the PCI buses.
-                */
-               if (i == rio_table_hdr->num_rio_dev)
-                       next_wpeg = 0;
-       } while (next_wpeg != 0);
-}
diff --git a/arch/i386/kernel/sys_i386_32.c b/arch/i386/kernel/sys_i386_32.c
deleted file mode 100644 (file)
index 4214730..0000000
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * linux/arch/i386/kernel/sys_i386.c
- *
- * This file contains various random system calls that
- * have a non-standard calling sequence on the Linux/i386
- * platform.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
-#include <asm/ipc.h>
-
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user * fildes)
-{
-       int fd[2];
-       int error;
-
-       error = do_pipe(fd);
-       if (!error) {
-               if (copy_to_user(fildes, fd, 2*sizeof(int)))
-                       error = -EFAULT;
-       }
-       return error;
-}
-
-asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
-                         unsigned long prot, unsigned long flags,
-                         unsigned long fd, unsigned long pgoff)
-{
-       int error = -EBADF;
-       struct file *file = NULL;
-       struct mm_struct *mm = current->mm;
-
-       flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-       if (!(flags & MAP_ANONYMOUS)) {
-               file = fget(fd);
-               if (!file)
-                       goto out;
-       }
-
-       down_write(&mm->mmap_sem);
-       error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-       up_write(&mm->mmap_sem);
-
-       if (file)
-               fput(file);
-out:
-       return error;
-}
-
-/*
- * Perform the select(nd, in, out, ex, tv) and mmap() system
- * calls. Linux/i386 didn't use to be able to handle more than
- * 4 system call parameters, so these system calls used a memory
- * block for parameter passing..
- */
-
-struct mmap_arg_struct {
-       unsigned long addr;
-       unsigned long len;
-       unsigned long prot;
-       unsigned long flags;
-       unsigned long fd;
-       unsigned long offset;
-};
-
-asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
-{
-       struct mmap_arg_struct a;
-       int err = -EFAULT;
-
-       if (copy_from_user(&a, arg, sizeof(a)))
-               goto out;
-
-       err = -EINVAL;
-       if (a.offset & ~PAGE_MASK)
-               goto out;
-
-       err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
-                       a.fd, a.offset >> PAGE_SHIFT);
-out:
-       return err;
-}
-
-
-struct sel_arg_struct {
-       unsigned long n;
-       fd_set __user *inp, *outp, *exp;
-       struct timeval __user *tvp;
-};
-
-asmlinkage int old_select(struct sel_arg_struct __user *arg)
-{
-       struct sel_arg_struct a;
-
-       if (copy_from_user(&a, arg, sizeof(a)))
-               return -EFAULT;
-       /* sys_select() does the appropriate kernel locking */
-       return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
-}
-
-/*
- * sys_ipc() is the de-multiplexer for the SysV IPC calls..
- *
- * This is really horribly ugly.
- */
-asmlinkage int sys_ipc (uint call, int first, int second,
-                       int third, void __user *ptr, long fifth)
-{
-       int version, ret;
-
-       version = call >> 16; /* hack for backward compatibility */
-       call &= 0xffff;
-
-       switch (call) {
-       case SEMOP:
-               return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
-       case SEMTIMEDOP:
-               return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
-                                       (const struct timespec __user *)fifth);
-
-       case SEMGET:
-               return sys_semget (first, second, third);
-       case SEMCTL: {
-               union semun fourth;
-               if (!ptr)
-                       return -EINVAL;
-               if (get_user(fourth.__pad, (void __user * __user *) ptr))
-                       return -EFAULT;
-               return sys_semctl (first, second, third, fourth);
-       }
-
-       case MSGSND:
-               return sys_msgsnd (first, (struct msgbuf __user *) ptr, 
-                                  second, third);
-       case MSGRCV:
-               switch (version) {
-               case 0: {
-                       struct ipc_kludge tmp;
-                       if (!ptr)
-                               return -EINVAL;
-                       
-                       if (copy_from_user(&tmp,
-                                          (struct ipc_kludge __user *) ptr, 
-                                          sizeof (tmp)))
-                               return -EFAULT;
-                       return sys_msgrcv (first, tmp.msgp, second,
-                                          tmp.msgtyp, third);
-               }
-               default:
-                       return sys_msgrcv (first,
-                                          (struct msgbuf __user *) ptr,
-                                          second, fifth, third);
-               }
-       case MSGGET:
-               return sys_msgget ((key_t) first, second);
-       case MSGCTL:
-               return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
-
-       case SHMAT:
-               switch (version) {
-               default: {
-                       ulong raddr;
-                       ret = do_shmat (first, (char __user *) ptr, second, &raddr);
-                       if (ret)
-                               return ret;
-                       return put_user (raddr, (ulong __user *) third);
-               }
-               case 1: /* iBCS2 emulator entry point */
-                       if (!segment_eq(get_fs(), get_ds()))
-                               return -EINVAL;
-                       /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
-                       return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
-               }
-       case SHMDT: 
-               return sys_shmdt ((char __user *)ptr);
-       case SHMGET:
-               return sys_shmget (first, second, third);
-       case SHMCTL:
-               return sys_shmctl (first, second,
-                                  (struct shmid_ds __user *) ptr);
-       default:
-               return -ENOSYS;
-       }
-}
-
-/*
- * Old cruft
- */
-asmlinkage int sys_uname(struct old_utsname __user * name)
-{
-       int err;
-       if (!name)
-               return -EFAULT;
-       down_read(&uts_sem);
-       err = copy_to_user(name, utsname(), sizeof (*name));
-       up_read(&uts_sem);
-       return err?-EFAULT:0;
-}
-
-asmlinkage int sys_olduname(struct oldold_utsname __user * name)
-{
-       int error;
-
-       if (!name)
-               return -EFAULT;
-       if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
-               return -EFAULT;
-  
-       down_read(&uts_sem);
-       
-       error = __copy_to_user(&name->sysname, &utsname()->sysname,
-                              __OLD_UTS_LEN);
-       error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
-       error |= __copy_to_user(&name->nodename, &utsname()->nodename,
-                               __OLD_UTS_LEN);
-       error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
-       error |= __copy_to_user(&name->release, &utsname()->release,
-                               __OLD_UTS_LEN);
-       error |= __put_user(0, name->release + __OLD_UTS_LEN);
-       error |= __copy_to_user(&name->version, &utsname()->version,
-                               __OLD_UTS_LEN);
-       error |= __put_user(0, name->version + __OLD_UTS_LEN);
-       error |= __copy_to_user(&name->machine, &utsname()->machine,
-                               __OLD_UTS_LEN);
-       error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-       
-       up_read(&uts_sem);
-       
-       error = error ? -EFAULT : 0;
-
-       return error;
-}
-
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
-{
-       long __res;
-       asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
-       : "=a" (__res)
-       : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
-       return __res;
-}
diff --git a/arch/i386/kernel/syscall_table_32.S b/arch/i386/kernel/syscall_table_32.S
deleted file mode 100644 (file)
index 8344c70..0000000
+++ /dev/null
@@ -1,326 +0,0 @@
-ENTRY(sys_call_table)
-       .long sys_restart_syscall       /* 0 - old "setup()" system call, used for restarting */
-       .long sys_exit
-       .long sys_fork
-       .long sys_read
-       .long sys_write
-       .long sys_open          /* 5 */
-       .long sys_close
-       .long sys_waitpid
-       .long sys_creat
-       .long sys_link
-       .long sys_unlink        /* 10 */
-       .long sys_execve
-       .long sys_chdir
-       .long sys_time
-       .long sys_mknod
-       .long sys_chmod         /* 15 */
-       .long sys_lchown16
-       .long sys_ni_syscall    /* old break syscall holder */
-       .long sys_stat
-       .long sys_lseek
-       .long sys_getpid        /* 20 */
-       .long sys_mount
-       .long sys_oldumount
-       .long sys_setuid16
-       .long sys_getuid16
-       .long sys_stime         /* 25 */
-       .long sys_ptrace
-       .long sys_alarm
-       .long sys_fstat
-       .long sys_pause
-       .long sys_utime         /* 30 */
-       .long sys_ni_syscall    /* old stty syscall holder */
-       .long sys_ni_syscall    /* old gtty syscall holder */
-       .long sys_access
-       .long sys_nice
-       .long sys_ni_syscall    /* 35 - old ftime syscall holder */
-       .long sys_sync
-       .long sys_kill
-       .long sys_rename
-       .long sys_mkdir
-       .long sys_rmdir         /* 40 */
-       .long sys_dup
-       .long sys_pipe
-       .long sys_times
-       .long sys_ni_syscall    /* old prof syscall holder */
-       .long sys_brk           /* 45 */
-       .long sys_setgid16
-       .long sys_getgid16
-       .long sys_signal
-       .long sys_geteuid16
-       .long sys_getegid16     /* 50 */
-       .long sys_acct
-       .long sys_umount        /* recycled never used phys() */
-       .long sys_ni_syscall    /* old lock syscall holder */
-       .long sys_ioctl
-       .long sys_fcntl         /* 55 */
-       .long sys_ni_syscall    /* old mpx syscall holder */
-       .long sys_setpgid
-       .long sys_ni_syscall    /* old ulimit syscall holder */
-       .long sys_olduname
-       .long sys_umask         /* 60 */
-       .long sys_chroot
-       .long sys_ustat
-       .long sys_dup2
-       .long sys_getppid
-       .long sys_getpgrp       /* 65 */
-       .long sys_setsid
-       .long sys_sigaction
-       .long sys_sgetmask
-       .long sys_ssetmask
-       .long sys_setreuid16    /* 70 */
-       .long sys_setregid16
-       .long sys_sigsuspend
-       .long sys_sigpending
-       .long sys_sethostname
-       .long sys_setrlimit     /* 75 */
-       .long sys_old_getrlimit
-       .long sys_getrusage
-       .long sys_gettimeofday
-       .long sys_settimeofday
-       .long sys_getgroups16   /* 80 */
-       .long sys_setgroups16
-       .long old_select
-       .long sys_symlink
-       .long sys_lstat
-       .long sys_readlink      /* 85 */
-       .long sys_uselib
-       .long sys_swapon
-       .long sys_reboot
-       .long old_readdir
-       .long old_mmap          /* 90 */
-       .long sys_munmap
-       .long sys_truncate
-       .long sys_ftruncate
-       .long sys_fchmod
-       .long sys_fchown16      /* 95 */
-       .long sys_getpriority
-       .long sys_setpriority
-       .long sys_ni_syscall    /* old profil syscall holder */
-       .long sys_statfs
-       .long sys_fstatfs       /* 100 */
-       .long sys_ioperm
-       .long sys_socketcall
-       .long sys_syslog
-       .long sys_setitimer
-       .long sys_getitimer     /* 105 */
-       .long sys_newstat
-       .long sys_newlstat
-       .long sys_newfstat
-       .long sys_uname
-       .long sys_iopl          /* 110 */
-       .long sys_vhangup
-       .long sys_ni_syscall    /* old "idle" system call */
-       .long sys_vm86old
-       .long sys_wait4
-       .long sys_swapoff       /* 115 */
-       .long sys_sysinfo
-       .long sys_ipc
-       .long sys_fsync
-       .long sys_sigreturn
-       .long sys_clone         /* 120 */
-       .long sys_setdomainname
-       .long sys_newuname
-       .long sys_modify_ldt
-       .long sys_adjtimex
-       .long sys_mprotect      /* 125 */
-       .long sys_sigprocmask
-       .long sys_ni_syscall    /* old "create_module" */
-       .long sys_init_module
-       .long sys_delete_module
-       .long sys_ni_syscall    /* 130: old "get_kernel_syms" */
-       .long sys_quotactl
-       .long sys_getpgid
-       .long sys_fchdir
-       .long sys_bdflush
-       .long sys_sysfs         /* 135 */
-       .long sys_personality
-       .long sys_ni_syscall    /* reserved for afs_syscall */
-       .long sys_setfsuid16
-       .long sys_setfsgid16
-       .long sys_llseek        /* 140 */
-       .long sys_getdents
-       .long sys_select
-       .long sys_flock
-       .long sys_msync
-       .long sys_readv         /* 145 */
-       .long sys_writev
-       .long sys_getsid
-       .long sys_fdatasync
-       .long sys_sysctl
-       .long sys_mlock         /* 150 */
-       .long sys_munlock
-       .long sys_mlockall
-       .long sys_munlockall
-       .long sys_sched_setparam
-       .long sys_sched_getparam   /* 155 */
-       .long sys_sched_setscheduler
-       .long sys_sched_getscheduler
-       .long sys_sched_yield
-       .long sys_sched_get_priority_max
-       .long sys_sched_get_priority_min  /* 160 */
-       .long sys_sched_rr_get_interval
-       .long sys_nanosleep
-       .long sys_mremap
-       .long sys_setresuid16
-       .long sys_getresuid16   /* 165 */
-       .long sys_vm86
-       .long sys_ni_syscall    /* Old sys_query_module */
-       .long sys_poll
-       .long sys_nfsservctl
-       .long sys_setresgid16   /* 170 */
-       .long sys_getresgid16
-       .long sys_prctl
-       .long sys_rt_sigreturn
-       .long sys_rt_sigaction
-       .long sys_rt_sigprocmask        /* 175 */
-       .long sys_rt_sigpending
-       .long sys_rt_sigtimedwait
-       .long sys_rt_sigqueueinfo
-       .long sys_rt_sigsuspend
-       .long sys_pread64       /* 180 */
-       .long sys_pwrite64
-       .long sys_chown16
-       .long sys_getcwd
-       .long sys_capget
-       .long sys_capset        /* 185 */
-       .long sys_sigaltstack
-       .long sys_sendfile
-       .long sys_ni_syscall    /* reserved for streams1 */
-       .long sys_ni_syscall    /* reserved for streams2 */
-       .long sys_vfork         /* 190 */
-       .long sys_getrlimit
-       .long sys_mmap2
-       .long sys_truncate64
-       .long sys_ftruncate64
-       .long sys_stat64        /* 195 */
-       .long sys_lstat64
-       .long sys_fstat64
-       .long sys_lchown
-       .long sys_getuid
-       .long sys_getgid        /* 200 */
-       .long sys_geteuid
-       .long sys_getegid
-       .long sys_setreuid
-       .long sys_setregid
-       .long sys_getgroups     /* 205 */
-       .long sys_setgroups
-       .long sys_fchown
-       .long sys_setresuid
-       .long sys_getresuid
-       .long sys_setresgid     /* 210 */
-       .long sys_getresgid
-       .long sys_chown
-       .long sys_setuid
-       .long sys_setgid
-       .long sys_setfsuid      /* 215 */
-       .long sys_setfsgid
-       .long sys_pivot_root
-       .long sys_mincore
-       .long sys_madvise
-       .long sys_getdents64    /* 220 */
-       .long sys_fcntl64
-       .long sys_ni_syscall    /* reserved for TUX */
-       .long sys_ni_syscall
-       .long sys_gettid
-       .long sys_readahead     /* 225 */
-       .long sys_setxattr
-       .long sys_lsetxattr
-       .long sys_fsetxattr
-       .long sys_getxattr
-       .long sys_lgetxattr     /* 230 */
-       .long sys_fgetxattr
-       .long sys_listxattr
-       .long sys_llistxattr
-       .long sys_flistxattr
-       .long sys_removexattr   /* 235 */
-       .long sys_lremovexattr
-       .long sys_fremovexattr
-       .long sys_tkill
-       .long sys_sendfile64
-       .long sys_futex         /* 240 */
-       .long sys_sched_setaffinity
-       .long sys_sched_getaffinity
-       .long sys_set_thread_area
-       .long sys_get_thread_area
-       .long sys_io_setup      /* 245 */
-       .long sys_io_destroy
-       .long sys_io_getevents
-       .long sys_io_submit
-       .long sys_io_cancel
-       .long sys_fadvise64     /* 250 */
-       .long sys_ni_syscall
-       .long sys_exit_group
-       .long sys_lookup_dcookie
-       .long sys_epoll_create
-       .long sys_epoll_ctl     /* 255 */
-       .long sys_epoll_wait
-       .long sys_remap_file_pages
-       .long sys_set_tid_address
-       .long sys_timer_create
-       .long sys_timer_settime         /* 260 */
-       .long sys_timer_gettime
-       .long sys_timer_getoverrun
-       .long sys_timer_delete
-       .long sys_clock_settime
-       .long sys_clock_gettime         /* 265 */
-       .long sys_clock_getres
-       .long sys_clock_nanosleep
-       .long sys_statfs64
-       .long sys_fstatfs64
-       .long sys_tgkill        /* 270 */
-       .long sys_utimes
-       .long sys_fadvise64_64
-       .long sys_ni_syscall    /* sys_vserver */
-       .long sys_mbind
-       .long sys_get_mempolicy
-       .long sys_set_mempolicy
-       .long sys_mq_open
-       .long sys_mq_unlink
-       .long sys_mq_timedsend
-       .long sys_mq_timedreceive       /* 280 */
-       .long sys_mq_notify
-       .long sys_mq_getsetattr
-       .long sys_kexec_load
-       .long sys_waitid
-       .long sys_ni_syscall            /* 285 */ /* available */
-       .long sys_add_key
-       .long sys_request_key
-       .long sys_keyctl
-       .long sys_ioprio_set
-       .long sys_ioprio_get            /* 290 */
-       .long sys_inotify_init
-       .long sys_inotify_add_watch
-       .long sys_inotify_rm_watch
-       .long sys_migrate_pages
-       .long sys_openat                /* 295 */
-       .long sys_mkdirat
-       .long sys_mknodat
-       .long sys_fchownat
-       .long sys_futimesat
-       .long sys_fstatat64             /* 300 */
-       .long sys_unlinkat
-       .long sys_renameat
-       .long sys_linkat
-       .long sys_symlinkat
-       .long sys_readlinkat            /* 305 */
-       .long sys_fchmodat
-       .long sys_faccessat
-       .long sys_pselect6
-       .long sys_ppoll
-       .long sys_unshare               /* 310 */
-       .long sys_set_robust_list
-       .long sys_get_robust_list
-       .long sys_splice
-       .long sys_sync_file_range
-       .long sys_tee                   /* 315 */
-       .long sys_vmsplice
-       .long sys_move_pages
-       .long sys_getcpu
-       .long sys_epoll_pwait
-       .long sys_utimensat             /* 320 */
-       .long sys_signalfd
-       .long sys_timerfd
-       .long sys_eventfd
-       .long sys_fallocate
diff --git a/arch/i386/kernel/sysenter_32.c b/arch/i386/kernel/sysenter_32.c
deleted file mode 100644 (file)
index 4eb2e40..0000000
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
- * linux/arch/i386/kernel/sysenter.c
- *
- * (C) Copyright 2002 Linus Torvalds
- * Portions based on the vdso-randomization code from exec-shield:
- * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
- *
- * This file contains the needed initializations to support sysenter.
- */
-
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/thread_info.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <linux/string.h>
-#include <linux/elf.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/module.h>
-
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
-#include <asm/pgtable.h>
-#include <asm/unistd.h>
-#include <asm/elf.h>
-#include <asm/tlbflush.h>
-
-enum {
-       VDSO_DISABLED = 0,
-       VDSO_ENABLED = 1,
-       VDSO_COMPAT = 2,
-};
-
-#ifdef CONFIG_COMPAT_VDSO
-#define VDSO_DEFAULT   VDSO_COMPAT
-#else
-#define VDSO_DEFAULT   VDSO_ENABLED
-#endif
-
-/*
- * Should the kernel map a VDSO page into processes and pass its
- * address down to glibc upon exec()?
- */
-unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
-
-EXPORT_SYMBOL_GPL(vdso_enabled);
-
-static int __init vdso_setup(char *s)
-{
-       vdso_enabled = simple_strtoul(s, NULL, 0);
-
-       return 1;
-}
-
-__setup("vdso=", vdso_setup);
-
-extern asmlinkage void sysenter_entry(void);
-
-static __init void reloc_symtab(Elf32_Ehdr *ehdr,
-                               unsigned offset, unsigned size)
-{
-       Elf32_Sym *sym = (void *)ehdr + offset;
-       unsigned nsym = size / sizeof(*sym);
-       unsigned i;
-
-       for(i = 0; i < nsym; i++, sym++) {
-               if (sym->st_shndx == SHN_UNDEF ||
-                   sym->st_shndx == SHN_ABS)
-                       continue;  /* skip */
-
-               if (sym->st_shndx > SHN_LORESERVE) {
-                       printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
-                              sym->st_shndx);
-                       continue;
-               }
-
-               switch(ELF_ST_TYPE(sym->st_info)) {
-               case STT_OBJECT:
-               case STT_FUNC:
-               case STT_SECTION:
-               case STT_FILE:
-                       sym->st_value += VDSO_HIGH_BASE;
-               }
-       }
-}
-
-static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
-{
-       Elf32_Dyn *dyn = (void *)ehdr + offset;
-
-       for(; dyn->d_tag != DT_NULL; dyn++)
-               switch(dyn->d_tag) {
-               case DT_PLTGOT:
-               case DT_HASH:
-               case DT_STRTAB:
-               case DT_SYMTAB:
-               case DT_RELA:
-               case DT_INIT:
-               case DT_FINI:
-               case DT_REL:
-               case DT_DEBUG:
-               case DT_JMPREL:
-               case DT_VERSYM:
-               case DT_VERDEF:
-               case DT_VERNEED:
-               case DT_ADDRRNGLO ... DT_ADDRRNGHI:
-                       /* definitely pointers needing relocation */
-                       dyn->d_un.d_ptr += VDSO_HIGH_BASE;
-                       break;
-
-               case DT_ENCODING ... OLD_DT_LOOS-1:
-               case DT_LOOS ... DT_HIOS-1:
-                       /* Tags above DT_ENCODING are pointers if
-                          they're even */
-                       if (dyn->d_tag >= DT_ENCODING &&
-                           (dyn->d_tag & 1) == 0)
-                               dyn->d_un.d_ptr += VDSO_HIGH_BASE;
-                       break;
-
-               case DT_VERDEFNUM:
-               case DT_VERNEEDNUM:
-               case DT_FLAGS_1:
-               case DT_RELACOUNT:
-               case DT_RELCOUNT:
-               case DT_VALRNGLO ... DT_VALRNGHI:
-                       /* definitely not pointers */
-                       break;
-
-               case OLD_DT_LOOS ... DT_LOOS-1:
-               case DT_HIOS ... DT_VALRNGLO-1:
-               default:
-                       if (dyn->d_tag > DT_ENCODING)
-                               printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
-                                      dyn->d_tag);
-                       break;
-               }
-}
-
-static __init void relocate_vdso(Elf32_Ehdr *ehdr)
-{
-       Elf32_Phdr *phdr;
-       Elf32_Shdr *shdr;
-       int i;
-
-       BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
-              !elf_check_arch(ehdr) ||
-              ehdr->e_type != ET_DYN);
-
-       ehdr->e_entry += VDSO_HIGH_BASE;
-
-       /* rebase phdrs */
-       phdr = (void *)ehdr + ehdr->e_phoff;
-       for (i = 0; i < ehdr->e_phnum; i++) {
-               phdr[i].p_vaddr += VDSO_HIGH_BASE;
-
-               /* relocate dynamic stuff */
-               if (phdr[i].p_type == PT_DYNAMIC)
-                       reloc_dyn(ehdr, phdr[i].p_offset);
-       }
-
-       /* rebase sections */
-       shdr = (void *)ehdr + ehdr->e_shoff;
-       for(i = 0; i < ehdr->e_shnum; i++) {
-               if (!(shdr[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               shdr[i].sh_addr += VDSO_HIGH_BASE;
-
-               if (shdr[i].sh_type == SHT_SYMTAB ||
-                   shdr[i].sh_type == SHT_DYNSYM)
-                       reloc_symtab(ehdr, shdr[i].sh_offset,
-                                    shdr[i].sh_size);
-       }
-}
-
-void enable_sep_cpu(void)
-{
-       int cpu = get_cpu();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
-       if (!boot_cpu_has(X86_FEATURE_SEP)) {
-               put_cpu();
-               return;
-       }
-
-       tss->x86_tss.ss1 = __KERNEL_CS;
-       tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
-       wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-       wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
-       wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
-       put_cpu();      
-}
-
-static struct vm_area_struct gate_vma;
-
-static int __init gate_vma_init(void)
-{
-       gate_vma.vm_mm = NULL;
-       gate_vma.vm_start = FIXADDR_USER_START;
-       gate_vma.vm_end = FIXADDR_USER_END;
-       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
-       gate_vma.vm_page_prot = __P101;
-       /*
-        * Make sure the vDSO gets into every core dump.
-        * Dumping its contents makes post-mortem fully interpretable later
-        * without matching up the same kernel and hardware config to see
-        * what PC values meant.
-        */
-       gate_vma.vm_flags |= VM_ALWAYSDUMP;
-       return 0;
-}
-
-/*
- * These symbols are defined by vsyscall.o to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vsyscall_int80_start, vsyscall_int80_end;
-extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
-static struct page *syscall_pages[1];
-
-static void map_compat_vdso(int map)
-{
-       static int vdso_mapped;
-
-       if (map == vdso_mapped)
-               return;
-
-       vdso_mapped = map;
-
-       __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
-                    map ? PAGE_READONLY_EXEC : PAGE_NONE);
-
-       /* flush stray tlbs */
-       flush_tlb_all();
-}
-
-int __init sysenter_setup(void)
-{
-       void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
-       const void *vsyscall;
-       size_t vsyscall_len;
-
-       syscall_pages[0] = virt_to_page(syscall_page);
-
-       gate_vma_init();
-
-       printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
-
-       if (!boot_cpu_has(X86_FEATURE_SEP)) {
-               vsyscall = &vsyscall_int80_start;
-               vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
-       } else {
-               vsyscall = &vsyscall_sysenter_start;
-               vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
-       }
-
-       memcpy(syscall_page, vsyscall, vsyscall_len);
-       relocate_vdso(syscall_page);
-
-       return 0;
-}
-
-/* Defined in vsyscall-sysenter.S */
-extern void SYSENTER_RETURN;
-
-/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
-{
-       struct mm_struct *mm = current->mm;
-       unsigned long addr;
-       int ret = 0;
-       bool compat;
-
-       down_write(&mm->mmap_sem);
-
-       /* Test compat mode once here, in case someone
-          changes it via sysctl */
-       compat = (vdso_enabled == VDSO_COMPAT);
-
-       map_compat_vdso(compat);
-
-       if (compat)
-               addr = VDSO_HIGH_BASE;
-       else {
-               addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
-               if (IS_ERR_VALUE(addr)) {
-                       ret = addr;
-                       goto up_fail;
-               }
-
-               /*
-                * MAYWRITE to allow gdb to COW and set breakpoints
-                *
-                * Make sure the vDSO gets into every core dump.
-                * Dumping its contents makes post-mortem fully
-                * interpretable later without matching up the same
-                * kernel and hardware config to see what PC values
-                * meant.
-                */
-               ret = install_special_mapping(mm, addr, PAGE_SIZE,
-                                             VM_READ|VM_EXEC|
-                                             VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-                                             VM_ALWAYSDUMP,
-                                             syscall_pages);
-
-               if (ret)
-                       goto up_fail;
-       }
-
-       current->mm->context.vdso = (void *)addr;
-       current_thread_info()->sysenter_return =
-               (void *)VDSO_SYM(&SYSENTER_RETURN);
-
-  up_fail:
-       up_write(&mm->mmap_sem);
-
-       return ret;
-}
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
-               return "[vdso]";
-       return NULL;
-}
-
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
-{
-       struct mm_struct *mm = tsk->mm;
-
-       /* Check to see if this task was created in compat vdso mode */
-       if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
-               return &gate_vma;
-       return NULL;
-}
-
-int in_gate_area(struct task_struct *task, unsigned long addr)
-{
-       const struct vm_area_struct *vma = get_gate_vma(task);
-
-       return vma && addr >= vma->vm_start && addr < vma->vm_end;
-}
-
-int in_gate_area_no_task(unsigned long addr)
-{
-       return 0;
-}
diff --git a/arch/i386/kernel/time_32.c b/arch/i386/kernel/time_32.c
deleted file mode 100644 (file)
index 19a6c67..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- *  linux/arch/i386/kernel/time.c
- *
- *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
- *
- * This file contains the PC-specific time handling details:
- * reading the RTC at bootup, etc..
- * 1994-07-02    Alan Modra
- *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
- * 1995-03-26    Markus Kuhn
- *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
- *      precision CMOS clock update
- * 1996-05-03    Ingo Molnar
- *      fixed time warps in do_[slow|fast]_gettimeoffset()
- * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
- *             "A Kernel Model for Precision Timekeeping" by Dave Mills
- * 1998-09-05    (Various)
- *     More robust do_fast_gettimeoffset() algorithm implemented
- *     (works with APM, Cyrix 6x86MX and Centaur C6),
- *     monotonic gettimeofday() with fast_get_timeoffset(),
- *     drift-proof precision TSC calibration on boot
- *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
- *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
- *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
- * 1998-12-16    Andrea Arcangeli
- *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
- *     because was not accounting lost_ticks.
- * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
- *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
- *     serialize accesses to xtime/lost_ticks).
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/efi.h>
-#include <linux/mca.h>
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/msr.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/time.h>
-
-#include "mach_time.h"
-
-#include <linux/timex.h>
-
-#include <asm/hpet.h>
-
-#include <asm/arch_hooks.h>
-
-#include "io_ports.h"
-
-#include <asm/i8259.h>
-
-#include "do_timer.h"
-
-unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
-EXPORT_SYMBOL(cpu_khz);
-
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
-/*
- * This is a special lock that is owned by the CPU and holds the index
- * register we are working with.  It is required for NMI access to the
- * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
- */
-volatile unsigned long cmos_lock = 0;
-EXPORT_SYMBOL(cmos_lock);
-
-/* Routines for accessing the CMOS RAM/RTC. */
-unsigned char rtc_cmos_read(unsigned char addr)
-{
-       unsigned char val;
-       lock_cmos_prefix(addr);
-       outb_p(addr, RTC_PORT(0));
-       val = inb_p(RTC_PORT(1));
-       lock_cmos_suffix(addr);
-       return val;
-}
-EXPORT_SYMBOL(rtc_cmos_read);
-
-void rtc_cmos_write(unsigned char val, unsigned char addr)
-{
-       lock_cmos_prefix(addr);
-       outb_p(addr, RTC_PORT(0));
-       outb_p(val, RTC_PORT(1));
-       lock_cmos_suffix(addr);
-}
-EXPORT_SYMBOL(rtc_cmos_write);
-
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       int retval;
-       unsigned long flags;
-
-       /* gets recalled with irq locally disabled */
-       /* XXX - does irqsave resolve this? -johnstul */
-       spin_lock_irqsave(&rtc_lock, flags);
-       retval = set_wallclock(nowtime);
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       return retval;
-}
-
-
-int timer_ack;
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
-       unsigned long pc = instruction_pointer(regs);
-
-#ifdef CONFIG_SMP
-       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
-           in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
-               return *(unsigned long *)(regs->ebp + 4);
-#else
-               unsigned long *sp = (unsigned long *)&regs->esp;
-
-               /* Return address is either directly at stack pointer
-                  or above a saved eflags. Eflags has bits 22-31 zero,
-                  kernel addresses don't. */
-               if (sp[0] >> 22)
-                       return sp[0];
-               if (sp[1] >> 22)
-                       return sp[1];
-#endif
-       }
-#endif
-       return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-#ifdef CONFIG_X86_IO_APIC
-       if (timer_ack) {
-               /*
-                * Subtle, when I/O APICs are used we have to ack timer IRQ
-                * manually to reset the IRR bit for do_slow_gettimeoffset().
-                * This will also deassert NMI lines for the watchdog if run
-                * on an 82489DX-based system.
-                */
-               spin_lock(&i8259A_lock);
-               outb(0x0c, PIC_MASTER_OCW3);
-               /* Ack the IRQ; AEOI will end it automatically. */
-               inb(PIC_MASTER_POLL);
-               spin_unlock(&i8259A_lock);
-       }
-#endif
-
-       do_timer_interrupt_hook();
-
-       if (MCA_bus) {
-               /* The PS/2 uses level-triggered interrupts.  You can't
-               turn them off, nor would you want to (any attempt to
-               enable edge-triggered interrupts usually gets intercepted by a
-               special hardware circuit).  Hence we have to acknowledge
-               the timer interrupt.  Through some incredibly stupid
-               design idea, the reset for IRQ 0 is done by setting the
-               high bit of the PPI port B (0x61).  Note that some PS/2s,
-               notably the 55SX, work fine if this is removed.  */
-
-               u8 irq_v = inb_p( 0x61 );       /* read the current state */
-               outb_p( irq_v|0x80, 0x61 );     /* reset the IRQ */
-       }
-
-       return IRQ_HANDLED;
-}
-
-/* not static: needed by APM */
-unsigned long read_persistent_clock(void)
-{
-       unsigned long retval;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rtc_lock, flags);
-
-       retval = get_wallclock();
-
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-       return set_rtc_mmss(now.tv_sec);
-}
-
-extern void (*late_time_init)(void);
-/* Duplicate of time_init() below, with hpet_enable part added */
-void __init hpet_time_init(void)
-{
-       if (!hpet_enable())
-               setup_pit_timer();
-       time_init_hook();
-}
-
-/*
- * This is called directly from init code; we must delay timer setup in the
- * HPET case as we can't make the decision to turn on HPET this early in the
- * boot process.
- *
- * The chosen time_init function will usually be hpet_time_init, above, but
- * in the case of virtual hardware, an alternative function may be substituted.
- */
-void __init time_init(void)
-{
-       tsc_init();
-       late_time_init = choose_time_init();
-}
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
deleted file mode 100644 (file)
index 4578235..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * arch/i386/kernel/topology.c - Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/nodemask.h>
-#include <linux/mmzone.h>
-#include <asm/cpu.h>
-
-static struct i386_cpu cpu_devices[NR_CPUS];
-
-int arch_register_cpu(int num)
-{
-       /*
-        * CPU0 cannot be offlined due to several
-        * restrictions and assumptions in kernel. This basically
-        * doesnt add a control file, one cannot attempt to offline
-        * BSP.
-        *
-        * Also certain PCI quirks require not to enable hotplug control
-        * for all CPU's.
-        */
-       if (num && enable_cpu_hotplug)
-               cpu_devices[num].cpu.hotpluggable = 1;
-
-       return register_cpu(&cpu_devices[num].cpu, num);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-int enable_cpu_hotplug = 1;
-
-void arch_unregister_cpu(int num) {
-       return unregister_cpu(&cpu_devices[num].cpu);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-EXPORT_SYMBOL(arch_unregister_cpu);
-#endif /*CONFIG_HOTPLUG_CPU*/
-
-static int __init topology_init(void)
-{
-       int i;
-
-#ifdef CONFIG_NUMA
-       for_each_online_node(i)
-               register_one_node(i);
-#endif /* CONFIG_NUMA */
-
-       for_each_present_cpu(i)
-               arch_register_cpu(i);
-       return 0;
-}
-
-subsys_initcall(topology_init);
diff --git a/arch/i386/kernel/trampoline_32.S b/arch/i386/kernel/trampoline_32.S
deleted file mode 100644 (file)
index f62815f..0000000
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *
- *     Trampoline.S    Derived from Setup.S by Linus Torvalds
- *
- *     4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- *     This is only used for booting secondary CPUs in SMP machine
- *
- *     Entry: CS:IP point to the start of our code, we are 
- *     in real mode with no stack, but the rest of the 
- *     trampoline page to make our stack and everything else
- *     is a mystery.
- *
- *     In fact we don't actually need a stack so we don't
- *     set one up.
- *
- *     We jump into the boot/compressed/head.S code. So you'd
- *     better be running a compressed kernel image or you
- *     won't get very far.
- *
- *     On entry to trampoline_data, the processor is in real mode
- *     with 16-bit addressing and 16-bit data.  CS has some value
- *     and IP is zero.  Thus, data addresses need to be absolute
- *     (no relocation) and are taken with regard to r_base.
- *
- *     If you work on this file, check the object module with
- *     objdump --reloc to make sure there are no relocation
- *     entries except for:
- *
- *     TYPE              VALUE
- *     R_386_32          startup_32_smp
- *     R_386_32          boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/page.h>
-
-.data
-
-/* We can free up trampoline after bootup if cpu hotplug is not supported. */
-#ifndef CONFIG_HOTPLUG_CPU
-.section ".init.data","aw",@progbits
-#endif
-
-.code16
-
-ENTRY(trampoline_data)
-r_base = .
-       wbinvd                  # Needed for NUMA-Q should be harmless for others
-       mov     %cs, %ax        # Code and data in the same place
-       mov     %ax, %ds
-
-       cli                     # We should be safe anyway
-
-       movl    $0xA5A5A5A5, trampoline_data - r_base
-                               # write marker for master knows we're running
-
-       /* GDT tables in non default location kernel can be beyond 16MB and
-        * lgdt will not be able to load the address as in real mode default
-        * operand size is 16bit. Use lgdtl instead to force operand size
-        * to 32 bit.
-        */
-
-       lidtl   boot_idt_descr - r_base # load idt with 0, 0
-       lgdtl   boot_gdt_descr - r_base # load gdt with whatever is appropriate
-
-       xor     %ax, %ax
-       inc     %ax             # protected mode (PE) bit
-       lmsw    %ax             # into protected mode
-       # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
-       ljmpl   $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
-       # These need to be in the same 64K segment as the above;
-       # hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
-       .word   __BOOT_DS + 7                   # gdt limit
-       .long   boot_gdt - __PAGE_OFFSET        # gdt base
-
-boot_idt_descr:
-       .word   0                               # idt limit = 0
-       .long   0                               # idt base = 0L
-
-.globl trampoline_end
-trampoline_end:
diff --git a/arch/i386/kernel/traps_32.c b/arch/i386/kernel/traps_32.c
deleted file mode 100644 (file)
index 47b0bef..0000000
+++ /dev/null
@@ -1,1250 +0,0 @@
-/*
- *  linux/arch/i386/traps.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-/*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'asm.s'.
- */
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/highmem.h>
-#include <linux/kallsyms.h>
-#include <linux/ptrace.h>
-#include <linux/utsname.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
-#include <linux/nmi.h>
-#include <linux/bug.h>
-
-#ifdef CONFIG_EISA
-#include <linux/ioport.h>
-#include <linux/eisa.h>
-#endif
-
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
-#include <asm/debugreg.h>
-#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/nmi.h>
-#include <asm/unwind.h>
-#include <asm/smp.h>
-#include <asm/arch_hooks.h>
-#include <linux/kdebug.h>
-#include <asm/stacktrace.h>
-
-#include <linux/module.h>
-
-#include "mach_traps.h"
-
-int panic_on_unrecovered_nmi;
-
-asmlinkage int system_call(void);
-
-/* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq = 0;
-
-/*
- * The IDT has to be page-aligned to simplify the Pentium
- * F0 0F bug workaround.. We have a special link segment
- * for this.
- */
-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
-
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
-int kstack_depth_to_print = 24;
-static unsigned int code_bytes = 64;
-
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
-{
-       return  p > (void *)tinfo &&
-               p <= (void *)tinfo + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-       struct stack_frame *next_frame;
-       unsigned long return_address;
-};
-
-static inline unsigned long print_context_stack(struct thread_info *tinfo,
-                               unsigned long *stack, unsigned long ebp,
-                               struct stacktrace_ops *ops, void *data)
-{
-#ifdef CONFIG_FRAME_POINTER
-       struct stack_frame *frame = (struct stack_frame *)ebp;
-       while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
-               struct stack_frame *next;
-               unsigned long addr;
-
-               addr = frame->return_address;
-               ops->address(data, addr);
-               /*
-                * break out of recursive entries (such as
-                * end_of_stack_stop_unwind_function). Also,
-                * we can never allow a frame pointer to
-                * move downwards!
-                */
-               next = frame->next_frame;
-               if (next <= frame)
-                       break;
-               frame = next;
-       }
-#else
-       while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
-               unsigned long addr;
-
-               addr = *stack++;
-               if (__kernel_text_address(addr))
-                       ops->address(data, addr);
-       }
-#endif
-       return ebp;
-}
-
-#define MSG(msg) ops->warning(data, msg)
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack,
-               struct stacktrace_ops *ops, void *data)
-{
-       unsigned long ebp = 0;
-
-       if (!task)
-               task = current;
-
-       if (!stack) {
-               unsigned long dummy;
-               stack = &dummy;
-               if (task != current)
-                       stack = (unsigned long *)task->thread.esp;
-       }
-
-#ifdef CONFIG_FRAME_POINTER
-       if (!ebp) {
-               if (task == current) {
-                       /* Grab ebp right from our regs */
-                       asm ("movl %%ebp, %0" : "=r" (ebp) : );
-               } else {
-                       /* ebp is the last reg pushed by switch_to */
-                       ebp = *(unsigned long *) task->thread.esp;
-               }
-       }
-#endif
-
-       while (1) {
-               struct thread_info *context;
-               context = (struct thread_info *)
-                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
-               ebp = print_context_stack(context, stack, ebp, ops, data);
-               /* Should be after the line below, but somewhere
-                  in early boot context comes out corrupted and we
-                  can't reference it -AK */
-               if (ops->stack(data, "IRQ") < 0)
-                       break;
-               stack = (unsigned long*)context->previous_esp;
-               if (!stack)
-                       break;
-               touch_nmi_watchdog();
-       }
-}
-EXPORT_SYMBOL(dump_trace);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       printk(data);
-       print_symbol(msg, symbol);
-       printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-       printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-       return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr)
-{
-       printk("%s [<%08lx>] ", (char *)data, addr);
-       print_symbol("%s\n", addr);
-       touch_nmi_watchdog();
-}
-
-static struct stacktrace_ops print_trace_ops = {
-       .warning = print_trace_warning,
-       .warning_symbol = print_trace_warning_symbol,
-       .stack = print_trace_stack,
-       .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long * stack, char *log_lvl)
-{
-       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
-       printk("%s =======================\n", log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long * stack)
-{
-       show_trace_log_lvl(task, regs, stack, "");
-}
-
-static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                              unsigned long *esp, char *log_lvl)
-{
-       unsigned long *stack;
-       int i;
-
-       if (esp == NULL) {
-               if (task)
-                       esp = (unsigned long*)task->thread.esp;
-               else
-                       esp = (unsigned long *)&esp;
-       }
-
-       stack = esp;
-       for(i = 0; i < kstack_depth_to_print; i++) {
-               if (kstack_end(stack))
-                       break;
-               if (i && ((i % 8) == 0))
-                       printk("\n%s       ", log_lvl);
-               printk("%08lx ", *stack++);
-       }
-       printk("\n%sCall Trace:\n", log_lvl);
-       show_trace_log_lvl(task, regs, esp, log_lvl);
-}
-
-void show_stack(struct task_struct *task, unsigned long *esp)
-{
-       printk("       ");
-       show_stack_log_lvl(task, NULL, esp, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-       unsigned long stack;
-
-       show_trace(current, NULL, &stack);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void show_registers(struct pt_regs *regs)
-{
-       int i;
-       int in_kernel = 1;
-       unsigned long esp;
-       unsigned short ss, gs;
-
-       esp = (unsigned long) (&regs->esp);
-       savesegment(ss, ss);
-       savesegment(gs, gs);
-       if (user_mode_vm(regs)) {
-               in_kernel = 0;
-               esp = regs->esp;
-               ss = regs->xss & 0xffff;
-       }
-       print_modules();
-       printk(KERN_EMERG "CPU:    %d\n"
-               KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
-               KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
-               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
-               print_tainted(), regs->eflags, init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
-       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
-               regs->eax, regs->ebx, regs->ecx, regs->edx);
-       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
-               regs->esi, regs->edi, regs->ebp, esp);
-       printk(KERN_EMERG "ds: %04x   es: %04x   fs: %04x  gs: %04x  ss: %04x\n",
-              regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
-       printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
-               TASK_COMM_LEN, current->comm, current->pid,
-               current_thread_info(), current, task_thread_info(current));
-       /*
-        * When in-kernel, we also print out the stack and code at the
-        * time of the fault..
-        */
-       if (in_kernel) {
-               u8 *eip;
-               unsigned int code_prologue = code_bytes * 43 / 64;
-               unsigned int code_len = code_bytes;
-               unsigned char c;
-
-               printk("\n" KERN_EMERG "Stack: ");
-               show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
-
-               printk(KERN_EMERG "Code: ");
-
-               eip = (u8 *)regs->eip - code_prologue;
-               if (eip < (u8 *)PAGE_OFFSET ||
-                       probe_kernel_address(eip, c)) {
-                       /* try starting at EIP */
-                       eip = (u8 *)regs->eip;
-                       code_len = code_len - code_prologue + 1;
-               }
-               for (i = 0; i < code_len; i++, eip++) {
-                       if (eip < (u8 *)PAGE_OFFSET ||
-                               probe_kernel_address(eip, c)) {
-                               printk(" Bad EIP value.");
-                               break;
-                       }
-                       if (eip == (u8 *)regs->eip)
-                               printk("<%02x> ", c);
-                       else
-                               printk("%02x ", c);
-               }
-       }
-       printk("\n");
-}      
-
-int is_valid_bugaddr(unsigned long eip)
-{
-       unsigned short ud2;
-
-       if (eip < PAGE_OFFSET)
-               return 0;
-       if (probe_kernel_address((unsigned short *)eip, ud2))
-               return 0;
-
-       return ud2 == 0x0b0f;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad and
- * is about to be terminated.
- */
-void die(const char * str, struct pt_regs * regs, long err)
-{
-       static struct {
-               spinlock_t lock;
-               u32 lock_owner;
-               int lock_owner_depth;
-       } die = {
-               .lock =                 __SPIN_LOCK_UNLOCKED(die.lock),
-               .lock_owner =           -1,
-               .lock_owner_depth =     0
-       };
-       static int die_counter;
-       unsigned long flags;
-
-       oops_enter();
-
-       if (die.lock_owner != raw_smp_processor_id()) {
-               console_verbose();
-               spin_lock_irqsave(&die.lock, flags);
-               die.lock_owner = smp_processor_id();
-               die.lock_owner_depth = 0;
-               bust_spinlocks(1);
-       }
-       else
-               local_save_flags(flags);
-
-       if (++die.lock_owner_depth < 3) {
-               int nl = 0;
-               unsigned long esp;
-               unsigned short ss;
-
-               report_bug(regs->eip, regs);
-
-               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-               printk(KERN_EMERG "PREEMPT ");
-               nl = 1;
-#endif
-#ifdef CONFIG_SMP
-               if (!nl)
-                       printk(KERN_EMERG);
-               printk("SMP ");
-               nl = 1;
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-               if (!nl)
-                       printk(KERN_EMERG);
-               printk("DEBUG_PAGEALLOC");
-               nl = 1;
-#endif
-               if (nl)
-                       printk("\n");
-               if (notify_die(DIE_OOPS, str, regs, err,
-                                       current->thread.trap_no, SIGSEGV) !=
-                               NOTIFY_STOP) {
-                       show_registers(regs);
-                       /* Executive summary in case the oops scrolled away */
-                       esp = (unsigned long) (&regs->esp);
-                       savesegment(ss, ss);
-                       if (user_mode(regs)) {
-                               esp = regs->esp;
-                               ss = regs->xss & 0xffff;
-                       }
-                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
-                       print_symbol("%s", regs->eip);
-                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
-               }
-               else
-                       regs = NULL;
-       } else
-               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
-
-       bust_spinlocks(0);
-       die.lock_owner = -1;
-       add_taint(TAINT_DIE);
-       spin_unlock_irqrestore(&die.lock, flags);
-
-       if (!regs)
-               return;
-
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-
-       if (panic_on_oops)
-               panic("Fatal exception");
-
-       oops_exit();
-       do_exit(SIGSEGV);
-}
-
-static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
-{
-       if (!user_mode_vm(regs))
-               die(str, regs, err);
-}
-
-static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
-                             struct pt_regs * regs, long error_code,
-                             siginfo_t *info)
-{
-       struct task_struct *tsk = current;
-
-       if (regs->eflags & VM_MASK) {
-               if (vm86)
-                       goto vm86_trap;
-               goto trap_signal;
-       }
-
-       if (!user_mode(regs))
-               goto kernel_trap;
-
-       trap_signal: {
-               /*
-                * We want error_code and trap_no set for userspace faults and
-                * kernelspace faults which result in die(), but not
-                * kernelspace faults which are fixed up.  die() gives the
-                * process no chance to handle the signal and notice the
-                * kernel fault information, so that won't result in polluting
-                * the information about previously queued, but not yet
-                * delivered, faults.  See also do_general_protection below.
-                */
-               tsk->thread.error_code = error_code;
-               tsk->thread.trap_no = trapnr;
-
-               if (info)
-                       force_sig_info(signr, info, tsk);
-               else
-                       force_sig(signr, tsk);
-               return;
-       }
-
-       kernel_trap: {
-               if (!fixup_exception(regs)) {
-                       tsk->thread.error_code = error_code;
-                       tsk->thread.trap_no = trapnr;
-                       die(str, regs, error_code);
-               }
-               return;
-       }
-
-       vm86_trap: {
-               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
-               if (ret) goto trap_signal;
-               return;
-       }
-}
-
-#define DO_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-                                               == NOTIFY_STOP) \
-               return; \
-       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-       siginfo_t info; \
-       if (irq) \
-               local_irq_enable(); \
-       info.si_signo = signr; \
-       info.si_errno = 0; \
-       info.si_code = sicode; \
-       info.si_addr = (void __user *)siaddr; \
-       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-                                               == NOTIFY_STOP) \
-               return; \
-       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
-}
-
-#define DO_VM86_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-                                               == NOTIFY_STOP) \
-               return; \
-       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
-}
-
-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-       siginfo_t info; \
-       info.si_signo = signr; \
-       info.si_errno = 0; \
-       info.si_code = sicode; \
-       info.si_addr = (void __user *)siaddr; \
-       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-                                               == NOTIFY_STOP) \
-               return; \
-       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
-}
-
-DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
-#ifndef CONFIG_KPROBES
-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
-#endif
-DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
-DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
-DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
-
-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
-                                             long error_code)
-{
-       int cpu = get_cpu();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
-       struct thread_struct *thread = &current->thread;
-
-       /*
-        * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
-        * invalid offset set (the LAZY one) and the faulting thread has
-        * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
-        * and we set the offset field correctly. Then we let the CPU to
-        * restart the faulting instruction.
-        */
-       if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-           thread->io_bitmap_ptr) {
-               memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-                      thread->io_bitmap_max);
-               /*
-                * If the previously set map was extending to higher ports
-                * than the current one, pad extra space with 0xff (no access).
-                */
-               if (thread->io_bitmap_max < tss->io_bitmap_max)
-                       memset((char *) tss->io_bitmap +
-                               thread->io_bitmap_max, 0xff,
-                               tss->io_bitmap_max - thread->io_bitmap_max);
-               tss->io_bitmap_max = thread->io_bitmap_max;
-               tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-               tss->io_bitmap_owner = thread;
-               put_cpu();
-               return;
-       }
-       put_cpu();
-
-       if (regs->eflags & VM_MASK)
-               goto gp_in_vm86;
-
-       if (!user_mode(regs))
-               goto gp_in_kernel;
-
-       current->thread.error_code = error_code;
-       current->thread.trap_no = 13;
-       if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
-           printk_ratelimit())
-               printk(KERN_INFO
-                   "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
-                   current->comm, current->pid,
-                   regs->eip, regs->esp, error_code);
-
-       force_sig(SIGSEGV, current);
-       return;
-
-gp_in_vm86:
-       local_irq_enable();
-       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-       return;
-
-gp_in_kernel:
-       if (!fixup_exception(regs)) {
-               current->thread.error_code = error_code;
-               current->thread.trap_no = 13;
-               if (notify_die(DIE_GPF, "general protection fault", regs,
-                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
-                       return;
-               die("general protection fault", regs, error_code);
-       }
-}
-
-static __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
-{
-       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
-               "CPU %d.\n", reason, smp_processor_id());
-       printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
-
-#if defined(CONFIG_EDAC)
-       if(edac_handler_set()) {
-               edac_atomic_assert_error();
-               return;
-       }
-#endif
-
-       if (panic_on_unrecovered_nmi)
-                panic("NMI: Not continuing");
-
-       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-
-       /* Clear and disable the memory parity error line. */
-       clear_mem_error(reason);
-}
-
-static __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
-{
-       unsigned long i;
-
-       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
-       show_registers(regs);
-
-       /* Re-enable the IOCK line, wait for a few seconds */
-       reason = (reason & 0xf) | 8;
-       outb(reason, 0x61);
-       i = 2000;
-       while (--i) udelay(1000);
-       reason &= ~8;
-       outb(reason, 0x61);
-}
-
-static __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-#ifdef CONFIG_MCA
-       /* Might actually be able to figure out what the guilty party
-       * is. */
-       if( MCA_bus ) {
-               mca_handle_nmi();
-               return;
-       }
-#endif
-       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
-               "CPU %d.\n", reason, smp_processor_id());
-       printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-       if (panic_on_unrecovered_nmi)
-                panic("NMI: Not continuing");
-
-       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
-{
-       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
-           NOTIFY_STOP)
-               return;
-
-       spin_lock(&nmi_print_lock);
-       /*
-       * We are in trouble anyway, lets at least try
-       * to get a message out.
-       */
-       bust_spinlocks(1);
-       printk(KERN_EMERG "%s", msg);
-       printk(" on CPU%d, eip %08lx, registers:\n",
-               smp_processor_id(), regs->eip);
-       show_registers(regs);
-       console_silent();
-       spin_unlock(&nmi_print_lock);
-       bust_spinlocks(0);
-
-       /* If we are in kernel we are probably nested up pretty bad
-        * and might aswell get out now while we still can.
-       */
-       if (!user_mode_vm(regs)) {
-               current->thread.trap_no = 2;
-               crash_kexec(regs);
-       }
-
-       do_exit(SIGSEGV);
-}
-
-static __kprobes void default_do_nmi(struct pt_regs * regs)
-{
-       unsigned char reason = 0;
-
-       /* Only the BSP gets external NMIs from the system.  */
-       if (!smp_processor_id())
-               reason = get_nmi_reason();
-       if (!(reason & 0xc0)) {
-               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-                                                       == NOTIFY_STOP)
-                       return;
-#ifdef CONFIG_X86_LOCAL_APIC
-               /*
-                * Ok, so this is none of the documented NMI sources,
-                * so it must be the NMI watchdog.
-                */
-               if (nmi_watchdog_tick(regs, reason))
-                       return;
-               if (!do_nmi_callback(regs, smp_processor_id()))
-#endif
-                       unknown_nmi_error(reason, regs);
-
-               return;
-       }
-       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-               return;
-       if (reason & 0x80)
-               mem_parity_error(reason, regs);
-       if (reason & 0x40)
-               io_check_error(reason, regs);
-       /*
-        * Reassert NMI in case it became active meanwhile
-        * as it's edge-triggered.
-        */
-       reassert_nmi();
-}
-
-static int ignore_nmis;
-
-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
-{
-       int cpu;
-
-       nmi_enter();
-
-       cpu = smp_processor_id();
-
-       ++nmi_count(cpu);
-
-       if (!ignore_nmis)
-               default_do_nmi(regs);
-
-       nmi_exit();
-}
-
-void stop_nmi(void)
-{
-       acpi_nmi_disable();
-       ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
-       ignore_nmis--;
-       acpi_nmi_enable();
-}
-
-#ifdef CONFIG_KPROBES
-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
-{
-       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
-                       == NOTIFY_STOP)
-               return;
-       /* This is an interrupt gate, because kprobes wants interrupts
-       disabled.  Normal trap handlers don't. */
-       restore_interrupts(regs);
-       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
-}
-#endif
-
-/*
- * Our handling of the processor debug registers is non-trivial.
- * We do not clear them on entry and exit from the kernel. Therefore
- * it is possible to get a watchpoint trap here from inside the kernel.
- * However, the code in ./ptrace.c has ensured that the user can
- * only set watchpoints on userspace addresses. Therefore the in-kernel
- * watchpoint trap can only occur in code which is reading/writing
- * from user space. Such code must not hold kernel locks (since it
- * can equally take a page fault), therefore it is safe to call
- * force_sig_info even though that claims and releases locks.
- * 
- * Code in ./signal.c ensures that the debug control register
- * is restored before we deliver any signal, and therefore that
- * user code runs with the correct debug control register even though
- * we clear it here.
- *
- * Being careful here means that we don't have to be as careful in a
- * lot of more complicated places (task switching can be a bit lazy
- * about restoring all the debug state, and ptrace doesn't have to
- * find every occurrence of the TF bit that could be saved away even
- * by user code)
- */
-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
-{
-       unsigned int condition;
-       struct task_struct *tsk = current;
-
-       get_debugreg(condition, 6);
-
-       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-                                       SIGTRAP) == NOTIFY_STOP)
-               return;
-       /* It's safe to allow irq's after DR6 has been saved */
-       if (regs->eflags & X86_EFLAGS_IF)
-               local_irq_enable();
-
-       /* Mask out spurious debug traps due to lazy DR7 setting */
-       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-               if (!tsk->thread.debugreg[7])
-                       goto clear_dr7;
-       }
-
-       if (regs->eflags & VM_MASK)
-               goto debug_vm86;
-
-       /* Save debug status register where ptrace can see it */
-       tsk->thread.debugreg[6] = condition;
-
-       /*
-        * Single-stepping through TF: make sure we ignore any events in
-        * kernel space (but re-enable TF when returning to user mode).
-        */
-       if (condition & DR_STEP) {
-               /*
-                * We already checked v86 mode above, so we can
-                * check for kernel mode by just checking the CPL
-                * of CS.
-                */
-               if (!user_mode(regs))
-                       goto clear_TF_reenable;
-       }
-
-       /* Ok, finally something we can handle */
-       send_sigtrap(tsk, regs, error_code);
-
-       /* Disable additional traps. They'll be re-enabled when
-        * the signal is delivered.
-        */
-clear_dr7:
-       set_debugreg(0, 7);
-       return;
-
-debug_vm86:
-       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-       return;
-
-clear_TF_reenable:
-       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-       regs->eflags &= ~TF_MASK;
-       return;
-}
-
-/*
- * Note that we play around with the 'TS' bit in an attempt to get
- * the correct behaviour even in the presence of the asynchronous
- * IRQ13 behaviour
- */
-void math_error(void __user *eip)
-{
-       struct task_struct * task;
-       siginfo_t info;
-       unsigned short cwd, swd;
-
-       /*
-        * Save the info for the exception handler and clear the error.
-        */
-       task = current;
-       save_init_fpu(task);
-       task->thread.trap_no = 16;
-       task->thread.error_code = 0;
-       info.si_signo = SIGFPE;
-       info.si_errno = 0;
-       info.si_code = __SI_FAULT;
-       info.si_addr = eip;
-       /*
-        * (~cwd & swd) will mask out exceptions that are not set to unmasked
-        * status.  0x3f is the exception bits in these regs, 0x200 is the
-        * C1 reg you need in case of a stack fault, 0x040 is the stack
-        * fault bit.  We should only be taking one exception at a time,
-        * so if this combination doesn't produce any single exception,
-        * then we have a bad program that isn't syncronizing its FPU usage
-        * and it will suffer the consequences since we won't be able to
-        * fully reproduce the context of the exception
-        */
-       cwd = get_fpu_cwd(task);
-       swd = get_fpu_swd(task);
-       switch (swd & ~cwd & 0x3f) {
-               case 0x000: /* No unmasked exception */
-                       return;
-               default:    /* Multiple exceptions */
-                       break;
-               case 0x001: /* Invalid Op */
-                       /*
-                        * swd & 0x240 == 0x040: Stack Underflow
-                        * swd & 0x240 == 0x240: Stack Overflow
-                        * User must clear the SF bit (0x40) if set
-                        */
-                       info.si_code = FPE_FLTINV;
-                       break;
-               case 0x002: /* Denormalize */
-               case 0x010: /* Underflow */
-                       info.si_code = FPE_FLTUND;
-                       break;
-               case 0x004: /* Zero Divide */
-                       info.si_code = FPE_FLTDIV;
-                       break;
-               case 0x008: /* Overflow */
-                       info.si_code = FPE_FLTOVF;
-                       break;
-               case 0x020: /* Precision */
-                       info.si_code = FPE_FLTRES;
-                       break;
-       }
-       force_sig_info(SIGFPE, &info, task);
-}
-
-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
-{
-       ignore_fpu_irq = 1;
-       math_error((void __user *)regs->eip);
-}
-
-static void simd_math_error(void __user *eip)
-{
-       struct task_struct * task;
-       siginfo_t info;
-       unsigned short mxcsr;
-
-       /*
-        * Save the info for the exception handler and clear the error.
-        */
-       task = current;
-       save_init_fpu(task);
-       task->thread.trap_no = 19;
-       task->thread.error_code = 0;
-       info.si_signo = SIGFPE;
-       info.si_errno = 0;
-       info.si_code = __SI_FAULT;
-       info.si_addr = eip;
-       /*
-        * The SIMD FPU exceptions are handled a little differently, as there
-        * is only a single status/control register.  Thus, to determine which
-        * unmasked exception was caught we must mask the exception mask bits
-        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-        */
-       mxcsr = get_fpu_mxcsr(task);
-       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-               case 0x000:
-               default:
-                       break;
-               case 0x001: /* Invalid Op */
-                       info.si_code = FPE_FLTINV;
-                       break;
-               case 0x002: /* Denormalize */
-               case 0x010: /* Underflow */
-                       info.si_code = FPE_FLTUND;
-                       break;
-               case 0x004: /* Zero Divide */
-                       info.si_code = FPE_FLTDIV;
-                       break;
-               case 0x008: /* Overflow */
-                       info.si_code = FPE_FLTOVF;
-                       break;
-               case 0x020: /* Precision */
-                       info.si_code = FPE_FLTRES;
-                       break;
-       }
-       force_sig_info(SIGFPE, &info, task);
-}
-
-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
-                                         long error_code)
-{
-       if (cpu_has_xmm) {
-               /* Handle SIMD FPU exceptions on PIII+ processors. */
-               ignore_fpu_irq = 1;
-               simd_math_error((void __user *)regs->eip);
-       } else {
-               /*
-                * Handle strange cache flush from user space exception
-                * in all other cases.  This is undocumented behaviour.
-                */
-               if (regs->eflags & VM_MASK) {
-                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
-                                         error_code);
-                       return;
-               }
-               current->thread.trap_no = 19;
-               current->thread.error_code = error_code;
-               die_if_kernel("cache flush denied", regs, error_code);
-               force_sig(SIGSEGV, current);
-       }
-}
-
-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
-                                         long error_code)
-{
-#if 0
-       /* No need to warn about this any longer. */
-       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
-                                         unsigned long kesp)
-{
-       struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
-       unsigned long base = (kesp - uesp) & -THREAD_SIZE;
-       unsigned long new_kesp = kesp - base;
-       unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
-       __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
-       /* Set up base for espfix segment */
-       desc &= 0x00f0ff0000000000ULL;
-       desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
-               ((((__u64)base) << 32) & 0xff00000000000000ULL) |
-               ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
-               (lim_pages & 0xffff);
-       *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
-       return new_kesp;
-}
-
-/*
- *  'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
- */
-asmlinkage void math_state_restore(void)
-{
-       struct thread_info *thread = current_thread_info();
-       struct task_struct *tsk = thread->task;
-
-       clts();         /* Allow maths ops (or we recurse) */
-       if (!tsk_used_math(tsk))
-               init_fpu(tsk);
-       restore_fpu(tsk);
-       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
-       tsk->fpu_counter++;
-}
-EXPORT_SYMBOL_GPL(math_state_restore);
-
-#ifndef CONFIG_MATH_EMULATION
-
-asmlinkage void math_emulate(long arg)
-{
-       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
-       printk(KERN_EMERG "killing %s.\n",current->comm);
-       force_sig(SIGFPE,current);
-       schedule();
-}
-
-#endif /* CONFIG_MATH_EMULATION */
-
-#ifdef CONFIG_X86_F00F_BUG
-void __init trap_init_f00f_bug(void)
-{
-       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-
-       /*
-        * Update the IDT descriptor and reload the IDT so that
-        * it uses the read-only mapped virtual address.
-        */
-       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
-       load_idt(&idt_descr);
-}
-#endif
-
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-void set_intr_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
-}
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
-}
-
-static void __init set_trap_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
-}
-
-static void __init set_system_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
-}
-
-static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-       _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
-}
-
-
-void __init trap_init(void)
-{
-#ifdef CONFIG_EISA
-       void __iomem *p = ioremap(0x0FFFD9, 4);
-       if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
-               EISA_bus = 1;
-       }
-       iounmap(p);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       init_apic_mappings();
-#endif
-
-       set_trap_gate(0,&divide_error);
-       set_intr_gate(1,&debug);
-       set_intr_gate(2,&nmi);
-       set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
-       set_system_gate(4,&overflow);
-       set_trap_gate(5,&bounds);
-       set_trap_gate(6,&invalid_op);
-       set_trap_gate(7,&device_not_available);
-       set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
-       set_trap_gate(9,&coprocessor_segment_overrun);
-       set_trap_gate(10,&invalid_TSS);
-       set_trap_gate(11,&segment_not_present);
-       set_trap_gate(12,&stack_segment);
-       set_trap_gate(13,&general_protection);
-       set_intr_gate(14,&page_fault);
-       set_trap_gate(15,&spurious_interrupt_bug);
-       set_trap_gate(16,&coprocessor_error);
-       set_trap_gate(17,&alignment_check);
-#ifdef CONFIG_X86_MCE
-       set_trap_gate(18,&machine_check);
-#endif
-       set_trap_gate(19,&simd_coprocessor_error);
-
-       if (cpu_has_fxsr) {
-               /*
-                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-                * Generates a compile-time "error: zero width for bit-field" if
-                * the alignment is wrong.
-                */
-               struct fxsrAlignAssert {
-                       int _:!(offsetof(struct task_struct,
-                                       thread.i387.fxsave) & 15);
-               };
-
-               printk(KERN_INFO "Enabling fast FPU save and restore... ");
-               set_in_cr4(X86_CR4_OSFXSR);
-               printk("done.\n");
-       }
-       if (cpu_has_xmm) {
-               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
-                               "support... ");
-               set_in_cr4(X86_CR4_OSXMMEXCPT);
-               printk("done.\n");
-       }
-
-       set_system_gate(SYSCALL_VECTOR,&system_call);
-
-       /*
-        * Should be a barrier for any external CPU state.
-        */
-       cpu_init();
-
-       trap_init_hook();
-}
-
-static int __init kstack_setup(char *s)
-{
-       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-       return 1;
-}
-__setup("kstack=", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-       code_bytes = simple_strtoul(s, NULL, 0);
-       if (code_bytes > 8192)
-               code_bytes = 8192;
-
-       return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/i386/kernel/tsc_32.c b/arch/i386/kernel/tsc_32.c
deleted file mode 100644 (file)
index a39280b..0000000
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
- * which was originally moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/sched.h>
-#include <linux/clocksource.h>
-#include <linux/workqueue.h>
-#include <linux/cpufreq.h>
-#include <linux/jiffies.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-
-#include <asm/delay.h>
-#include <asm/tsc.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-
-#include "mach_timer.h"
-
-static int tsc_enabled;
-
-/*
- * On some systems the TSC frequency does not
- * change with the cpu frequency. So we need
- * an extra value to store the TSC freq
- */
-unsigned int tsc_khz;
-EXPORT_SYMBOL_GPL(tsc_khz);
-
-int tsc_disable;
-
-#ifdef CONFIG_X86_TSC
-static int __init tsc_setup(char *str)
-{
-       printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-                               "cannot disable TSC.\n");
-       return 1;
-}
-#else
-/*
- * disable flag for tsc. Takes effect by clearing the TSC cpu flag
- * in cpu/common.c
- */
-static int __init tsc_setup(char *str)
-{
-       tsc_disable = 1;
-
-       return 1;
-}
-#endif
-
-__setup("notsc", tsc_setup);
-
-/*
- * code to mark and check if the TSC is unstable
- * due to cpufreq or due to unsynced TSCs
- */
-static int tsc_unstable;
-
-int check_tsc_unstable(void)
-{
-       return tsc_unstable;
-}
-EXPORT_SYMBOL_GPL(check_tsc_unstable);
-
-/* Accellerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *             ns = cycles / (freq / ns_per_sec)
- *             ns = cycles * (ns_per_sec / freq)
- *             ns = cycles * (10^9 / (cpu_khz * 10^3))
- *             ns = cycles * (10^6 / cpu_khz)
- *
- *     Then we use scaling math (suggested by george@mvista.com) to get:
- *             ns = cycles * (10^6 * SC / cpu_khz) / SC
- *             ns = cycles * cyc2ns_scale / SC
- *
- *     And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *                     -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-unsigned long cyc2ns_scale __read_mostly;
-
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-       cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long native_sched_clock(void)
-{
-       unsigned long long this_offset;
-
-       /*
-        * Fall back to jiffies if there's no TSC available:
-        * ( But note that we still use it if the TSC is marked
-        *   unstable. We do this because unlike Time Of Day,
-        *   the scheduler clock tolerates small errors and it's
-        *   very important for it to be as fast as the platform
-        *   can achive it. )
-        */
-       if (unlikely(!tsc_enabled && !tsc_unstable))
-               /* No locking but a rare wrong value is not a big deal: */
-               return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-
-       /* read the Time Stamp Counter: */
-       rdtscll(this_offset);
-
-       /* return the value in ns */
-       return cycles_2_ns(this_offset);
-}
-
-/* We need to define a real function for sched_clock, to override the
-   weak default version */
-#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
-{
-       return paravirt_sched_clock();
-}
-#else
-unsigned long long sched_clock(void)
-       __attribute__((alias("native_sched_clock")));
-#endif
-
-unsigned long native_calculate_cpu_khz(void)
-{
-       unsigned long long start, end;
-       unsigned long count;
-       u64 delta64;
-       int i;
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       /* run 3 times to ensure the cache is warm */
-       for (i = 0; i < 3; i++) {
-               mach_prepare_counter();
-               rdtscll(start);
-               mach_countup(&count);
-               rdtscll(end);
-       }
-       /*
-        * Error: ECTCNEVERSET
-        * The CTC wasn't reliable: we got a hit on the very first read,
-        * or the CPU was so fast/slow that the quotient wouldn't fit in
-        * 32 bits..
-        */
-       if (count <= 1)
-               goto err;
-
-       delta64 = end - start;
-
-       /* cpu freq too fast: */
-       if (delta64 > (1ULL<<32))
-               goto err;
-
-       /* cpu freq too slow: */
-       if (delta64 <= CALIBRATE_TIME_MSEC)
-               goto err;
-
-       delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
-       do_div(delta64,CALIBRATE_TIME_MSEC);
-
-       local_irq_restore(flags);
-       return (unsigned long)delta64;
-err:
-       local_irq_restore(flags);
-       return 0;
-}
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
-       unsigned long cpu_khz_old = cpu_khz;
-
-       if (cpu_has_tsc) {
-               cpu_khz = calculate_cpu_khz();
-               tsc_khz = cpu_khz;
-               cpu_data[0].loops_per_jiffy =
-                       cpufreq_scale(cpu_data[0].loops_per_jiffy,
-                                       cpu_khz_old, cpu_khz);
-               return 0;
-       } else
-               return -ENODEV;
-#else
-       return -ENODEV;
-#endif
-}
-
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
-#ifdef CONFIG_CPU_FREQ
-
-/*
- * if the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-static unsigned long cpu_khz_ref = 0;
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
-{
-       struct cpufreq_freqs *freq = data;
-
-       if (!ref_freq) {
-               if (!freq->old){
-                       ref_freq = freq->new;
-                       return 0;
-               }
-               ref_freq = freq->old;
-               loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
-               cpu_khz_ref = cpu_khz;
-       }
-
-       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-           (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-           (val == CPUFREQ_RESUMECHANGE)) {
-               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-                       cpu_data[freq->cpu].loops_per_jiffy =
-                               cpufreq_scale(loops_per_jiffy_ref,
-                                               ref_freq, freq->new);
-
-               if (cpu_khz) {
-
-                       if (num_online_cpus() == 1)
-                               cpu_khz = cpufreq_scale(cpu_khz_ref,
-                                               ref_freq, freq->new);
-                       if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
-                               tsc_khz = cpu_khz;
-                               set_cyc2ns_scale(cpu_khz);
-                               /*
-                                * TSC based sched_clock turns
-                                * to junk w/ cpufreq
-                                */
-                               mark_tsc_unstable("cpufreq changes");
-                       }
-               }
-       }
-
-       return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-       .notifier_call  = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
-       return cpufreq_register_notifier(&time_cpufreq_notifier_block,
-                                        CPUFREQ_TRANSITION_NOTIFIER);
-}
-core_initcall(cpufreq_tsc);
-
-#endif
-
-/* clock source code */
-
-static unsigned long current_tsc_khz = 0;
-
-static cycle_t read_tsc(void)
-{
-       cycle_t ret;
-
-       rdtscll(ret);
-
-       return ret;
-}
-
-static struct clocksource clocksource_tsc = {
-       .name                   = "tsc",
-       .rating                 = 300,
-       .read                   = read_tsc,
-       .mask                   = CLOCKSOURCE_MASK(64),
-       .mult                   = 0, /* to be set */
-       .shift                  = 22,
-       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
-                                 CLOCK_SOURCE_MUST_VERIFY,
-};
-
-void mark_tsc_unstable(char *reason)
-{
-       if (!tsc_unstable) {
-               tsc_unstable = 1;
-               tsc_enabled = 0;
-               printk("Marking TSC unstable due to: %s.\n", reason);
-               /* Can be called before registration */
-               if (clocksource_tsc.mult)
-                       clocksource_change_rating(&clocksource_tsc, 0);
-               else
-                       clocksource_tsc.rating = 0;
-       }
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
-static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
-{
-       printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-                      d->ident);
-       tsc_unstable = 1;
-       return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
-       {
-        .callback = dmi_mark_tsc_unstable,
-        .ident = "IBM Thinkpad 380XD",
-        .matches = {
-                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
-                    DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
-                    },
-        },
-        {}
-};
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
-       if (!cpu_has_tsc || tsc_unstable)
-               return 1;
-       /*
-        * Intel systems are normally all synchronized.
-        * Exceptions must mark TSC as unstable:
-        */
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
-               /* assume multi socket systems are not synchronized: */
-               if (num_possible_cpus() > 1)
-                       tsc_unstable = 1;
-       }
-       return tsc_unstable;
-}
-
-/*
- * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
- */
-#ifdef CONFIG_MGEODE_LX
-/* RTSC counts during suspend */
-#define RTSC_SUSP 0x100
-
-static void __init check_geode_tsc_reliable(void)
-{
-       unsigned long val;
-
-       rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
-       if ((val & RTSC_SUSP))
-               clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-}
-#else
-static inline void check_geode_tsc_reliable(void) { }
-#endif
-
-
-void __init tsc_init(void)
-{
-       if (!cpu_has_tsc || tsc_disable)
-               goto out_no_tsc;
-
-       cpu_khz = calculate_cpu_khz();
-       tsc_khz = cpu_khz;
-
-       if (!cpu_khz)
-               goto out_no_tsc;
-
-       printk("Detected %lu.%03lu MHz processor.\n",
-                               (unsigned long)cpu_khz / 1000,
-                               (unsigned long)cpu_khz % 1000);
-
-       set_cyc2ns_scale(cpu_khz);
-       use_tsc_delay();
-
-       /* Check and install the TSC clocksource */
-       dmi_check_system(bad_tsc_dmi_table);
-
-       unsynchronized_tsc();
-       check_geode_tsc_reliable();
-       current_tsc_khz = tsc_khz;
-       clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
-                                                       clocksource_tsc.shift);
-       /* lower the rating if we already know its unstable: */
-       if (check_tsc_unstable()) {
-               clocksource_tsc.rating = 0;
-               clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
-       } else
-               tsc_enabled = 1;
-
-       clocksource_register(&clocksource_tsc);
-
-       return;
-
-out_no_tsc:
-       /*
-        * Set the tsc_disable flag if there's no TSC support, this
-        * makes it a fast flag for the kernel to see whether it
-        * should be using the TSC.
-        */
-       tsc_disable = 1;
-}
diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c
deleted file mode 100644 (file)
index 1242462..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/i386/kernel/vm86_32.c b/arch/i386/kernel/vm86_32.c
deleted file mode 100644 (file)
index f2dcd1d..0000000
+++ /dev/null
@@ -1,843 +0,0 @@
-/*
- *  linux/kernel/vm86.c
- *
- *  Copyright (C) 1994  Linus Torvalds
- *
- *  29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
- *                stack - Manfred Spraul <manfred@colorfullife.com>
- *
- *  22 mar 2002 - Manfred detected the stackfaults, but didn't handle
- *                them correctly. Now the emulation will be in a
- *                consistent state after stackfaults - Kasper Dupont
- *                <kasperd@daimi.au.dk>
- *
- *  22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
- *                <kasperd@daimi.au.dk>
- *
- *  ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
- *                caused by Kasper Dupont's changes - Stas Sergeev
- *
- *   4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
- *                Kasper Dupont <kasperd@daimi.au.dk>
- *
- *   9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
- *                Kasper Dupont <kasperd@daimi.au.dk>
- *
- *   9 apr 2002 - Changed stack access macros to jump to a label
- *                instead of returning to userspace. This simplifies
- *                do_int, and is needed by handle_vm6_fault. Kasper
- *                Dupont <kasperd@daimi.au.dk>
- *
- */
-
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/highmem.h>
-#include <linux/ptrace.h>
-#include <linux/audit.h>
-#include <linux/stddef.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/tlbflush.h>
-#include <asm/irq.h>
-
-/*
- * Known problems:
- *
- * Interrupt handling is not guaranteed:
- * - a real x86 will disable all interrupts for one instruction
- *   after a "mov ss,xx" to make stack handling atomic even without
- *   the 'lss' instruction. We can't guarantee this in v86 mode,
- *   as the next instruction might result in a page fault or similar.
- * - a real x86 will have interrupts disabled for one instruction
- *   past the 'sti' that enables them. We don't bother with all the
- *   details yet.
- *
- * Let's hope these problems do not actually matter for anything.
- */
-
-
-#define KVM86  ((struct kernel_vm86_struct *)regs)
-#define VMPI   KVM86->vm86plus
-
-
-/*
- * 8- and 16-bit register defines..
- */
-#define AL(regs)       (((unsigned char *)&((regs)->pt.eax))[0])
-#define AH(regs)       (((unsigned char *)&((regs)->pt.eax))[1])
-#define IP(regs)       (*(unsigned short *)&((regs)->pt.eip))
-#define SP(regs)       (*(unsigned short *)&((regs)->pt.esp))
-
-/*
- * virtual flags (16 and 32-bit versions)
- */
-#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS        (current->thread.v86flags)
-
-#define set_flags(X,new,mask) \
-((X) = ((X) & ~(mask)) | ((new) & (mask)))
-
-#define SAFE_MASK      (0xDD5)
-#define RETURN_MASK    (0xDFF)
-
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-                                 const struct kernel_vm86_regs *regs)
-{
-       int ret = 0;
-
-       /* kernel_vm86_regs is missing xgs, so copy everything up to
-          (but not including) orig_eax, and then rest including orig_eax. */
-       ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-       ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
-                           sizeof(struct kernel_vm86_regs) -
-                           offsetof(struct kernel_vm86_regs, pt.orig_eax));
-
-       return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-                                   const struct vm86_regs __user *user,
-                                   unsigned extra)
-{
-       int ret = 0;
-
-       /* copy eax-xfs inclusive */
-       ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-       /* copy orig_eax-__gsh+extra */
-       ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
-                             sizeof(struct kernel_vm86_regs) -
-                             offsetof(struct kernel_vm86_regs, pt.orig_eax) +
-                             extra);
-       return ret;
-}
-
-struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
-struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
-{
-       struct tss_struct *tss;
-       struct pt_regs *ret;
-       unsigned long tmp;
-
-       /*
-        * This gets called from entry.S with interrupts disabled, but
-        * from process context. Enable interrupts here, before trying
-        * to access user space.
-        */
-       local_irq_enable();
-
-       if (!current->thread.vm86_info) {
-               printk("no vm86_info: BAD\n");
-               do_exit(SIGSEGV);
-       }
-       set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
-       tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
-       tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
-       if (tmp) {
-               printk("vm86: could not access userspace vm86_info\n");
-               do_exit(SIGSEGV);
-       }
-
-       tss = &per_cpu(init_tss, get_cpu());
-       current->thread.esp0 = current->thread.saved_esp0;
-       current->thread.sysenter_cs = __KERNEL_CS;
-       load_esp0(tss, &current->thread);
-       current->thread.saved_esp0 = 0;
-       put_cpu();
-
-       ret = KVM86->regs32;
-
-       ret->xfs = current->thread.saved_fs;
-       loadsegment(gs, current->thread.saved_gs);
-
-       return ret;
-}
-
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       spinlock_t *ptl;
-       int i;
-
-       pgd = pgd_offset(mm, 0xA0000);
-       if (pgd_none_or_clear_bad(pgd))
-               goto out;
-       pud = pud_offset(pgd, 0xA0000);
-       if (pud_none_or_clear_bad(pud))
-               goto out;
-       pmd = pmd_offset(pud, 0xA0000);
-       if (pmd_none_or_clear_bad(pmd))
-               goto out;
-       pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
-       for (i = 0; i < 32; i++) {
-               if (pte_present(*pte))
-                       set_pte(pte, pte_wrprotect(*pte));
-               pte++;
-       }
-       pte_unmap_unlock(pte, ptl);
-out:
-       flush_tlb();
-}
-
-
-
-static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-
-asmlinkage int sys_vm86old(struct pt_regs regs)
-{
-       struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
-       struct kernel_vm86_struct info; /* declare this _on top_,
-                                        * this avoids wasting of stack space.
-                                        * This remains on the stack until we
-                                        * return to 32 bit user space.
-                                        */
-       struct task_struct *tsk;
-       int tmp, ret = -EPERM;
-
-       tsk = current;
-       if (tsk->thread.saved_esp0)
-               goto out;
-       tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-                                      offsetof(struct kernel_vm86_struct, vm86plus) -
-                                      sizeof(info.regs));
-       ret = -EFAULT;
-       if (tmp)
-               goto out;
-       memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-       info.regs32 = &regs;
-       tsk->thread.vm86_info = v86;
-       do_sys_vm86(&info, tsk);
-       ret = 0;        /* we never return here */
-out:
-       return ret;
-}
-
-
-asmlinkage int sys_vm86(struct pt_regs regs)
-{
-       struct kernel_vm86_struct info; /* declare this _on top_,
-                                        * this avoids wasting of stack space.
-                                        * This remains on the stack until we
-                                        * return to 32 bit user space.
-                                        */
-       struct task_struct *tsk;
-       int tmp, ret;
-       struct vm86plus_struct __user *v86;
-
-       tsk = current;
-       switch (regs.ebx) {
-               case VM86_REQUEST_IRQ:
-               case VM86_FREE_IRQ:
-               case VM86_GET_IRQ_BITS:
-               case VM86_GET_AND_RESET_IRQ:
-                       ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
-                       goto out;
-               case VM86_PLUS_INSTALL_CHECK:
-                       /* NOTE: on old vm86 stuff this will return the error
-                          from access_ok(), because the subfunction is
-                          interpreted as (invalid) address to vm86_struct.
-                          So the installation check works.
-                        */
-                       ret = 0;
-                       goto out;
-       }
-
-       /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-       ret = -EPERM;
-       if (tsk->thread.saved_esp0)
-               goto out;
-       v86 = (struct vm86plus_struct __user *)regs.ecx;
-       tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-                                      offsetof(struct kernel_vm86_struct, regs32) -
-                                      sizeof(info.regs));
-       ret = -EFAULT;
-       if (tmp)
-               goto out;
-       info.regs32 = &regs;
-       info.vm86plus.is_vm86pus = 1;
-       tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-       do_sys_vm86(&info, tsk);
-       ret = 0;        /* we never return here */
-out:
-       return ret;
-}
-
-
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
-{
-       struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-       info->regs.pt.xds = 0;
-       info->regs.pt.xes = 0;
-       info->regs.pt.xfs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
-
-/*
- * The eflags register is also special: we cannot trust that the user
- * has set it up safely, so this makes sure interrupt etc flags are
- * inherited from protected mode.
- */
-       VEFLAGS = info->regs.pt.eflags;
-       info->regs.pt.eflags &= SAFE_MASK;
-       info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
-       info->regs.pt.eflags |= VM_MASK;
-
-       switch (info->cpu_type) {
-               case CPU_286:
-                       tsk->thread.v86mask = 0;
-                       break;
-               case CPU_386:
-                       tsk->thread.v86mask = NT_MASK | IOPL_MASK;
-                       break;
-               case CPU_486:
-                       tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
-                       break;
-               default:
-                       tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
-                       break;
-       }
-
-/*
- * Save old state, set default return value (%eax) to 0
- */
-       info->regs32->eax = 0;
-       tsk->thread.saved_esp0 = tsk->thread.esp0;
-       tsk->thread.saved_fs = info->regs32->xfs;
-       savesegment(gs, tsk->thread.saved_gs);
-
-       tss = &per_cpu(init_tss, get_cpu());
-       tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
-       if (cpu_has_sep)
-               tsk->thread.sysenter_cs = 0;
-       load_esp0(tss, &tsk->thread);
-       put_cpu();
-
-       tsk->thread.screen_bitmap = info->screen_bitmap;
-       if (info->flags & VM86_SCREEN_BITMAP)
-               mark_screen_rdonly(tsk->mm);
-
-       /*call audit_syscall_exit since we do not exit via the normal paths */
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(0), 0);
-
-       __asm__ __volatile__(
-               "movl %0,%%esp\n\t"
-               "movl %1,%%ebp\n\t"
-               "mov  %2, %%gs\n\t"
-               "jmp resume_userspace"
-               : /* no outputs */
-               :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-       /* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
-{
-       struct pt_regs * regs32;
-
-       regs32 = save_v86_state(regs16);
-       regs32->eax = retval;
-       __asm__ __volatile__("movl %0,%%esp\n\t"
-               "movl %1,%%ebp\n\t"
-               "jmp resume_userspace"
-               : : "r" (regs32), "r" (current_thread_info()));
-}
-
-static inline void set_IF(struct kernel_vm86_regs * regs)
-{
-       VEFLAGS |= VIF_MASK;
-       if (VEFLAGS & VIP_MASK)
-               return_to_32bit(regs, VM86_STI);
-}
-
-static inline void clear_IF(struct kernel_vm86_regs * regs)
-{
-       VEFLAGS &= ~VIF_MASK;
-}
-
-static inline void clear_TF(struct kernel_vm86_regs * regs)
-{
-       regs->pt.eflags &= ~TF_MASK;
-}
-
-static inline void clear_AC(struct kernel_vm86_regs * regs)
-{
-       regs->pt.eflags &= ~AC_MASK;
-}
-
-/* It is correct to call set_IF(regs) from the set_vflags_*
- * functions. However someone forgot to call clear_IF(regs)
- * in the opposite case.
- * After the command sequence CLI PUSHF STI POPF you should
- * end up with interrups disabled, but you ended up with
- * interrupts enabled.
- *  ( I was testing my own changes, but the only bug I
- *    could find was in a function I had not changed. )
- * [KD]
- */
-
-static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
-{
-       set_flags(VEFLAGS, eflags, current->thread.v86mask);
-       set_flags(regs->pt.eflags, eflags, SAFE_MASK);
-       if (eflags & IF_MASK)
-               set_IF(regs);
-       else
-               clear_IF(regs);
-}
-
-static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
-{
-       set_flags(VFLAGS, flags, current->thread.v86mask);
-       set_flags(regs->pt.eflags, flags, SAFE_MASK);
-       if (flags & IF_MASK)
-               set_IF(regs);
-       else
-               clear_IF(regs);
-}
-
-static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
-{
-       unsigned long flags = regs->pt.eflags & RETURN_MASK;
-
-       if (VEFLAGS & VIF_MASK)
-               flags |= IF_MASK;
-       flags |= IOPL_MASK;
-       return flags | (VEFLAGS & current->thread.v86mask);
-}
-
-static inline int is_revectored(int nr, struct revectored_struct * bitmap)
-{
-       __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
-               :"=r" (nr)
-               :"m" (*bitmap),"r" (nr));
-       return nr;
-}
-
-#define val_byte(val, n) (((__u8 *)&val)[n])
-
-#define pushb(base, ptr, val, err_label) \
-       do { \
-               __u8 __val = val; \
-               ptr--; \
-               if (put_user(__val, base + ptr) < 0) \
-                       goto err_label; \
-       } while(0)
-
-#define pushw(base, ptr, val, err_label) \
-       do { \
-               __u16 __val = val; \
-               ptr--; \
-               if (put_user(val_byte(__val, 1), base + ptr) < 0) \
-                       goto err_label; \
-               ptr--; \
-               if (put_user(val_byte(__val, 0), base + ptr) < 0) \
-                       goto err_label; \
-       } while(0)
-
-#define pushl(base, ptr, val, err_label) \
-       do { \
-               __u32 __val = val; \
-               ptr--; \
-               if (put_user(val_byte(__val, 3), base + ptr) < 0) \
-                       goto err_label; \
-               ptr--; \
-               if (put_user(val_byte(__val, 2), base + ptr) < 0) \
-                       goto err_label; \
-               ptr--; \
-               if (put_user(val_byte(__val, 1), base + ptr) < 0) \
-                       goto err_label; \
-               ptr--; \
-               if (put_user(val_byte(__val, 0), base + ptr) < 0) \
-                       goto err_label; \
-       } while(0)
-
-#define popb(base, ptr, err_label) \
-       ({ \
-               __u8 __res; \
-               if (get_user(__res, base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               __res; \
-       })
-
-#define popw(base, ptr, err_label) \
-       ({ \
-               __u16 __res; \
-               if (get_user(val_byte(__res, 0), base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               if (get_user(val_byte(__res, 1), base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               __res; \
-       })
-
-#define popl(base, ptr, err_label) \
-       ({ \
-               __u32 __res; \
-               if (get_user(val_byte(__res, 0), base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               if (get_user(val_byte(__res, 1), base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               if (get_user(val_byte(__res, 2), base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               if (get_user(val_byte(__res, 3), base + ptr) < 0) \
-                       goto err_label; \
-               ptr++; \
-               __res; \
-       })
-
-/* There are so many possible reasons for this function to return
- * VM86_INTx, so adding another doesn't bother me. We can expect
- * userspace programs to be able to handle it. (Getting a problem
- * in userspace is always better than an Oops anyway.) [KD]
- */
-static void do_int(struct kernel_vm86_regs *regs, int i,
-    unsigned char __user * ssp, unsigned short sp)
-{
-       unsigned long __user *intr_ptr;
-       unsigned long segoffs;
-
-       if (regs->pt.xcs == BIOSSEG)
-               goto cannot_handle;
-       if (is_revectored(i, &KVM86->int_revectored))
-               goto cannot_handle;
-       if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
-               goto cannot_handle;
-       intr_ptr = (unsigned long __user *) (i << 2);
-       if (get_user(segoffs, intr_ptr))
-               goto cannot_handle;
-       if ((segoffs >> 16) == BIOSSEG)
-               goto cannot_handle;
-       pushw(ssp, sp, get_vflags(regs), cannot_handle);
-       pushw(ssp, sp, regs->pt.xcs, cannot_handle);
-       pushw(ssp, sp, IP(regs), cannot_handle);
-       regs->pt.xcs = segoffs >> 16;
-       SP(regs) -= 6;
-       IP(regs) = segoffs & 0xffff;
-       clear_TF(regs);
-       clear_IF(regs);
-       clear_AC(regs);
-       return;
-
-cannot_handle:
-       return_to_32bit(regs, VM86_INTx + (i << 8));
-}
-
-int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
-{
-       if (VMPI.is_vm86pus) {
-               if ( (trapno==3) || (trapno==1) )
-                       return_to_32bit(regs, VM86_TRAP + (trapno << 8));
-               do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
-               return 0;
-       }
-       if (trapno !=1)
-               return 1; /* we let this handle by the calling routine */
-       if (current->ptrace & PT_PTRACED) {
-               unsigned long flags;
-               spin_lock_irqsave(&current->sighand->siglock, flags);
-               sigdelset(&current->blocked, SIGTRAP);
-               recalc_sigpending();
-               spin_unlock_irqrestore(&current->sighand->siglock, flags);
-       }
-       send_sig(SIGTRAP, current, 1);
-       current->thread.trap_no = trapno;
-       current->thread.error_code = error_code;
-       return 0;
-}
-
-void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
-{
-       unsigned char opcode;
-       unsigned char __user *csp;
-       unsigned char __user *ssp;
-       unsigned short ip, sp, orig_flags;
-       int data32, pref_done;
-
-#define CHECK_IF_IN_TRAP \
-       if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
-               newflags |= TF_MASK
-#define VM86_FAULT_RETURN do { \
-       if (VMPI.force_return_for_pic  && (VEFLAGS & (IF_MASK | VIF_MASK))) \
-               return_to_32bit(regs, VM86_PICRETURN); \
-       if (orig_flags & TF_MASK) \
-               handle_vm86_trap(regs, 0, 1); \
-       return; } while (0)
-
-       orig_flags = *(unsigned short *)&regs->pt.eflags;
-
-       csp = (unsigned char __user *) (regs->pt.xcs << 4);
-       ssp = (unsigned char __user *) (regs->pt.xss << 4);
-       sp = SP(regs);
-       ip = IP(regs);
-
-       data32 = 0;
-       pref_done = 0;
-       do {
-               switch (opcode = popb(csp, ip, simulate_sigsegv)) {
-                       case 0x66:      /* 32-bit data */     data32=1; break;
-                       case 0x67:      /* 32-bit address */  break;
-                       case 0x2e:      /* CS */              break;
-                       case 0x3e:      /* DS */              break;
-                       case 0x26:      /* ES */              break;
-                       case 0x36:      /* SS */              break;
-                       case 0x65:      /* GS */              break;
-                       case 0x64:      /* FS */              break;
-                       case 0xf2:      /* repnz */       break;
-                       case 0xf3:      /* rep */             break;
-                       default: pref_done = 1;
-               }
-       } while (!pref_done);
-
-       switch (opcode) {
-
-       /* pushf */
-       case 0x9c:
-               if (data32) {
-                       pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
-                       SP(regs) -= 4;
-               } else {
-                       pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
-                       SP(regs) -= 2;
-               }
-               IP(regs) = ip;
-               VM86_FAULT_RETURN;
-
-       /* popf */
-       case 0x9d:
-               {
-               unsigned long newflags;
-               if (data32) {
-                       newflags=popl(ssp, sp, simulate_sigsegv);
-                       SP(regs) += 4;
-               } else {
-                       newflags = popw(ssp, sp, simulate_sigsegv);
-                       SP(regs) += 2;
-               }
-               IP(regs) = ip;
-               CHECK_IF_IN_TRAP;
-               if (data32) {
-                       set_vflags_long(newflags, regs);
-               } else {
-                       set_vflags_short(newflags, regs);
-               }
-               VM86_FAULT_RETURN;
-               }
-
-       /* int xx */
-       case 0xcd: {
-               int intno=popb(csp, ip, simulate_sigsegv);
-               IP(regs) = ip;
-               if (VMPI.vm86dbg_active) {
-                       if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
-                               return_to_32bit(regs, VM86_INTx + (intno << 8));
-               }
-               do_int(regs, intno, ssp, sp);
-               return;
-       }
-
-       /* iret */
-       case 0xcf:
-               {
-               unsigned long newip;
-               unsigned long newcs;
-               unsigned long newflags;
-               if (data32) {
-                       newip=popl(ssp, sp, simulate_sigsegv);
-                       newcs=popl(ssp, sp, simulate_sigsegv);
-                       newflags=popl(ssp, sp, simulate_sigsegv);
-                       SP(regs) += 12;
-               } else {
-                       newip = popw(ssp, sp, simulate_sigsegv);
-                       newcs = popw(ssp, sp, simulate_sigsegv);
-                       newflags = popw(ssp, sp, simulate_sigsegv);
-                       SP(regs) += 6;
-               }
-               IP(regs) = newip;
-               regs->pt.xcs = newcs;
-               CHECK_IF_IN_TRAP;
-               if (data32) {
-                       set_vflags_long(newflags, regs);
-               } else {
-                       set_vflags_short(newflags, regs);
-               }
-               VM86_FAULT_RETURN;
-               }
-
-       /* cli */
-       case 0xfa:
-               IP(regs) = ip;
-               clear_IF(regs);
-               VM86_FAULT_RETURN;
-
-       /* sti */
-       /*
-        * Damn. This is incorrect: the 'sti' instruction should actually
-        * enable interrupts after the /next/ instruction. Not good.
-        *
-        * Probably needs some horsing around with the TF flag. Aiee..
-        */
-       case 0xfb:
-               IP(regs) = ip;
-               set_IF(regs);
-               VM86_FAULT_RETURN;
-
-       default:
-               return_to_32bit(regs, VM86_UNKNOWN);
-       }
-
-       return;
-
-simulate_sigsegv:
-       /* FIXME: After a long discussion with Stas we finally
-        *        agreed, that this is wrong. Here we should
-        *        really send a SIGSEGV to the user program.
-        *        But how do we create the correct context? We
-        *        are inside a general protection fault handler
-        *        and has just returned from a page fault handler.
-        *        The correct context for the signal handler
-        *        should be a mixture of the two, but how do we
-        *        get the information? [KD]
-        */
-       return_to_32bit(regs, VM86_UNKNOWN);
-}
-
-/* ---------------- vm86 special IRQ passing stuff ----------------- */
-
-#define VM86_IRQNAME           "vm86irq"
-
-static struct vm86_irqs {
-       struct task_struct *tsk;
-       int sig;
-} vm86_irqs[16];
-
-static DEFINE_SPINLOCK(irqbits_lock);
-static int irqbits;
-
-#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
-       | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO)  | (1 << SIGURG) \
-       | (1 << SIGUNUSED) )
-       
-static irqreturn_t irq_handler(int intno, void *dev_id)
-{
-       int irq_bit;
-       unsigned long flags;
-
-       spin_lock_irqsave(&irqbits_lock, flags);        
-       irq_bit = 1 << intno;
-       if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
-               goto out;
-       irqbits |= irq_bit;
-       if (vm86_irqs[intno].sig)
-               send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
-       /*
-        * IRQ will be re-enabled when user asks for the irq (whether
-        * polling or as a result of the signal)
-        */
-       disable_irq_nosync(intno);
-       spin_unlock_irqrestore(&irqbits_lock, flags);
-       return IRQ_HANDLED;
-
-out:
-       spin_unlock_irqrestore(&irqbits_lock, flags);   
-       return IRQ_NONE;
-}
-
-static inline void free_vm86_irq(int irqnumber)
-{
-       unsigned long flags;
-
-       free_irq(irqnumber, NULL);
-       vm86_irqs[irqnumber].tsk = NULL;
-
-       spin_lock_irqsave(&irqbits_lock, flags);        
-       irqbits &= ~(1 << irqnumber);
-       spin_unlock_irqrestore(&irqbits_lock, flags);   
-}
-
-void release_vm86_irqs(struct task_struct *task)
-{
-       int i;
-       for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
-           if (vm86_irqs[i].tsk == task)
-               free_vm86_irq(i);
-}
-
-static inline int get_and_reset_irq(int irqnumber)
-{
-       int bit;
-       unsigned long flags;
-       int ret = 0;
-       
-       if (invalid_vm86_irq(irqnumber)) return 0;
-       if (vm86_irqs[irqnumber].tsk != current) return 0;
-       spin_lock_irqsave(&irqbits_lock, flags);        
-       bit = irqbits & (1 << irqnumber);
-       irqbits &= ~bit;
-       if (bit) {
-               enable_irq(irqnumber);
-               ret = 1;
-       }
-
-       spin_unlock_irqrestore(&irqbits_lock, flags);   
-       return ret;
-}
-
-
-static int do_vm86_irq_handling(int subfunction, int irqnumber)
-{
-       int ret;
-       switch (subfunction) {
-               case VM86_GET_AND_RESET_IRQ: {
-                       return get_and_reset_irq(irqnumber);
-               }
-               case VM86_GET_IRQ_BITS: {
-                       return irqbits;
-               }
-               case VM86_REQUEST_IRQ: {
-                       int sig = irqnumber >> 8;
-                       int irq = irqnumber & 255;
-                       if (!capable(CAP_SYS_ADMIN)) return -EPERM;
-                       if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
-                       if (invalid_vm86_irq(irq)) return -EPERM;
-                       if (vm86_irqs[irq].tsk) return -EPERM;
-                       ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
-                       if (ret) return ret;
-                       vm86_irqs[irq].sig = sig;
-                       vm86_irqs[irq].tsk = current;
-                       return irq;
-               }
-               case  VM86_FREE_IRQ: {
-                       if (invalid_vm86_irq(irqnumber)) return -EPERM;
-                       if (!vm86_irqs[irqnumber].tsk) return 0;
-                       if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
-                       free_vm86_irq(irqnumber);
-                       return 0;
-               }
-       }
-       return -EINVAL;
-}
-
diff --git a/arch/i386/kernel/vmi_32.c b/arch/i386/kernel/vmi_32.c
deleted file mode 100644 (file)
index 18673e0..0000000
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
-   (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
-   (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
-       void (*cpuid)(void /* non-c */);
-       void (*_set_ldt)(u32 selector);
-       void (*set_tr)(u32 selector);
-       void (*set_kernel_stack)(u32 selector, u32 esp0);
-       void (*allocate_page)(u32, u32, u32, u32, u32);
-       void (*release_page)(u32, u32);
-       void (*set_pte)(pte_t, pte_t *, unsigned);
-       void (*update_pte)(pte_t *, unsigned);
-       void (*set_linear_mapping)(int, void *, u32, u32);
-       void (*_flush_tlb)(int);
-       void (*set_initial_ap_state)(int, int);
-       void (*halt)(void);
-       void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP  0xe9
-#define MNEM_RET  0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE  5
-
-static inline void patch_offset(void *insnbuf,
-                               unsigned long eip, unsigned long dest)
-{
-        *(unsigned long *)(insnbuf+1) = dest-eip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-                              unsigned long eip)
-{
-       u64 reloc;
-       struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
-       reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
-       switch(rel->type) {
-               case VMI_RELOCATION_CALL_REL:
-                       BUG_ON(len < 5);
-                       *(char *)insnbuf = MNEM_CALL;
-                       patch_offset(insnbuf, eip, (unsigned long)rel->eip);
-                       return 5;
-
-               case VMI_RELOCATION_JUMP_REL:
-                       BUG_ON(len < 5);
-                       *(char *)insnbuf = MNEM_JMP;
-                       patch_offset(insnbuf, eip, (unsigned long)rel->eip);
-                       return 5;
-
-               case VMI_RELOCATION_NOP:
-                       /* obliterate the whole thing */
-                       return 0;
-
-               case VMI_RELOCATION_NONE:
-                       /* leave native code in place */
-                       break;
-
-               default:
-                       BUG();
-       }
-       return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence.  The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-                         unsigned long eip, unsigned len)
-{
-       switch (type) {
-               case PARAVIRT_PATCH(irq_disable):
-                       return patch_internal(VMI_CALL_DisableInterrupts, len,
-                                             insns, eip);
-               case PARAVIRT_PATCH(irq_enable):
-                       return patch_internal(VMI_CALL_EnableInterrupts, len,
-                                             insns, eip);
-               case PARAVIRT_PATCH(restore_fl):
-                       return patch_internal(VMI_CALL_SetInterruptMask, len,
-                                             insns, eip);
-               case PARAVIRT_PATCH(save_fl):
-                       return patch_internal(VMI_CALL_GetInterruptMask, len,
-                                             insns, eip);
-               case PARAVIRT_PATCH(iret):
-                       return patch_internal(VMI_CALL_IRET, len, insns, eip);
-               case PARAVIRT_PATCH(irq_enable_sysexit):
-                       return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
-               default:
-                       break;
-       }
-       return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
-                               unsigned int *ecx, unsigned int *edx)
-{
-       int override = 0;
-       if (*eax == 1)
-               override = 1;
-        asm volatile ("call *%6"
-                      : "=a" (*eax),
-                        "=b" (*ebx),
-                        "=c" (*ecx),
-                        "=d" (*edx)
-                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
-       if (override) {
-               if (disable_pse)
-                       *edx &= ~X86_FEATURE_PSE;
-               if (disable_pge)
-                       *edx &= ~X86_FEATURE_PGE;
-               if (disable_sep)
-                       *edx &= ~X86_FEATURE_SEP;
-               if (disable_tsc)
-                       *edx &= ~X86_FEATURE_TSC;
-               if (disable_mtrr)
-                       *edx &= ~X86_FEATURE_MTRR;
-       }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
-       if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-               write_gdt_entry(gdt, nr, new->a, new->b);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
-       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
-       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
-       unsigned cpu = smp_processor_id();
-       u32 low, high;
-
-       pack_descriptor(&low, &high, (unsigned long)addr,
-                       entries * sizeof(struct desc_struct) - 1,
-                       DESCTYPE_LDT, 0);
-       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
-       vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
-       vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_load_esp0(struct tss_struct *tss,
-                                  struct thread_struct *thread)
-{
-       tss->x86_tss.esp0 = thread->esp0;
-
-       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
-       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-               tss->x86_tss.ss1 = thread->sysenter_cs;
-               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-       }
-       vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
-       vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
-       vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-#ifdef CONFIG_DEBUG_PAGE_TYPE
-
-#ifdef CONFIG_X86_PAE
-#define MAX_BOOT_PTS (2048+4+1)
-#else
-#define MAX_BOOT_PTS (1024+1)
-#endif
-
-/*
- * During boot, mem_map is not yet available in paging_init, so stash
- * all the boot page allocations here.
- */
-static struct {
-       u32 pfn;
-       int type;
-} boot_page_allocations[MAX_BOOT_PTS];
-static int num_boot_page_allocations;
-static int boot_allocations_applied;
-
-void vmi_apply_boot_page_allocations(void)
-{
-       int i;
-       BUG_ON(!mem_map);
-       for (i = 0; i < num_boot_page_allocations; i++) {
-               struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
-               page->type = boot_page_allocations[i].type;
-               page->type = boot_page_allocations[i].type &
-                               ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
-       }
-       boot_allocations_applied = 1;
-}
-
-static void record_page_type(u32 pfn, int type)
-{
-       BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
-       boot_page_allocations[num_boot_page_allocations].pfn = pfn;
-       boot_page_allocations[num_boot_page_allocations].type = type;
-       num_boot_page_allocations++;
-}
-
-static void check_zeroed_page(u32 pfn, int type, struct page *page)
-{
-       u32 *ptr;
-       int i;
-       int limit = PAGE_SIZE / sizeof(int);
-
-       if (page_address(page))
-               ptr = (u32 *)page_address(page);
-       else
-               ptr = (u32 *)__va(pfn << PAGE_SHIFT);
-       /*
-        * When cloning the root in non-PAE mode, only the userspace
-        * pdes need to be zeroed.
-        */
-       if (type & VMI_PAGE_CLONE)
-               limit = USER_PTRS_PER_PGD;
-       for (i = 0; i < limit; i++)
-               BUG_ON(ptr[i]);
-}
-
-/*
- * We stash the page type into struct page so we can verify the page
- * types are used properly.
- */
-static void vmi_set_page_type(u32 pfn, int type)
-{
-       /* PAE can have multiple roots per page - don't track */
-       if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
-               return;
-
-       if (boot_allocations_applied) {
-               struct page *page = pfn_to_page(pfn);
-               if (type != VMI_PAGE_NORMAL)
-                       BUG_ON(page->type);
-               else
-                       BUG_ON(page->type == VMI_PAGE_NORMAL);
-               page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
-               if (type & VMI_PAGE_ZEROED)
-                       check_zeroed_page(pfn, type, page);
-       } else {
-               record_page_type(pfn, type);
-       }
-}
-
-static void vmi_check_page_type(u32 pfn, int type)
-{
-       /* PAE can have multiple roots per page - skip checks */
-       if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
-               return;
-
-       type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
-       if (boot_allocations_applied) {
-               struct page *page = pfn_to_page(pfn);
-               BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
-               BUG_ON(type == VMI_PAGE_NORMAL && page->type);
-               BUG_ON((type & page->type) == 0);
-       }
-}
-#else
-#define vmi_set_page_type(p,t) do { } while (0)
-#define vmi_check_page_type(p,t) do { } while (0)
-#endif
-
-#ifdef CONFIG_HIGHPTE
-static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-       void *va = kmap_atomic(page, type);
-
-       /*
-        * Internally, the VMI ROM must map virtual addresses to physical
-        * addresses for processing MMU updates.  By the time MMU updates
-        * are issued, this information is typically already lost.
-        * Fortunately, the VMI provides a cache of mapping slots for active
-        * page tables.
-        *
-        * We use slot zero for the linear mapping of physical memory, and
-        * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
-        *
-        *  args:                 SLOT                 VA    COUNT PFN
-        */
-       BUG_ON(type != KM_PTE0 && type != KM_PTE1);
-       vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
-
-       return va;
-}
-#endif
-
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
-{
-       vmi_set_page_type(pfn, VMI_PAGE_L1);
-       vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pd(u32 pfn)
-{
-       /*
-        * This call comes in very early, before mem_map is setup.
-        * It is called only for swapper_pg_dir, which already has
-        * data on it.
-        */
-       vmi_set_page_type(pfn, VMI_PAGE_L2);
-       vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
-{
-       vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
-       vmi_check_page_type(clonepfn, VMI_PAGE_L2);
-       vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pt(u32 pfn)
-{
-       vmi_ops.release_page(pfn, VMI_PAGE_L1);
-       vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
-}
-
-static void vmi_release_pd(u32 pfn)
-{
-       vmi_ops.release_page(pfn, VMI_PAGE_L2);
-       vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
-}
-
-/*
- * Helper macros for MMU update flags.  We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush).  We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs.  We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
-                                       (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user)                           \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user)                     \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
-       vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
-       vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
-       /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
-       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
-       vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
-       vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
-       const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
-       vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
-#else
-       const pte_t pte = { pmdval.pud.pgd.pgd };
-       vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
-#endif
-       vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
-       /*
-        * XXX This is called from set_pmd_pte, but at both PT
-        * and PD layers so the VMI_PAGE_PT flag is wrong.  But
-        * it is only called for large page mapping changes,
-        * the Xen backend, doesn't support large pages, and the
-        * ESX backend doesn't depend on the flag.
-        */
-       set_64bit((unsigned long long *)ptep,pte_val(pteval));
-       vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
-       vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
-       /* Um, eww */
-       const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
-       vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
-       vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       const pte_t pte = { 0 };
-       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
-       vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
-       const pte_t pte = { 0 };
-       vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
-       vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-                    unsigned long start_esp)
-{
-       struct vmi_ap_state ap;
-
-       /* Default everything to zero.  This is fine for most GPRs. */
-       memset(&ap, 0, sizeof(struct vmi_ap_state));
-
-       ap.gdtr_limit = GDT_SIZE - 1;
-       ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
-       ap.idtr_limit = IDT_ENTRIES * 8 - 1;
-       ap.idtr_base = (unsigned long) idt_table;
-
-       ap.ldtr = 0;
-
-       ap.cs = __KERNEL_CS;
-       ap.eip = (unsigned long) start_eip;
-       ap.ss = __KERNEL_DS;
-       ap.esp = (unsigned long) start_esp;
-
-       ap.ds = __USER_DS;
-       ap.es = __USER_DS;
-       ap.fs = __KERNEL_PERCPU;
-       ap.gs = 0;
-
-       ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
-       /* efer should match BSP efer. */
-       if (cpu_has_nx) {
-               unsigned l, h;
-               rdmsr(MSR_EFER, l, h);
-               ap.efer = (unsigned long long) h << 32 | l;
-       }
-#endif
-
-       ap.cr3 = __pa(swapper_pg_dir);
-       /* Protected mode, paging, AM, WP, NE, MP. */
-       ap.cr0 = 0x80050023;
-       ap.cr4 = mmu_cr4_features;
-       vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
-{
-       static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
-
-       if (!vmi_ops.set_lazy_mode)
-               return;
-
-       /* Modes should never nest or overlap */
-       BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
-                                            mode == PARAVIRT_LAZY_FLUSH));
-
-       if (mode == PARAVIRT_LAZY_FLUSH) {
-               vmi_ops.set_lazy_mode(0);
-               vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
-       } else {
-               vmi_ops.set_lazy_mode(mode);
-               __get_cpu_var(lazy_mode) = mode;
-       }
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
-       struct pci_header *pci;
-       struct pnp_header *pnp;
-       const char *manufacturer = "UNKNOWN";
-       const char *product = "UNKNOWN";
-       const char *license = "unspecified";
-
-       if (rom->rom_signature != 0xaa55)
-               return 0;
-       if (rom->vrom_signature != VMI_SIGNATURE)
-               return 0;
-       if (rom->api_version_maj != VMI_API_REV_MAJOR ||
-           rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
-               printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
-                               rom->api_version_maj,
-                               rom->api_version_min);
-               return 0;
-       }
-
-       /*
-        * Relying on the VMI_SIGNATURE field is not 100% safe, so check
-        * the PCI header and device type to make sure this is really a
-        * VMI device.
-        */
-       if (!rom->pci_header_offs) {
-               printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
-               return 0;
-       }
-
-       pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
-       if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
-           pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
-               /* Allow it to run... anyways, but warn */
-               printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
-       }
-
-       if (rom->pnp_header_offs) {
-               pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
-               if (pnp->manufacturer_offset)
-                       manufacturer = (const char *)rom+pnp->manufacturer_offset;
-               if (pnp->product_offset)
-                       product = (const char *)rom+pnp->product_offset;
-       }
-
-       if (rom->license_offs)
-               license = (char *)rom+rom->license_offs;
-
-       printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
-               manufacturer, product,
-               rom->api_version_maj, rom->api_version_min,
-               pci->rom_version_maj, pci->rom_version_min);
-
-       /* Don't allow BSD/MIT here for now because we don't want to end up
-          with any binary only shim layers */
-       if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
-               printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
-                       license);
-               return 0;
-       }
-
-       return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
-       unsigned long base;
-
-       /* VMI ROM is in option ROM area, check signature */
-       for (base = 0xC0000; base < 0xE0000; base += 2048) {
-               struct vrom_header *romstart;
-               romstart = (struct vrom_header *)isa_bus_to_virt(base);
-               if (check_vmi_rom(romstart)) {
-                       vmi_rom = romstart;
-                       return 1;
-               }
-       }
-       return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
-       /* We must establish the lowmem mapping for MMU ops to work */
-       if (vmi_ops.set_linear_mapping)
-               vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
-       u64 reloc;
-       const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-       reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
-       BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
-       if (rel->type == VMI_RELOCATION_CALL_REL)
-               return (void *)rel->eip;
-       else
-               return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall)                             \
-do {                                                           \
-       reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
-                                   VMI_CALL_##vmicall);        \
-       if (rel->type == VMI_RELOCATION_CALL_REL)               \
-               paravirt_ops.opname = (void *)rel->eip;         \
-       else if (rel->type == VMI_RELOCATION_NOP)               \
-               paravirt_ops.opname = (void *)vmi_nop;          \
-       else if (rel->type != VMI_RELOCATION_NONE)              \
-               printk(KERN_WARNING "VMI: Unknown relocation "  \
-                                   "type %d for " #vmicall"\n",\
-                                       rel->type);             \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub.  Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall)             \
-do {                                                           \
-       reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
-                                   VMI_CALL_##vmicall);        \
-       BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);           \
-       if (rel->type == VMI_RELOCATION_CALL_REL) {             \
-               paravirt_ops.opname = wrapper;                  \
-               vmi_ops.cache = (void *)rel->eip;               \
-       }                                                       \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
-       short kernel_cs;
-       u64 reloc;
-       const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
-       if (call_vrom_func(vmi_rom, vmi_init) != 0) {
-               printk(KERN_ERR "VMI ROM failed to initialize!");
-               return 0;
-       }
-       savesegment(cs, kernel_cs);
-
-       paravirt_ops.paravirt_enabled = 1;
-       paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
-
-       paravirt_ops.patch = vmi_patch;
-       paravirt_ops.name = "vmi";
-
-       /*
-        * Many of these operations are ABI compatible with VMI.
-        * This means we can fill in the paravirt-ops with direct
-        * pointers into the VMI ROM.  If the calling convention for
-        * these operations changes, this code needs to be updated.
-        *
-        * Exceptions
-        *  CPUID paravirt-op uses pointers, not the native ISA
-        *  halt has no VMI equivalent; all VMI halts are "safe"
-        *  no MSR support yet - just trap and emulate.  VMI uses the
-        *    same ABI as the native ISA, but Linux wants exceptions
-        *    from bogus MSR read / write handled
-        *  rdpmc is not yet used in Linux
-        */
-
-       /* CPUID is special, so very special it gets wrapped like a present */
-       para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
-
-       para_fill(clts, CLTS);
-       para_fill(get_debugreg, GetDR);
-       para_fill(set_debugreg, SetDR);
-       para_fill(read_cr0, GetCR0);
-       para_fill(read_cr2, GetCR2);
-       para_fill(read_cr3, GetCR3);
-       para_fill(read_cr4, GetCR4);
-       para_fill(write_cr0, SetCR0);
-       para_fill(write_cr2, SetCR2);
-       para_fill(write_cr3, SetCR3);
-       para_fill(write_cr4, SetCR4);
-       para_fill(save_fl, GetInterruptMask);
-       para_fill(restore_fl, SetInterruptMask);
-       para_fill(irq_disable, DisableInterrupts);
-       para_fill(irq_enable, EnableInterrupts);
-
-       para_fill(wbinvd, WBINVD);
-       para_fill(read_tsc, RDTSC);
-
-       /* The following we emulate with trap and emulate for now */
-       /* paravirt_ops.read_msr = vmi_rdmsr */
-       /* paravirt_ops.write_msr = vmi_wrmsr */
-       /* paravirt_ops.rdpmc = vmi_rdpmc */
-
-       /* TR interface doesn't pass TR value, wrap */
-       para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
-       /* LDT is special, too */
-       para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
-       para_fill(load_gdt, SetGDT);
-       para_fill(load_idt, SetIDT);
-       para_fill(store_gdt, GetGDT);
-       para_fill(store_idt, GetIDT);
-       para_fill(store_tr, GetTR);
-       paravirt_ops.load_tls = vmi_load_tls;
-       para_fill(write_ldt_entry, WriteLDTEntry);
-       para_fill(write_gdt_entry, WriteGDTEntry);
-       para_fill(write_idt_entry, WriteIDTEntry);
-       para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
-       para_fill(set_iopl_mask, SetIOPLMask);
-       para_fill(io_delay, IODelay);
-       para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
-
-       /* user and kernel flush are just handled with different flags to FlushTLB */
-       para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-       para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-       para_fill(flush_tlb_single, InvalPage);
-
-       /*
-        * Until a standard flag format can be agreed on, we need to
-        * implement these as wrappers in Linux.  Get the VMI ROM
-        * function pointers for the two backend calls.
-        */
-#ifdef CONFIG_X86_PAE
-       vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
-       vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
-       vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
-       vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
-       if (vmi_ops.set_pte) {
-               paravirt_ops.set_pte = vmi_set_pte;
-               paravirt_ops.set_pte_at = vmi_set_pte_at;
-               paravirt_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
-               paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
-               paravirt_ops.set_pte_present = vmi_set_pte_present;
-               paravirt_ops.set_pud = vmi_set_pud;
-               paravirt_ops.pte_clear = vmi_pte_clear;
-               paravirt_ops.pmd_clear = vmi_pmd_clear;
-#endif
-       }
-
-       if (vmi_ops.update_pte) {
-               paravirt_ops.pte_update = vmi_update_pte;
-               paravirt_ops.pte_update_defer = vmi_update_pte_defer;
-       }
-
-       vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
-       if (vmi_ops.allocate_page) {
-               paravirt_ops.alloc_pt = vmi_allocate_pt;
-               paravirt_ops.alloc_pd = vmi_allocate_pd;
-               paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
-       }
-
-       vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
-       if (vmi_ops.release_page) {
-               paravirt_ops.release_pt = vmi_release_pt;
-               paravirt_ops.release_pd = vmi_release_pd;
-       }
-
-       /* Set linear is needed in all cases */
-       vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-#ifdef CONFIG_HIGHPTE
-       if (vmi_ops.set_linear_mapping)
-               paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
-#endif
-
-       /*
-        * These MUST always be patched.  Don't support indirect jumps
-        * through these operations, as the VMI interface may use either
-        * a jump or a call to get to these operations, depending on
-        * the backend.  They are performance critical anyway, so requiring
-        * a patch is not a big problem.
-        */
-       paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-       paravirt_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
-       para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       para_fill(apic_read, APICRead);
-       para_fill(apic_write, APICWrite);
-       para_fill(apic_write_atomic, APICWrite);
-#endif
-
-       /*
-        * Check for VMI timer functionality by probing for a cycle frequency method
-        */
-       reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
-       if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
-               vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
-               vmi_timer_ops.get_cycle_counter =
-                       vmi_get_function(VMI_CALL_GetCycleCounter);
-               vmi_timer_ops.get_wallclock =
-                       vmi_get_function(VMI_CALL_GetWallclockTime);
-               vmi_timer_ops.wallclock_updated =
-                       vmi_get_function(VMI_CALL_WallclockUpdated);
-               vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
-               vmi_timer_ops.cancel_alarm =
-                        vmi_get_function(VMI_CALL_CancelAlarm);
-               paravirt_ops.time_init = vmi_time_init;
-               paravirt_ops.get_wallclock = vmi_get_wallclock;
-               paravirt_ops.set_wallclock = vmi_set_wallclock;
-#ifdef CONFIG_X86_LOCAL_APIC
-               paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
-               paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
-#endif
-               paravirt_ops.sched_clock = vmi_sched_clock;
-               paravirt_ops.get_cpu_khz = vmi_cpu_khz;
-
-               /* We have true wallclock functions; disable CMOS clock sync */
-               no_sync_cmos_clock = 1;
-       } else {
-               disable_noidle = 1;
-               disable_vmi_timer = 1;
-       }
-
-       para_fill(safe_halt, Halt);
-
-       /*
-        * Alternative instruction rewriting doesn't happen soon enough
-        * to convert VMI_IRET to a call instead of a jump; so we have
-        * to do this before IRQs get reenabled.  Fortunately, it is
-        * idempotent.
-        */
-       apply_paravirt(__parainstructions, __parainstructions_end);
-
-       vmi_bringup();
-
-       return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
-       unsigned long flags;
-
-       if (!vmi_rom)
-               probe_vmi_rom();
-       else
-               check_vmi_rom(vmi_rom);
-
-       /* In case probing for or validating the ROM failed, basil */
-       if (!vmi_rom)
-               return;
-
-       reserve_top_address(-vmi_rom->virtual_top);
-
-       local_irq_save(flags);
-       activate_vmi();
-
-#ifdef CONFIG_X86_IO_APIC
-       /* This is virtual hardware; timer routing is wired correctly */
-       no_timer_check = 1;
-#endif
-       local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       if (!strcmp(arg, "disable_pge")) {
-               clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
-               disable_pge = 1;
-       } else if (!strcmp(arg, "disable_pse")) {
-               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-               disable_pse = 1;
-       } else if (!strcmp(arg, "disable_sep")) {
-               clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
-               disable_sep = 1;
-       } else if (!strcmp(arg, "disable_tsc")) {
-               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
-               disable_tsc = 1;
-       } else if (!strcmp(arg, "disable_mtrr")) {
-               clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
-               disable_mtrr = 1;
-       } else if (!strcmp(arg, "disable_timer")) {
-               disable_vmi_timer = 1;
-               disable_noidle = 1;
-       } else if (!strcmp(arg, "disable_noidle"))
-               disable_noidle = 1;
-       return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/i386/kernel/vmiclock_32.c b/arch/i386/kernel/vmiclock_32.c
deleted file mode 100644 (file)
index b1b5ab0..0000000
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/arch_hooks.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-
-#include <irq_vectors.h>
-#include "io_ports.h"
-
-#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
-       /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
-        * cycle counter. */
-       return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
-       unsigned long long wallclock;
-       wallclock = vmi_timer_ops.get_wallclock(); // nsec
-       (void)do_div(wallclock, 1000000000);       // sec
-
-       return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
-       return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
-       return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
-unsigned long vmi_cpu_khz(void)
-{
-       unsigned long long khz;
-       khz = vmi_timer_ops.get_cycle_frequency();
-       (void)do_div(khz, 1000);
-       return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
-#ifdef CONFIG_X86_IO_APIC
-       return FIRST_DEVICE_VECTOR;
-#else
-       return FIRST_EXTERNAL_VECTOR;
-#endif
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
-       unsigned long val = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, vmi_get_timer_vector());
-
-       return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
-       unsigned long val = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
-       unsigned long val = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
-       ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
-       .name           = "VMI-LOCAL",
-       .startup        = startup_timer_irq,
-       .mask           = mask_timer_irq,
-       .unmask         = unmask_timer_irq,
-       .ack            = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
-       return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
-                              struct clock_event_device *evt)
-{
-       cycle_t now, cycles_per_hz;
-       BUG_ON(!irqs_disabled());
-
-       switch (mode) {
-       case CLOCK_EVT_MODE_ONESHOT:
-       case CLOCK_EVT_MODE_RESUME:
-               break;
-       case CLOCK_EVT_MODE_PERIODIC:
-               cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
-               (void)do_div(cycles_per_hz, HZ);
-               now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
-               vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
-               break;
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               switch (evt->mode) {
-               case CLOCK_EVT_MODE_ONESHOT:
-                       vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
-                       break;
-               case CLOCK_EVT_MODE_PERIODIC:
-                       vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
-                       break;
-               default:
-                       break;
-               }
-               break;
-       default:
-               break;
-       }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
-                               struct clock_event_device *evt)
-{
-       /* Unfortunately, set_next_event interface only passes relative
-        * expiry, but we want absolute expiry.  It'd be better if were
-        * were passed an aboslute expiry, since a bunch of time may
-        * have been stolen between the time the delta is computed and
-        * when we set the alarm below. */
-       cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
-       BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-       vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
-       return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
-       .name           = "vmi-timer",
-       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .shift          = 22,
-       .set_mode       = vmi_timer_set_mode,
-       .set_next_event = vmi_timer_next_event,
-       .rating         = 1000,
-       .irq            = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-       struct clock_event_device *evt = &__get_cpu_var(local_events);
-       evt->event_handler(evt);
-       return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action  = {
-       .name           = "vmi-timer",
-       .handler        = vmi_timer_interrupt,
-       .flags          = IRQF_DISABLED | IRQF_NOBALANCING,
-       .mask           = CPU_MASK_ALL,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
-       cycle_t cycles_per_msec;
-       struct clock_event_device *evt;
-
-       int cpu = smp_processor_id();
-       evt = &__get_cpu_var(local_events);
-
-       /* Use cycles_per_msec since div_sc params are 32-bits. */
-       cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-       (void)do_div(cycles_per_msec, 1000);
-
-       memcpy(evt, &vmi_clockevent, sizeof(*evt));
-       /* Must pick .shift such that .mult fits in 32-bits.  Choosing
-        * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
-        * before overflow. */
-       evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
-       /* Upper bound is clockevent's use of ulong for cycle deltas. */
-       evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
-       evt->min_delta_ns = clockevent_delta2ns(1, evt);
-       evt->cpumask = cpumask_of_cpu(cpu);
-
-       printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
-              evt->name, evt->mult, evt->shift);
-       clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
-       /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-       outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
-       vmi_time_init_clockevent();
-       setup_irq(0, &vmi_clock_action);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
-       /*
-        * On APIC systems, we want local timers to fire on each cpu.  We do
-        * this by programming LVTT to deliver timer events to the IRQ handler
-        * for IRQ-0, since we can't re-use the APIC local timer handler
-        * without interfering with that code.
-        */
-       clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-       local_irq_disable();
-#ifdef CONFIG_X86_SMP
-       /*
-        * XXX handle_percpu_irq only defined for SMP; we need to switch over
-        * to using it, since this is a local interrupt, which each CPU must
-        * handle individually without locking out or dropping simultaneous
-        * local timers on other CPUs.  We also don't want to trigger the
-        * quirk workaround code for interrupts which gets invoked from
-        * handle_percpu_irq via eoi, so we use our own IRQ chip.
-        */
-       set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
-       set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
-       vmi_wiring = VMI_ALARM_WIRED_LVTT;
-       apic_write(APIC_LVTT, vmi_get_timer_vector());
-       local_irq_enable();
-       clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
-       vmi_time_init_clockevent();
-       apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-
-static cycle_t read_real_cycles(void)
-{
-       return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-}
-
-static struct clocksource clocksource_vmi = {
-       .name                   = "vmi-timer",
-       .rating                 = 450,
-       .read                   = read_real_cycles,
-       .mask                   = CLOCKSOURCE_MASK(64),
-       .mult                   = 0, /* to be set */
-       .shift                  = 22,
-       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
-       cycle_t cycles_per_msec;
-
-       if (!vmi_timer_ops.get_cycle_frequency)
-               return 0;
-       /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
-       cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-       (void)do_div(cycles_per_msec, 1000);
-
-       /* Note that clocksource.{mult, shift} converts in the opposite direction
-        * as clockevents.  */
-       clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-                                                   clocksource_vmi.shift);
-
-       printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
-       return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
deleted file mode 100644 (file)
index 849ee61..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
-#else
-# include "vmlinux_64.lds.S"
-#endif
diff --git a/arch/i386/kernel/vmlinux_32.lds.S b/arch/i386/kernel/vmlinux_32.lds.S
deleted file mode 100644 (file)
index 7d72cce..0000000
+++ /dev/null
@@ -1,213 +0,0 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-/* Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
-PHDRS {
-       text PT_LOAD FLAGS(5);  /* R_E */
-       data PT_LOAD FLAGS(7);  /* RWE */
-       note PT_NOTE FLAGS(0);  /* ___ */
-}
-SECTIONS
-{
-  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-  phys_startup_32 = startup_32 - LOAD_OFFSET;
-
-  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-       _text = .;                      /* Text and read-only data */
-       *(.text.head)
-  } :text = 0x9090
-
-  /* read-only */
-  .text : AT(ADDR(.text) - LOAD_OFFSET) {
-       TEXT_TEXT
-       SCHED_TEXT
-       LOCK_TEXT
-       KPROBES_TEXT
-       *(.fixup)
-       *(.gnu.warning)
-       _etext = .;                     /* End of text section */
-  } :text = 0x9090
-
-  . = ALIGN(16);               /* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-       __start___ex_table = .;
-        *(__ex_table)
-       __stop___ex_table = .;
-  }
-
-  NOTES :text :note
-
-  BUG_TABLE :text
-
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-       __tracedata_start = .;
-       *(.tracedata)
-       __tracedata_end = .;
-  }
-
-  RODATA
-
-  /* writeable */
-  . = ALIGN(4096);
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {      /* Data */
-       DATA_DATA
-       CONSTRUCTORS
-       } :data
-
-  . = ALIGN(4096);
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-       __nosave_begin = .;
-       *(.data.nosave)
-       . = ALIGN(4096);
-       __nosave_end = .;
-  }
-
-  . = ALIGN(4096);
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-       *(.data.page_aligned)
-       *(.data.idt)
-  }
-
-  . = ALIGN(32);
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-       *(.data.cacheline_aligned)
-  }
-
-  /* rarely changed data like cpu maps */
-  . = ALIGN(32);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-       *(.data.read_mostly)
-       _edata = .;             /* End of data section */
-  }
-
-  . = ALIGN(THREAD_SIZE);      /* init_task */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-       *(.data.init_task)
-  }
-
-  /* might get freed after init */
-  . = ALIGN(4096);
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-       __smp_locks = .;
-       *(.smp_locks)
-       __smp_locks_end = .;
-  }
-  /* will be freed after init
-   * Following ALIGN() is required to make sure no other data falls on the
-   * same page where __smp_alt_end is pointing as that page might be freed
-   * after boot. Always make sure that ALIGN() directive is present after
-   * the section which contains __smp_alt_end.
-   */
-  . = ALIGN(4096);
-
-  /* will be freed after init */
-  . = ALIGN(4096);             /* Init code and data */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-       __init_begin = .;
-       _sinittext = .;
-       *(.init.text)
-       _einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
-  . = ALIGN(16);
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-       __setup_start = .;
-       *(.init.setup)
-       __setup_end = .;
-   }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-       __initcall_start = .;
-       INITCALLS
-       __initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-       __con_initcall_start = .;
-       *(.con_initcall.init)
-       __con_initcall_end = .;
-  }
-  SECURITY_INIT
-  . = ALIGN(4);
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-       __alt_instructions = .;
-       *(.altinstructions)
-       __alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-       *(.altinstr_replacement)
-  }
-  . = ALIGN(4);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-       __parainstructions = .;
-       *(.parainstructions)
-       __parainstructions_end = .;
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
-#if defined(CONFIG_BLK_DEV_INITRD)
-  . = ALIGN(4096);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-       __initramfs_start = .;
-       *(.init.ramfs)
-       __initramfs_end = .;
-  }
-#endif
-  . = ALIGN(4096);
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
-       __per_cpu_start = .;
-       *(.data.percpu)
-       *(.data.percpu.shared_aligned)
-       __per_cpu_end = .;
-  }
-  . = ALIGN(4096);
-  /* freed after init ends here */
-       
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-       __init_end = .;
-       __bss_start = .;                /* BSS */
-       *(.bss.page_aligned)
-       *(.bss)
-       . = ALIGN(4);
-       __bss_stop = .;
-       _end = . ;
-       /* This is where the kernel creates the early boot page tables */
-       . = ALIGN(4096);
-       pg0 = . ;
-  }
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-       *(.exitcall.exit)
-       }
-
-  STABS_DEBUG
-
-  DWARF_DEBUG
-}
diff --git a/arch/i386/kernel/vsyscall-int80_32.S b/arch/i386/kernel/vsyscall-int80_32.S
deleted file mode 100644 (file)
index 103cab6..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Code for the vsyscall page.  This version uses the old int $0x80 method.
- *
- * NOTE:
- * 1) __kernel_vsyscall _must_ be first in this page.
- * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
- *    for details.
- */
-
-       .text
-       .globl __kernel_vsyscall
-       .type __kernel_vsyscall,@function
-__kernel_vsyscall:
-.LSTART_vsyscall:
-       int $0x80
-       ret
-.LEND_vsyscall:
-       .size __kernel_vsyscall,.-.LSTART_vsyscall
-       .previous
-
-       .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
-       .long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
-       .long 0                 /* CIE ID */
-       .byte 1                 /* Version number */
-       .string "zR"            /* NUL-terminated augmentation string */
-       .uleb128 1              /* Code alignment factor */
-       .sleb128 -4             /* Data alignment factor */
-       .byte 8                 /* Return address register column */
-       .uleb128 1              /* Augmentation value length */
-       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-       .byte 0x0c              /* DW_CFA_def_cfa */
-       .uleb128 4
-       .uleb128 4
-       .byte 0x88              /* DW_CFA_offset, column 0x8 */
-       .uleb128 1
-       .align 4
-.LENDCIEDLSI:
-       .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
-       .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
-       .long .LSTART_vsyscall-.        /* PC-relative start address */
-       .long .LEND_vsyscall-.LSTART_vsyscall
-       .uleb128 0
-       .align 4
-.LENDFDEDLSI:
-       .previous
-
-/*
- * Get the common code for the sigreturn entry points.
- */
-#include "vsyscall-sigreturn_32.S"
diff --git a/arch/i386/kernel/vsyscall-note_32.S b/arch/i386/kernel/vsyscall-note_32.S
deleted file mode 100644 (file)
index fcf376a..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
- * Here we can supply some information useful to userland.
- */
-
-#include <linux/version.h>
-#include <linux/elfnote.h>
-
-/* Ideally this would use UTS_NAME, but using a quoted string here
-   doesn't work. Remember to change this when changing the
-   kernel's name. */
-ELFNOTE_START(Linux, 0, "a")
-       .long LINUX_VERSION_CODE
-ELFNOTE_END
-
-#ifdef CONFIG_XEN
-/*
- * Add a special note telling glibc's dynamic linker a fake hardware
- * flavor that it will use to choose the search path for libraries in the
- * same way it uses real hardware capabilities like "mmx".
- * We supply "nosegneg" as the fake capability, to indicate that we
- * do not like negative offsets in instructions using segment overrides,
- * since we implement those inefficiently.  This makes it possible to
- * install libraries optimized to avoid those access patterns in someplace
- * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
- * corresponding to the bits here is needed to make ldconfig work right.
- * It should contain:
- *     hwcap 1 nosegneg
- * to match the mapping of bit to name that we give here.
- *
- * At runtime, the fake hardware feature will be considered to be present
- * if its bit is set in the mask word.  So, we start with the mask 0, and
- * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
- */
-
-#include "../../x86/xen/vdso.h"        /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
-
-       .globl VDSO_NOTE_MASK
-ELFNOTE_START(GNU, 2, "a")
-       .long 1                 /* ncaps */
-VDSO_NOTE_MASK:
-       .long 0                 /* mask */
-       .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
-ELFNOTE_END
-#endif
diff --git a/arch/i386/kernel/vsyscall-sigreturn_32.S b/arch/i386/kernel/vsyscall-sigreturn_32.S
deleted file mode 100644 (file)
index a92262f..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Common code for the sigreturn entry points on the vsyscall page.
- * So far this code is the same for both int80 and sysenter versions.
- * This file is #include'd by vsyscall-*.S to define them after the
- * vsyscall entry point.  The kernel assumes that the addresses of these
- * routines are constant for all vsyscall implementations.
- */
-
-#include <asm/unistd.h>
-#include <asm/asm-offsets.h>
-
-
-/* XXX
-   Should these be named "_sigtramp" or something?
-*/
-
-       .text
-       .org __kernel_vsyscall+32,0x90
-       .globl __kernel_sigreturn
-       .type __kernel_sigreturn,@function
-__kernel_sigreturn:
-.LSTART_sigreturn:
-       popl %eax               /* XXX does this mean it needs unwind info? */
-       movl $__NR_sigreturn, %eax
-       int $0x80
-.LEND_sigreturn:
-       .size __kernel_sigreturn,.-.LSTART_sigreturn
-
-       .balign 32
-       .globl __kernel_rt_sigreturn
-       .type __kernel_rt_sigreturn,@function
-__kernel_rt_sigreturn:
-.LSTART_rt_sigreturn:
-       movl $__NR_rt_sigreturn, %eax
-       int $0x80
-.LEND_rt_sigreturn:
-       .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
-       .balign 32
-       .previous
-
-       .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI1:
-       .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
-.LSTARTCIEDLSI1:
-       .long 0                 /* CIE ID */
-       .byte 1                 /* Version number */
-       .string "zRS"           /* NUL-terminated augmentation string */
-       .uleb128 1              /* Code alignment factor */
-       .sleb128 -4             /* Data alignment factor */
-       .byte 8                 /* Return address register column */
-       .uleb128 1              /* Augmentation value length */
-       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-       .byte 0                 /* DW_CFA_nop */
-       .align 4
-.LENDCIEDLSI1:
-       .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
-.LSTARTFDEDLSI1:
-       .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
-       /* HACK: The dwarf2 unwind routines will subtract 1 from the
-          return address to get an address in the middle of the
-          presumed call instruction.  Since we didn't get here via
-          a call, we need to include the nop before the real start
-          to make up for it.  */
-       .long .LSTART_sigreturn-1-.     /* PC-relative start address */
-       .long .LEND_sigreturn-.LSTART_sigreturn+1
-       .uleb128 0                      /* Augmentation */
-       /* What follows are the instructions for the table generation.
-          We record the locations of each register saved.  This is
-          complicated by the fact that the "CFA" is always assumed to
-          be the value of the stack pointer in the caller.  This means
-          that we must define the CFA of this body of code to be the
-          saved value of the stack pointer in the sigcontext.  Which
-          also means that there is no fixed relation to the other 
-          saved registers, which means that we must use DW_CFA_expression
-          to compute their addresses.  It also means that when we 
-          adjust the stack with the popl, we have to do it all over again.  */
-
-#define do_cfa_expr(offset)                                            \
-       .byte 0x0f;                     /* DW_CFA_def_cfa_expression */ \
-       .uleb128 1f-0f;                 /*   length */                  \
-0:     .byte 0x74;                     /*     DW_OP_breg4 */           \
-       .sleb128 offset;                /*      offset */               \
-       .byte 0x06;                     /*     DW_OP_deref */           \
-1:
-
-#define do_expr(regno, offset)                                         \
-       .byte 0x10;                     /* DW_CFA_expression */         \
-       .uleb128 regno;                 /*   regno */                   \
-       .uleb128 1f-0f;                 /*   length */                  \
-0:     .byte 0x74;                     /*     DW_OP_breg4 */           \
-       .sleb128 offset;                /*       offset */              \
-1:
-
-       do_cfa_expr(SIGCONTEXT_esp+4)
-       do_expr(0, SIGCONTEXT_eax+4)
-       do_expr(1, SIGCONTEXT_ecx+4)
-       do_expr(2, SIGCONTEXT_edx+4)
-       do_expr(3, SIGCONTEXT_ebx+4)
-       do_expr(5, SIGCONTEXT_ebp+4)
-       do_expr(6, SIGCONTEXT_esi+4)
-       do_expr(7, SIGCONTEXT_edi+4)
-       do_expr(8, SIGCONTEXT_eip+4)
-
-       .byte 0x42      /* DW_CFA_advance_loc 2 -- nop; popl eax. */
-
-       do_cfa_expr(SIGCONTEXT_esp)
-       do_expr(0, SIGCONTEXT_eax)
-       do_expr(1, SIGCONTEXT_ecx)
-       do_expr(2, SIGCONTEXT_edx)
-       do_expr(3, SIGCONTEXT_ebx)
-       do_expr(5, SIGCONTEXT_ebp)
-       do_expr(6, SIGCONTEXT_esi)
-       do_expr(7, SIGCONTEXT_edi)
-       do_expr(8, SIGCONTEXT_eip)
-
-       .align 4
-.LENDFDEDLSI1:
-
-       .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
-.LSTARTFDEDLSI2:
-       .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
-       /* HACK: See above wrt unwind library assumptions.  */
-       .long .LSTART_rt_sigreturn-1-.  /* PC-relative start address */
-       .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
-       .uleb128 0                      /* Augmentation */
-       /* What follows are the instructions for the table generation.
-          We record the locations of each register saved.  This is
-          slightly less complicated than the above, since we don't
-          modify the stack pointer in the process.  */
-
-       do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
-       do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
-       do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
-       do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
-       do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
-       do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
-       do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
-       do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
-       do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
-
-       .align 4
-.LENDFDEDLSI2:
-       .previous
diff --git a/arch/i386/kernel/vsyscall-sysenter_32.S b/arch/i386/kernel/vsyscall-sysenter_32.S
deleted file mode 100644 (file)
index ed879bf..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Code for the vsyscall page.  This version uses the sysenter instruction.
- *
- * NOTE:
- * 1) __kernel_vsyscall _must_ be first in this page.
- * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
- *    for details.
- */
-
-/*
- * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
- * %ecx itself for arg2. The pushing is because the sysexit instruction
- * (found in entry.S) requires that we clobber %ecx with the desired %esp.
- * User code might expect that %ecx is unclobbered though, as it would be
- * for returning via the iret instruction, so we must push and pop.
- *
- * The caller puts arg3 in %edx, which the sysexit instruction requires
- * for %eip. Thus, exactly as for arg2, we must push and pop.
- *
- * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
- * instruction clobbers %esp, the user's %esp won't even survive entry
- * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
- * arg6 from the stack.
- *
- * You can not use this vsyscall for the clone() syscall because the
- * three dwords on the parent stack do not get copied to the child.
- */
-       .text
-       .globl __kernel_vsyscall
-       .type __kernel_vsyscall,@function
-__kernel_vsyscall:
-.LSTART_vsyscall:
-       push %ecx
-.Lpush_ecx:
-       push %edx
-.Lpush_edx:
-       push %ebp
-.Lenter_kernel:
-       movl %esp,%ebp
-       sysenter
-
-       /* 7: align return point with nop's to make disassembly easier */
-       .space 7,0x90
-
-       /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
-       jmp .Lenter_kernel
-       /* 16: System call normal return point is here! */
-       .globl SYSENTER_RETURN  /* Symbol used by sysenter.c  */
-SYSENTER_RETURN:
-       pop %ebp
-.Lpop_ebp:
-       pop %edx
-.Lpop_edx:
-       pop %ecx
-.Lpop_ecx:
-       ret
-.LEND_vsyscall:
-       .size __kernel_vsyscall,.-.LSTART_vsyscall
-       .previous
-
-       .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
-       .long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
-       .long 0                 /* CIE ID */
-       .byte 1                 /* Version number */
-       .string "zR"            /* NUL-terminated augmentation string */
-       .uleb128 1              /* Code alignment factor */
-       .sleb128 -4             /* Data alignment factor */
-       .byte 8                 /* Return address register column */
-       .uleb128 1              /* Augmentation value length */
-       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-       .byte 0x0c              /* DW_CFA_def_cfa */
-       .uleb128 4
-       .uleb128 4
-       .byte 0x88              /* DW_CFA_offset, column 0x8 */
-       .uleb128 1
-       .align 4
-.LENDCIEDLSI:
-       .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
-       .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
-       .long .LSTART_vsyscall-.        /* PC-relative start address */
-       .long .LEND_vsyscall-.LSTART_vsyscall
-       .uleb128 0
-       /* What follows are the instructions for the table generation.
-          We have to record all changes of the stack pointer.  */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpush_ecx-.LSTART_vsyscall
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x08              /* RA at offset 8 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpush_edx-.Lpush_ecx
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x0c              /* RA at offset 12 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lenter_kernel-.Lpush_edx
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x10              /* RA at offset 16 now */
-       .byte 0x85, 0x04        /* DW_CFA_offset %ebp -16 */
-       /* Finally the epilogue.  */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_ebp-.Lenter_kernel
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x0c              /* RA at offset 12 now */
-       .byte 0xc5              /* DW_CFA_restore %ebp */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_edx-.Lpop_ebp
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x08              /* RA at offset 8 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_ecx-.Lpop_edx
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x04              /* RA at offset 4 now */
-       .align 4
-.LENDFDEDLSI:
-       .previous
-
-/*
- * Get the common code for the sigreturn entry points.
- */
-#include "vsyscall-sigreturn_32.S"
diff --git a/arch/i386/kernel/vsyscall_32.S b/arch/i386/kernel/vsyscall_32.S
deleted file mode 100644 (file)
index 0330744..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <linux/init.h>
-
-__INITDATA
-
-       .globl vsyscall_int80_start, vsyscall_int80_end
-vsyscall_int80_start:
-       .incbin "arch/i386/kernel/vsyscall-int80_32.so"
-vsyscall_int80_end:
-
-       .globl vsyscall_sysenter_start, vsyscall_sysenter_end
-vsyscall_sysenter_start:
-       .incbin "arch/i386/kernel/vsyscall-sysenter_32.so"
-vsyscall_sysenter_end:
-
-__FINIT
diff --git a/arch/i386/kernel/vsyscall_32.lds.S b/arch/i386/kernel/vsyscall_32.lds.S
deleted file mode 100644 (file)
index 4a8b0ed..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
- * object prelinked to its virtual address, and with only one read-only
- * segment (that fits in one page).  This script controls its layout.
- */
-#include <asm/asm-offsets.h>
-
-SECTIONS
-{
-  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
-
-  .hash           : { *(.hash) }               :text
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-
-  /* This linker script is used both with -r and with -shared.
-     For the layouts to match, we need to skip more than enough
-     space for the dynamic symbol table et al.  If this amount
-     is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VDSO_PRELINK_asm + 0x400;
-
-  .text           : { *(.text) }               :text =0x90909090
-  .note                  : { *(.note.*) }              :text :note
-  .eh_frame_hdr   : { *(.eh_frame_hdr) }       :text :eh_frame_hdr
-  .eh_frame       : { KEEP (*(.eh_frame)) }    :text
-  .dynamic        : { *(.dynamic) }            :text :dynamic
-  .useless        : {
-       *(.got.plt) *(.got)
-       *(.data .data.* .gnu.linkonce.d.*)
-       *(.dynbss)
-       *(.bss .bss.* .gnu.linkonce.b.*)
-  }                                            :text
-}
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
-  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
-  note PT_NOTE FLAGS(4); /* PF_R */
-  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-  LINUX_2.5 {
-    global:
-       __kernel_vsyscall;
-       __kernel_sigreturn;
-       __kernel_rt_sigreturn;
-
-    local: *;
-  };
-}
-
-/* The ELF entry point can be used to set the AT_SYSINFO value.  */
-ENTRY(__kernel_vsyscall);
index 4c1f17d..12d4148 100644 (file)
@@ -9,4 +9,4 @@
 
 #define old_mmap old_mmap_i386
 
-#include "../../i386/kernel/syscall_table_32.S"
+#include "../../x86/kernel/syscall_table_32.S"
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644 (file)
index 0000000..40836ad
--- /dev/null
@@ -0,0 +1 @@
+vsyscall.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644 (file)
index 0000000..577d08f
--- /dev/null
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/kernel/Makefile_32
+else
+include ${srctree}/arch/x86_64/kernel/Makefile_64
+endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
new file mode 100644 (file)
index 0000000..5096f48
--- /dev/null
@@ -0,0 +1,88 @@
+#
+# Makefile for the linux kernel.
+#
+
+extra-y := head_32.o init_task_32.o vmlinux.lds
+
+obj-y  := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
+               ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
+               pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
+               quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
+
+obj-$(CONFIG_STACKTRACE)       += stacktrace.o
+obj-y                          += ../../x86/kernel/cpu/
+obj-y                          += ../../x86/kernel/acpi/
+obj-$(CONFIG_X86_BIOS_REBOOT)  += reboot_32.o
+obj-$(CONFIG_MCA)              += mca_32.o
+obj-$(CONFIG_X86_MSR)          += msr.o
+obj-$(CONFIG_X86_CPUID)                += cpuid.o
+obj-$(CONFIG_MICROCODE)                += microcode.o
+obj-$(CONFIG_APM)              += apm_32.o
+obj-$(CONFIG_X86_SMP)          += smp_32.o smpboot_32.o tsc_sync.o
+obj-$(CONFIG_SMP)              += smpcommon_32.o
+obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline_32.o
+obj-$(CONFIG_X86_MPPARSE)      += mpparse_32.o
+obj-$(CONFIG_X86_LOCAL_APIC)   += apic_32.o nmi_32.o
+obj-$(CONFIG_X86_IO_APIC)      += io_apic_32.o
+obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_KEXEC)            += machine_kexec_32.o relocate_kernel_32.o crash_32.o
+obj-$(CONFIG_CRASH_DUMP)       += crash_dump_32.o
+obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
+obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit_32.o
+obj-$(CONFIG_KPROBES)          += kprobes_32.o
+obj-$(CONFIG_MODULES)          += module_32.o
+obj-y                          += sysenter_32.o vsyscall_32.o
+obj-$(CONFIG_ACPI_SRAT)        += srat_32.o
+obj-$(CONFIG_EFI)              += efi_32.o efi_stub_32.o
+obj-$(CONFIG_DOUBLEFAULT)      += doublefault_32.o
+obj-$(CONFIG_VM86)             += vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
+obj-$(CONFIG_HPET_TIMER)       += hpet_32.o
+obj-$(CONFIG_K8_NB)            += k8.o
+obj-$(CONFIG_MGEODE_LX)                += geode_32.o
+
+obj-$(CONFIG_VMI)              += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_PARAVIRT)         += paravirt_32.o
+obj-y                          += pcspeaker.o
+
+obj-$(CONFIG_SCx200)           += scx200_32.o
+
+# vsyscall_32.o contains the vsyscall DSO images as __initdata.
+# We must build both images before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
+targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
+targets += vsyscall-note_32.o vsyscall_32.lds
+
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
+                         -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
+
+vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
+                $(call ld-option, -Wl$(comma)--hash-style=sysv)
+SYSCFLAGS_vsyscall-sysenter_32.so      = $(vsyscall-flags)
+SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
+
+$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
+                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
+       $(call if_changed,syscall)
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vsyscall-syms.o
+$(obj)/built-in.o: $(obj)/vsyscall-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
+
+SYSCFLAGS_vsyscall-syms.o = -r
+$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
+                       $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
+       $(call if_changed,syscall)
+
+k8-y                      += ../../x86_64/kernel/k8.o
+stacktrace-y             += ../../x86_64/kernel/stacktrace.o
+
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644 (file)
index 0000000..bd72d94
--- /dev/null
@@ -0,0 +1,450 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+
+#define MAX_PATCH_LEN (255-1)
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
+
+static int __init bootonly(char *str)
+{
+       smp_alt_once = 1;
+       return 1;
+}
+__setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+
+static int debug_alternative;
+
+static int __init debug_alt(char *str)
+{
+       debug_alternative = 1;
+       return 1;
+}
+__setup("debug-alternative", debug_alt);
+
+static int noreplace_smp;
+
+static int __init setup_noreplace_smp(char *str)
+{
+       noreplace_smp = 1;
+       return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#ifdef CONFIG_PARAVIRT
+static int noreplace_paravirt = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+       noreplace_paravirt = 1;
+       return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+#endif
+
+#define DPRINTK(fmt, args...) if (debug_alternative) \
+       printk(KERN_DEBUG fmt, args)
+
+#ifdef GENERIC_NOP1
+/* Use inline assembly to define this because the nops are defined
+   as inline assembly strings in the include files and we cannot
+   get them easily into strings. */
+asm("\t.data\nintelnops: "
+       GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
+       GENERIC_NOP7 GENERIC_NOP8);
+extern unsigned char intelnops[];
+static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
+       NULL,
+       intelnops,
+       intelnops + 1,
+       intelnops + 1 + 2,
+       intelnops + 1 + 2 + 3,
+       intelnops + 1 + 2 + 3 + 4,
+       intelnops + 1 + 2 + 3 + 4 + 5,
+       intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+       intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+
+#ifdef K8_NOP1
+asm("\t.data\nk8nops: "
+       K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+       K8_NOP7 K8_NOP8);
+extern unsigned char k8nops[];
+static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
+       NULL,
+       k8nops,
+       k8nops + 1,
+       k8nops + 1 + 2,
+       k8nops + 1 + 2 + 3,
+       k8nops + 1 + 2 + 3 + 4,
+       k8nops + 1 + 2 + 3 + 4 + 5,
+       k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+       k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+
+#ifdef K7_NOP1
+asm("\t.data\nk7nops: "
+       K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+       K7_NOP7 K7_NOP8);
+extern unsigned char k7nops[];
+static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
+       NULL,
+       k7nops,
+       k7nops + 1,
+       k7nops + 1 + 2,
+       k7nops + 1 + 2 + 3,
+       k7nops + 1 + 2 + 3 + 4,
+       k7nops + 1 + 2 + 3 + 4 + 5,
+       k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+       k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+
+#ifdef CONFIG_X86_64
+
+extern char __vsyscall_0;
+static inline unsigned char** find_nop_table(void)
+{
+       return k8_nops;
+}
+
+#else /* CONFIG_X86_64 */
+
+static struct nop {
+       int cpuid;
+       unsigned char **noptable;
+} noptypes[] = {
+       { X86_FEATURE_K8, k8_nops },
+       { X86_FEATURE_K7, k7_nops },
+       { -1, NULL }
+};
+
+static unsigned char** find_nop_table(void)
+{
+       unsigned char **noptable = intel_nops;
+       int i;
+
+       for (i = 0; noptypes[i].cpuid >= 0; i++) {
+               if (boot_cpu_has(noptypes[i].cpuid)) {
+                       noptable = noptypes[i].noptable;
+                       break;
+               }
+       }
+       return noptable;
+}
+
+#endif /* CONFIG_X86_64 */
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void add_nops(void *insns, unsigned int len)
+{
+       unsigned char **noptable = find_nop_table();
+
+       while (len > 0) {
+               unsigned int noplen = len;
+               if (noplen > ASM_NOP_MAX)
+                       noplen = ASM_NOP_MAX;
+               memcpy(insns, noptable[noplen], noplen);
+               insns += noplen;
+               len -= noplen;
+       }
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern u8 *__smp_locks[], *__smp_locks_end[];
+
+/* Replace instructions with better alternatives for this CPU type.
+   This runs before SMP is initialized to avoid SMP problems with
+   self modifying code. This implies that assymetric systems where
+   APs have less capabilities than the boot processor are not handled.
+   Tough. Make sure you disable such features by hand. */
+
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+{
+       struct alt_instr *a;
+       char insnbuf[MAX_PATCH_LEN];
+
+       DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
+       for (a = start; a < end; a++) {
+               u8 *instr = a->instr;
+               BUG_ON(a->replacementlen > a->instrlen);
+               BUG_ON(a->instrlen > sizeof(insnbuf));
+               if (!boot_cpu_has(a->cpuid))
+                       continue;
+#ifdef CONFIG_X86_64
+               /* vsyscall code is not mapped yet. resolve it manually. */
+               if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
+                       instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
+                       DPRINTK("%s: vsyscall fixup: %p => %p\n",
+                               __FUNCTION__, a->instr, instr);
+               }
+#endif
+               memcpy(insnbuf, a->replacement, a->replacementlen);
+               add_nops(insnbuf + a->replacementlen,
+                        a->instrlen - a->replacementlen);
+               text_poke(instr, insnbuf, a->instrlen);
+       }
+}
+
+#ifdef CONFIG_SMP
+
+static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+{
+       u8 **ptr;
+
+       for (ptr = start; ptr < end; ptr++) {
+               if (*ptr < text)
+                       continue;
+               if (*ptr > text_end)
+                       continue;
+               text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+       };
+}
+
+static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+{
+       u8 **ptr;
+       char insn[1];
+
+       if (noreplace_smp)
+               return;
+
+       add_nops(insn, 1);
+       for (ptr = start; ptr < end; ptr++) {
+               if (*ptr < text)
+                       continue;
+               if (*ptr > text_end)
+                       continue;
+               text_poke(*ptr, insn, 1);
+       };
+}
+
+struct smp_alt_module {
+       /* what is this ??? */
+       struct module   *mod;
+       char            *name;
+
+       /* ptrs to lock prefixes */
+       u8              **locks;
+       u8              **locks_end;
+
+       /* .text segment, needed to avoid patching init code ;) */
+       u8              *text;
+       u8              *text_end;
+
+       struct list_head next;
+};
+static LIST_HEAD(smp_alt_modules);
+static DEFINE_SPINLOCK(smp_alt);
+
+void alternatives_smp_module_add(struct module *mod, char *name,
+                                void *locks, void *locks_end,
+                                void *text,  void *text_end)
+{
+       struct smp_alt_module *smp;
+       unsigned long flags;
+
+       if (noreplace_smp)
+               return;
+
+       if (smp_alt_once) {
+               if (boot_cpu_has(X86_FEATURE_UP))
+                       alternatives_smp_unlock(locks, locks_end,
+                                               text, text_end);
+               return;
+       }
+
+       smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+       if (NULL == smp)
+               return; /* we'll run the (safe but slow) SMP code then ... */
+
+       smp->mod        = mod;
+       smp->name       = name;
+       smp->locks      = locks;
+       smp->locks_end  = locks_end;
+       smp->text       = text;
+       smp->text_end   = text_end;
+       DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
+               __FUNCTION__, smp->locks, smp->locks_end,
+               smp->text, smp->text_end, smp->name);
+
+       spin_lock_irqsave(&smp_alt, flags);
+       list_add_tail(&smp->next, &smp_alt_modules);
+       if (boot_cpu_has(X86_FEATURE_UP))
+               alternatives_smp_unlock(smp->locks, smp->locks_end,
+                                       smp->text, smp->text_end);
+       spin_unlock_irqrestore(&smp_alt, flags);
+}
+
+void alternatives_smp_module_del(struct module *mod)
+{
+       struct smp_alt_module *item;
+       unsigned long flags;
+
+       if (smp_alt_once || noreplace_smp)
+               return;
+
+       spin_lock_irqsave(&smp_alt, flags);
+       list_for_each_entry(item, &smp_alt_modules, next) {
+               if (mod != item->mod)
+                       continue;
+               list_del(&item->next);
+               spin_unlock_irqrestore(&smp_alt, flags);
+               DPRINTK("%s: %s\n", __FUNCTION__, item->name);
+               kfree(item);
+               return;
+       }
+       spin_unlock_irqrestore(&smp_alt, flags);
+}
+
+void alternatives_smp_switch(int smp)
+{
+       struct smp_alt_module *mod;
+       unsigned long flags;
+
+#ifdef CONFIG_LOCKDEP
+       /*
+        * A not yet fixed binutils section handling bug prevents
+        * alternatives-replacement from working reliably, so turn
+        * it off:
+        */
+       printk("lockdep: not fixing up alternatives.\n");
+       return;
+#endif
+
+       if (noreplace_smp || smp_alt_once)
+               return;
+       BUG_ON(!smp && (num_online_cpus() > 1));
+
+       spin_lock_irqsave(&smp_alt, flags);
+       if (smp) {
+               printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+               clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+               clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+               list_for_each_entry(mod, &smp_alt_modules, next)
+                       alternatives_smp_lock(mod->locks, mod->locks_end,
+                                             mod->text, mod->text_end);
+       } else {
+               printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+               set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+               set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+               list_for_each_entry(mod, &smp_alt_modules, next)
+                       alternatives_smp_unlock(mod->locks, mod->locks_end,
+                                               mod->text, mod->text_end);
+       }
+       spin_unlock_irqrestore(&smp_alt, flags);
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+                   struct paravirt_patch_site *end)
+{
+       struct paravirt_patch_site *p;
+       char insnbuf[MAX_PATCH_LEN];
+
+       if (noreplace_paravirt)
+               return;
+
+       for (p = start; p < end; p++) {
+               unsigned int used;
+
+               BUG_ON(p->len > MAX_PATCH_LEN);
+               /* prep the buffer with the original instructions */
+               memcpy(insnbuf, p->instr, p->len);
+               used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
+                                         (unsigned long)p->instr, p->len);
+
+               BUG_ON(used > p->len);
+
+               /* Pad the rest with nops */
+               add_nops(insnbuf + used, p->len - used);
+               text_poke(p->instr, insnbuf, p->len);
+       }
+}
+extern struct paravirt_patch_site __start_parainstructions[],
+       __stop_parainstructions[];
+#endif /* CONFIG_PARAVIRT */
+
+void __init alternative_instructions(void)
+{
+       unsigned long flags;
+
+       /* The patching is not fully atomic, so try to avoid local interruptions
+          that might execute the to be patched code.
+          Other CPUs are not running. */
+       stop_nmi();
+#ifdef CONFIG_X86_MCE
+       stop_mce();
+#endif
+
+       local_irq_save(flags);
+       apply_alternatives(__alt_instructions, __alt_instructions_end);
+
+       /* switch to patch-once-at-boottime-only mode and free the
+        * tables in case we know the number of CPUs will never ever
+        * change */
+#ifdef CONFIG_HOTPLUG_CPU
+       if (num_possible_cpus() < 2)
+               smp_alt_once = 1;
+#endif
+
+#ifdef CONFIG_SMP
+       if (smp_alt_once) {
+               if (1 == num_possible_cpus()) {
+                       printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+                       set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+                       set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+                       alternatives_smp_unlock(__smp_locks, __smp_locks_end,
+                                               _text, _etext);
+               }
+               free_init_pages("SMP alternatives",
+                               (unsigned long)__smp_locks,
+                               (unsigned long)__smp_locks_end);
+       } else {
+               alternatives_smp_module_add(NULL, "core kernel",
+                                           __smp_locks, __smp_locks_end,
+                                           _text, _etext);
+               alternatives_smp_switch(0);
+       }
+#endif
+       apply_paravirt(__parainstructions, __parainstructions_end);
+       local_irq_restore(flags);
+
+       restart_nmi();
+#ifdef CONFIG_X86_MCE
+       restart_mce();
+#endif
+}
+
+/*
+ * Warning:
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these instructions.
+ * And on the local CPU you need to be protected again NMI or MCE handlers
+ * seeing an inconsistent instruction while you patch.
+ */
+void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
+{
+       memcpy(addr, opcode, len);
+       sync_core();
+       /* Could also do a CLFLUSH here to speed up CPU recovery; but
+          that causes hangs on some VIA CPUs. */
+}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
new file mode 100644 (file)
index 0000000..3d67ae1
--- /dev/null
@@ -0,0 +1,1566 @@
+/*
+ *     Local APIC handling, local APIC timers
+ *
+ *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively.
+ *     Maciej W. Rozycki       :       Various updates and fixes.
+ *     Mikael Pettersson       :       Power Management for UP-APIC.
+ *     Pavel Machek and
+ *     Mikael Pettersson       :       PM converted to driver model.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/hpet.h>
+#include <asm/i8253.h>
+#include <asm/nmi.h>
+
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include <mach_ipi.h>
+
+#include "io_ports.h"
+
+/*
+ * Sanity check
+ */
+#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
+
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * -1=force-disable, +1=force-enable
+ */
+static int enable_local_apic __initdata = 0;
+
+/* Local APIC timer verification ok */
+static int local_apic_timer_verify_ok;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk
+   or using CPU MSR check */
+int local_apic_timer_disabled;
+/* Local APIC timer works in C2 */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+
+static unsigned int calibration_result;
+
+static int lapic_next_event(unsigned long delta,
+                           struct clock_event_device *evt);
+static void lapic_timer_setup(enum clock_event_mode mode,
+                             struct clock_event_device *evt);
+static void lapic_timer_broadcast(cpumask_t mask);
+static void apic_pm_activate(void);
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+       .name           = "lapic",
+       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+                       | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+       .shift          = 32,
+       .set_mode       = lapic_timer_setup,
+       .set_next_event = lapic_next_event,
+       .broadcast      = lapic_timer_broadcast,
+       .rating         = 100,
+       .irq            = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+       return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+       return APIC_INTEGRATED(lapic_get_version());
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
+static int modern_apic(void)
+{
+       /* AMD systems use old APIC versions, so check the CPU */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+           boot_cpu_data.x86 >= 0xf)
+               return 1;
+       return lapic_get_version() >= 0x14;
+}
+
+void apic_wait_icr_idle(void)
+{
+       while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+               cpu_relax();
+}
+
+unsigned long safe_apic_wait_icr_idle(void)
+{
+       unsigned long send_status;
+       int timeout;
+
+       timeout = 0;
+       do {
+               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+               if (!send_status)
+                       break;
+               udelay(100);
+       } while (timeout++ < 1000);
+
+       return send_status;
+}
+
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void enable_NMI_through_LVT0 (void * dummy)
+{
+       unsigned int v = APIC_DM_NMI;
+
+       /* Level triggered for 82489DX */
+       if (!lapic_is_integrated())
+               v |= APIC_LVT_LEVEL_TRIGGER;
+       apic_write_around(APIC_LVT0, v);
+}
+
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+       return modern_apic() ? 0xff : 0xf;
+}
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+       unsigned int v = apic_read(APIC_LVR);
+
+       /* 82489DXs do not report # of LVT entries. */
+       return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor is set to 16 */
+#define APIC_DIVISOR 16
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+{
+       unsigned int lvtt_value, tmp_value;
+
+       lvtt_value = LOCAL_TIMER_VECTOR;
+       if (!oneshot)
+               lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+       if (!lapic_is_integrated())
+               lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+       if (!irqen)
+               lvtt_value |= APIC_LVT_MASKED;
+
+       apic_write_around(APIC_LVTT, lvtt_value);
+
+       /*
+        * Divide PICLK by 16
+        */
+       tmp_value = apic_read(APIC_TDCR);
+       apic_write_around(APIC_TDCR, (tmp_value
+                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+                               | APIC_TDR_DIV_16);
+
+       if (!oneshot)
+               apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+                           struct clock_event_device *evt)
+{
+       apic_write_around(APIC_TMICT, delta);
+       return 0;
+}
+
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+                             struct clock_event_device *evt)
+{
+       unsigned long flags;
+       unsigned int v;
+
+       /* Lapic used for broadcast ? */
+       if (!local_apic_timer_verify_ok)
+               return;
+
+       local_irq_save(flags);
+
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+       case CLOCK_EVT_MODE_ONESHOT:
+               __setup_APIC_LVTT(calibration_result,
+                                 mode != CLOCK_EVT_MODE_PERIODIC, 1);
+               break;
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               v = apic_read(APIC_LVTT);
+               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+               apic_write_around(APIC_LVTT, v);
+               break;
+       case CLOCK_EVT_MODE_RESUME:
+               /* Nothing to do here */
+               break;
+       }
+
+       local_irq_restore(flags);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(cpumask_t mask)
+{
+#ifdef CONFIG_SMP
+       send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void __devinit setup_APIC_timer(void)
+{
+       struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+
+       memcpy(levt, &lapic_clockevent, sizeof(*levt));
+       levt->cpumask = cpumask_of_cpu(smp_processor_id());
+
+       clockevents_register_device(levt);
+}
+
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS                (HZ/10)
+
+static __initdata int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
+
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
+{
+       unsigned long long tsc = 0;
+       long tapic = apic_read(APIC_TMCCT);
+       unsigned long pm = acpi_pm_read_early();
+
+       if (cpu_has_tsc)
+               rdtscll(tsc);
+
+       switch (lapic_cal_loops++) {
+       case 0:
+               lapic_cal_t1 = tapic;
+               lapic_cal_tsc1 = tsc;
+               lapic_cal_pm1 = pm;
+               lapic_cal_j1 = jiffies;
+               break;
+
+       case LAPIC_CAL_LOOPS:
+               lapic_cal_t2 = tapic;
+               lapic_cal_tsc2 = tsc;
+               if (pm < lapic_cal_pm1)
+                       pm += ACPI_PM_OVRRUN;
+               lapic_cal_pm2 = pm;
+               lapic_cal_j2 = jiffies;
+               break;
+       }
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+       struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+       const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
+       const long pm_thresh = pm_100ms/100;
+       void (*real_handler)(struct clock_event_device *dev);
+       unsigned long deltaj;
+       long delta, deltapm;
+       int pm_referenced = 0;
+
+       /*
+        * The local apic timer can be disabled via the kernel
+        * commandline or from the CPU detection code. Register the lapic
+        * timer as a dummy clock event source on SMP systems, so the
+        * broadcast mechanism is used. On UP systems simply ignore it.
+        */
+       if (local_apic_timer_disabled) {
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() > 1)
+                       setup_APIC_timer();
+               return;
+       }
+
+       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+                   "calibrating APIC timer ...\n");
+
+       local_irq_disable();
+
+       /* Replace the global interrupt handler */
+       real_handler = global_clock_event->event_handler;
+       global_clock_event->event_handler = lapic_cal_handler;
+
+       /*
+        * Setup the APIC counter to 1e9. There is no way the lapic
+        * can underflow in the 100ms detection time frame
+        */
+       __setup_APIC_LVTT(1000000000, 0, 0);
+
+       /* Let the interrupts run */
+       local_irq_enable();
+
+       while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+               cpu_relax();
+
+       local_irq_disable();
+
+       /* Restore the real event handler */
+       global_clock_event->event_handler = real_handler;
+
+       /* Build delta t1-t2 as apic timer counts down */
+       delta = lapic_cal_t1 - lapic_cal_t2;
+       apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+       /* Check, if the PM timer is available */
+       deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+       apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+       if (deltapm) {
+               unsigned long mult;
+               u64 res;
+
+               mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+               if (deltapm > (pm_100ms - pm_thresh) &&
+                   deltapm < (pm_100ms + pm_thresh)) {
+                       apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+               } else {
+                       res = (((u64) deltapm) *  mult) >> 22;
+                       do_div(res, 1000000);
+                       printk(KERN_WARNING "APIC calibration not consistent "
+                              "with PM Timer: %ldms instead of 100ms\n",
+                              (long)res);
+                       /* Correct the lapic counter value */
+                       res = (((u64) delta ) * pm_100ms);
+                       do_div(res, deltapm);
+                       printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+                              "%lu (%ld)\n", (unsigned long) res, delta);
+                       delta = (long) res;
+               }
+               pm_referenced = 1;
+       }
+
+       /* Calculate the scaled math multiplication factor */
+       lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+       lapic_clockevent.max_delta_ns =
+               clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+       lapic_clockevent.min_delta_ns =
+               clockevent_delta2ns(0xF, &lapic_clockevent);
+
+       calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+       apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+       apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+       apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+                   calibration_result);
+
+       if (cpu_has_tsc) {
+               delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+               apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+                           "%ld.%04ld MHz.\n",
+                           (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+                           (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+       }
+
+       apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+                   "%u.%04u MHz.\n",
+                   calibration_result / (1000000 / HZ),
+                   calibration_result % (1000000 / HZ));
+
+       local_apic_timer_verify_ok = 1;
+
+       /* We trust the pm timer based calibration */
+       if (!pm_referenced) {
+               apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+               /*
+                * Setup the apic timer manually
+                */
+               levt->event_handler = lapic_cal_handler;
+               lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+               lapic_cal_loops = -1;
+
+               /* Let the interrupts run */
+               local_irq_enable();
+
+               while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+                       cpu_relax();
+
+               local_irq_disable();
+
+               /* Stop the lapic timer */
+               lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+
+               local_irq_enable();
+
+               /* Jiffies delta */
+               deltaj = lapic_cal_j2 - lapic_cal_j1;
+               apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+               /* Check, if the jiffies result is consistent */
+               if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
+                       apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+               else
+                       local_apic_timer_verify_ok = 0;
+       } else
+               local_irq_enable();
+
+       if (!local_apic_timer_verify_ok) {
+               printk(KERN_WARNING
+                      "APIC timer disabled due to verification failure.\n");
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() == 1)
+                       return;
+       } else {
+               /*
+                * If nmi_watchdog is set to IO_APIC, we need the
+                * PIT/HPET going.  Otherwise register lapic as a dummy
+                * device.
+                */
+               if (nmi_watchdog != NMI_IO_APIC)
+                       lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+               else
+                       printk(KERN_WARNING "APIC timer registered as dummy,"
+                              " due to nmi_watchdog=1!\n");
+       }
+
+       /* Setup the lapic or request the broadcast */
+       setup_APIC_timer();
+}
+
+void __devinit setup_secondary_APIC_clock(void)
+{
+       setup_APIC_timer();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+       int cpu = smp_processor_id();
+       struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+
+       /*
+        * Normally we should not be here till LAPIC has been initialized but
+        * in some cases like kdump, its possible that there is a pending LAPIC
+        * timer interrupt from previous kernel's context and is delivered in
+        * new kernel the moment interrupts are enabled.
+        *
+        * Interrupts are enabled early and LAPIC is setup much later, hence
+        * its possible that when we get here evt->event_handler is NULL.
+        * Check for event_handler being NULL and discard the interrupt as
+        * spurious.
+        */
+       if (!evt->event_handler) {
+               printk(KERN_WARNING
+                      "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+               /* Switch it off */
+               lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+               return;
+       }
+
+       per_cpu(irq_stat, cpu).apic_timer_irqs++;
+
+       evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+
+void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       /*
+        * NOTE! We'd better ACK the irq immediately,
+        * because timer handling can be slow.
+        */
+       ack_APIC_irq();
+       /*
+        * update_process_times() expects us to have done irq_enter().
+        * Besides, if we don't timer interrupts ignore the global
+        * interrupt lock, which is the WrongThing (tm) to do.
+        */
+       irq_enter();
+       local_apic_timer_interrupt();
+       irq_exit();
+
+       set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+       return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+       int maxlvt = lapic_get_maxlvt();
+       unsigned long v;
+
+       /*
+        * Masking an LVT entry can trigger a local APIC error
+        * if the vector is zero. Mask LVTERR first to prevent this.
+        */
+       if (maxlvt >= 3) {
+               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+               apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+       }
+       /*
+        * Careful: we have to set masks only first to deassert
+        * any level-triggered sources.
+        */
+       v = apic_read(APIC_LVTT);
+       apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+       v = apic_read(APIC_LVT0);
+       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+       v = apic_read(APIC_LVT1);
+       apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+       if (maxlvt >= 4) {
+               v = apic_read(APIC_LVTPC);
+               apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+       }
+
+       /* lets not touch this if we didn't frob it */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+       if (maxlvt >= 5) {
+               v = apic_read(APIC_LVTTHMR);
+               apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+       }
+#endif
+       /*
+        * Clean APIC state for other OSs:
+        */
+       apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+       apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+       if (maxlvt >= 3)
+               apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+       if (maxlvt >= 4)
+               apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+
+#ifdef CONFIG_X86_MCE_P4THERMAL
+       if (maxlvt >= 5)
+               apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
+#endif
+       /* Integrated APIC (!82489DX) ? */
+       if (lapic_is_integrated()) {
+               if (maxlvt > 3)
+                       /* Clear ESR due to Pentium errata 3AP and 11AP */
+                       apic_write(APIC_ESR, 0);
+               apic_read(APIC_ESR);
+       }
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+       unsigned long value;
+
+       clear_local_APIC();
+
+       /*
+        * Disable APIC (implies clearing of registers
+        * for 82489DX!).
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_SPIV_APIC_ENABLED;
+       apic_write_around(APIC_SPIV, value);
+
+       /*
+        * When LAPIC was disabled by the BIOS and enabled by the kernel,
+        * restore the disabled state.
+        */
+       if (enabled_via_apicbase) {
+               unsigned int l, h;
+
+               rdmsr(MSR_IA32_APICBASE, l, h);
+               l &= ~MSR_IA32_APICBASE_ENABLE;
+               wrmsr(MSR_IA32_APICBASE, l, h);
+       }
+}
+
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown.  Otherwise the BIOS may get confused and
+ * not power-off.  Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+       unsigned long flags;
+
+       if (!cpu_has_apic)
+               return;
+
+       local_irq_save(flags);
+       clear_local_APIC();
+
+       if (enabled_via_apicbase)
+               disable_local_APIC();
+
+       local_irq_restore(flags);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+       unsigned int reg0, reg1;
+
+       /*
+        * The version register is read-only in a real APIC.
+        */
+       reg0 = apic_read(APIC_LVR);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+       reg1 = apic_read(APIC_LVR);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+       /*
+        * The two version reads above should print the same
+        * numbers.  If the second one is different, then we
+        * poke at a non-APIC.
+        */
+       if (reg1 != reg0)
+               return 0;
+
+       /*
+        * Check if the version looks reasonably.
+        */
+       reg1 = GET_APIC_VERSION(reg0);
+       if (reg1 == 0x00 || reg1 == 0xff)
+               return 0;
+       reg1 = lapic_get_maxlvt();
+       if (reg1 < 0x02 || reg1 == 0xff)
+               return 0;
+
+       /*
+        * The ID register is read/write in a real APIC.
+        */
+       reg0 = apic_read(APIC_ID);
+       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+
+       /*
+        * The next two are just to see if we have sane values.
+        * They're only really relevant if we're in Virtual Wire
+        * compatibility mode, but most boxes are anymore.
+        */
+       reg0 = apic_read(APIC_LVT0);
+       apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
+       reg1 = apic_read(APIC_LVT1);
+       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+
+       return 1;
+}
+
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
+void __init sync_Arb_IDs(void)
+{
+       /*
+        * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+        * needed on AMD.
+        */
+       if (modern_apic())
+               return;
+       /*
+        * Wait for idle.
+        */
+       apic_wait_icr_idle();
+
+       apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+       apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+                               | APIC_DM_INIT);
+}
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+       unsigned long value;
+
+       /*
+        * Don't do the setup now if we have a SMP BIOS as the
+        * through-I/O-APIC virtual wire mode might be active.
+        */
+       if (smp_found_config || !cpu_has_apic)
+               return;
+
+       /*
+        * Do not trust the local APIC being empty at bootup.
+        */
+       clear_local_APIC();
+
+       /*
+        * Enable APIC.
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       value |= APIC_SPIV_APIC_ENABLED;
+
+       /* This bit is reserved on P4/Xeon and should be cleared */
+       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+           (boot_cpu_data.x86 == 15))
+               value &= ~APIC_SPIV_FOCUS_DISABLED;
+       else
+               value |= APIC_SPIV_FOCUS_DISABLED;
+       value |= SPURIOUS_APIC_VECTOR;
+       apic_write_around(APIC_SPIV, value);
+
+       /*
+        * Set up the virtual wire mode.
+        */
+       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+       value = APIC_DM_NMI;
+       if (!lapic_is_integrated())             /* 82489DX */
+               value |= APIC_LVT_LEVEL_TRIGGER;
+       apic_write_around(APIC_LVT1, value);
+}
+
+/**
+ * setup_local_APIC - setup the local APIC
+ */
+void __devinit setup_local_APIC(void)
+{
+       unsigned long oldvalue, value, maxlvt, integrated;
+       int i, j;
+
+       /* Pound the ESR really hard over the head with a big hammer - mbligh */
+       if (esr_disable) {
+               apic_write(APIC_ESR, 0);
+               apic_write(APIC_ESR, 0);
+               apic_write(APIC_ESR, 0);
+               apic_write(APIC_ESR, 0);
+       }
+
+       integrated = lapic_is_integrated();
+
+       /*
+        * Double-check whether this APIC is really registered.
+        */
+       if (!apic_id_registered())
+               BUG();
+
+       /*
+        * Intel recommends to set DFR, LDR and TPR before enabling
+        * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+        * document number 292116).  So here it goes...
+        */
+       init_apic_ldr();
+
+       /*
+        * Set Task Priority to 'accept all'. We never change this
+        * later on.
+        */
+       value = apic_read(APIC_TASKPRI);
+       value &= ~APIC_TPRI_MASK;
+       apic_write_around(APIC_TASKPRI, value);
+
+       /*
+        * After a crash, we no longer service the interrupts and a pending
+        * interrupt from previous kernel might still have ISR bit set.
+        *
+        * Most probably by now CPU has serviced that pending interrupt and
+        * it might not have done the ack_APIC_irq() because it thought,
+        * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+        * does not clear the ISR bit and cpu thinks it has already serivced
+        * the interrupt. Hence a vector might get locked. It was noticed
+        * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+        */
+       for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+               value = apic_read(APIC_ISR + i*0x10);
+               for (j = 31; j >= 0; j--) {
+                       if (value & (1<<j))
+                               ack_APIC_irq();
+               }
+       }
+
+       /*
+        * Now that we are all set up, enable the APIC
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       /*
+        * Enable APIC
+        */
+       value |= APIC_SPIV_APIC_ENABLED;
+
+       /*
+        * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+        * certain networking cards. If high frequency interrupts are
+        * happening on a particular IOAPIC pin, plus the IOAPIC routing
+        * entry is masked/unmasked at a high rate as well then sooner or
+        * later IOAPIC line gets 'stuck', no more interrupts are received
+        * from the device. If focus CPU is disabled then the hang goes
+        * away, oh well :-(
+        *
+        * [ This bug can be reproduced easily with a level-triggered
+        *   PCI Ne2000 networking cards and PII/PIII processors, dual
+        *   BX chipset. ]
+        */
+       /*
+        * Actually disabling the focus CPU check just makes the hang less
+        * frequent as it makes the interrupt distributon model be more
+        * like LRU than MRU (the short-term load is more even across CPUs).
+        * See also the comment in end_level_ioapic_irq().  --macro
+        */
+
+       /* Enable focus processor (bit==0) */
+       value &= ~APIC_SPIV_FOCUS_DISABLED;
+
+       /*
+        * Set spurious IRQ vector
+        */
+       value |= SPURIOUS_APIC_VECTOR;
+       apic_write_around(APIC_SPIV, value);
+
+       /*
+        * Set up LVT0, LVT1:
+        *
+        * set up through-local-APIC on the BP's LINT0. This is not
+        * strictly necessery in pure symmetric-IO mode, but sometimes
+        * we delegate interrupts to the 8259A.
+        */
+       /*
+        * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+        */
+       value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+       if (!smp_processor_id() && (pic_mode || !value)) {
+               value = APIC_DM_EXTINT;
+               apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
+                               smp_processor_id());
+       } else {
+               value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+               apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
+                               smp_processor_id());
+       }
+       apic_write_around(APIC_LVT0, value);
+
+       /*
+        * only the BP should see the LINT1 NMI signal, obviously.
+        */
+       if (!smp_processor_id())
+               value = APIC_DM_NMI;
+       else
+               value = APIC_DM_NMI | APIC_LVT_MASKED;
+       if (!integrated)                /* 82489DX */
+               value |= APIC_LVT_LEVEL_TRIGGER;
+       apic_write_around(APIC_LVT1, value);
+
+       if (integrated && !esr_disable) {               /* !82489DX */
+               maxlvt = lapic_get_maxlvt();
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                       apic_write(APIC_ESR, 0);
+               oldvalue = apic_read(APIC_ESR);
+
+               /* enables sending errors */
+               value = ERROR_APIC_VECTOR;
+               apic_write_around(APIC_LVTERR, value);
+               /*
+                * spec says clear errors after enabling vector.
+                */
+               if (maxlvt > 3)
+                       apic_write(APIC_ESR, 0);
+               value = apic_read(APIC_ESR);
+               if (value != oldvalue)
+                       apic_printk(APIC_VERBOSE, "ESR value before enabling "
+                               "vector: 0x%08lx  after: 0x%08lx\n",
+                               oldvalue, value);
+       } else {
+               if (esr_disable)
+                       /*
+                        * Something untraceble is creating bad interrupts on
+                        * secondary quads ... for the moment, just leave the
+                        * ESR disabled - we can't do anything useful with the
+                        * errors anyway - mbligh
+                        */
+                       printk(KERN_INFO "Leaving ESR disabled.\n");
+               else
+                       printk(KERN_INFO "No ESR for 82489DX.\n");
+       }
+
+       /* Disable the local apic timer */
+       value = apic_read(APIC_LVTT);
+       value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+       apic_write_around(APIC_LVTT, value);
+
+       setup_apic_nmi_watchdog(NULL);
+       apic_pm_activate();
+}
+
+/*
+ * Detect and initialize APIC
+ */
+static int __init detect_init_APIC (void)
+{
+       u32 h, l, features;
+
+       /* Disabled by kernel option? */
+       if (enable_local_apic < 0)
+               return -1;
+
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_AMD:
+               if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
+                   (boot_cpu_data.x86 == 15))
+                       break;
+               goto no_apic;
+       case X86_VENDOR_INTEL:
+               if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
+                   (boot_cpu_data.x86 == 5 && cpu_has_apic))
+                       break;
+               goto no_apic;
+       default:
+               goto no_apic;
+       }
+
+       if (!cpu_has_apic) {
+               /*
+                * Over-ride BIOS and try to enable the local APIC only if
+                * "lapic" specified.
+                */
+               if (enable_local_apic <= 0) {
+                       printk(KERN_INFO "Local APIC disabled by BIOS -- "
+                              "you can enable it with \"lapic\"\n");
+                       return -1;
+               }
+               /*
+                * Some BIOSes disable the local APIC in the APIC_BASE
+                * MSR. This can only be done in software for Intel P6 or later
+                * and AMD K7 (Model > 1) or later.
+                */
+               rdmsr(MSR_IA32_APICBASE, l, h);
+               if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+                       printk(KERN_INFO
+                              "Local APIC disabled by BIOS -- reenabling.\n");
+                       l &= ~MSR_IA32_APICBASE_BASE;
+                       l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+                       wrmsr(MSR_IA32_APICBASE, l, h);
+                       enabled_via_apicbase = 1;
+               }
+       }
+       /*
+        * The APIC feature bit should now be enabled
+        * in `cpuid'
+        */
+       features = cpuid_edx(1);
+       if (!(features & (1 << X86_FEATURE_APIC))) {
+               printk(KERN_WARNING "Could not enable APIC!\n");
+               return -1;
+       }
+       set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+       /* The BIOS may have set up the APIC at some other address */
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       if (l & MSR_IA32_APICBASE_ENABLE)
+               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+       if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
+               nmi_watchdog = NMI_LOCAL_APIC;
+
+       printk(KERN_INFO "Found and enabled local APIC!\n");
+
+       apic_pm_activate();
+
+       return 0;
+
+no_apic:
+       printk(KERN_INFO "No local APIC present or hardware disabled\n");
+       return -1;
+}
+
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
+void __init init_apic_mappings(void)
+{
+       unsigned long apic_phys;
+
+       /*
+        * If no local APIC can be found then set up a fake all
+        * zeroes page to simulate the local APIC and another
+        * one for the IO-APIC.
+        */
+       if (!smp_found_config && detect_init_APIC()) {
+               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+               apic_phys = __pa(apic_phys);
+       } else
+               apic_phys = mp_lapic_addr;
+
+       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+       printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
+              apic_phys);
+
+       /*
+        * Fetch the APIC ID of the BSP in case we have a
+        * default configuration (or the MP table is broken).
+        */
+       if (boot_cpu_physical_apicid == -1U)
+               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+
+#ifdef CONFIG_X86_IO_APIC
+       {
+               unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+               int i;
+
+               for (i = 0; i < nr_ioapics; i++) {
+                       if (smp_found_config) {
+                               ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                               if (!ioapic_phys) {
+                                       printk(KERN_ERR
+                                              "WARNING: bogus zero IO-APIC "
+                                              "address found in MPTABLE, "
+                                              "disabling IO/APIC support!\n");
+                                       smp_found_config = 0;
+                                       skip_ioapic_setup = 1;
+                                       goto fake_ioapic_page;
+                               }
+                       } else {
+fake_ioapic_page:
+                               ioapic_phys = (unsigned long)
+                                             alloc_bootmem_pages(PAGE_SIZE);
+                               ioapic_phys = __pa(ioapic_phys);
+                       }
+                       set_fixmap_nocache(idx, ioapic_phys);
+                       printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+                              __fix_to_virt(idx), ioapic_phys);
+                       idx++;
+               }
+       }
+#endif
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+       if (enable_local_apic < 0)
+               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+
+       if (!smp_found_config && !cpu_has_apic)
+               return -1;
+
+       /*
+        * Complain if the BIOS pretends there is one.
+        */
+       if (!cpu_has_apic &&
+           APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                      boot_cpu_physical_apicid);
+               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+               return -1;
+       }
+
+       verify_local_APIC();
+
+       connect_bsp_APIC();
+
+       /*
+        * Hack: In case of kdump, after a crash, kernel might be booting
+        * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+        * might be zero if read from MP tables. Get it from LAPIC.
+        */
+#ifdef CONFIG_CRASH_DUMP
+       boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+#endif
+       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+
+       setup_local_APIC();
+
+#ifdef CONFIG_X86_IO_APIC
+       if (smp_found_config)
+               if (!skip_ioapic_setup && nr_ioapics)
+                       setup_IO_APIC();
+#endif
+       setup_boot_clock();
+
+       return 0;
+}
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+       enable_local_apic = 1;
+       return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+       enable_local_apic = -1;
+       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+       return 0;
+}
+early_param("nolapic", parse_nolapic);
+
+static int __init parse_disable_lapic_timer(char *arg)
+{
+       local_apic_timer_disabled = 1;
+       return 0;
+}
+early_param("nolapic_timer", parse_disable_lapic_timer);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+       local_apic_timer_c2_ok = 1;
+       return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init apic_set_verbosity(char *str)
+{
+       if (strcmp("debug", str) == 0)
+               apic_verbosity = APIC_DEBUG;
+       else if (strcmp("verbose", str) == 0)
+               apic_verbosity = APIC_VERBOSE;
+       return 1;
+}
+
+__setup("apic=", apic_set_verbosity);
+
+
+/*
+ * Local APIC interrupts
+ */
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+void smp_spurious_interrupt(struct pt_regs *regs)
+{
+       unsigned long v;
+
+       irq_enter();
+       /*
+        * Check if this really is a spurious interrupt and ACK it
+        * if it is a vectored one.  Just in case...
+        * Spurious interrupts should not be ACKed.
+        */
+       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+               ack_APIC_irq();
+
+       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
+       printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
+              "should never happen.\n", smp_processor_id());
+       irq_exit();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+void smp_error_interrupt(struct pt_regs *regs)
+{
+       unsigned long v, v1;
+
+       irq_enter();
+       /* First tickle the hardware, only then report what went on. -- REW */
+       v = apic_read(APIC_ESR);
+       apic_write(APIC_ESR, 0);
+       v1 = apic_read(APIC_ESR);
+       ack_APIC_irq();
+       atomic_inc(&irq_err_count);
+
+       /* Here is what the APIC error bits mean:
+          0: Send CS error
+          1: Receive CS error
+          2: Send accept error
+          3: Receive accept error
+          4: Reserved
+          5: Send illegal vector
+          6: Received illegal vector
+          7: Illegal register address
+       */
+       printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+               smp_processor_id(), v , v1);
+       irq_exit();
+}
+
+/*
+ * Initialize APIC interrupts
+ */
+void __init apic_intr_init(void)
+{
+#ifdef CONFIG_SMP
+       smp_intr_init();
+#endif
+       /* self generated IPI for local APIC timer */
+       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+       /* IPI vectors for APIC spurious and error interrupts */
+       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+       set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+       /* thermal monitor LVT interrupt */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+       set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+void __init connect_bsp_APIC(void)
+{
+       if (pic_mode) {
+               /*
+                * Do not trust the local APIC being empty at bootup.
+                */
+               clear_local_APIC();
+               /*
+                * PIC mode, enable APIC mode in the IMCR, i.e.  connect BSP's
+                * local APIC to INT and NMI lines.
+                */
+               apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+                               "enabling APIC mode.\n");
+               outb(0x70, 0x22);
+               outb(0x01, 0x23);
+       }
+       enable_apic_mode();
+}
+
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup:   indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+       if (pic_mode) {
+               /*
+                * Put the board back into PIC mode (has an effect only on
+                * certain older boards).  Note that APIC interrupts, including
+                * IPIs, won't work beyond this point!  The only exception are
+                * INIT IPIs.
+                */
+               apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+                               "entering PIC mode.\n");
+               outb(0x70, 0x22);
+               outb(0x00, 0x23);
+       } else {
+               /* Go back to Virtual Wire compatibility mode */
+               unsigned long value;
+
+               /* For the spurious interrupt use vector F, and enable it */
+               value = apic_read(APIC_SPIV);
+               value &= ~APIC_VECTOR_MASK;
+               value |= APIC_SPIV_APIC_ENABLED;
+               value |= 0xf;
+               apic_write_around(APIC_SPIV, value);
+
+               if (!virt_wire_setup) {
+                       /*
+                        * For LVT0 make it edge triggered, active high,
+                        * external and enabled
+                        */
+                       value = apic_read(APIC_LVT0);
+                       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                               APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                               APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+                       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+                       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+                       apic_write_around(APIC_LVT0, value);
+               } else {
+                       /* Disable LVT0 */
+                       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+               }
+
+               /*
+                * For LVT1 make it edge triggered, active high, nmi and
+                * enabled
+                */
+               value = apic_read(APIC_LVT1);
+               value &= ~(
+                       APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+               apic_write_around(APIC_LVT1, value);
+       }
+}
+
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+       int active;
+       /* r/w apic fields */
+       unsigned int apic_id;
+       unsigned int apic_taskpri;
+       unsigned int apic_ldr;
+       unsigned int apic_dfr;
+       unsigned int apic_spiv;
+       unsigned int apic_lvtt;
+       unsigned int apic_lvtpc;
+       unsigned int apic_lvt0;
+       unsigned int apic_lvt1;
+       unsigned int apic_lvterr;
+       unsigned int apic_tmict;
+       unsigned int apic_tdcr;
+       unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+       unsigned long flags;
+       int maxlvt;
+
+       if (!apic_pm_state.active)
+               return 0;
+
+       maxlvt = lapic_get_maxlvt();
+
+       apic_pm_state.apic_id = apic_read(APIC_ID);
+       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+       if (maxlvt >= 4)
+               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+       if (maxlvt >= 5)
+               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+
+       local_irq_save(flags);
+       disable_local_APIC();
+       local_irq_restore(flags);
+       return 0;
+}
+
+static int lapic_resume(struct sys_device *dev)
+{
+       unsigned int l, h;
+       unsigned long flags;
+       int maxlvt;
+
+       if (!apic_pm_state.active)
+               return 0;
+
+       maxlvt = lapic_get_maxlvt();
+
+       local_irq_save(flags);
+
+       /*
+        * Make sure the APICBASE points to the right address
+        *
+        * FIXME! This will be wrong if we ever support suspend on
+        * SMP! We'll need to do this as part of the CPU restore!
+        */
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       l &= ~MSR_IA32_APICBASE_BASE;
+       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+       wrmsr(MSR_IA32_APICBASE, l, h);
+
+       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+       apic_write(APIC_ID, apic_pm_state.apic_id);
+       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+       if (maxlvt >= 5)
+               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+       if (maxlvt >= 4)
+               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+       local_irq_restore(flags);
+       return 0;
+}
+
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct sysdev_class lapic_sysclass = {
+       set_kset_name("lapic"),
+       .resume         = lapic_resume,
+       .suspend        = lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+       .id     = 0,
+       .cls    = &lapic_sysclass,
+};
+
+static void __devinit apic_pm_activate(void)
+{
+       apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+       int error;
+
+       if (!cpu_has_apic)
+               return 0;
+       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+
+       error = sysdev_class_register(&lapic_sysclass);
+       if (!error)
+               error = sysdev_register(&device_lapic);
+       return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else  /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644 (file)
index 0000000..f02a8ac
--- /dev/null
@@ -0,0 +1,2403 @@
+/* -*- linux-c -*-
+ * APM BIOS driver for Linux
+ * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
+ *
+ * Initial development of this driver was funded by NEC Australia P/L
+ *     and NEC Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * October 1995, Rik Faith (faith@cs.unc.edu):
+ *    Minor enhancements and updates (to the patch set) for 1.3.x
+ *    Documentation
+ * January 1996, Rik Faith (faith@cs.unc.edu):
+ *    Make /proc/apm easy to format (bump driver version)
+ * March 1996, Rik Faith (faith@cs.unc.edu):
+ *    Prohibit APM BIOS calls unless apm_enabled.
+ *    (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
+ * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
+ *    Version 1.0 and 1.1
+ * May 1996, Version 1.2
+ * Feb 1998, Version 1.3
+ * Feb 1998, Version 1.4
+ * Aug 1998, Version 1.5
+ * Sep 1998, Version 1.6
+ * Nov 1998, Version 1.7
+ * Jan 1999, Version 1.8
+ * Jan 1999, Version 1.9
+ * Oct 1999, Version 1.10
+ * Nov 1999, Version 1.11
+ * Jan 2000, Version 1.12
+ * Feb 2000, Version 1.13
+ * Nov 2000, Version 1.14
+ * Oct 2001, Version 1.15
+ * Jan 2002, Version 1.16
+ * Oct 2002, Version 1.16ac
+ *
+ * History:
+ *    0.6b: first version in official kernel, Linux 1.3.46
+ *    0.7: changed /proc/apm format, Linux 1.3.58
+ *    0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
+ *    0.9: only call bios if bios is present, Linux 1.3.72
+ *    1.0: use fixed device number, consolidate /proc/apm into this file,
+ *         Linux 1.3.85
+ *    1.1: support user-space standby and suspend, power off after system
+ *         halted, Linux 1.3.98
+ *    1.2: When resetting RTC after resume, take care so that the time
+ *         is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
+ *         <jtoth@princeton.edu>); improve interaction between
+ *         screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
+ *    1.2a:Simple change to stop mysterious bug reports with SMP also added
+ *        levels to the printk calls. APM is not defined for SMP machines.
+ *         The new replacment for it is, but Linux doesn't yet support this.
+ *         Alan Cox Linux 2.1.55
+ *    1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
+ *    1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
+ *         Dean Gaudet <dgaudet@arctic.org>.
+ *         C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
+ *    1.5: Fix segment register reloading (in case of bad segments saved
+ *         across BIOS call).
+ *         Stephen Rothwell
+ *    1.6: Cope with complier/assembler differences.
+ *         Only try to turn off the first display device.
+ *         Fix OOPS at power off with no APM BIOS by Jan Echternach
+ *                   <echter@informatik.uni-rostock.de>
+ *         Stephen Rothwell
+ *    1.7: Modify driver's cached copy of the disabled/disengaged flags
+ *         to reflect current state of APM BIOS.
+ *         Chris Rankin <rankinc@bellsouth.net>
+ *         Reset interrupt 0 timer to 100Hz after suspend
+ *         Chad Miller <cmiller@surfsouth.com>
+ *         Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
+ *         Richard Gooch <rgooch@atnf.csiro.au>
+ *         Allow boot time disabling of APM
+ *         Make boot messages far less verbose by default
+ *         Make asm safer
+ *         Stephen Rothwell
+ *    1.8: Add CONFIG_APM_RTC_IS_GMT
+ *         Richard Gooch <rgooch@atnf.csiro.au>
+ *         change APM_NOINTS to CONFIG_APM_ALLOW_INTS
+ *         remove dependency on CONFIG_PROC_FS
+ *         Stephen Rothwell
+ *    1.9: Fix small typo.  <laslo@wodip.opole.pl>
+ *         Try to cope with BIOS's that need to have all display
+ *         devices blanked and not just the first one.
+ *         Ross Paterson <ross@soi.city.ac.uk>
+ *         Fix segment limit setting it has always been wrong as
+ *         the segments needed to have byte granularity.
+ *         Mark a few things __init.
+ *         Add hack to allow power off of SMP systems by popular request.
+ *         Use CONFIG_SMP instead of __SMP__
+ *         Ignore BOUNCES for three seconds.
+ *         Stephen Rothwell
+ *   1.10: Fix for Thinkpad return code.
+ *         Merge 2.2 and 2.3 drivers.
+ *         Remove APM dependencies in arch/i386/kernel/process.c
+ *         Remove APM dependencies in drivers/char/sysrq.c
+ *         Reset time across standby.
+ *         Allow more inititialisation on SMP.
+ *         Remove CONFIG_APM_POWER_OFF and make it boot time
+ *         configurable (default on).
+ *         Make debug only a boot time parameter (remove APM_DEBUG).
+ *         Try to blank all devices on any error.
+ *   1.11: Remove APM dependencies in drivers/char/console.c
+ *         Check nr_running to detect if we are idle (from
+ *         Borislav Deianov <borislav@lix.polytechnique.fr>)
+ *         Fix for bioses that don't zero the top part of the
+ *         entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
+ *         (reported by Panos Katsaloulis <teras@writeme.com>).
+ *         Real mode power off patch (Walter Hofmann
+ *         <Walter.Hofmann@physik.stud.uni-erlangen.de>).
+ *   1.12: Remove CONFIG_SMP as the compiler will optimize
+ *         the code away anyway (smp_num_cpus == 1 in UP)
+ *         noted by Artur Skawina <skawina@geocities.com>.
+ *         Make power off under SMP work again.
+ *         Fix thinko with initial engaging of BIOS.
+ *         Make sure power off only happens on CPU 0
+ *         (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
+ *         Do error notification to user mode if BIOS calls fail.
+ *         Move entrypoint offset fix to ...boot/setup.S
+ *         where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
+ *         Remove smp-power-off. SMP users must now specify
+ *         "apm=power-off" on the kernel command line. Suggested
+ *         by Jim Avera <jima@hal.com>, modified by Alan Cox
+ *         <alan@lxorguk.ukuu.org.uk>.
+ *         Register the /proc/apm entry even on SMP so that
+ *         scripts that check for it before doing power off
+ *         work (Jim Avera <jima@hal.com>).
+ *   1.13: Changes for new pm_ interfaces (Andy Henroid
+ *         <andy_henroid@yahoo.com>).
+ *         Modularize the code.
+ *         Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
+ *         is now the way life works).
+ *         Fix thinko in suspend() (wrong return).
+ *         Notify drivers on critical suspend.
+ *         Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ *         modified by sfr).
+ *         Disable interrupts while we are suspended (Andy Henroid
+ *         <andy_henroid@yahoo.com> fixed by sfr).
+ *         Make power off work on SMP again (Tony Hoyle
+ *         <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
+ *         Remove CONFIG_APM_SUSPEND_BOUNCE.  The bounce ignore
+ *         interval is now configurable.
+ *   1.14: Make connection version persist across module unload/load.
+ *         Enable and engage power management earlier.
+ *         Disengage power management on module unload.
+ *         Changed to use the sysrq-register hack for registering the
+ *         power off function called by magic sysrq based upon discussions
+ *         in irc://irc.openprojects.net/#kernelnewbies
+ *         (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
+ *         Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
+ *         (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
+ *         Work around byte swap bug in one of the Vaio's BIOS's
+ *         (Marc Boucher <marc@mbsi.ca>).
+ *         Exposed the disable flag to dmi so that we can handle known
+ *         broken APM (Alan Cox <alan@redhat.com>).
+ *   1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
+ *         calling it - instead idle. (Alan Cox <alan@redhat.com>)
+ *         If an APM idle fails log it and idle sensibly
+ *   1.15: Don't queue events to clients who open the device O_WRONLY.
+ *         Don't expect replies from clients who open the device O_RDONLY.
+ *         (Idea from Thomas Hood)
+ *         Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
+ *   1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
+ *         Notify listeners of standby or suspend events before notifying
+ *         drivers. Return EBUSY to ioctl() if suspend is rejected.
+ *         (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
+ *         Ignore first resume after we generate our own resume event
+ *         after a suspend (Thomas Hood)
+ *         Daemonize now gets rid of our controlling terminal (sfr).
+ *         CONFIG_APM_CPU_IDLE now just affects the default value of
+ *         idle_threshold (sfr).
+ *         Change name of kernel apm daemon (as it no longer idles) (sfr).
+ *   1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
+ *        make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
+ *        TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
+ *
+ * APM 1.1 Reference:
+ *
+ *   Intel Corporation, Microsoft Corporation. Advanced Power Management
+ *   (APM) BIOS Interface Specification, Revision 1.1, September 1993.
+ *   Intel Order Number 241704-001.  Microsoft Part Number 781-110-X01.
+ *
+ * [This document is available free from Intel by calling 800.628.8686 (fax
+ * 916.356.6100) or 800.548.4725; or via anonymous ftp from
+ * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc.  It is also
+ * available from Microsoft by calling 206.882.8080.]
+ *
+ * APM 1.2 Reference:
+ *   Intel Corporation, Microsoft Corporation. Advanced Power Management
+ *   (APM) BIOS Interface Specification, Revision 1.2, February 1996.
+ *
+ * [This document is available from Microsoft at:
+ *    http://www.microsoft.com/whdc/archive/amp_12.mspx]
+ */
+
+#include <linux/module.h>
+
+#include <linux/poll.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/timer.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/apm_bios.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/pm.h>
+#include <linux/pm_legacy.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/freezer.h>
+#include <linux/smp.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/i8253.h>
+#include <asm/paravirt.h>
+#include <asm/reboot.h>
+
+#include "io_ports.h"
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+extern int (*console_blank_hook)(int);
+#endif
+
+/*
+ * The apm_bios device is one of the misc char devices.
+ * This is its minor number.
+ */
+#define        APM_MINOR_DEV   134
+
+/*
+ * See Documentation/Config.help for the configuration options.
+ *
+ * Various options can be changed at boot time as follows:
+ * (We allow underscores for compatibility with the modules code)
+ *     apm=on/off                      enable/disable APM
+ *         [no-]allow[-_]ints          allow interrupts during BIOS calls
+ *         [no-]broken[-_]psr          BIOS has a broken GetPowerStatus call
+ *         [no-]realmode[-_]power[-_]off       switch to real mode before
+ *                                             powering off
+ *         [no-]debug                  log some debugging messages
+ *         [no-]power[-_]off           power off on shutdown
+ *         [no-]smp                    Use apm even on an SMP box
+ *         bounce[-_]interval=<n>      number of ticks to ignore suspend
+ *                                     bounces
+ *          idle[-_]threshold=<n>       System idle percentage above which to
+ *                                      make APM BIOS idle calls. Set it to
+ *                                      100 to disable.
+ *          idle[-_]period=<n>          Period (in 1/100s of a second) over
+ *                                      which the idle percentage is
+ *                                      calculated.
+ */
+
+/* KNOWN PROBLEM MACHINES:
+ *
+ * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
+ *                         [Confirmed by TI representative]
+ * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
+ *                    [Confirmed by BIOS disassembly]
+ *                    [This may work now ...]
+ * P: Toshiba 1950S: battery life information only gets updated after resume
+ * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
+ *     broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
+ * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
+ *     Neale Banks <neale@lowendale.com.au> December 2000
+ *
+ * Legend: U = unusable with APM patches
+ *         P = partially usable with APM patches
+ */
+
+/*
+ * Define as 1 to make the driver always call the APM BIOS busy
+ * routine even if the clock was not reported as slowed by the
+ * idle routine.  Otherwise, define as 0.
+ */
+#define ALWAYS_CALL_BUSY   1
+
+/*
+ * Define to make the APM BIOS calls zero all data segment registers (so
+ * that an incorrect BIOS implementation will cause a kernel panic if it
+ * tries to write to arbitrary memory).
+ */
+#define APM_ZERO_SEGS
+
+#include "apm.h"
+
+/*
+ * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
+ * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
+ * David Chen <chen@ctpa04.mit.edu>
+ */
+#undef INIT_TIMER_AFTER_SUSPEND
+
+#ifdef INIT_TIMER_AFTER_SUSPEND
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#endif
+
+/*
+ * Need to poll the APM BIOS every second
+ */
+#define APM_CHECK_TIMEOUT      (HZ)
+
+/*
+ * Ignore suspend events for this amount of time after a resume
+ */
+#define DEFAULT_BOUNCE_INTERVAL                (3 * HZ)
+
+/*
+ * Maximum number of events stored
+ */
+#define APM_MAX_EVENTS         20
+
+/*
+ * The per-file APM data
+ */
+struct apm_user {
+       int             magic;
+       struct apm_user *       next;
+       unsigned int    suser: 1;
+       unsigned int    writer: 1;
+       unsigned int    reader: 1;
+       unsigned int    suspend_wait: 1;
+       int             suspend_result;
+       int             suspends_pending;
+       int             standbys_pending;
+       int             suspends_read;
+       int             standbys_read;
+       int             event_head;
+       int             event_tail;
+       apm_event_t     events[APM_MAX_EVENTS];
+};
+
+/*
+ * The magic number in apm_user
+ */
+#define APM_BIOS_MAGIC         0x4101
+
+/*
+ * idle percentage above which bios idle calls are done
+ */
+#ifdef CONFIG_APM_CPU_IDLE
+#define DEFAULT_IDLE_THRESHOLD 95
+#else
+#define DEFAULT_IDLE_THRESHOLD 100
+#endif
+#define DEFAULT_IDLE_PERIOD    (100 / 3)
+
+/*
+ * Local variables
+ */
+static struct {
+       unsigned long   offset;
+       unsigned short  segment;
+}                              apm_bios_entry;
+static int                     clock_slowed;
+static int                     idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int                     idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int                     set_pm_idle;
+static int                     suspends_pending;
+static int                     standbys_pending;
+static int                     ignore_sys_suspend;
+static int                     ignore_normal_resume;
+static int                     bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static int                     debug __read_mostly;
+static int                     smp __read_mostly;
+static int                     apm_disabled = -1;
+#ifdef CONFIG_SMP
+static int                     power_off;
+#else
+static int                     power_off = 1;
+#endif
+#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
+static int                     realmode_power_off = 1;
+#else
+static int                     realmode_power_off;
+#endif
+#ifdef CONFIG_APM_ALLOW_INTS
+static int                     allow_ints = 1;
+#else
+static int                     allow_ints;
+#endif
+static int                     broken_psr;
+
+static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
+static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
+static struct apm_user *       user_list;
+static DEFINE_SPINLOCK(user_list_lock);
+static const struct desc_struct        bad_bios_desc = { 0, 0x00409200 };
+
+static const char              driver_version[] = "1.16ac";    /* no spaces */
+
+static struct task_struct *kapmd_task;
+
+/*
+ *     APM event names taken from the APM 1.2 specification. These are
+ *     the message codes that the BIOS uses to tell us about events
+ */
+static const char *    const apm_event_name[] = {
+       "system standby",
+       "system suspend",
+       "normal resume",
+       "critical resume",
+       "low battery",
+       "power status change",
+       "update time",
+       "critical suspend",
+       "user standby",
+       "user suspend",
+       "system standby resume",
+       "capabilities change"
+};
+#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
+
+typedef struct lookup_t {
+       int     key;
+       char *  msg;
+} lookup_t;
+
+/*
+ *     The BIOS returns a set of standard error codes in AX when the
+ *     carry flag is set.
+ */
+static const lookup_t error_table[] = {
+/* N/A { APM_SUCCESS,          "Operation succeeded" }, */
+       { APM_DISABLED,         "Power management disabled" },
+       { APM_CONNECTED,        "Real mode interface already connected" },
+       { APM_NOT_CONNECTED,    "Interface not connected" },
+       { APM_16_CONNECTED,     "16 bit interface already connected" },
+/* N/A { APM_16_UNSUPPORTED,   "16 bit interface not supported" }, */
+       { APM_32_CONNECTED,     "32 bit interface already connected" },
+       { APM_32_UNSUPPORTED,   "32 bit interface not supported" },
+       { APM_BAD_DEVICE,       "Unrecognized device ID" },
+       { APM_BAD_PARAM,        "Parameter out of range" },
+       { APM_NOT_ENGAGED,      "Interface not engaged" },
+       { APM_BAD_FUNCTION,     "Function not supported" },
+       { APM_RESUME_DISABLED,  "Resume timer disabled" },
+       { APM_BAD_STATE,        "Unable to enter requested state" },
+/* N/A { APM_NO_EVENTS,        "No events pending" }, */
+       { APM_NO_ERROR,         "BIOS did not set a return code" },
+       { APM_NOT_PRESENT,      "No APM present" }
+};
+#define ERROR_COUNT    ARRAY_SIZE(error_table)
+
+/**
+ *     apm_error       -       display an APM error
+ *     @str: information string
+ *     @err: APM BIOS return code
+ *
+ *     Write a meaningful log entry to the kernel log in the event of
+ *     an APM error.
+ */
+static void apm_error(char *str, int err)
+{
+       int     i;
+
+       for (i = 0; i < ERROR_COUNT; i++)
+               if (error_table[i].key == err) break;
+       if (i < ERROR_COUNT)
+               printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+       else
+               printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+                       str, err);
+}
+
+/*
+ * Lock APM functionality to physical CPU 0
+ */
+#ifdef CONFIG_SMP
+
+static cpumask_t apm_save_cpus(void)
+{
+       cpumask_t x = current->cpus_allowed;
+       /* Some bioses don't like being called from CPU != 0 */
+       set_cpus_allowed(current, cpumask_of_cpu(0));
+       BUG_ON(smp_processor_id() != 0);
+       return x;
+}
+
+static inline void apm_restore_cpus(cpumask_t mask)
+{
+       set_cpus_allowed(current, mask);
+}
+
+#else
+
+/*
+ *     No CPU lockdown needed on a uniprocessor
+ */
+#define apm_save_cpus()                (current->cpus_allowed)
+#define apm_restore_cpus(x)    (void)(x)
+
+#endif
+
+/*
+ * These are the actual BIOS calls.  Depending on APM_ZERO_SEGS and
+ * apm_info.allow_ints, we are being really paranoid here!  Not only
+ * are interrupts disabled, but all the segment registers (except SS)
+ * are saved and zeroed this means that if the BIOS tries to reference
+ * any data without explicitly loading the segment registers, the kernel
+ * will fault immediately rather than have some unforeseen circumstances
+ * for the rest of the kernel.  And it will be very obvious!  :-) Doing
+ * this depends on CS referring to the same physical memory as DS so that
+ * DS can be zeroed before the call. Unfortunately, we can't do anything
+ * about the stack segment/pointer.  Also, we tell the compiler that
+ * everything could change.
+ *
+ * Also, we KNOW that for the non error case of apm_bios_call, there
+ * is no useful data returned in the low order 8 bits of eax.
+ */
+
+static inline unsigned long __apm_irq_save(void)
+{
+       unsigned long flags;
+       local_save_flags(flags);
+       if (apm_info.allow_ints) {
+               if (irqs_disabled_flags(flags))
+                       local_irq_enable();
+       } else
+               local_irq_disable();
+
+       return flags;
+}
+
+#define apm_irq_save(flags) \
+       do { flags = __apm_irq_save(); } while (0)
+
+static inline void apm_irq_restore(unsigned long flags)
+{
+       if (irqs_disabled_flags(flags))
+               local_irq_disable();
+       else if (irqs_disabled())
+               local_irq_enable();
+}
+
+#ifdef APM_ZERO_SEGS
+#      define APM_DECL_SEGS \
+               unsigned int saved_fs; unsigned int saved_gs;
+#      define APM_DO_SAVE_SEGS \
+               savesegment(fs, saved_fs); savesegment(gs, saved_gs)
+#      define APM_DO_RESTORE_SEGS \
+               loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
+#else
+#      define APM_DECL_SEGS
+#      define APM_DO_SAVE_SEGS
+#      define APM_DO_RESTORE_SEGS
+#endif
+
+/**
+ *     apm_bios_call   -       Make an APM BIOS 32bit call
+ *     @func: APM function to execute
+ *     @ebx_in: EBX register for call entry
+ *     @ecx_in: ECX register for call entry
+ *     @eax: EAX register return
+ *     @ebx: EBX register return
+ *     @ecx: ECX register return
+ *     @edx: EDX register return
+ *     @esi: ESI register return
+ *
+ *     Make an APM call using the 32bit protected mode interface. The
+ *     caller is responsible for knowing if APM BIOS is configured and
+ *     enabled. This call can disable interrupts for a long period of
+ *     time on some laptops.  The return value is in AH and the carry
+ *     flag is loaded into AL.  If there is an error, then the error
+ *     code is returned in AH (bits 8-15 of eax) and this function
+ *     returns non-zero.
+ */
+static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
+       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
+{
+       APM_DECL_SEGS
+       unsigned long           flags;
+       cpumask_t               cpus;
+       int                     cpu;
+       struct desc_struct      save_desc_40;
+       struct desc_struct      *gdt;
+
+       cpus = apm_save_cpus();
+       
+       cpu = get_cpu();
+       gdt = get_cpu_gdt_table(cpu);
+       save_desc_40 = gdt[0x40 / 8];
+       gdt[0x40 / 8] = bad_bios_desc;
+
+       apm_irq_save(flags);
+       APM_DO_SAVE_SEGS;
+       apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
+       APM_DO_RESTORE_SEGS;
+       apm_irq_restore(flags);
+       gdt[0x40 / 8] = save_desc_40;
+       put_cpu();
+       apm_restore_cpus(cpus);
+       
+       return *eax & 0xff;
+}
+
+/**
+ *     apm_bios_call_simple    -       make a simple APM BIOS 32bit call
+ *     @func: APM function to invoke
+ *     @ebx_in: EBX register value for BIOS call
+ *     @ecx_in: ECX register value for BIOS call
+ *     @eax: EAX register on return from the BIOS call
+ *
+ *     Make a BIOS call that returns one value only, or just status.
+ *     If there is an error, then the error code is returned in AH
+ *     (bits 8-15 of eax) and this function returns non-zero. This is
+ *     used for simpler BIOS operations. This call may hold interrupts
+ *     off for a long time on some laptops.
+ */
+
+static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
+{
+       u8                      error;
+       APM_DECL_SEGS
+       unsigned long           flags;
+       cpumask_t               cpus;
+       int                     cpu;
+       struct desc_struct      save_desc_40;
+       struct desc_struct      *gdt;
+
+       cpus = apm_save_cpus();
+       
+       cpu = get_cpu();
+       gdt = get_cpu_gdt_table(cpu);
+       save_desc_40 = gdt[0x40 / 8];
+       gdt[0x40 / 8] = bad_bios_desc;
+
+       apm_irq_save(flags);
+       APM_DO_SAVE_SEGS;
+       error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
+       APM_DO_RESTORE_SEGS;
+       apm_irq_restore(flags);
+       gdt[0x40 / 8] = save_desc_40;
+       put_cpu();
+       apm_restore_cpus(cpus);
+       return error;
+}
+
+/**
+ *     apm_driver_version      -       APM driver version
+ *     @val:   loaded with the APM version on return
+ *
+ *     Retrieve the APM version supported by the BIOS. This is only
+ *     supported for APM 1.1 or higher. An error indicates APM 1.0 is
+ *     probably present.
+ *
+ *     On entry val should point to a value indicating the APM driver
+ *     version with the high byte being the major and the low byte the
+ *     minor number both in BCD
+ *
+ *     On return it will hold the BIOS revision supported in the
+ *     same format.
+ */
+
+static int apm_driver_version(u_short *val)
+{
+       u32     eax;
+
+       if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
+               return (eax >> 8) & 0xff;
+       *val = eax;
+       return APM_SUCCESS;
+}
+
+/**
+ *     apm_get_event   -       get an APM event from the BIOS
+ *     @event: pointer to the event
+ *     @info: point to the event information
+ *
+ *     The APM BIOS provides a polled information for event
+ *     reporting. The BIOS expects to be polled at least every second
+ *     when events are pending. When a message is found the caller should
+ *     poll until no more messages are present.  However, this causes
+ *     problems on some laptops where a suspend event notification is
+ *     not cleared until it is acknowledged.
+ *
+ *     Additional information is returned in the info pointer, providing
+ *     that APM 1.2 is in use. If no messges are pending the value 0x80
+ *     is returned (No power management events pending).
+ */
+static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
+{
+       u32     eax;
+       u32     ebx;
+       u32     ecx;
+       u32     dummy;
+
+       if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
+                       &dummy, &dummy))
+               return (eax >> 8) & 0xff;
+       *event = ebx;
+       if (apm_info.connection_version < 0x0102)
+               *info = ~0; /* indicate info not valid */
+       else
+               *info = ecx;
+       return APM_SUCCESS;
+}
+
+/**
+ *     set_power_state -       set the power management state
+ *     @what: which items to transition
+ *     @state: state to transition to
+ *
+ *     Request an APM change of state for one or more system devices. The
+ *     processor state must be transitioned last of all. what holds the
+ *     class of device in the upper byte and the device number (0xFF for
+ *     all) for the object to be transitioned.
+ *
+ *     The state holds the state to transition to, which may in fact
+ *     be an acceptance of a BIOS requested state change.
+ */
+static int set_power_state(u_short what, u_short state)
+{
+       u32     eax;
+
+       if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
+               return (eax >> 8) & 0xff;
+       return APM_SUCCESS;
+}
+
+/**
+ *     set_system_power_state - set system wide power state
+ *     @state: which state to enter
+ *
+ *     Transition the entire system into a new APM power state.
+ */
+static int set_system_power_state(u_short state)
+{
+       return set_power_state(APM_DEVICE_ALL, state);
+}
+
+/**
+ *     apm_do_idle     -       perform power saving
+ *
+ *     This function notifies the BIOS that the processor is (in the view
+ *     of the OS) idle. It returns -1 in the event that the BIOS refuses
+ *     to handle the idle request. On a success the function returns 1
+ *     if the BIOS did clock slowing or 0 otherwise.
+ */
+static int apm_do_idle(void)
+{
+       u32     eax;
+       u8      ret = 0;
+       int     idled = 0;
+       int     polling;
+
+       polling = !!(current_thread_info()->status & TS_POLLING);
+       if (polling) {
+               current_thread_info()->status &= ~TS_POLLING;
+               /*
+                * TS_POLLING-cleared state must be visible before we
+                * test NEED_RESCHED:
+                */
+               smp_mb();
+       }
+       if (!need_resched()) {
+               idled = 1;
+               ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
+       }
+       if (polling)
+               current_thread_info()->status |= TS_POLLING;
+
+       if (!idled)
+               return 0;
+
+       if (ret) {
+               static unsigned long t;
+
+               /* This always fails on some SMP boards running UP kernels.
+                * Only report the failure the first 5 times.
+                */
+               if (++t < 5)
+               {
+                       printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
+                                       (eax >> 8) & 0xff);
+                       t = jiffies;
+               }
+               return -1;
+       }
+       clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
+       return clock_slowed;
+}
+
+/**
+ *     apm_do_busy     -       inform the BIOS the CPU is busy
+ *
+ *     Request that the BIOS brings the CPU back to full performance. 
+ */
+static void apm_do_busy(void)
+{
+       u32     dummy;
+
+       if (clock_slowed || ALWAYS_CALL_BUSY) {
+               (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
+               clock_slowed = 0;
+       }
+}
+
+/*
+ * If no process has really been interested in
+ * the CPU for some time, we want to call BIOS
+ * power management - we probably want
+ * to conserve power.
+ */
+#define IDLE_CALC_LIMIT   (HZ * 100)
+#define IDLE_LEAKY_MAX    16
+
+static void (*original_pm_idle)(void) __read_mostly;
+
+/**
+ * apm_cpu_idle                -       cpu idling for APM capable Linux
+ *
+ * This is the idling function the kernel executes when APM is available. It 
+ * tries to do BIOS powermanagement based on the average system idle time.
+ * Furthermore it calls the system default idle routine.
+ */
+
+static void apm_cpu_idle(void)
+{
+       static int use_apm_idle; /* = 0 */
+       static unsigned int last_jiffies; /* = 0 */
+       static unsigned int last_stime; /* = 0 */
+
+       int apm_idle_done = 0;
+       unsigned int jiffies_since_last_check = jiffies - last_jiffies;
+       unsigned int bucket;
+
+recalc:
+       if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
+               use_apm_idle = 0;
+               last_jiffies = jiffies;
+               last_stime = current->stime;
+       } else if (jiffies_since_last_check > idle_period) {
+               unsigned int idle_percentage;
+
+               idle_percentage = current->stime - last_stime;
+               idle_percentage *= 100;
+               idle_percentage /= jiffies_since_last_check;
+               use_apm_idle = (idle_percentage > idle_threshold);
+               if (apm_info.forbid_idle)
+                       use_apm_idle = 0;
+               last_jiffies = jiffies;
+               last_stime = current->stime;
+       }
+
+       bucket = IDLE_LEAKY_MAX;
+
+       while (!need_resched()) {
+               if (use_apm_idle) {
+                       unsigned int t;
+
+                       t = jiffies;
+                       switch (apm_do_idle()) {
+                       case 0: apm_idle_done = 1;
+                               if (t != jiffies) {
+                                       if (bucket) {
+                                               bucket = IDLE_LEAKY_MAX;
+                                               continue;
+                                       }
+                               } else if (bucket) {
+                                       bucket--;
+                                       continue;
+                               }
+                               break;
+                       case 1: apm_idle_done = 1;
+                               break;
+                       default: /* BIOS refused */
+                               break;
+                       }
+               }
+               if (original_pm_idle)
+                       original_pm_idle();
+               else
+                       default_idle();
+               jiffies_since_last_check = jiffies - last_jiffies;
+               if (jiffies_since_last_check > idle_period)
+                       goto recalc;
+       }
+
+       if (apm_idle_done)
+               apm_do_busy();
+}
+
+/**
+ *     apm_power_off   -       ask the BIOS to power off
+ *
+ *     Handle the power off sequence. This is the one piece of code we
+ *     will execute even on SMP machines. In order to deal with BIOS
+ *     bugs we support real mode APM BIOS power off calls. We also make
+ *     the SMP call on CPU0 as some systems will only honour this call
+ *     on their first cpu.
+ */
+static void apm_power_off(void)
+{
+       unsigned char   po_bios_call[] = {
+               0xb8, 0x00, 0x10,       /* movw  $0x1000,ax  */
+               0x8e, 0xd0,             /* movw  ax,ss       */
+               0xbc, 0x00, 0xf0,       /* movw  $0xf000,sp  */
+               0xb8, 0x07, 0x53,       /* movw  $0x5307,ax  */
+               0xbb, 0x01, 0x00,       /* movw  $0x0001,bx  */
+               0xb9, 0x03, 0x00,       /* movw  $0x0003,cx  */
+               0xcd, 0x15              /* int   $0x15       */
+       };
+
+       /* Some bioses don't like being called from CPU != 0 */
+       if (apm_info.realmode_power_off)
+       {
+               (void)apm_save_cpus();
+               machine_real_restart(po_bios_call, sizeof(po_bios_call));
+       }
+       else
+               (void) set_system_power_state(APM_STATE_OFF);
+}
+
+#ifdef CONFIG_APM_DO_ENABLE
+
+/**
+ *     apm_enable_power_management - enable BIOS APM power management
+ *     @enable: enable yes/no
+ *
+ *     Enable or disable the APM BIOS power services. 
+ */
+static int apm_enable_power_management(int enable)
+{
+       u32     eax;
+
+       if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
+               return APM_NOT_ENGAGED;
+       if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
+                       enable, &eax))
+               return (eax >> 8) & 0xff;
+       if (enable)
+               apm_info.bios.flags &= ~APM_BIOS_DISABLED;
+       else
+               apm_info.bios.flags |= APM_BIOS_DISABLED;
+       return APM_SUCCESS;
+}
+#endif
+
+/**
+ *     apm_get_power_status    -       get current power state
+ *     @status: returned status
+ *     @bat: battery info
+ *     @life: estimated life
+ *
+ *     Obtain the current power status from the APM BIOS. We return a
+ *     status which gives the rough battery status, and current power
+ *     source. The bat value returned give an estimate as a percentage
+ *     of life and a status value for the battery. The estimated life
+ *     if reported is a lifetime in secodnds/minutes at current powwer
+ *     consumption.
+ */
+static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
+{
+       u32     eax;
+       u32     ebx;
+       u32     ecx;
+       u32     edx;
+       u32     dummy;
+
+       if (apm_info.get_power_status_broken)
+               return APM_32_UNSUPPORTED;
+       if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
+                       &eax, &ebx, &ecx, &edx, &dummy))
+               return (eax >> 8) & 0xff;
+       *status = ebx;
+       *bat = ecx;
+       if (apm_info.get_power_status_swabinminutes) {
+               *life = swab16((u16)edx);
+               *life |= 0x8000;
+       } else
+               *life = edx;
+       return APM_SUCCESS;
+}
+
+#if 0
+static int apm_get_battery_status(u_short which, u_short *status,
+                                 u_short *bat, u_short *life, u_short *nbat)
+{
+       u32     eax;
+       u32     ebx;
+       u32     ecx;
+       u32     edx;
+       u32     esi;
+
+       if (apm_info.connection_version < 0x0102) {
+               /* pretend we only have one battery. */
+               if (which != 1)
+                       return APM_BAD_DEVICE;
+               *nbat = 1;
+               return apm_get_power_status(status, bat, life);
+       }
+
+       if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
+                       &ebx, &ecx, &edx, &esi))
+               return (eax >> 8) & 0xff;
+       *status = ebx;
+       *bat = ecx;
+       *life = edx;
+       *nbat = esi;
+       return APM_SUCCESS;
+}
+#endif
+
+/**
+ *     apm_engage_power_management     -       enable PM on a device
+ *     @device: identity of device
+ *     @enable: on/off
+ *
+ *     Activate or deactive power management on either a specific device
+ *     or the entire system (%APM_DEVICE_ALL).
+ */
+static int apm_engage_power_management(u_short device, int enable)
+{
+       u32     eax;
+
+       if ((enable == 0) && (device == APM_DEVICE_ALL)
+           && (apm_info.bios.flags & APM_BIOS_DISABLED))
+               return APM_DISABLED;
+       if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
+               return (eax >> 8) & 0xff;
+       if (device == APM_DEVICE_ALL) {
+               if (enable)
+                       apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
+               else
+                       apm_info.bios.flags |= APM_BIOS_DISENGAGED;
+       }
+       return APM_SUCCESS;
+}
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+
+/**
+ *     apm_console_blank       -       blank the display
+ *     @blank: on/off
+ *
+ *     Attempt to blank the console, firstly by blanking just video device
+ *     zero, and if that fails (some BIOSes don't support it) then it blanks
+ *     all video devices. Typically the BIOS will do laptop backlight and
+ *     monitor powerdown for us.
+ */
+static int apm_console_blank(int blank)
+{
+       int error = APM_NOT_ENGAGED; /* silence gcc */
+       int i;
+       u_short state;
+       static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
+
+       state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
+
+       for (i = 0; i < ARRAY_SIZE(dev); i++) {
+               error = set_power_state(dev[i], state);
+
+               if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+                       return 1;
+
+               if (error == APM_NOT_ENGAGED)
+                       break;
+       }
+
+       if (error == APM_NOT_ENGAGED) {
+               static int tried;
+               int eng_error;
+               if (tried++ == 0) {
+                       eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+                       if (eng_error) {
+                               apm_error("set display", error);
+                               apm_error("engage interface", eng_error);
+                               return 0;
+                       } else
+                               return apm_console_blank(blank);
+               }
+       }
+       apm_error("set display", error);
+       return 0;
+}
+#endif
+
+static int queue_empty(struct apm_user *as)
+{
+       return as->event_head == as->event_tail;
+}
+
+static apm_event_t get_queued_event(struct apm_user *as)
+{
+       if (++as->event_tail >= APM_MAX_EVENTS)
+               as->event_tail = 0;
+       return as->events[as->event_tail];
+}
+
+static void queue_event(apm_event_t event, struct apm_user *sender)
+{
+       struct apm_user *       as;
+
+       spin_lock(&user_list_lock);
+       if (user_list == NULL)
+               goto out;
+       for (as = user_list; as != NULL; as = as->next) {
+               if ((as == sender) || (!as->reader))
+                       continue;
+               if (++as->event_head >= APM_MAX_EVENTS)
+                       as->event_head = 0;
+
+               if (as->event_head == as->event_tail) {
+                       static int notified;
+
+                       if (notified++ == 0)
+                           printk(KERN_ERR "apm: an event queue overflowed\n");
+                       if (++as->event_tail >= APM_MAX_EVENTS)
+                               as->event_tail = 0;
+               }
+               as->events[as->event_head] = event;
+               if ((!as->suser) || (!as->writer))
+                       continue;
+               switch (event) {
+               case APM_SYS_SUSPEND:
+               case APM_USER_SUSPEND:
+                       as->suspends_pending++;
+                       suspends_pending++;
+                       break;
+
+               case APM_SYS_STANDBY:
+               case APM_USER_STANDBY:
+                       as->standbys_pending++;
+                       standbys_pending++;
+                       break;
+               }
+       }
+       wake_up_interruptible(&apm_waitqueue);
+out:
+       spin_unlock(&user_list_lock);
+}
+
+static void reinit_timer(void)
+{
+#ifdef INIT_TIMER_AFTER_SUSPEND
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8253_lock, flags);
+       /* set the clock to HZ */
+       outb_p(0x34, PIT_MODE);         /* binary, mode 2, LSB/MSB, ch 0 */
+       udelay(10);
+       outb_p(LATCH & 0xff, PIT_CH0);  /* LSB */
+       udelay(10);
+       outb(LATCH >> 8, PIT_CH0);      /* MSB */
+       udelay(10);
+       spin_unlock_irqrestore(&i8253_lock, flags);
+#endif
+}
+
+static int suspend(int vetoable)
+{
+       int             err;
+       struct apm_user *as;
+
+       if (pm_send_all(PM_SUSPEND, (void *)3)) {
+               /* Vetoed */
+               if (vetoable) {
+                       if (apm_info.connection_version > 0x100)
+                               set_system_power_state(APM_STATE_REJECT);
+                       err = -EBUSY;
+                       ignore_sys_suspend = 0;
+                       printk(KERN_WARNING "apm: suspend was vetoed.\n");
+                       goto out;
+               }
+               printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
+       }
+
+       device_suspend(PMSG_SUSPEND);
+       local_irq_disable();
+       device_power_down(PMSG_SUSPEND);
+
+       local_irq_enable();
+
+       save_processor_state();
+       err = set_system_power_state(APM_STATE_SUSPEND);
+       ignore_normal_resume = 1;
+       restore_processor_state();
+
+       local_irq_disable();
+       reinit_timer();
+
+       if (err == APM_NO_ERROR)
+               err = APM_SUCCESS;
+       if (err != APM_SUCCESS)
+               apm_error("suspend", err);
+       err = (err == APM_SUCCESS) ? 0 : -EIO;
+       device_power_up();
+       local_irq_enable();
+       device_resume();
+       pm_send_all(PM_RESUME, (void *)0);
+       queue_event(APM_NORMAL_RESUME, NULL);
+ out:
+       spin_lock(&user_list_lock);
+       for (as = user_list; as != NULL; as = as->next) {
+               as->suspend_wait = 0;
+               as->suspend_result = err;
+       }
+       spin_unlock(&user_list_lock);
+       wake_up_interruptible(&apm_suspend_waitqueue);
+       return err;
+}
+
+static void standby(void)
+{
+       int     err;
+
+       local_irq_disable();
+       device_power_down(PMSG_SUSPEND);
+       local_irq_enable();
+
+       err = set_system_power_state(APM_STATE_STANDBY);
+       if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
+               apm_error("standby", err);
+
+       local_irq_disable();
+       device_power_up();
+       local_irq_enable();
+}
+
+static apm_event_t get_event(void)
+{
+       int             error;
+       apm_event_t     event = APM_NO_EVENTS; /* silence gcc */
+       apm_eventinfo_t info;
+
+       static int notified;
+
+       /* we don't use the eventinfo */
+       error = apm_get_event(&event, &info);
+       if (error == APM_SUCCESS)
+               return event;
+
+       if ((error != APM_NO_EVENTS) && (notified++ == 0))
+               apm_error("get_event", error);
+
+       return 0;
+}
+
+static void check_events(void)
+{
+       apm_event_t             event;
+       static unsigned long    last_resume;
+       static int              ignore_bounce;
+
+       while ((event = get_event()) != 0) {
+               if (debug) {
+                       if (event <= NR_APM_EVENT_NAME)
+                               printk(KERN_DEBUG "apm: received %s notify\n",
+                                      apm_event_name[event - 1]);
+                       else
+                               printk(KERN_DEBUG "apm: received unknown "
+                                      "event 0x%02x\n", event);
+               }
+               if (ignore_bounce
+                   && ((jiffies - last_resume) > bounce_interval))
+                       ignore_bounce = 0;
+
+               switch (event) {
+               case APM_SYS_STANDBY:
+               case APM_USER_STANDBY:
+                       queue_event(event, NULL);
+                       if (standbys_pending <= 0)
+                               standby();
+                       break;
+
+               case APM_USER_SUSPEND:
+#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
+                       if (apm_info.connection_version > 0x100)
+                               set_system_power_state(APM_STATE_REJECT);
+                       break;
+#endif
+               case APM_SYS_SUSPEND:
+                       if (ignore_bounce) {
+                               if (apm_info.connection_version > 0x100)
+                                       set_system_power_state(APM_STATE_REJECT);
+                               break;
+                       }
+                       /*
+                        * If we are already processing a SUSPEND,
+                        * then further SUSPEND events from the BIOS
+                        * will be ignored.  We also return here to
+                        * cope with the fact that the Thinkpads keep
+                        * sending a SUSPEND event until something else
+                        * happens!
+                        */
+                       if (ignore_sys_suspend)
+                               return;
+                       ignore_sys_suspend = 1;
+                       queue_event(event, NULL);
+                       if (suspends_pending <= 0)
+                               (void) suspend(1);
+                       break;
+
+               case APM_NORMAL_RESUME:
+               case APM_CRITICAL_RESUME:
+               case APM_STANDBY_RESUME:
+                       ignore_sys_suspend = 0;
+                       last_resume = jiffies;
+                       ignore_bounce = 1;
+                       if ((event != APM_NORMAL_RESUME)
+                           || (ignore_normal_resume == 0)) {
+                               device_resume();
+                               pm_send_all(PM_RESUME, (void *)0);
+                               queue_event(event, NULL);
+                       }
+                       ignore_normal_resume = 0;
+                       break;
+
+               case APM_CAPABILITY_CHANGE:
+               case APM_LOW_BATTERY:
+               case APM_POWER_STATUS_CHANGE:
+                       queue_event(event, NULL);
+                       /* If needed, notify drivers here */
+                       break;
+
+               case APM_UPDATE_TIME:
+                       break;
+
+               case APM_CRITICAL_SUSPEND:
+                       /*
+                        * We are not allowed to reject a critical suspend.
+                        */
+                       (void) suspend(0);
+                       break;
+               }
+       }
+}
+
+static void apm_event_handler(void)
+{
+       static int      pending_count = 4;
+       int             err;
+
+       if ((standbys_pending > 0) || (suspends_pending > 0)) {
+               if ((apm_info.connection_version > 0x100) &&
+                               (pending_count-- <= 0)) {
+                       pending_count = 4;
+                       if (debug)
+                               printk(KERN_DEBUG "apm: setting state busy\n");
+                       err = set_system_power_state(APM_STATE_BUSY);
+                       if (err)
+                               apm_error("busy", err);
+               }
+       } else
+               pending_count = 4;
+       check_events();
+}
+
+/*
+ * This is the APM thread main loop.
+ */
+
+static void apm_mainloop(void)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+       add_wait_queue(&apm_waitqueue, &wait);
+       set_current_state(TASK_INTERRUPTIBLE);
+       for (;;) {
+               schedule_timeout(APM_CHECK_TIMEOUT);
+               if (kthread_should_stop())
+                       break;
+               /*
+                * Ok, check all events, check for idle (and mark us sleeping
+                * so as not to count towards the load average)..
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               apm_event_handler();
+       }
+       remove_wait_queue(&apm_waitqueue, &wait);
+}
+
+static int check_apm_user(struct apm_user *as, const char *func)
+{
+       if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+               printk(KERN_ERR "apm: %s passed bad filp\n", func);
+               return 1;
+       }
+       return 0;
+}
+
+static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
+{
+       struct apm_user *       as;
+       int                     i;
+       apm_event_t             event;
+
+       as = fp->private_data;
+       if (check_apm_user(as, "read"))
+               return -EIO;
+       if ((int)count < sizeof(apm_event_t))
+               return -EINVAL;
+       if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
+               return -EAGAIN;
+       wait_event_interruptible(apm_waitqueue, !queue_empty(as));
+       i = count;
+       while ((i >= sizeof(event)) && !queue_empty(as)) {
+               event = get_queued_event(as);
+               if (copy_to_user(buf, &event, sizeof(event))) {
+                       if (i < count)
+                               break;
+                       return -EFAULT;
+               }
+               switch (event) {
+               case APM_SYS_SUSPEND:
+               case APM_USER_SUSPEND:
+                       as->suspends_read++;
+                       break;
+
+               case APM_SYS_STANDBY:
+               case APM_USER_STANDBY:
+                       as->standbys_read++;
+                       break;
+               }
+               buf += sizeof(event);
+               i -= sizeof(event);
+       }
+       if (i < count)
+               return count - i;
+       if (signal_pending(current))
+               return -ERESTARTSYS;
+       return 0;
+}
+
+static unsigned int do_poll(struct file *fp, poll_table * wait)
+{
+       struct apm_user * as;
+
+       as = fp->private_data;
+       if (check_apm_user(as, "poll"))
+               return 0;
+       poll_wait(fp, &apm_waitqueue, wait);
+       if (!queue_empty(as))
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+static int do_ioctl(struct inode * inode, struct file *filp,
+                   u_int cmd, u_long arg)
+{
+       struct apm_user *       as;
+
+       as = filp->private_data;
+       if (check_apm_user(as, "ioctl"))
+               return -EIO;
+       if ((!as->suser) || (!as->writer))
+               return -EPERM;
+       switch (cmd) {
+       case APM_IOC_STANDBY:
+               if (as->standbys_read > 0) {
+                       as->standbys_read--;
+                       as->standbys_pending--;
+                       standbys_pending--;
+               } else
+                       queue_event(APM_USER_STANDBY, as);
+               if (standbys_pending <= 0)
+                       standby();
+               break;
+       case APM_IOC_SUSPEND:
+               if (as->suspends_read > 0) {
+                       as->suspends_read--;
+                       as->suspends_pending--;
+                       suspends_pending--;
+               } else
+                       queue_event(APM_USER_SUSPEND, as);
+               if (suspends_pending <= 0) {
+                       return suspend(1);
+               } else {
+                       as->suspend_wait = 1;
+                       wait_event_interruptible(apm_suspend_waitqueue,
+                                       as->suspend_wait == 0);
+                       return as->suspend_result;
+               }
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int do_release(struct inode * inode, struct file * filp)
+{
+       struct apm_user *       as;
+
+       as = filp->private_data;
+       if (check_apm_user(as, "release"))
+               return 0;
+       filp->private_data = NULL;
+       if (as->standbys_pending > 0) {
+               standbys_pending -= as->standbys_pending;
+               if (standbys_pending <= 0)
+                       standby();
+       }
+       if (as->suspends_pending > 0) {
+               suspends_pending -= as->suspends_pending;
+               if (suspends_pending <= 0)
+                       (void) suspend(1);
+       }
+       spin_lock(&user_list_lock);
+       if (user_list == as)
+               user_list = as->next;
+       else {
+               struct apm_user *       as1;
+
+               for (as1 = user_list;
+                    (as1 != NULL) && (as1->next != as);
+                    as1 = as1->next)
+                       ;
+               if (as1 == NULL)
+                       printk(KERN_ERR "apm: filp not in user list\n");
+               else
+                       as1->next = as->next;
+       }
+       spin_unlock(&user_list_lock);
+       kfree(as);
+       return 0;
+}
+
+static int do_open(struct inode * inode, struct file * filp)
+{
+       struct apm_user *       as;
+
+       as = kmalloc(sizeof(*as), GFP_KERNEL);
+       if (as == NULL) {
+               printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
+                      sizeof(*as));
+               return -ENOMEM;
+       }
+       as->magic = APM_BIOS_MAGIC;
+       as->event_tail = as->event_head = 0;
+       as->suspends_pending = as->standbys_pending = 0;
+       as->suspends_read = as->standbys_read = 0;
+       /*
+        * XXX - this is a tiny bit broken, when we consider BSD
+         * process accounting. If the device is opened by root, we
+        * instantly flag that we used superuser privs. Who knows,
+        * we might close the device immediately without doing a
+        * privileged operation -- cevans
+        */
+       as->suser = capable(CAP_SYS_ADMIN);
+       as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
+       as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
+       spin_lock(&user_list_lock);
+       as->next = user_list;
+       user_list = as;
+       spin_unlock(&user_list_lock);
+       filp->private_data = as;
+       return 0;
+}
+
+static int proc_apm_show(struct seq_file *m, void *v)
+{
+       unsigned short  bx;
+       unsigned short  cx;
+       unsigned short  dx;
+       int             error;
+       unsigned short  ac_line_status = 0xff;
+       unsigned short  battery_status = 0xff;
+       unsigned short  battery_flag   = 0xff;
+       int             percentage     = -1;
+       int             time_units     = -1;
+       char            *units         = "?";
+
+       if ((num_online_cpus() == 1) &&
+           !(error = apm_get_power_status(&bx, &cx, &dx))) {
+               ac_line_status = (bx >> 8) & 0xff;
+               battery_status = bx & 0xff;
+               if ((cx & 0xff) != 0xff)
+                       percentage = cx & 0xff;
+
+               if (apm_info.connection_version > 0x100) {
+                       battery_flag = (cx >> 8) & 0xff;
+                       if (dx != 0xffff) {
+                               units = (dx & 0x8000) ? "min" : "sec";
+                               time_units = dx & 0x7fff;
+                       }
+               }
+       }
+       /* Arguments, with symbols from linux/apm_bios.h.  Information is
+          from the Get Power Status (0x0a) call unless otherwise noted.
+
+          0) Linux driver version (this will change if format changes)
+          1) APM BIOS Version.  Usually 1.0, 1.1 or 1.2.
+          2) APM flags from APM Installation Check (0x00):
+             bit 0: APM_16_BIT_SUPPORT
+             bit 1: APM_32_BIT_SUPPORT
+             bit 2: APM_IDLE_SLOWS_CLOCK
+             bit 3: APM_BIOS_DISABLED
+             bit 4: APM_BIOS_DISENGAGED
+          3) AC line status
+             0x00: Off-line
+             0x01: On-line
+             0x02: On backup power (BIOS >= 1.1 only)
+             0xff: Unknown
+          4) Battery status
+             0x00: High
+             0x01: Low
+             0x02: Critical
+             0x03: Charging
+             0x04: Selected battery not present (BIOS >= 1.2 only)
+             0xff: Unknown
+          5) Battery flag
+             bit 0: High
+             bit 1: Low
+             bit 2: Critical
+             bit 3: Charging
+             bit 7: No system battery
+             0xff: Unknown
+          6) Remaining battery life (percentage of charge):
+             0-100: valid
+             -1: Unknown
+          7) Remaining battery life (time units):
+             Number of remaining minutes or seconds
+             -1: Unknown
+          8) min = minutes; sec = seconds */
+
+       seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+                    driver_version,
+                    (apm_info.bios.version >> 8) & 0xff,
+                    apm_info.bios.version & 0xff,
+                    apm_info.bios.flags,
+                    ac_line_status,
+                    battery_status,
+                    battery_flag,
+                    percentage,
+                    time_units,
+                    units);
+       return 0;
+}
+
+static int proc_apm_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, proc_apm_show, NULL);
+}
+
+static const struct file_operations apm_file_ops = {
+       .owner          = THIS_MODULE,
+       .open           = proc_apm_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int apm(void *unused)
+{
+       unsigned short  bx;
+       unsigned short  cx;
+       unsigned short  dx;
+       int             error;
+       char *          power_stat;
+       char *          bat_stat;
+
+#ifdef CONFIG_SMP
+       /* 2002/08/01 - WT
+        * This is to avoid random crashes at boot time during initialization
+        * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
+        * Some bioses don't like being called from CPU != 0.
+        * Method suggested by Ingo Molnar.
+        */
+       set_cpus_allowed(current, cpumask_of_cpu(0));
+       BUG_ON(smp_processor_id() != 0);
+#endif
+
+       if (apm_info.connection_version == 0) {
+               apm_info.connection_version = apm_info.bios.version;
+               if (apm_info.connection_version > 0x100) {
+                       /*
+                        * We only support BIOSs up to version 1.2
+                        */
+                       if (apm_info.connection_version > 0x0102)
+                               apm_info.connection_version = 0x0102;
+                       error = apm_driver_version(&apm_info.connection_version);
+                       if (error != APM_SUCCESS) {
+                               apm_error("driver version", error);
+                               /* Fall back to an APM 1.0 connection. */
+                               apm_info.connection_version = 0x100;
+                       }
+               }
+       }
+
+       if (debug)
+               printk(KERN_INFO "apm: Connection version %d.%d\n",
+                       (apm_info.connection_version >> 8) & 0xff,
+                       apm_info.connection_version & 0xff);
+
+#ifdef CONFIG_APM_DO_ENABLE
+       if (apm_info.bios.flags & APM_BIOS_DISABLED) {
+               /*
+                * This call causes my NEC UltraLite Versa 33/C to hang if it
+                * is booted with PM disabled but not in the docking station.
+                * Unfortunate ...
+                */
+               error = apm_enable_power_management(1);
+               if (error) {
+                       apm_error("enable power management", error);
+                       return -1;
+               }
+       }
+#endif
+
+       if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
+           && (apm_info.connection_version > 0x0100)) {
+               error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+               if (error) {
+                       apm_error("engage power management", error);
+                       return -1;
+               }
+       }
+
+       if (debug && (num_online_cpus() == 1 || smp )) {
+               error = apm_get_power_status(&bx, &cx, &dx);
+               if (error)
+                       printk(KERN_INFO "apm: power status not available\n");
+               else {
+                       switch ((bx >> 8) & 0xff) {
+                       case 0: power_stat = "off line"; break;
+                       case 1: power_stat = "on line"; break;
+                       case 2: power_stat = "on backup power"; break;
+                       default: power_stat = "unknown"; break;
+                       }
+                       switch (bx & 0xff) {
+                       case 0: bat_stat = "high"; break;
+                       case 1: bat_stat = "low"; break;
+                       case 2: bat_stat = "critical"; break;
+                       case 3: bat_stat = "charging"; break;
+                       default: bat_stat = "unknown"; break;
+                       }
+                       printk(KERN_INFO
+                              "apm: AC %s, battery status %s, battery life ",
+                              power_stat, bat_stat);
+                       if ((cx & 0xff) == 0xff)
+                               printk("unknown\n");
+                       else
+                               printk("%d%%\n", cx & 0xff);
+                       if (apm_info.connection_version > 0x100) {
+                               printk(KERN_INFO
+                                      "apm: battery flag 0x%02x, battery life ",
+                                      (cx >> 8) & 0xff);
+                               if (dx == 0xffff)
+                                       printk("unknown\n");
+                               else
+                                       printk("%d %s\n", dx & 0x7fff,
+                                               (dx & 0x8000) ?
+                                               "minutes" : "seconds");
+                       }
+               }
+       }
+
+       /* Install our power off handler.. */
+       if (power_off)
+               pm_power_off = apm_power_off;
+
+       if (num_online_cpus() == 1 || smp) {
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+               console_blank_hook = apm_console_blank;
+#endif
+               apm_mainloop();
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+               console_blank_hook = NULL;
+#endif
+       }
+
+       return 0;
+}
+
+#ifndef MODULE
+static int __init apm_setup(char *str)
+{
+       int     invert;
+
+       while ((str != NULL) && (*str != '\0')) {
+               if (strncmp(str, "off", 3) == 0)
+                       apm_disabled = 1;
+               if (strncmp(str, "on", 2) == 0)
+                       apm_disabled = 0;
+               if ((strncmp(str, "bounce-interval=", 16) == 0) ||
+                   (strncmp(str, "bounce_interval=", 16) == 0))
+                       bounce_interval = simple_strtol(str + 16, NULL, 0);
+               if ((strncmp(str, "idle-threshold=", 15) == 0) ||
+                   (strncmp(str, "idle_threshold=", 15) == 0))
+                       idle_threshold = simple_strtol(str + 15, NULL, 0);
+               if ((strncmp(str, "idle-period=", 12) == 0) ||
+                   (strncmp(str, "idle_period=", 12) == 0))
+                       idle_period = simple_strtol(str + 12, NULL, 0);
+               invert = (strncmp(str, "no-", 3) == 0) ||
+                       (strncmp(str, "no_", 3) == 0);
+               if (invert)
+                       str += 3;
+               if (strncmp(str, "debug", 5) == 0)
+                       debug = !invert;
+               if ((strncmp(str, "power-off", 9) == 0) ||
+                   (strncmp(str, "power_off", 9) == 0))
+                       power_off = !invert;
+               if (strncmp(str, "smp", 3) == 0)
+               {
+                       smp = !invert;
+                       idle_threshold = 100;
+               }
+               if ((strncmp(str, "allow-ints", 10) == 0) ||
+                   (strncmp(str, "allow_ints", 10) == 0))
+                       apm_info.allow_ints = !invert;
+               if ((strncmp(str, "broken-psr", 10) == 0) ||
+                   (strncmp(str, "broken_psr", 10) == 0))
+                       apm_info.get_power_status_broken = !invert;
+               if ((strncmp(str, "realmode-power-off", 18) == 0) ||
+                   (strncmp(str, "realmode_power_off", 18) == 0))
+                       apm_info.realmode_power_off = !invert;
+               str = strchr(str, ',');
+               if (str != NULL)
+                       str += strspn(str, ", \t");
+       }
+       return 1;
+}
+
+__setup("apm=", apm_setup);
+#endif
+
+static const struct file_operations apm_bios_fops = {
+       .owner          = THIS_MODULE,
+       .read           = do_read,
+       .poll           = do_poll,
+       .ioctl          = do_ioctl,
+       .open           = do_open,
+       .release        = do_release,
+};
+
+static struct miscdevice apm_device = {
+       APM_MINOR_DEV,
+       "apm_bios",
+       &apm_bios_fops
+};
+
+
+/* Simple "print if true" callback */
+static int __init print_if_true(struct dmi_system_id *d)
+{
+       printk("%s\n", d->ident);
+       return 0;
+}
+
+/*
+ * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
+ * disabled before the suspend. Linux used to get terribly confused by that.
+ */
+static int __init broken_ps2_resume(struct dmi_system_id *d)
+{
+       printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
+       return 0;
+}
+
+/* Some bioses have a broken protected mode poweroff and need to use realmode */
+static int __init set_realmode_power_off(struct dmi_system_id *d)
+{
+       if (apm_info.realmode_power_off == 0) {
+               apm_info.realmode_power_off = 1;
+               printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
+       }
+       return 0;
+}
+
+/* Some laptops require interrupts to be enabled during APM calls */
+static int __init set_apm_ints(struct dmi_system_id *d)
+{
+       if (apm_info.allow_ints == 0) {
+               apm_info.allow_ints = 1;
+               printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
+       }
+       return 0;
+}
+
+/* Some APM bioses corrupt memory or just plain do not work */
+static int __init apm_is_horked(struct dmi_system_id *d)
+{
+       if (apm_info.disabled == 0) {
+               apm_info.disabled = 1;
+               printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+       }
+       return 0;
+}
+
+static int __init apm_is_horked_d850md(struct dmi_system_id *d)
+{
+       if (apm_info.disabled == 0) {
+               apm_info.disabled = 1;
+               printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+               printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
+               printk(KERN_INFO "download from support.intel.com \n");
+       }
+       return 0;
+}
+
+/* Some APM bioses hang on APM idle calls */
+static int __init apm_likes_to_melt(struct dmi_system_id *d)
+{
+       if (apm_info.forbid_idle == 0) {
+               apm_info.forbid_idle = 1;
+               printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
+       }
+       return 0;
+}
+
+/*
+ *  Check for clue free BIOS implementations who use
+ *  the following QA technique
+ *
+ *      [ Write BIOS Code ]<------
+ *               |                ^
+ *      < Does it Compile >----N--
+ *               |Y               ^
+ *     < Does it Boot Win98 >-N--
+ *               |Y
+ *           [Ship It]
+ *
+ *     Phoenix A04  08/24/2000 is known bad (Dell Inspiron 5000e)
+ *     Phoenix A07  09/29/2000 is known good (Dell Inspiron 5000)
+ */
+static int __init broken_apm_power(struct dmi_system_id *d)
+{
+       apm_info.get_power_status_broken = 1;
+       printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
+       return 0;
+}
+
+/*
+ * This bios swaps the APM minute reporting bytes over (Many sony laptops
+ * have this problem).
+ */
+static int __init swab_apm_power_in_minutes(struct dmi_system_id *d)
+{
+       apm_info.get_power_status_swabinminutes = 1;
+       printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
+       return 0;
+}
+
+static struct dmi_system_id __initdata apm_dmi_table[] = {
+       {
+               print_if_true,
+               KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
+       },
+       {       /* Handle problems with APM on the C600 */
+               broken_ps2_resume, "Dell Latitude C600",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
+       },
+       {       /* Allow interrupts during suspend on Dell Latitude laptops*/
+               set_apm_ints, "Dell Latitude",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Dell Inspiron 2500",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+                       DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+       },
+       {       /* Allow interrupts during suspend on Dell Inspiron laptops*/
+               set_apm_ints, "Dell Inspiron", {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
+       },
+       {       /* Handle problems with APM on Inspiron 5000e */
+               broken_apm_power, "Dell Inspiron 5000e",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "A04"),
+                       DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
+       },
+       {       /* Handle problems with APM on Inspiron 2500 */
+               broken_apm_power, "Dell Inspiron 2500",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "A12"),
+                       DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Dell Dimension 4100",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
+                       DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
+                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+       },
+       {       /* Allow interrupts during suspend on Compaq Laptops*/
+               set_apm_ints, "Compaq 12XL125",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
+                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
+       },
+       {       /* Allow interrupts during APM or the clock goes slow */
+               set_apm_ints, "ASUSTeK",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
+       },
+       {       /* APM blows on shutdown */
+               apm_is_horked, "ABIT KX7-333[R]",
+               {       DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
+                       DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Trigem Delhi3",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Fujitsu-Siemens",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked_d850md, "Intel D850MD",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Intel D810EMO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Dell XPS-Z",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "A11"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Sharp PC-PJ/AX",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
+                       DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
+                       DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
+       },
+       {       /* APM crashes */
+               apm_is_horked, "Dell Inspiron 2500",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+                       DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+       },
+       {       /* APM idle hangs */
+               apm_likes_to_melt, "Jabil AMD",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
+       },
+       {       /* APM idle hangs */
+               apm_likes_to_melt, "AMI Bios",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
+                       DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-N505VX */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
+                       DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-XG29 */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
+                       DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
+                       DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
+                       DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
+                       DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
+                       DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
+                       DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
+                       DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-F104K */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
+                       DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
+       },
+
+       {       /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
+                       DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-C1VE */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
+                       DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
+       },
+       {       /* Handle problems with APM on Sony Vaio PCG-C1VE */
+               swab_apm_power_in_minutes, "Sony VAIO",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
+                       DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
+       },
+       {       /* broken PM poweroff bios */
+               set_realmode_power_off, "Award Software v4.60 PGMA",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
+                       DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
+       },
+
+       /* Generic per vendor APM settings  */
+
+       {       /* Allow interrupts during suspend on IBM laptops */
+               set_apm_ints, "IBM",
+               {       DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
+       },
+
+       { }
+};
+
+/*
+ * Just start the APM thread. We do NOT want to do APM BIOS
+ * calls from anything but the APM thread, if for no other reason
+ * than the fact that we don't trust the APM BIOS. This way,
+ * most common APM BIOS problems that lead to protection errors
+ * etc will have at least some level of being contained...
+ *
+ * In short, if something bad happens, at least we have a choice
+ * of just killing the apm thread..
+ */
+static int __init apm_init(void)
+{
+       struct proc_dir_entry *apm_proc;
+       struct desc_struct *gdt;
+       int err;
+
+       dmi_check_system(apm_dmi_table);
+
+       if (apm_info.bios.version == 0 || paravirt_enabled()) {
+               printk(KERN_INFO "apm: BIOS not found.\n");
+               return -ENODEV;
+       }
+       printk(KERN_INFO
+               "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+               ((apm_info.bios.version >> 8) & 0xff),
+               (apm_info.bios.version & 0xff),
+               apm_info.bios.flags,
+               driver_version);
+       if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
+               printk(KERN_INFO "apm: no 32 bit BIOS support\n");
+               return -ENODEV;
+       }
+
+       if (allow_ints)
+               apm_info.allow_ints = 1;
+       if (broken_psr)
+               apm_info.get_power_status_broken = 1;
+       if (realmode_power_off)
+               apm_info.realmode_power_off = 1;
+       /* User can override, but default is to trust DMI */
+       if (apm_disabled != -1)
+               apm_info.disabled = apm_disabled;
+
+       /*
+        * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
+        * but is reportedly a 1.0 BIOS.
+        */
+       if (apm_info.bios.version == 0x001)
+               apm_info.bios.version = 0x100;
+
+       /* BIOS < 1.2 doesn't set cseg_16_len */
+       if (apm_info.bios.version < 0x102)
+               apm_info.bios.cseg_16_len = 0; /* 64k */
+
+       if (debug) {
+               printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
+                       apm_info.bios.cseg, apm_info.bios.offset,
+                       apm_info.bios.cseg_16, apm_info.bios.dseg);
+               if (apm_info.bios.version > 0x100)
+                       printk(" cseg len %x, dseg len %x",
+                               apm_info.bios.cseg_len,
+                               apm_info.bios.dseg_len);
+               if (apm_info.bios.version > 0x101)
+                       printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
+               printk("\n");
+       }
+
+       if (apm_info.disabled) {
+               printk(KERN_NOTICE "apm: disabled on user request.\n");
+               return -ENODEV;
+       }
+       if ((num_online_cpus() > 1) && !power_off && !smp) {
+               printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+               apm_info.disabled = 1;
+               return -ENODEV;
+       }
+       if (PM_IS_ACTIVE()) {
+               printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+               apm_info.disabled = 1;
+               return -ENODEV;
+       }
+#ifdef CONFIG_PM_LEGACY
+       pm_active = 1;
+#endif
+
+       /*
+        * Set up a segment that references the real mode segment 0x40
+        * that extends up to the end of page zero (that we have reserved).
+        * This is for buggy BIOS's that refer to (real mode) segment 0x40
+        * even though they are called in protected mode.
+        */
+       set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
+       _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
+
+       /*
+        * Set up the long jump entry point to the APM BIOS, which is called
+        * from inline assembly.
+        */
+       apm_bios_entry.offset = apm_info.bios.offset;
+       apm_bios_entry.segment = APM_CS;
+
+       /*
+        * The APM 1.1 BIOS is supposed to provide limit information that it
+        * recognizes.  Many machines do this correctly, but many others do
+        * not restrict themselves to their claimed limit.  When this happens,
+        * they will cause a segmentation violation in the kernel at boot time.
+        * Most BIOS's, however, will respect a 64k limit, so we use that.
+        *
+        * Note we only set APM segments on CPU zero, since we pin the APM
+        * code to that CPU.
+        */
+       gdt = get_cpu_gdt_table(0);
+       set_base(gdt[APM_CS >> 3],
+                __va((unsigned long)apm_info.bios.cseg << 4));
+       set_base(gdt[APM_CS_16 >> 3],
+                __va((unsigned long)apm_info.bios.cseg_16 << 4));
+       set_base(gdt[APM_DS >> 3],
+                __va((unsigned long)apm_info.bios.dseg << 4));
+
+       apm_proc = create_proc_entry("apm", 0, NULL);
+       if (apm_proc)
+               apm_proc->proc_fops = &apm_file_ops;
+
+       kapmd_task = kthread_create(apm, NULL, "kapmd");
+       if (IS_ERR(kapmd_task)) {
+               printk(KERN_ERR "apm: disabled - Unable to start kernel "
+                               "thread.\n");
+               err = PTR_ERR(kapmd_task);
+               kapmd_task = NULL;
+               remove_proc_entry("apm", NULL);
+               return err;
+       }
+       wake_up_process(kapmd_task);
+
+       if (num_online_cpus() > 1 && !smp ) {
+               printk(KERN_NOTICE
+                  "apm: disabled - APM is not SMP safe (power off active).\n");
+               return 0;
+       }
+
+       /*
+        * Note we don't actually care if the misc_device cannot be registered.
+        * this driver can do its job without it, even if userspace can't
+        * control it.  just log the error
+        */
+       if (misc_register(&apm_device))
+               printk(KERN_WARNING "apm: Could not register misc device.\n");
+
+       if (HZ != 100)
+               idle_period = (idle_period * HZ) / 100;
+       if (idle_threshold < 100) {
+               original_pm_idle = pm_idle;
+               pm_idle  = apm_cpu_idle;
+               set_pm_idle = 1;
+       }
+
+       return 0;
+}
+
+static void __exit apm_exit(void)
+{
+       int     error;
+
+       if (set_pm_idle) {
+               pm_idle = original_pm_idle;
+               /*
+                * We are about to unload the current idle thread pm callback
+                * (pm_idle), Wait for all processors to update cached/local
+                * copies of pm_idle before proceeding.
+                */
+               cpu_idle_wait();
+       }
+       if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
+           && (apm_info.connection_version > 0x0100)) {
+               error = apm_engage_power_management(APM_DEVICE_ALL, 0);
+               if (error)
+                       apm_error("disengage power management", error);
+       }
+       misc_deregister(&apm_device);
+       remove_proc_entry("apm", NULL);
+       if (power_off)
+               pm_power_off = NULL;
+       if (kapmd_task) {
+               kthread_stop(kapmd_task);
+               kapmd_task = NULL;
+       }
+#ifdef CONFIG_PM_LEGACY
+       pm_active = 0;
+#endif
+}
+
+module_init(apm_init);
+module_exit(apm_exit);
+
+MODULE_AUTHOR("Stephen Rothwell");
+MODULE_DESCRIPTION("Advanced Power Management");
+MODULE_LICENSE("GPL");
+module_param(debug, bool, 0644);
+MODULE_PARM_DESC(debug, "Enable debug mode");
+module_param(power_off, bool, 0444);
+MODULE_PARM_DESC(power_off, "Enable power off");
+module_param(bounce_interval, int, 0444);
+MODULE_PARM_DESC(bounce_interval,
+               "Set the number of ticks to ignore suspend bounces");
+module_param(allow_ints, bool, 0444);
+MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
+module_param(broken_psr, bool, 0444);
+MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
+module_param(realmode_power_off, bool, 0444);
+MODULE_PARM_DESC(realmode_power_off,
+               "Switch to real mode before powering off");
+module_param(idle_threshold, int, 0444);
+MODULE_PARM_DESC(idle_threshold,
+       "System idle percentage above which to make APM BIOS idle calls");
+module_param(idle_period, int, 0444);
+MODULE_PARM_DESC(idle_period,
+       "Period (in sec/100) over which to caculate the idle percentage");
+module_param(smp, bool, 0444);
+MODULE_PARM_DESC(smp,
+       "Set this to enable APM use on an SMP platform. Use with caution on older systems");
+MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644 (file)
index 0000000..cfa82c8
--- /dev/null
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "asm-offsets_32.c"
+#else
+# include "asm-offsets_64.c"
+#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644 (file)
index 0000000..8029742
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <asm/ucontext.h>
+#include "sigframe_32.h"
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/elf.h>
+
+#include <xen/interface/xen.h>
+
+#ifdef CONFIG_LGUEST_GUEST
+#include <linux/lguest.h>
+#include "../../../drivers/lguest/lg.h"
+#endif
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+#define OFFSET(sym, str, mem) \
+       DEFINE(sym, offsetof(struct str, mem));
+
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
+void foo(void)
+{
+       OFFSET(SIGCONTEXT_eax, sigcontext, eax);
+       OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
+       OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
+       OFFSET(SIGCONTEXT_edx, sigcontext, edx);
+       OFFSET(SIGCONTEXT_esi, sigcontext, esi);
+       OFFSET(SIGCONTEXT_edi, sigcontext, edi);
+       OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
+       OFFSET(SIGCONTEXT_esp, sigcontext, esp);
+       OFFSET(SIGCONTEXT_eip, sigcontext, eip);
+       BLANK();
+
+       OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+       OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+       OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+       OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
+       OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
+       OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+       OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+       OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+       BLANK();
+
+       OFFSET(TI_task, thread_info, task);
+       OFFSET(TI_exec_domain, thread_info, exec_domain);
+       OFFSET(TI_flags, thread_info, flags);
+       OFFSET(TI_status, thread_info, status);
+       OFFSET(TI_preempt_count, thread_info, preempt_count);
+       OFFSET(TI_addr_limit, thread_info, addr_limit);
+       OFFSET(TI_restart_block, thread_info, restart_block);
+       OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+       OFFSET(TI_cpu, thread_info, cpu);
+       BLANK();
+
+       OFFSET(GDS_size, Xgt_desc_struct, size);
+       OFFSET(GDS_address, Xgt_desc_struct, address);
+       OFFSET(GDS_pad, Xgt_desc_struct, pad);
+       BLANK();
+
+       OFFSET(PT_EBX, pt_regs, ebx);
+       OFFSET(PT_ECX, pt_regs, ecx);
+       OFFSET(PT_EDX, pt_regs, edx);
+       OFFSET(PT_ESI, pt_regs, esi);
+       OFFSET(PT_EDI, pt_regs, edi);
+       OFFSET(PT_EBP, pt_regs, ebp);
+       OFFSET(PT_EAX, pt_regs, eax);
+       OFFSET(PT_DS,  pt_regs, xds);
+       OFFSET(PT_ES,  pt_regs, xes);
+       OFFSET(PT_FS,  pt_regs, xfs);
+       OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
+       OFFSET(PT_EIP, pt_regs, eip);
+       OFFSET(PT_CS,  pt_regs, xcs);
+       OFFSET(PT_EFLAGS, pt_regs, eflags);
+       OFFSET(PT_OLDESP, pt_regs, esp);
+       OFFSET(PT_OLDSS,  pt_regs, xss);
+       BLANK();
+
+       OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
+       OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+       BLANK();
+
+       OFFSET(pbe_address, pbe, address);
+       OFFSET(pbe_orig_address, pbe, orig_address);
+       OFFSET(pbe_next, pbe, next);
+
+       /* Offset from the sysenter stack to tss.esp0 */
+       DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
+                sizeof(struct tss_struct));
+
+       DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+       DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+       DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
+       DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
+       DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+
+       DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
+
+       OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+#ifdef CONFIG_PARAVIRT
+       BLANK();
+       OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+       OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+       OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+       OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+       OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+       OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
+
+#ifdef CONFIG_XEN
+       BLANK();
+       OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+       OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+#ifdef CONFIG_LGUEST_GUEST
+       BLANK();
+       OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+       OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+       OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+       OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+       OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+       OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+       OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+       OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+       OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+       OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+       OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
+#endif
+}
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644 (file)
index 0000000..0b98605
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ *     Implement 'Simple Boot Flag Specification 2.0'
+ */
+
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+
+#include <linux/mc146818rtc.h>
+
+
+#define SBF_RESERVED (0x78)
+#define SBF_PNPOS    (1<<0)
+#define SBF_BOOTING  (1<<1)
+#define SBF_DIAG     (1<<2)
+#define SBF_PARITY   (1<<7)
+
+
+int sbf_port __initdata = -1;  /* set via acpi_boot_init() */
+
+
+static int __init parity(u8 v)
+{
+       int x = 0;
+       int i;
+       
+       for(i=0;i<8;i++)
+       {
+               x^=(v&1);
+               v>>=1;
+       }
+       return x;
+}
+
+static void __init sbf_write(u8 v)
+{
+       unsigned long flags;
+       if(sbf_port != -1)
+       {
+               v &= ~SBF_PARITY;
+               if(!parity(v))
+                       v|=SBF_PARITY;
+
+               printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
+
+               spin_lock_irqsave(&rtc_lock, flags);
+               CMOS_WRITE(v, sbf_port);
+               spin_unlock_irqrestore(&rtc_lock, flags);
+       }
+}
+
+static u8 __init sbf_read(void)
+{
+       u8 v;
+       unsigned long flags;
+       if(sbf_port == -1)
+               return 0;
+       spin_lock_irqsave(&rtc_lock, flags);
+       v = CMOS_READ(sbf_port);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return v;
+}
+
+static int __init sbf_value_valid(u8 v)
+{
+       if(v&SBF_RESERVED)              /* Reserved bits */
+               return 0;
+       if(!parity(v))
+               return 0;
+       return 1;
+}
+
+static int __init sbf_init(void)
+{
+       u8 v;
+       if(sbf_port == -1)
+               return 0;
+       v = sbf_read();
+       if(!sbf_value_valid(v))
+               printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
+
+       v &= ~SBF_RESERVED;
+       v &= ~SBF_BOOTING;
+       v &= ~SBF_DIAG;
+#if defined(CONFIG_ISAPNP)
+       v |= SBF_PNPOS;
+#endif
+       sbf_write(v);
+       return 0;
+}
+
+module_init(sbf_init);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644 (file)
index 0000000..5c2faa1
--- /dev/null
@@ -0,0 +1,242 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * cpuid.c
+ *
+ * x86 CPUID access device
+ *
+ * This device is accessed by lseek() to the appropriate CPUID level
+ * and then read in chunks of 16 bytes.  A larger size means multiple
+ * reads of consecutive levels.
+ *
+ * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+static struct class *cpuid_class;
+
+#ifdef CONFIG_SMP
+
+struct cpuid_command {
+       u32 reg;
+       u32 *data;
+};
+
+static void cpuid_smp_cpuid(void *cmd_block)
+{
+       struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
+
+       cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
+                     &cmd->data[3]);
+}
+
+static inline void do_cpuid(int cpu, u32 reg, u32 * data)
+{
+       struct cpuid_command cmd;
+
+       preempt_disable();
+       if (cpu == smp_processor_id()) {
+               cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
+       } else {
+               cmd.reg = reg;
+               cmd.data = data;
+
+               smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+       }
+       preempt_enable();
+}
+#else                          /* ! CONFIG_SMP */
+
+static inline void do_cpuid(int cpu, u32 reg, u32 * data)
+{
+       cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
+}
+
+#endif                         /* ! CONFIG_SMP */
+
+static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
+{
+       loff_t ret;
+
+       lock_kernel();
+
+       switch (orig) {
+       case 0:
+               file->f_pos = offset;
+               ret = file->f_pos;
+               break;
+       case 1:
+               file->f_pos += offset;
+               ret = file->f_pos;
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       unlock_kernel();
+       return ret;
+}
+
+static ssize_t cpuid_read(struct file *file, char __user *buf,
+                         size_t count, loff_t * ppos)
+{
+       char __user *tmp = buf;
+       u32 data[4];
+       u32 reg = *ppos;
+       int cpu = iminor(file->f_path.dentry->d_inode);
+
+       if (count % 16)
+               return -EINVAL; /* Invalid chunk size */
+
+       for (; count; count -= 16) {
+               do_cpuid(cpu, reg, data);
+               if (copy_to_user(tmp, &data, 16))
+                       return -EFAULT;
+               tmp += 16;
+               *ppos = reg++;
+       }
+
+       return tmp - buf;
+}
+
+static int cpuid_open(struct inode *inode, struct file *file)
+{
+       unsigned int cpu = iminor(file->f_path.dentry->d_inode);
+       struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+
+       if (cpu >= NR_CPUS || !cpu_online(cpu))
+               return -ENXIO;  /* No such CPU */
+       if (c->cpuid_level < 0)
+               return -EIO;    /* CPUID not supported */
+
+       return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations cpuid_fops = {
+       .owner = THIS_MODULE,
+       .llseek = cpuid_seek,
+       .read = cpuid_read,
+       .open = cpuid_open,
+};
+
+static int cpuid_device_create(int i)
+{
+       int err = 0;
+       struct device *dev;
+
+       dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
+       if (IS_ERR(dev))
+               err = PTR_ERR(dev);
+       return err;
+}
+
+static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               cpuid_device_create(cpu);
+               break;
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
+{
+       .notifier_call = cpuid_class_cpu_callback,
+};
+
+static int __init cpuid_init(void)
+{
+       int i, err = 0;
+       i = 0;
+
+       if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
+               printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
+                      CPUID_MAJOR);
+               err = -EBUSY;
+               goto out;
+       }
+       cpuid_class = class_create(THIS_MODULE, "cpuid");
+       if (IS_ERR(cpuid_class)) {
+               err = PTR_ERR(cpuid_class);
+               goto out_chrdev;
+       }
+       for_each_online_cpu(i) {
+               err = cpuid_device_create(i);
+               if (err != 0) 
+                       goto out_class;
+       }
+       register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+
+       err = 0;
+       goto out;
+
+out_class:
+       i = 0;
+       for_each_online_cpu(i) {
+               device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
+       }
+       class_destroy(cpuid_class);
+out_chrdev:
+       unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");    
+out:
+       return err;
+}
+
+static void __exit cpuid_exit(void)
+{
+       int cpu = 0;
+
+       for_each_online_cpu(cpu)
+               device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+       class_destroy(cpuid_class);
+       unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
+       unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+}
+
+module_init(cpuid_init);
+module_exit(cpuid_exit);
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic CPUID driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c
new file mode 100644 (file)
index 0000000..53589d1
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * Architecture specific (i386) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <linux/kdebug.h>
+#include <asm/smp.h>
+
+#include <mach_ipi.h>
+
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct notifier_block *self,
+                       unsigned long val, void *data)
+{
+       struct pt_regs *regs;
+       struct pt_regs fixed_regs;
+       int cpu;
+
+       if (val != DIE_NMI_IPI)
+               return NOTIFY_OK;
+
+       regs = ((struct die_args *)data)->regs;
+       cpu = raw_smp_processor_id();
+
+       /* Don't do anything if this handler is invoked on crashing cpu.
+        * Otherwise, system will completely hang. Crashing cpu can get
+        * an NMI if system was initially booted with nmi_watchdog parameter.
+        */
+       if (cpu == crashing_cpu)
+               return NOTIFY_STOP;
+       local_irq_disable();
+
+       if (!user_mode_vm(regs)) {
+               crash_fixup_ss_esp(&fixed_regs, regs);
+               regs = &fixed_regs;
+       }
+       crash_save_cpu(regs, cpu);
+       disable_local_APIC();
+       atomic_dec(&waiting_for_crash_ipi);
+       /* Assume hlt works */
+       halt();
+       for (;;)
+               cpu_relax();
+
+       return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+       cpumask_t mask = cpu_online_map;
+       cpu_clear(safe_smp_processor_id(), mask);
+       if (!cpus_empty(mask))
+               send_IPI_mask(mask, NMI_VECTOR);
+}
+
+static struct notifier_block crash_nmi_nb = {
+       .notifier_call = crash_nmi_callback,
+};
+
+static void nmi_shootdown_cpus(void)
+{
+       unsigned long msecs;
+
+       atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+       /* Would it be better to replace the trap vector here? */
+       if (register_die_notifier(&crash_nmi_nb))
+               return;         /* return what? */
+       /* Ensure the new callback function is set before sending
+        * out the NMI
+        */
+       wmb();
+
+       smp_send_nmi_allbutself();
+
+       msecs = 1000; /* Wait at most a second for the other cpus to stop */
+       while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+               mdelay(1);
+               msecs--;
+       }
+
+       /* Leave the nmi callback set */
+       disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+       /* There are no cpus to shootdown */
+}
+#endif
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+       /* This function is only called after the system
+        * has panicked or is otherwise in a critical state.
+        * The minimum amount of code to allow a kexec'd kernel
+        * to run successfully needs to happen here.
+        *
+        * In practice this means shooting down the other cpus in
+        * an SMP system.
+        */
+       /* The kernel is broken so disable interrupts */
+       local_irq_disable();
+
+       /* Make a note of crashing cpu. Will be used in NMI callback.*/
+       crashing_cpu = safe_smp_processor_id();
+       nmi_shootdown_cpus();
+       lapic_shutdown();
+#if defined(CONFIG_X86_IO_APIC)
+       disable_IO_APIC();
+#endif
+       crash_save_cpu(regs, safe_smp_processor_id());
+}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644 (file)
index 0000000..3f532df
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ *     kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *     Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *     Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+
+static void *kdump_buf_page;
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *     space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *     otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+       void  *vaddr;
+
+       if (!csize)
+               return 0;
+
+       vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+
+       if (!userbuf) {
+               memcpy(buf, (vaddr + offset), csize);
+               kunmap_atomic(vaddr, KM_PTE0);
+       } else {
+               if (!kdump_buf_page) {
+                       printk(KERN_WARNING "Kdump: Kdump buffer page not"
+                               " allocated\n");
+                       return -EFAULT;
+               }
+               copy_page(kdump_buf_page, vaddr);
+               kunmap_atomic(vaddr, KM_PTE0);
+               if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+                       return -EFAULT;
+       }
+
+       return csize;
+}
+
+static int __init kdump_buf_page_init(void)
+{
+       int ret = 0;
+
+       kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!kdump_buf_page) {
+               printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+                        " page\n");
+               ret = -ENOMEM;
+       }
+
+       return ret;
+}
+arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644 (file)
index 0000000..40978af
--- /dev/null
@@ -0,0 +1,70 @@
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+static void doublefault_fn(void)
+{
+       struct Xgt_desc_struct gdt_desc = {0, 0};
+       unsigned long gdt, tss;
+
+       store_gdt(&gdt_desc);
+       gdt = gdt_desc.address;
+
+       printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+
+       if (ptr_ok(gdt)) {
+               gdt += GDT_ENTRY_TSS << 3;
+               tss = *(u16 *)(gdt+2);
+               tss += *(u8 *)(gdt+4) << 16;
+               tss += *(u8 *)(gdt+7) << 24;
+               printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+
+               if (ptr_ok(tss)) {
+                       struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
+
+                       printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
+
+                       printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+                               t->eax, t->ebx, t->ecx, t->edx);
+                       printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
+                               t->esi, t->edi);
+               }
+       }
+
+       for (;;)
+               cpu_relax();
+}
+
+struct tss_struct doublefault_tss __cacheline_aligned = {
+       .x86_tss = {
+               .esp0           = STACK_START,
+               .ss0            = __KERNEL_DS,
+               .ldt            = 0,
+               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+               .eip            = (unsigned long) doublefault_fn,
+               /* 0x2 bit is always set */
+               .eflags         = X86_EFLAGS_SF | 0x2,
+               .esp            = STACK_START,
+               .es             = __USER_DS,
+               .cs             = __KERNEL_CS,
+               .ss             = __KERNEL_DS,
+               .ds             = __USER_DS,
+               .fs             = __KERNEL_PERCPU,
+
+               .__cr3          = __pa(swapper_pg_dir)
+       }
+};
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
new file mode 100644 (file)
index 0000000..3c86b97
--- /dev/null
@@ -0,0 +1,944 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/pfn.h>
+#include <linux/uaccess.h>
+#include <linux/suspend.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_EFI
+int efi_enabled = 0;
+EXPORT_SYMBOL(efi_enabled);
+#endif
+
+struct e820map e820;
+struct change_member {
+       struct e820entry *pbios; /* pointer to original bios entry */
+       unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+extern int user_defined_memmap;
+struct resource data_resource = {
+       .name   = "Kernel data",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+struct resource code_resource = {
+       .name   = "Kernel code",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource system_rom_resource = {
+       .name   = "System ROM",
+       .start  = 0xf0000,
+       .end    = 0xfffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+       .name   = "Extension ROM",
+       .start  = 0xe0000,
+       .end    = 0xeffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+       .name   = "Adapter ROM",
+       .start  = 0xc8000,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+       .name   = "Video ROM",
+       .start  = 0xc0000,
+       .end    = 0xc7fff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+       .name   = "Video RAM area",
+       .start  = 0xa0000,
+       .end    = 0xbffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+       .name   = "dma1",
+       .start  = 0x0000,
+       .end    = 0x001f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "pic1",
+       .start  = 0x0020,
+       .end    = 0x0021,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "timer0",
+       .start  = 0x0040,
+       .end    = 0x0043,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "timer1",
+       .start  = 0x0050,
+       .end    = 0x0053,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "keyboard",
+       .start  = 0x0060,
+       .end    = 0x006f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "dma page reg",
+       .start  = 0x0080,
+       .end    = 0x008f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "pic2",
+       .start  = 0x00a0,
+       .end    = 0x00a1,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "dma2",
+       .start  = 0x00c0,
+       .end    = 0x00df,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "fpu",
+       .start  = 0x00f0,
+       .end    = 0x00ff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+} };
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+       const unsigned short * const ptr = (const unsigned short *)rom;
+       unsigned short sig;
+
+       return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+       unsigned char sum, c;
+
+       for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+               sum += c;
+       return !length && !sum;
+}
+
+static void __init probe_roms(void)
+{
+       const unsigned char *rom;
+       unsigned long start, length, upper;
+       unsigned char c;
+       int i;
+
+       /* video rom */
+       upper = adapter_rom_resources[0].start;
+       for (start = video_rom_resource.start; start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+
+               video_rom_resource.start = start;
+
+               if (probe_kernel_address(rom + 2, c) != 0)
+                       continue;
+
+               /* 0 < length <= 0x7f * 512, historically */
+               length = c * 512;
+
+               /* if checksum okay, trust length byte */
+               if (length && romchecksum(rom, length))
+                       video_rom_resource.end = start + length - 1;
+
+               request_resource(&iomem_resource, &video_rom_resource);
+               break;
+       }
+
+       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+       if (start < upper)
+               start = upper;
+
+       /* system rom */
+       request_resource(&iomem_resource, &system_rom_resource);
+       upper = system_rom_resource.start;
+
+       /* check for extension rom (ignore length byte!) */
+       rom = isa_bus_to_virt(extension_rom_resource.start);
+       if (romsignature(rom)) {
+               length = extension_rom_resource.end - extension_rom_resource.start + 1;
+               if (romchecksum(rom, length)) {
+                       request_resource(&iomem_resource, &extension_rom_resource);
+                       upper = extension_rom_resource.start;
+               }
+       }
+
+       /* check for adapter roms on 2k boundaries */
+       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+
+               if (probe_kernel_address(rom + 2, c) != 0)
+                       continue;
+
+               /* 0 < length <= 0x7f * 512, historically */
+               length = c * 512;
+
+               /* but accept any length that fits if checksum okay */
+               if (!length || start + length > upper || !romchecksum(rom, length))
+                       continue;
+
+               adapter_rom_resources[i].start = start;
+               adapter_rom_resources[i].end = start + length - 1;
+               request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+               start = adapter_rom_resources[i++].end & ~2047UL;
+       }
+}
+
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+static void __init
+legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
+{
+       int i;
+
+       probe_roms();
+       for (i = 0; i < e820.nr_map; i++) {
+               struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
+               if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+                       continue;
+#endif
+               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+               switch (e820.map[i].type) {
+               case E820_RAM:  res->name = "System RAM"; break;
+               case E820_ACPI: res->name = "ACPI Tables"; break;
+               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
+               default:        res->name = "reserved";
+               }
+               res->start = e820.map[i].addr;
+               res->end = res->start + e820.map[i].size - 1;
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               if (request_resource(&iomem_resource, res)) {
+                       kfree(res);
+                       continue;
+               }
+               if (e820.map[i].type == E820_RAM) {
+                       /*
+                        *  We don't know which RAM region contains kernel data,
+                        *  so we try it repeatedly and let the resource manager
+                        *  test it.
+                        */
+                       request_resource(res, code_resource);
+                       request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+                       request_resource(res, &crashk_res);
+#endif
+               }
+       }
+}
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+       int i;
+
+       printk("Setting up standard PCI resources\n");
+       if (efi_enabled)
+               efi_initialize_iomem_resources(&code_resource, &data_resource);
+       else
+               legacy_init_iomem_resources(&code_resource, &data_resource);
+
+       /* EFI systems may still have VGA */
+       request_resource(&iomem_resource, &video_ram_resource);
+
+       /* request I/O space for devices used on all i[345]86 PCs */
+       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+               request_resource(&ioport_resource, &standard_io_resources[i]);
+       return 0;
+}
+
+subsys_initcall(request_standard_resources);
+
+#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
+/**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+ * correspond to e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+       int i;
+       unsigned long pfn;
+
+       pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+       for (i = 1; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+
+               if (pfn < PFN_UP(ei->addr))
+                       register_nosave_region(pfn, PFN_UP(ei->addr));
+
+               pfn = PFN_DOWN(ei->addr + ei->size);
+               if (ei->type != E820_RAM)
+                       register_nosave_region(PFN_UP(ei->addr), pfn);
+
+               if (pfn >= max_low_pfn)
+                       break;
+       }
+}
+#endif
+
+void __init add_memory_region(unsigned long long start,
+                             unsigned long long size, int type)
+{
+       int x;
+
+       if (!efi_enabled) {
+                       x = e820.nr_map;
+
+               if (x == E820MAX) {
+                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+                   return;
+               }
+
+               e820.map[x].addr = start;
+               e820.map[x].size = size;
+               e820.map[x].type = type;
+               e820.nr_map++;
+       }
+} /* add_memory_region */
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+       struct change_member *change_tmp;
+       unsigned long current_type, last_type;
+       unsigned long long last_addr;
+       int chgidx, still_changing;
+       int overlap_entries;
+       int new_bios_entry;
+       int old_nr, new_nr, chg_nr;
+       int i;
+
+       /*
+               Visually we're performing the following (1,2,3,4 = memory types)...
+
+               Sample memory map (w/overlaps):
+                  ____22__________________
+                  ______________________4_
+                  ____1111________________
+                  _44_____________________
+                  11111111________________
+                  ____________________33__
+                  ___________44___________
+                  __________33333_________
+                  ______________22________
+                  ___________________2222_
+                  _________111111111______
+                  _____________________11_
+                  _________________4______
+
+               Sanitized equivalent (no overlap):
+                  1_______________________
+                  _44_____________________
+                  ___1____________________
+                  ____22__________________
+                  ______11________________
+                  _________1______________
+                  __________3_____________
+                  ___________44___________
+                  _____________33_________
+                  _______________2________
+                  ________________1_______
+                  _________________4______
+                  ___________________2____
+                  ____________________33__
+                  ______________________4_
+       */
+       /* if there's only one memory region, don't bother */
+       if (*pnr_map < 2) {
+               return -1;
+       }
+
+       old_nr = *pnr_map;
+
+       /* bail out if we find any unreasonable addresses in bios map */
+       for (i=0; i<old_nr; i++)
+               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
+                       return -1;
+               }
+
+       /* create pointers for initial change-point information (for sorting) */
+       for (i=0; i < 2*old_nr; i++)
+               change_point[i] = &change_point_list[i];
+
+       /* record all known change-points (starting and ending addresses),
+          omitting those that are for empty memory regions */
+       chgidx = 0;
+       for (i=0; i < old_nr; i++)      {
+               if (biosmap[i].size != 0) {
+                       change_point[chgidx]->addr = biosmap[i].addr;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+               }
+       }
+       chg_nr = chgidx;        /* true number of change-points */
+
+       /* sort change-point list by memory addresses (low -> high) */
+       still_changing = 1;
+       while (still_changing)  {
+               still_changing = 0;
+               for (i=1; i < chg_nr; i++)  {
+                       /* if <current_addr> > <last_addr>, swap */
+                       /* or, if current=<start_addr> & last=<end_addr>, swap */
+                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
+                               ((change_point[i]->addr == change_point[i-1]->addr) &&
+                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
+                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+                          )
+                       {
+                               change_tmp = change_point[i];
+                               change_point[i] = change_point[i-1];
+                               change_point[i-1] = change_tmp;
+                               still_changing=1;
+                       }
+               }
+       }
+
+       /* create a new bios memory map, removing overlaps */
+       overlap_entries=0;       /* number of entries in the overlap table */
+       new_bios_entry=0;        /* index for creating new bios map entries */
+       last_type = 0;           /* start with undefined memory type */
+       last_addr = 0;           /* start with 0 as last starting address */
+       /* loop through change-points, determining affect on the new bios map */
+       for (chgidx=0; chgidx < chg_nr; chgidx++)
+       {
+               /* keep track of all overlapping bios entries */
+               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+               {
+                       /* add map entry to overlap list (> 1 entry implies an overlap) */
+                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+               }
+               else
+               {
+                       /* remove entry from list (order independent, so swap with last) */
+                       for (i=0; i<overlap_entries; i++)
+                       {
+                               if (overlap_list[i] == change_point[chgidx]->pbios)
+                                       overlap_list[i] = overlap_list[overlap_entries-1];
+                       }
+                       overlap_entries--;
+               }
+               /* if there are overlapping entries, decide which "type" to use */
+               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+               current_type = 0;
+               for (i=0; i<overlap_entries; i++)
+                       if (overlap_list[i]->type > current_type)
+                               current_type = overlap_list[i]->type;
+               /* continue building up new bios map based on this information */
+               if (current_type != last_type)  {
+                       if (last_type != 0)      {
+                               new_bios[new_bios_entry].size =
+                                       change_point[chgidx]->addr - last_addr;
+                               /* move forward only if the new size was non-zero */
+                               if (new_bios[new_bios_entry].size != 0)
+                                       if (++new_bios_entry >= E820MAX)
+                                               break;  /* no more space left for new bios entries */
+                       }
+                       if (current_type != 0)  {
+                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+                               new_bios[new_bios_entry].type = current_type;
+                               last_addr=change_point[chgidx]->addr;
+                       }
+                       last_type = current_type;
+               }
+       }
+       new_nr = new_bios_entry;   /* retain count for new bios entries */
+
+       /* copy new bios mapping into original location */
+       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+       *pnr_map = new_nr;
+
+       return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+       /* Only one memory region (or negative)? Ignore it */
+       if (nr_map < 2)
+               return -1;
+
+       do {
+               unsigned long long start = biosmap->addr;
+               unsigned long long size = biosmap->size;
+               unsigned long long end = start + size;
+               unsigned long type = biosmap->type;
+
+               /* Overflow in 64 bits? Ignore the memory map. */
+               if (start > end)
+                       return -1;
+
+               /*
+                * Some BIOSes claim RAM in the 640k - 1M region.
+                * Not right. Fix it up.
+                */
+               if (type == E820_RAM) {
+                       if (start < 0x100000ULL && end > 0xA0000ULL) {
+                               if (start < 0xA0000ULL)
+                                       add_memory_region(start, 0xA0000ULL-start, type);
+                               if (end <= 0x100000ULL)
+                                       continue;
+                               start = 0x100000ULL;
+                               size = end - start;
+                       }
+               }
+               add_memory_region(start, size, type);
+       } while (biosmap++,--nr_map);
+       return 0;
+}
+
+/*
+ * Callback for efi_memory_walk.
+ */
+static int __init
+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+{
+       unsigned long *max_pfn = arg, pfn;
+
+       if (start < end) {
+               pfn = PFN_UP(end -1);
+               if (pfn > *max_pfn)
+                       *max_pfn = pfn;
+       }
+       return 0;
+}
+
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+       memory_present(0, PFN_UP(start), PFN_DOWN(end));
+       return 0;
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+void __init find_max_pfn(void)
+{
+       int i;
+
+       max_pfn = 0;
+       if (efi_enabled) {
+               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+               efi_memmap_walk(efi_memory_present_wrapper, NULL);
+               return;
+       }
+
+       for (i = 0; i < e820.nr_map; i++) {
+               unsigned long start, end;
+               /* RAM? */
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+               start = PFN_UP(e820.map[i].addr);
+               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+               if (start >= end)
+                       continue;
+               if (end > max_pfn)
+                       max_pfn = end;
+               memory_present(0, start, end);
+       }
+}
+
+/*
+ * Free all available memory for boot time allocation.  Used
+ * as a callback function by efi_memory_walk()
+ */
+
+static int __init
+free_available_memory(unsigned long start, unsigned long end, void *arg)
+{
+       /* check max_low_pfn */
+       if (start >= (max_low_pfn << PAGE_SHIFT))
+               return 0;
+       if (end >= (max_low_pfn << PAGE_SHIFT))
+               end = max_low_pfn << PAGE_SHIFT;
+       if (start < end)
+               free_bootmem(start, end - start);
+
+       return 0;
+}
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+       int i;
+
+       if (efi_enabled) {
+               efi_memmap_walk(free_available_memory, NULL);
+               return;
+       }
+       for (i = 0; i < e820.nr_map; i++) {
+               unsigned long curr_pfn, last_pfn, size;
+               /*
+                * Reserve usable low memory
+                */
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+               /*
+                * We are rounding up the start address of usable memory:
+                */
+               curr_pfn = PFN_UP(e820.map[i].addr);
+               if (curr_pfn >= max_low_pfn)
+                       continue;
+               /*
+                * ... and at the end of the usable range downwards:
+                */
+               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+
+               if (last_pfn > max_low_pfn)
+                       last_pfn = max_low_pfn;
+
+               /*
+                * .. finally, did all the rounding and playing
+                * around just make the area go away?
+                */
+               if (last_pfn <= curr_pfn)
+                       continue;
+
+               size = last_pfn - curr_pfn;
+               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+       }
+}
+
+void __init e820_register_memory(void)
+{
+       unsigned long gapstart, gapsize, round;
+       unsigned long long last;
+       int i;
+
+       /*
+        * Search for the bigest gap in the low 32 bits of the e820
+        * memory space.
+        */
+       last = 0x100000000ull;
+       gapstart = 0x10000000;
+       gapsize = 0x400000;
+       i = e820.nr_map;
+       while (--i >= 0) {
+               unsigned long long start = e820.map[i].addr;
+               unsigned long long end = start + e820.map[i].size;
+
+               /*
+                * Since "last" is at most 4GB, we know we'll
+                * fit in 32 bits if this condition is true
+                */
+               if (last > end) {
+                       unsigned long gap = last - end;
+
+                       if (gap > gapsize) {
+                               gapsize = gap;
+                               gapstart = end;
+                       }
+               }
+               if (start < last)
+                       last = start;
+       }
+
+       /*
+        * See how much we want to round up: start off with
+        * rounding to the next 1MB area.
+        */
+       round = 0x100000;
+       while ((gapsize >> 4) > round)
+               round += round;
+       /* Fun with two's complement */
+       pci_mem_start = (gapstart + round) & -round;
+
+       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
+               pci_mem_start, gapstart, gapsize);
+}
+
+void __init print_memory_map(char *who)
+{
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               printk(" %s: %016Lx - %016Lx ", who,
+                       e820.map[i].addr,
+                       e820.map[i].addr + e820.map[i].size);
+               switch (e820.map[i].type) {
+               case E820_RAM:  printk("(usable)\n");
+                               break;
+               case E820_RESERVED:
+                               printk("(reserved)\n");
+                               break;
+               case E820_ACPI:
+                               printk("(ACPI data)\n");
+                               break;
+               case E820_NVS:
+                               printk("(ACPI NVS)\n");
+                               break;
+               default:        printk("type %u\n", e820.map[i].type);
+                               break;
+               }
+       }
+}
+
+static __init __always_inline void efi_limit_regions(unsigned long long size)
+{
+       unsigned long long current_addr = 0;
+       efi_memory_desc_t *md, *next_md;
+       void *p, *p1;
+       int i, j;
+
+       j = 0;
+       p1 = memmap.map;
+       for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+               md = p;
+               next_md = p1;
+               current_addr = md->phys_addr +
+                       PFN_PHYS(md->num_pages);
+               if (is_available_memory(md)) {
+                       if (md->phys_addr >= size) continue;
+                       memcpy(next_md, md, memmap.desc_size);
+                       if (current_addr >= size) {
+                               next_md->num_pages -=
+                                       PFN_UP(current_addr-size);
+                       }
+                       p1 += memmap.desc_size;
+                       next_md = p1;
+                       j++;
+               } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
+                          EFI_MEMORY_RUNTIME) {
+                       /* In order to make runtime services
+                        * available we have to include runtime
+                        * memory regions in memory map */
+                       memcpy(next_md, md, memmap.desc_size);
+                       p1 += memmap.desc_size;
+                       next_md = p1;
+                       j++;
+               }
+       }
+       memmap.nr_map = j;
+       memmap.map_end = memmap.map +
+               (memmap.nr_map * memmap.desc_size);
+}
+
+void __init limit_regions(unsigned long long size)
+{
+       unsigned long long current_addr;
+       int i;
+
+       print_memory_map("limit_regions start");
+       if (efi_enabled) {
+               efi_limit_regions(size);
+               return;
+       }
+       for (i = 0; i < e820.nr_map; i++) {
+               current_addr = e820.map[i].addr + e820.map[i].size;
+               if (current_addr < size)
+                       continue;
+
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+
+               if (e820.map[i].addr >= size) {
+                       /*
+                        * This region starts past the end of the
+                        * requested size, skip it completely.
+                        */
+                       e820.nr_map = i;
+               } else {
+                       e820.nr_map = i + 1;
+                       e820.map[i].size -= current_addr - size;
+               }
+               print_memory_map("limit_regions endfor");
+               return;
+       }
+       print_memory_map("limit_regions endfunc");
+}
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+       int i;
+       for (i = 0; i < e820.nr_map; i++) {
+               const struct e820entry *ei = &e820.map[i];
+               if (type && ei->type != type)
+                       continue;
+               if (ei->addr >= end || ei->addr + ei->size <= start)
+                       continue;
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+ /*
+  * This function checks if the entire range <start,end> is mapped with type.
+  *
+  * Note: this function only works correct if the e820 table is sorted and
+  * not-overlapping, which is the case
+  */
+int __init
+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
+{
+       u64 start = s;
+       u64 end = e;
+       int i;
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+               if (type && ei->type != type)
+                       continue;
+               /* is the region (part) in overlap with the current region ?*/
+               if (ei->addr >= end || ei->addr + ei->size <= start)
+                       continue;
+               /* if the region is at the beginning of <start,end> we move
+                * start to the end of the region since it's ok until there
+                */
+               if (ei->addr <= start)
+                       start = ei->addr + ei->size;
+               /* if start is now at or beyond end, we're done, full
+                * coverage */
+               if (start >= end)
+                       return 1; /* we're done */
+       }
+       return 0;
+}
+
+static int __init parse_memmap(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+               /* If we are doing a crash dump, we
+                * still need to know the real mem
+                * size before original memory map is
+                * reset.
+                */
+               find_max_pfn();
+               saved_max_pfn = max_pfn;
+#endif
+               e820.nr_map = 0;
+               user_defined_memmap = 1;
+       } else {
+               /* If the user specifies memory size, we
+                * limit the BIOS-provided memory map to
+                * that size. exactmap can be used to specify
+                * the exact map. mem=number can be used to
+                * trim the existing memory map.
+                */
+               unsigned long long start_at, mem_size;
+
+               mem_size = memparse(arg, &arg);
+               if (*arg == '@') {
+                       start_at = memparse(arg+1, &arg);
+                       add_memory_region(start_at, mem_size, E820_RAM);
+               } else if (*arg == '#') {
+                       start_at = memparse(arg+1, &arg);
+                       add_memory_region(start_at, mem_size, E820_ACPI);
+               } else if (*arg == '$') {
+                       start_at = memparse(arg+1, &arg);
+                       add_memory_region(start_at, mem_size, E820_RESERVED);
+               } else {
+                       limit_regions(mem_size);
+                       user_defined_memmap = 1;
+               }
+       }
+       return 0;
+}
+early_param("memmap", parse_memmap);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644 (file)
index 0000000..92f812b
--- /dev/null
@@ -0,0 +1,2 @@
+
+#include "../../x86_64/kernel/early_printk.c"
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
new file mode 100644 (file)
index 0000000..2452c6f
--- /dev/null
@@ -0,0 +1,712 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *     David Mosberger-Tang <davidm@hpl.hp.com>
+ *     Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *     Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/kexec.h>
+
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+
+#define EFI_DEBUG      0
+#define PFX            "EFI: "
+
+extern efi_status_t asmlinkage efi_call_phys(void *, ...);
+
+struct efi efi;
+EXPORT_SYMBOL(efi);
+static struct efi efi_phys;
+struct efi_memory_map memmap;
+
+/*
+ * We require an early boot_ioremap mapping mechanism initially
+ */
+extern void * boot_ioremap(unsigned long, unsigned long);
+
+/*
+ * To make EFI call EFI runtime service in physical addressing mode we need
+ * prelog/epilog before/after the invocation to disable interrupt, to
+ * claim EFI runtime service handler exclusively and to duplicate a memory in
+ * low memory space say 0 - 3G.
+ */
+
+static unsigned long efi_rt_eflags;
+static DEFINE_SPINLOCK(efi_rt_lock);
+static pgd_t efi_bak_pg_dir_pointer[2];
+
+static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
+{
+       unsigned long cr4;
+       unsigned long temp;
+       struct Xgt_desc_struct gdt_descr;
+
+       spin_lock(&efi_rt_lock);
+       local_irq_save(efi_rt_eflags);
+
+       /*
+        * If I don't have PSE, I should just duplicate two entries in page
+        * directory. If I have PSE, I just need to duplicate one entry in
+        * page directory.
+        */
+       cr4 = read_cr4();
+
+       if (cr4 & X86_CR4_PSE) {
+               efi_bak_pg_dir_pointer[0].pgd =
+                   swapper_pg_dir[pgd_index(0)].pgd;
+               swapper_pg_dir[0].pgd =
+                   swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+       } else {
+               efi_bak_pg_dir_pointer[0].pgd =
+                   swapper_pg_dir[pgd_index(0)].pgd;
+               efi_bak_pg_dir_pointer[1].pgd =
+                   swapper_pg_dir[pgd_index(0x400000)].pgd;
+               swapper_pg_dir[pgd_index(0)].pgd =
+                   swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+               temp = PAGE_OFFSET + 0x400000;
+               swapper_pg_dir[pgd_index(0x400000)].pgd =
+                   swapper_pg_dir[pgd_index(temp)].pgd;
+       }
+
+       /*
+        * After the lock is released, the original page table is restored.
+        */
+       local_flush_tlb();
+
+       gdt_descr.address = __pa(get_cpu_gdt_table(0));
+       gdt_descr.size = GDT_SIZE - 1;
+       load_gdt(&gdt_descr);
+}
+
+static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
+{
+       unsigned long cr4;
+       struct Xgt_desc_struct gdt_descr;
+
+       gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+       gdt_descr.size = GDT_SIZE - 1;
+       load_gdt(&gdt_descr);
+
+       cr4 = read_cr4();
+
+       if (cr4 & X86_CR4_PSE) {
+               swapper_pg_dir[pgd_index(0)].pgd =
+                   efi_bak_pg_dir_pointer[0].pgd;
+       } else {
+               swapper_pg_dir[pgd_index(0)].pgd =
+                   efi_bak_pg_dir_pointer[0].pgd;
+               swapper_pg_dir[pgd_index(0x400000)].pgd =
+                   efi_bak_pg_dir_pointer[1].pgd;
+       }
+
+       /*
+        * After the lock is released, the original page table is restored.
+        */
+       local_flush_tlb();
+
+       local_irq_restore(efi_rt_eflags);
+       spin_unlock(&efi_rt_lock);
+}
+
+static efi_status_t
+phys_efi_set_virtual_address_map(unsigned long memory_map_size,
+                                unsigned long descriptor_size,
+                                u32 descriptor_version,
+                                efi_memory_desc_t *virtual_map)
+{
+       efi_status_t status;
+
+       efi_call_phys_prelog();
+       status = efi_call_phys(efi_phys.set_virtual_address_map,
+                                    memory_map_size, descriptor_size,
+                                    descriptor_version, virtual_map);
+       efi_call_phys_epilog();
+       return status;
+}
+
+static efi_status_t
+phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+       efi_status_t status;
+
+       efi_call_phys_prelog();
+       status = efi_call_phys(efi_phys.get_time, tm, tc);
+       efi_call_phys_epilog();
+       return status;
+}
+
+inline int efi_set_rtc_mmss(unsigned long nowtime)
+{
+       int real_seconds, real_minutes;
+       efi_status_t    status;
+       efi_time_t      eft;
+       efi_time_cap_t  cap;
+
+       spin_lock(&efi_rt_lock);
+       status = efi.get_time(&eft, &cap);
+       spin_unlock(&efi_rt_lock);
+       if (status != EFI_SUCCESS)
+               panic("Ooops, efitime: can't read time!\n");
+       real_seconds = nowtime % 60;
+       real_minutes = nowtime / 60;
+
+       if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+               real_minutes += 30;
+       real_minutes %= 60;
+
+       eft.minute = real_minutes;
+       eft.second = real_seconds;
+
+       if (status != EFI_SUCCESS) {
+               printk("Ooops: efitime: can't read time!\n");
+               return -1;
+       }
+       return 0;
+}
+/*
+ * This is used during kernel init before runtime
+ * services have been remapped and also during suspend, therefore,
+ * we'll need to call both in physical and virtual modes.
+ */
+inline unsigned long efi_get_time(void)
+{
+       efi_status_t status;
+       efi_time_t eft;
+       efi_time_cap_t cap;
+
+       if (efi.get_time) {
+               /* if we are in virtual mode use remapped function */
+               status = efi.get_time(&eft, &cap);
+       } else {
+               /* we are in physical mode */
+               status = phys_efi_get_time(&eft, &cap);
+       }
+
+       if (status != EFI_SUCCESS)
+               printk("Oops: efitime: can't read time status: 0x%lx\n",status);
+
+       return mktime(eft.year, eft.month, eft.day, eft.hour,
+                       eft.minute, eft.second);
+}
+
+int is_available_memory(efi_memory_desc_t * md)
+{
+       if (!(md->attribute & EFI_MEMORY_WB))
+               return 0;
+
+       switch (md->type) {
+               case EFI_LOADER_CODE:
+               case EFI_LOADER_DATA:
+               case EFI_BOOT_SERVICES_CODE:
+               case EFI_BOOT_SERVICES_DATA:
+               case EFI_CONVENTIONAL_MEMORY:
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * We need to map the EFI memory map again after paging_init().
+ */
+void __init efi_map_memmap(void)
+{
+       memmap.map = NULL;
+
+       memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
+                       (memmap.nr_map * memmap.desc_size));
+       if (memmap.map == NULL)
+               printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
+
+       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+}
+
+#if EFI_DEBUG
+static void __init print_efi_memmap(void)
+{
+       efi_memory_desc_t *md;
+       void *p;
+       int i;
+
+       for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+               md = p;
+               printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
+                       "range=[0x%016llx-0x%016llx) (%lluMB)\n",
+                       i, md->type, md->attribute, md->phys_addr,
+                       md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+                       (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+       }
+}
+#endif  /*  EFI_DEBUG  */
+
+/*
+ * Walks the EFI memory map and calls CALLBACK once for each EFI
+ * memory descriptor that has memory that is available for kernel use.
+ */
+void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
+{
+       int prev_valid = 0;
+       struct range {
+               unsigned long start;
+               unsigned long end;
+       } uninitialized_var(prev), curr;
+       efi_memory_desc_t *md;
+       unsigned long start, end;
+       void *p;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+
+               if ((md->num_pages == 0) || (!is_available_memory(md)))
+                       continue;
+
+               curr.start = md->phys_addr;
+               curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
+
+               if (!prev_valid) {
+                       prev = curr;
+                       prev_valid = 1;
+               } else {
+                       if (curr.start < prev.start)
+                               printk(KERN_INFO PFX "Unordered memory map\n");
+                       if (prev.end == curr.start)
+                               prev.end = curr.end;
+                       else {
+                               start =
+                                   (unsigned long) (PAGE_ALIGN(prev.start));
+                               end = (unsigned long) (prev.end & PAGE_MASK);
+                               if ((end > start)
+                                   && (*callback) (start, end, arg) < 0)
+                                       return;
+                               prev = curr;
+                       }
+               }
+       }
+       if (prev_valid) {
+               start = (unsigned long) PAGE_ALIGN(prev.start);
+               end = (unsigned long) (prev.end & PAGE_MASK);
+               if (end > start)
+                       (*callback) (start, end, arg);
+       }
+}
+
+void __init efi_init(void)
+{
+       efi_config_table_t *config_tables;
+       efi_runtime_services_t *runtime;
+       efi_char16_t *c16;
+       char vendor[100] = "unknown";
+       unsigned long num_config_tables;
+       int i = 0;
+
+       memset(&efi, 0, sizeof(efi) );
+       memset(&efi_phys, 0, sizeof(efi_phys));
+
+       efi_phys.systab = EFI_SYSTAB;
+       memmap.phys_map = EFI_MEMMAP;
+       memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
+       memmap.desc_version = EFI_MEMDESC_VERSION;
+       memmap.desc_size = EFI_MEMDESC_SIZE;
+
+       efi.systab = (efi_system_table_t *)
+               boot_ioremap((unsigned long) efi_phys.systab,
+                       sizeof(efi_system_table_t));
+       /*
+        * Verify the EFI Table
+        */
+       if (efi.systab == NULL)
+               printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
+       if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+               printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
+       if ((efi.systab->hdr.revision >> 16) == 0)
+               printk(KERN_ERR PFX "Warning: EFI system table version "
+                      "%d.%02d, expected 1.00 or greater\n",
+                      efi.systab->hdr.revision >> 16,
+                      efi.systab->hdr.revision & 0xffff);
+
+       /*
+        * Grab some details from the system table
+        */
+       num_config_tables = efi.systab->nr_tables;
+       config_tables = (efi_config_table_t *)efi.systab->tables;
+       runtime = efi.systab->runtime;
+
+       /*
+        * Show what we know for posterity
+        */
+       c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
+       if (c16) {
+               for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
+                       vendor[i] = *c16++;
+               vendor[i] = '\0';
+       } else
+               printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+
+       printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
+              efi.systab->hdr.revision >> 16,
+              efi.systab->hdr.revision & 0xffff, vendor);
+
+       /*
+        * Let's see what config tables the firmware passed to us.
+        */
+       config_tables = (efi_config_table_t *)
+                               boot_ioremap((unsigned long) config_tables,
+                               num_config_tables * sizeof(efi_config_table_t));
+
+       if (config_tables == NULL)
+               printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
+
+       efi.mps        = EFI_INVALID_TABLE_ADDR;
+       efi.acpi       = EFI_INVALID_TABLE_ADDR;
+       efi.acpi20     = EFI_INVALID_TABLE_ADDR;
+       efi.smbios     = EFI_INVALID_TABLE_ADDR;
+       efi.sal_systab = EFI_INVALID_TABLE_ADDR;
+       efi.boot_info  = EFI_INVALID_TABLE_ADDR;
+       efi.hcdp       = EFI_INVALID_TABLE_ADDR;
+       efi.uga        = EFI_INVALID_TABLE_ADDR;
+
+       for (i = 0; i < num_config_tables; i++) {
+               if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
+                       efi.mps = config_tables[i].table;
+                       printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
+               } else
+                   if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
+                       efi.acpi20 = config_tables[i].table;
+                       printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
+               } else
+                   if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
+                       efi.acpi = config_tables[i].table;
+                       printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
+               } else
+                   if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
+                       efi.smbios = config_tables[i].table;
+                       printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
+               } else
+                   if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
+                       efi.hcdp = config_tables[i].table;
+                       printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
+               } else
+                   if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
+                       efi.uga = config_tables[i].table;
+                       printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
+               }
+       }
+       printk("\n");
+
+       /*
+        * Check out the runtime services table. We need to map
+        * the runtime services table so that we can grab the physical
+        * address of several of the EFI runtime functions, needed to
+        * set the firmware into virtual mode.
+        */
+
+       runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
+                                               runtime,
+                                               sizeof(efi_runtime_services_t));
+       if (runtime != NULL) {
+               /*
+                * We will only need *early* access to the following
+                * two EFI runtime services before set_virtual_address_map
+                * is invoked.
+                */
+               efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
+               efi_phys.set_virtual_address_map =
+                       (efi_set_virtual_address_map_t *)
+                               runtime->set_virtual_address_map;
+       } else
+               printk(KERN_ERR PFX "Could not map the runtime service table!\n");
+
+       /* Map the EFI memory map for use until paging_init() */
+       memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
+       if (memmap.map == NULL)
+               printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
+
+       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
+#if EFI_DEBUG
+       print_efi_memmap();
+#endif
+}
+
+static inline void __init check_range_for_systab(efi_memory_desc_t *md)
+{
+       if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
+               ((unsigned long)efi_phys.systab < md->phys_addr +
+               ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
+               unsigned long addr;
+
+               addr = md->virt_addr - md->phys_addr +
+                       (unsigned long)efi_phys.systab;
+               efi.systab = (efi_system_table_t *)addr;
+       }
+}
+
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+
+#define efi_call_virt(f, args...) \
+     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+
+static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+       return efi_call_virt(get_time, tm, tc);
+}
+
+static efi_status_t virt_efi_set_time (efi_time_t *tm)
+{
+       return efi_call_virt(set_time, tm);
+}
+
+static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
+                                             efi_bool_t *pending,
+                                             efi_time_t *tm)
+{
+       return efi_call_virt(get_wakeup_time, enabled, pending, tm);
+}
+
+static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
+                                             efi_time_t *tm)
+{
+       return efi_call_virt(set_wakeup_time, enabled, tm);
+}
+
+static efi_status_t virt_efi_get_variable (efi_char16_t *name,
+                                          efi_guid_t *vendor, u32 *attr,
+                                          unsigned long *data_size, void *data)
+{
+       return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
+                                               efi_char16_t *name,
+                                               efi_guid_t *vendor)
+{
+       return efi_call_virt(get_next_variable, name_size, name, vendor);
+}
+
+static efi_status_t virt_efi_set_variable (efi_char16_t *name,
+                                          efi_guid_t *vendor,
+                                          unsigned long attr,
+                                          unsigned long data_size, void *data)
+{
+       return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
+{
+       return efi_call_virt(get_next_high_mono_count, count);
+}
+
+static void virt_efi_reset_system (int reset_type, efi_status_t status,
+                                  unsigned long data_size,
+                                  efi_char16_t *data)
+{
+       efi_call_virt(reset_system, reset_type, status, data_size, data);
+}
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor and update
+ * that memory descriptor with the virtual address obtained from ioremap().
+ * This enables the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ */
+
+void __init efi_enter_virtual_mode(void)
+{
+       efi_memory_desc_t *md;
+       efi_status_t status;
+       void *p;
+
+       efi.systab = NULL;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+
+               if (!(md->attribute & EFI_MEMORY_RUNTIME))
+                       continue;
+
+               md->virt_addr = (unsigned long)ioremap(md->phys_addr,
+                       md->num_pages << EFI_PAGE_SHIFT);
+               if (!(unsigned long)md->virt_addr) {
+                       printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
+                               (unsigned long)md->phys_addr);
+               }
+               /* update the virtual address of the EFI system table */
+               check_range_for_systab(md);
+       }
+
+       BUG_ON(!efi.systab);
+
+       status = phys_efi_set_virtual_address_map(
+                       memmap.desc_size * memmap.nr_map,
+                       memmap.desc_size,
+                       memmap.desc_version,
+                       memmap.phys_map);
+
+       if (status != EFI_SUCCESS) {
+               printk (KERN_ALERT "You are screwed! "
+                       "Unable to switch EFI into virtual mode "
+                       "(status=%lx)\n", status);
+               panic("EFI call to SetVirtualAddressMap() failed!");
+       }
+
+       /*
+        * Now that EFI is in virtual mode, update the function
+        * pointers in the runtime service table to the new virtual addresses.
+        */
+
+       efi.get_time = virt_efi_get_time;
+       efi.set_time = virt_efi_set_time;
+       efi.get_wakeup_time = virt_efi_get_wakeup_time;
+       efi.set_wakeup_time = virt_efi_set_wakeup_time;
+       efi.get_variable = virt_efi_get_variable;
+       efi.get_next_variable = virt_efi_get_next_variable;
+       efi.set_variable = virt_efi_set_variable;
+       efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+       efi.reset_system = virt_efi_reset_system;
+}
+
+void __init
+efi_initialize_iomem_resources(struct resource *code_resource,
+                              struct resource *data_resource)
+{
+       struct resource *res;
+       efi_memory_desc_t *md;
+       void *p;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+
+               if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
+                   0x100000000ULL)
+                       continue;
+               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+               switch (md->type) {
+               case EFI_RESERVED_TYPE:
+                       res->name = "Reserved Memory";
+                       break;
+               case EFI_LOADER_CODE:
+                       res->name = "Loader Code";
+                       break;
+               case EFI_LOADER_DATA:
+                       res->name = "Loader Data";
+                       break;
+               case EFI_BOOT_SERVICES_DATA:
+                       res->name = "BootServices Data";
+                       break;
+               case EFI_BOOT_SERVICES_CODE:
+                       res->name = "BootServices Code";
+                       break;
+               case EFI_RUNTIME_SERVICES_CODE:
+                       res->name = "Runtime Service Code";
+                       break;
+               case EFI_RUNTIME_SERVICES_DATA:
+                       res->name = "Runtime Service Data";
+                       break;
+               case EFI_CONVENTIONAL_MEMORY:
+                       res->name = "Conventional Memory";
+                       break;
+               case EFI_UNUSABLE_MEMORY:
+                       res->name = "Unusable Memory";
+                       break;
+               case EFI_ACPI_RECLAIM_MEMORY:
+                       res->name = "ACPI Reclaim";
+                       break;
+               case EFI_ACPI_MEMORY_NVS:
+                       res->name = "ACPI NVS";
+                       break;
+               case EFI_MEMORY_MAPPED_IO:
+                       res->name = "Memory Mapped IO";
+                       break;
+               case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+                       res->name = "Memory Mapped IO Port Space";
+                       break;
+               default:
+                       res->name = "Reserved";
+                       break;
+               }
+               res->start = md->phys_addr;
+               res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               if (request_resource(&iomem_resource, res) < 0)
+                       printk(KERN_ERR PFX "Failed to allocate res %s : "
+                               "0x%llx-0x%llx\n", res->name,
+                               (unsigned long long)res->start,
+                               (unsigned long long)res->end);
+               /*
+                * We don't know which region contains kernel data so we try
+                * it repeatedly and let the resource manager test it.
+                */
+               if (md->type == EFI_CONVENTIONAL_MEMORY) {
+                       request_resource(res, code_resource);
+                       request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+                       request_resource(res, &crashk_res);
+#endif
+               }
+       }
+}
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+
+u32 efi_mem_type(unsigned long phys_addr)
+{
+       efi_memory_desc_t *md;
+       void *p;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               if ((md->phys_addr <= phys_addr) && (phys_addr <
+                       (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
+                       return md->type;
+       }
+       return 0;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+       efi_memory_desc_t *md;
+       void *p;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               if ((md->phys_addr <= phys_addr) && (phys_addr <
+                       (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
+                       return md->attribute;
+       }
+       return 0;
+}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
new file mode 100644 (file)
index 0000000..ef00bb7
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+       /*
+        * 0. The function can only be called in Linux kernel. So CS has been
+        * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+        * the values of these registers are the same. And, the corresponding
+        * GDT entries are identical. So I will do nothing about segment reg
+        * and GDT, but change GDT base register in prelog and epilog.
+        */
+
+       /*
+        * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
+        * But to make it smoothly switch from virtual mode to flat mode.
+        * The mapping of lower virtual memory has been created in prelog and
+        * epilog.
+        */
+       movl    $1f, %edx
+       subl    $__PAGE_OFFSET, %edx
+       jmp     *%edx
+1:
+
+       /*
+        * 2. Now on the top of stack is the return
+        * address in the caller of efi_call_phys(), then parameter 1,
+        * parameter 2, ..., param n. To make things easy, we save the return
+        * address of efi_call_phys in a global variable.
+        */
+       popl    %edx
+       movl    %edx, saved_return_addr
+       /* get the function pointer into ECX*/
+       popl    %ecx
+       movl    %ecx, efi_rt_function_ptr
+       movl    $2f, %edx
+       subl    $__PAGE_OFFSET, %edx
+       pushl   %edx
+
+       /*
+        * 3. Clear PG bit in %CR0.
+        */
+       movl    %cr0, %edx
+       andl    $0x7fffffff, %edx
+       movl    %edx, %cr0
+       jmp     1f
+1:
+
+       /*
+        * 4. Adjust stack pointer.
+        */
+       subl    $__PAGE_OFFSET, %esp
+
+       /*
+        * 5. Call the physical function.
+        */
+       jmp     *%ecx
+
+2:
+       /*
+        * 6. After EFI runtime service returns, control will return to
+        * following instruction. We'd better readjust stack pointer first.
+        */
+       addl    $__PAGE_OFFSET, %esp
+
+       /*
+        * 7. Restore PG bit
+        */
+       movl    %cr0, %edx
+       orl     $0x80000000, %edx
+       movl    %edx, %cr0
+       jmp     1f
+1:
+       /*
+        * 8. Now restore the virtual mode from flat mode by
+        * adding EIP with PAGE_OFFSET.
+        */
+       movl    $1f, %edx
+       jmp     *%edx
+1:
+
+       /*
+        * 9. Balance the stack. And because EAX contain the return value,
+        * we'd better not clobber it.
+        */
+       leal    efi_rt_function_ptr, %edx
+       movl    (%edx), %ecx
+       pushl   %ecx
+
+       /*
+        * 10. Push the saved return address onto the stack and return.
+        */
+       leal    saved_return_addr, %edx
+       movl    (%edx), %ecx
+       pushl   %ecx
+       ret
+.previous
+
+.data
+saved_return_addr:
+       .long 0
+efi_rt_function_ptr:
+       .long 0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
new file mode 100644 (file)
index 0000000..290b7bc
--- /dev/null
@@ -0,0 +1,1112 @@
+/*
+ *  linux/arch/i386/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ *     ptrace needs to have all regs on the stack.
+ *     if the order here is changed, it needs to be
+ *     updated in fork.c:copy_process, signal.c:do_signal,
+ *     ptrace.c and ptrace.h
+ *
+ *      0(%esp) - %ebx
+ *      4(%esp) - %ecx
+ *      8(%esp) - %edx
+ *       C(%esp) - %esi
+ *     10(%esp) - %edi
+ *     14(%esp) - %ebp
+ *     18(%esp) - %eax
+ *     1C(%esp) - %ds
+ *     20(%esp) - %es
+ *     24(%esp) - %fs
+ *     28(%esp) - orig_eax
+ *     2C(%esp) - %eip
+ *     30(%esp) - %cs
+ *     34(%esp) - %eflags
+ *     38(%esp) - %oldesp
+ *     3C(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/percpu.h>
+#include <asm/dwarf2.h>
+#include "irq_vectors.h"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#define nr_syscalls ((syscall_table_size)/4)
+
+CF_MASK                = 0x00000001
+TF_MASK                = 0x00000100
+IF_MASK                = 0x00000200
+DF_MASK                = 0x00000400 
+NT_MASK                = 0x00004000
+VM_MASK                = 0x00020000
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel          restore_nocheck
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+       testl $IF_MASK,PT_EFLAGS(%esp)     # interrupts off?
+       jz 1f
+       TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#ifdef CONFIG_VM86
+#define resume_userspace_sig   check_userspace
+#else
+#define resume_userspace_sig   resume_userspace
+#endif
+
+#define SAVE_ALL \
+       cld; \
+       pushl %fs; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       /*CFI_REL_OFFSET fs, 0;*/\
+       pushl %es; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       /*CFI_REL_OFFSET es, 0;*/\
+       pushl %ds; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       /*CFI_REL_OFFSET ds, 0;*/\
+       pushl %eax; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET eax, 0;\
+       pushl %ebp; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET ebp, 0;\
+       pushl %edi; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET edi, 0;\
+       pushl %esi; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET esi, 0;\
+       pushl %edx; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET edx, 0;\
+       pushl %ecx; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET ecx, 0;\
+       pushl %ebx; \
+       CFI_ADJUST_CFA_OFFSET 4;\
+       CFI_REL_OFFSET ebx, 0;\
+       movl $(__USER_DS), %edx; \
+       movl %edx, %ds; \
+       movl %edx, %es; \
+       movl $(__KERNEL_PERCPU), %edx; \
+       movl %edx, %fs
+
+#define RESTORE_INT_REGS \
+       popl %ebx;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE ebx;\
+       popl %ecx;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE ecx;\
+       popl %edx;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE edx;\
+       popl %esi;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE esi;\
+       popl %edi;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE edi;\
+       popl %ebp;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE ebp;\
+       popl %eax;      \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       CFI_RESTORE eax
+
+#define RESTORE_REGS   \
+       RESTORE_INT_REGS; \
+1:     popl %ds;       \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       /*CFI_RESTORE ds;*/\
+2:     popl %es;       \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       /*CFI_RESTORE es;*/\
+3:     popl %fs;       \
+       CFI_ADJUST_CFA_OFFSET -4;\
+       /*CFI_RESTORE fs;*/\
+.pushsection .fixup,"ax";      \
+4:     movl $0,(%esp); \
+       jmp 1b;         \
+5:     movl $0,(%esp); \
+       jmp 2b;         \
+6:     movl $0,(%esp); \
+       jmp 3b;         \
+.section __ex_table,"a";\
+       .align 4;       \
+       .long 1b,4b;    \
+       .long 2b,5b;    \
+       .long 3b,6b;    \
+.popsection
+
+#define RING0_INT_FRAME \
+       CFI_STARTPROC simple;\
+       CFI_SIGNAL_FRAME;\
+       CFI_DEF_CFA esp, 3*4;\
+       /*CFI_OFFSET cs, -2*4;*/\
+       CFI_OFFSET eip, -3*4
+
+#define RING0_EC_FRAME \
+       CFI_STARTPROC simple;\
+       CFI_SIGNAL_FRAME;\
+       CFI_DEF_CFA esp, 4*4;\
+       /*CFI_OFFSET cs, -2*4;*/\
+       CFI_OFFSET eip, -3*4
+
+#define RING0_PTREGS_FRAME \
+       CFI_STARTPROC simple;\
+       CFI_SIGNAL_FRAME;\
+       CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
+       /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
+       CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
+       /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
+       /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
+       CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
+       CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
+       CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
+       CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
+       CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
+       CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+       CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+
+ENTRY(ret_from_fork)
+       CFI_STARTPROC
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       call schedule_tail
+       GET_THREAD_INFO(%ebp)
+       popl %eax
+       CFI_ADJUST_CFA_OFFSET -4
+       pushl $0x0202                   # Reset kernel eflags
+       CFI_ADJUST_CFA_OFFSET 4
+       popfl
+       CFI_ADJUST_CFA_OFFSET -4
+       jmp syscall_exit
+       CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+       # userspace resumption stub bypassing syscall exit tracing
+       ALIGN
+       RING0_PTREGS_FRAME
+ret_from_exception:
+       preempt_stop(CLBR_ANY)
+ret_from_intr:
+       GET_THREAD_INFO(%ebp)
+check_userspace:
+       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS
+       movb PT_CS(%esp), %al
+       andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+       cmpl $USER_RPL, %eax
+       jb resume_kernel                # not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       movl TI_flags(%ebp), %ecx
+       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
+                                       # int/exception return?
+       jne work_pending
+       jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
+       jnz restore_nocheck
+need_resched:
+       movl TI_flags(%ebp), %ecx       # need_resched set ?
+       testb $_TIF_NEED_RESCHED, %cl
+       jz restore_all
+       testl $IF_MASK,PT_EFLAGS(%esp)  # interrupts off (exception path) ?
+       jz restore_all
+       call preempt_schedule_irq
+       jmp need_resched
+END(resume_kernel)
+#endif
+       CFI_ENDPROC
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+
+       # sysenter call handler stub
+ENTRY(sysenter_entry)
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA esp, 0
+       CFI_REGISTER esp, ebp
+       movl TSS_sysenter_esp0(%esp),%esp
+sysenter_past_esp:
+       /*
+        * No need to follow this irqs on/off section: the syscall
+        * disabled irqs and here we enable it straight after entry:
+        */
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       pushl $(__USER_DS)
+       CFI_ADJUST_CFA_OFFSET 4
+       /*CFI_REL_OFFSET ss, 0*/
+       pushl %ebp
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET esp, 0
+       pushfl
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $(__USER_CS)
+       CFI_ADJUST_CFA_OFFSET 4
+       /*CFI_REL_OFFSET cs, 0*/
+       /*
+        * Push current_thread_info()->sysenter_return to the stack.
+        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+        * pushed above; +8 corresponds to copy_thread's esp0 setting.
+        */
+       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET eip, 0
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+       cmpl $__PAGE_OFFSET-3,%ebp
+       jae syscall_fault
+1:     movl (%ebp),%ebp
+.section __ex_table,"a"
+       .align 4
+       .long 1b,syscall_fault
+.previous
+
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+
+       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       jnz syscall_trace_entry
+       cmpl $(nr_syscalls), %eax
+       jae syscall_badsys
+       call *sys_call_table(,%eax,4)
+       movl %eax,PT_EAX(%esp)
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       testw $_TIF_ALLWORK_MASK, %cx
+       jne syscall_exit_work
+/* if something modifies registers it must also disable sysexit */
+       movl PT_EIP(%esp), %edx
+       movl PT_OLDESP(%esp), %ecx
+       xorl %ebp,%ebp
+       TRACE_IRQS_ON
+1:     mov  PT_FS(%esp), %fs
+       ENABLE_INTERRUPTS_SYSEXIT
+       CFI_ENDPROC
+.pushsection .fixup,"ax"
+2:     movl $0,PT_FS(%esp)
+       jmp 1b
+.section __ex_table,"a"
+       .align 4
+       .long 1b,2b
+.popsection
+ENDPROC(sysenter_entry)
+
+       # system call handler stub
+ENTRY(system_call)
+       RING0_INT_FRAME                 # can't unwind into user space anyway
+       pushl %eax                      # save orig_eax
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+                                       # system call tracing in operation / emulation
+       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       jnz syscall_trace_entry
+       cmpl $(nr_syscalls), %eax
+       jae syscall_badsys
+syscall_call:
+       call *sys_call_table(,%eax,4)
+       movl %eax,PT_EAX(%esp)          # store the return value
+syscall_exit:
+       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       TRACE_IRQS_OFF
+       testl $TF_MASK,PT_EFLAGS(%esp)  # If tracing set singlestep flag on exit
+       jz no_singlestep
+       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
+       movl TI_flags(%ebp), %ecx
+       testw $_TIF_ALLWORK_MASK, %cx   # current->work
+       jne syscall_exit_work
+
+restore_all:
+       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS, SS and CS
+       # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+       # are returning to the kernel.
+       # See comments in process.c:copy_thread() for details.
+       movb PT_OLDSS(%esp), %ah
+       movb PT_CS(%esp), %al
+       andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+       cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+       CFI_REMEMBER_STATE
+       je ldt_ss                       # returning to user-space with LDT SS
+restore_nocheck:
+       TRACE_IRQS_IRET
+restore_nocheck_notrace:
+       RESTORE_REGS
+       addl $4, %esp                   # skip orig_eax/error_code
+       CFI_ADJUST_CFA_OFFSET -4
+1:     INTERRUPT_RETURN
+.section .fixup,"ax"
+iret_exc:
+       pushl $0                        # no error code
+       pushl $do_iret_error
+       jmp error_code
+.previous
+.section __ex_table,"a"
+       .align 4
+       .long 1b,iret_exc
+.previous
+
+       CFI_RESTORE_STATE
+ldt_ss:
+       larl PT_OLDSS(%esp), %eax
+       jnz restore_nocheck
+       testl $0x00400000, %eax         # returning to 32bit stack?
+       jnz restore_nocheck             # allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+       /*
+        * The kernel can't run on a non-flat stack if paravirt mode
+        * is active.  Rather than try to fixup the high bits of
+        * ESP, bypass this code entirely.  This may break DOSemu
+        * and/or Wine support in a paravirt VM, although the option
+        * is still available to implement the setting of the high
+        * 16-bits in the INTERRUPT_RETURN paravirt-op.
+        */
+       cmpl $0, paravirt_ops+PARAVIRT_enabled
+       jne restore_nocheck
+#endif
+
+       /* If returning to userspace with 16bit stack,
+        * try to fix the higher word of ESP, as the CPU
+        * won't restore it.
+        * This is an "official" bug of all the x86-compatible
+        * CPUs, which we can try to work around to make
+        * dosemu and wine happy. */
+       movl PT_OLDESP(%esp), %eax
+       movl %esp, %edx
+       call patch_espfix_desc
+       pushl $__ESPFIX_SS
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       DISABLE_INTERRUPTS(CLBR_EAX)
+       TRACE_IRQS_OFF
+       lss (%esp), %esp
+       CFI_ADJUST_CFA_OFFSET -8
+       jmp restore_nocheck
+       CFI_ENDPROC
+ENDPROC(system_call)
+
+       # perform work that needs to be done immediately before resumption
+       ALIGN
+       RING0_PTREGS_FRAME              # can't unwind into user space anyway
+work_pending:
+       testb $_TIF_NEED_RESCHED, %cl
+       jz work_notifysig
+work_resched:
+       call schedule
+       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
+                                       # than syscall tracing?
+       jz restore_all
+       testb $_TIF_NEED_RESCHED, %cl
+       jnz work_resched
+
+work_notifysig:                                # deal with pending signals and
+                                       # notify-resume requests
+#ifdef CONFIG_VM86
+       testl $VM_MASK, PT_EFLAGS(%esp)
+       movl %esp, %eax
+       jne work_notifysig_v86          # returning to kernel-space or
+                                       # vm86-space
+       xorl %edx, %edx
+       call do_notify_resume
+       jmp resume_userspace_sig
+
+       ALIGN
+work_notifysig_v86:
+       pushl %ecx                      # save ti_flags for do_notify_resume
+       CFI_ADJUST_CFA_OFFSET 4
+       call save_v86_state             # %eax contains pt_regs pointer
+       popl %ecx
+       CFI_ADJUST_CFA_OFFSET -4
+       movl %eax, %esp
+#else
+       movl %esp, %eax
+#endif
+       xorl %edx, %edx
+       call do_notify_resume
+       jmp resume_userspace_sig
+END(work_pending)
+
+       # perform syscall exit tracing
+       ALIGN
+syscall_trace_entry:
+       movl $-ENOSYS,PT_EAX(%esp)
+       movl %esp, %eax
+       xorl %edx,%edx
+       call do_syscall_trace
+       cmpl $0, %eax
+       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
+                                       # so must skip actual syscall
+       movl PT_ORIG_EAX(%esp), %eax
+       cmpl $(nr_syscalls), %eax
+       jnae syscall_call
+       jmp syscall_exit
+END(syscall_trace_entry)
+
+       # perform syscall exit tracing
+       ALIGN
+syscall_exit_work:
+       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+       jz work_pending
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_ANY)     # could let do_syscall_trace() call
+                                       # schedule() instead
+       movl %esp, %eax
+       movl $1, %edx
+       call do_syscall_trace
+       jmp resume_userspace
+END(syscall_exit_work)
+       CFI_ENDPROC
+
+       RING0_INT_FRAME                 # can't unwind into user space anyway
+syscall_fault:
+       pushl %eax                      # save orig_eax
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+       movl $-EFAULT,PT_EAX(%esp)
+       jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+       movl $-ENOSYS,PT_EAX(%esp)
+       jmp resume_userspace
+END(syscall_badsys)
+       CFI_ENDPROC
+
+#define FIXUP_ESPFIX_STACK \
+       /* since we are on a wrong stack, we cant make it a C code :( */ \
+       PER_CPU(gdt_page, %ebx); \
+       GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
+       addl %esp, %eax; \
+       pushl $__KERNEL_DS; \
+       CFI_ADJUST_CFA_OFFSET 4; \
+       pushl %eax; \
+       CFI_ADJUST_CFA_OFFSET 4; \
+       lss (%esp), %esp; \
+       CFI_ADJUST_CFA_OFFSET -8;
+#define UNWIND_ESPFIX_STACK \
+       movl %ss, %eax; \
+       /* see if on espfix stack */ \
+       cmpw $__ESPFIX_SS, %ax; \
+       jne 27f; \
+       movl $__KERNEL_DS, %eax; \
+       movl %eax, %ds; \
+       movl %eax, %es; \
+       /* switch to normal stack */ \
+       FIXUP_ESPFIX_STACK; \
+27:;
+
+/*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+.data
+ENTRY(interrupt)
+.text
+
+ENTRY(irq_entries_start)
+       RING0_INT_FRAME
+vector=0
+.rept NR_IRQS
+       ALIGN
+ .if vector
+       CFI_ADJUST_CFA_OFFSET -4
+ .endif
+1:     pushl $~(vector)
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp common_interrupt
+ .previous
+       .long 1b
+ .text
+vector=vector+1
+.endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+       ALIGN
+common_interrupt:
+       SAVE_ALL
+       TRACE_IRQS_OFF
+       movl %esp,%eax
+       call do_IRQ
+       jmp ret_from_intr
+ENDPROC(common_interrupt)
+       CFI_ENDPROC
+
+#define BUILD_INTERRUPT(name, nr)      \
+ENTRY(name)                            \
+       RING0_INT_FRAME;                \
+       pushl $~(nr);                   \
+       CFI_ADJUST_CFA_OFFSET 4;        \
+       SAVE_ALL;                       \
+       TRACE_IRQS_OFF                  \
+       movl %esp,%eax;                 \
+       call smp_##name;                \
+       jmp ret_from_intr;              \
+       CFI_ENDPROC;                    \
+ENDPROC(name)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include "entry_arch.h"
+
+KPROBE_ENTRY(page_fault)
+       RING0_EC_FRAME
+       pushl $do_page_fault
+       CFI_ADJUST_CFA_OFFSET 4
+       ALIGN
+error_code:
+       /* the function address is in %fs's slot on the stack */
+       pushl %es
+       CFI_ADJUST_CFA_OFFSET 4
+       /*CFI_REL_OFFSET es, 0*/
+       pushl %ds
+       CFI_ADJUST_CFA_OFFSET 4
+       /*CFI_REL_OFFSET ds, 0*/
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET eax, 0
+       pushl %ebp
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET ebp, 0
+       pushl %edi
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET edi, 0
+       pushl %esi
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET esi, 0
+       pushl %edx
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET edx, 0
+       pushl %ecx
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET ecx, 0
+       pushl %ebx
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET ebx, 0
+       cld
+       pushl %fs
+       CFI_ADJUST_CFA_OFFSET 4
+       /*CFI_REL_OFFSET fs, 0*/
+       movl $(__KERNEL_PERCPU), %ecx
+       movl %ecx, %fs
+       UNWIND_ESPFIX_STACK
+       popl %ecx
+       CFI_ADJUST_CFA_OFFSET -4
+       /*CFI_REGISTER es, ecx*/
+       movl PT_FS(%esp), %edi          # get the function address
+       movl PT_ORIG_EAX(%esp), %edx    # get the error code
+       movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
+       mov  %ecx, PT_FS(%esp)
+       /*CFI_REL_OFFSET fs, ES*/
+       movl $(__USER_DS), %ecx
+       movl %ecx, %ds
+       movl %ecx, %es
+       movl %esp,%eax                  # pt_regs pointer
+       call *%edi
+       jmp ret_from_exception
+       CFI_ENDPROC
+KPROBE_END(page_fault)
+
+ENTRY(coprocessor_error)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_coprocessor_error
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_simd_coprocessor_error
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+       RING0_INT_FRAME
+       pushl $-1                       # mark this as an int
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       GET_CR0_INTO_EAX
+       testl $0x4, %eax                # EM (math emulation bit)
+       jne device_not_available_emulate
+       preempt_stop(CLBR_ANY)
+       call math_state_restore
+       jmp ret_from_exception
+device_not_available_emulate:
+       pushl $0                        # temporary storage for ORIG_EIP
+       CFI_ADJUST_CFA_OFFSET 4
+       call math_emulate
+       addl $4, %esp
+       CFI_ADJUST_CFA_OFFSET -4
+       jmp ret_from_exception
+       CFI_ENDPROC
+END(device_not_available)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)           \
+       cmpw $__KERNEL_CS,4(%esp);              \
+       jne ok;                                 \
+label:                                         \
+       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
+       CFI_DEF_CFA esp, 0;                     \
+       CFI_UNDEFINED eip;                      \
+       pushfl;                                 \
+       CFI_ADJUST_CFA_OFFSET 4;                \
+       pushl $__KERNEL_CS;                     \
+       CFI_ADJUST_CFA_OFFSET 4;                \
+       pushl $sysenter_past_esp;               \
+       CFI_ADJUST_CFA_OFFSET 4;                \
+       CFI_REL_OFFSET eip, 0
+
+KPROBE_ENTRY(debug)
+       RING0_INT_FRAME
+       cmpl $sysenter_entry,(%esp)
+       jne debug_stack_correct
+       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+       pushl $-1                       # mark this as an int
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       xorl %edx,%edx                  # error code 0
+       movl %esp,%eax                  # pt_regs pointer
+       call do_debug
+       jmp ret_from_exception
+       CFI_ENDPROC
+KPROBE_END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+KPROBE_ENTRY(nmi)
+       RING0_INT_FRAME
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       movl %ss, %eax
+       cmpw $__ESPFIX_SS, %ax
+       popl %eax
+       CFI_ADJUST_CFA_OFFSET -4
+       je nmi_espfix_stack
+       cmpl $sysenter_entry,(%esp)
+       je nmi_stack_fixup
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       movl %esp,%eax
+       /* Do not access memory above the end of our stack page,
+        * it might not exist.
+        */
+       andl $(THREAD_SIZE-1),%eax
+       cmpl $(THREAD_SIZE-20),%eax
+       popl %eax
+       CFI_ADJUST_CFA_OFFSET -4
+       jae nmi_stack_correct
+       cmpl $sysenter_entry,12(%esp)
+       je nmi_debug_stack_check
+nmi_stack_correct:
+       /* We have a RING0_INT_FRAME here */
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_nmi
+       jmp restore_nocheck_notrace
+       CFI_ENDPROC
+
+nmi_stack_fixup:
+       RING0_INT_FRAME
+       FIX_STACK(12,nmi_stack_correct, 1)
+       jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+       /* We have a RING0_INT_FRAME here */
+       cmpw $__KERNEL_CS,16(%esp)
+       jne nmi_stack_correct
+       cmpl $debug,(%esp)
+       jb nmi_stack_correct
+       cmpl $debug_esp_fix_insn,(%esp)
+       ja nmi_stack_correct
+       FIX_STACK(24,nmi_stack_correct, 1)
+       jmp nmi_stack_correct
+
+nmi_espfix_stack:
+       /* We have a RING0_INT_FRAME here.
+        *
+        * create the pointer to lss back
+        */
+       pushl %ss
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl %esp
+       CFI_ADJUST_CFA_OFFSET 4
+       addw $4, (%esp)
+       /* copy the iret frame of 12 bytes */
+       .rept 3
+       pushl 16(%esp)
+       CFI_ADJUST_CFA_OFFSET 4
+       .endr
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       FIXUP_ESPFIX_STACK              # %eax == %esp
+       xorl %edx,%edx                  # zero error code
+       call do_nmi
+       RESTORE_REGS
+       lss 12+4(%esp), %esp            # back to espfix stack
+       CFI_ADJUST_CFA_OFFSET -24
+1:     INTERRUPT_RETURN
+       CFI_ENDPROC
+.section __ex_table,"a"
+       .align 4
+       .long 1b,iret_exc
+.previous
+KPROBE_END(nmi)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1:     iret
+.section __ex_table,"a"
+       .align 4
+       .long 1b,iret_exc
+.previous
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+       sti
+       sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+KPROBE_ENTRY(int3)
+       RING0_INT_FRAME
+       pushl $-1                       # mark this as an int
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_int3
+       jmp ret_from_exception
+       CFI_ENDPROC
+KPROBE_END(int3)
+
+ENTRY(overflow)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_overflow
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(overflow)
+
+ENTRY(bounds)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_bounds
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(bounds)
+
+ENTRY(invalid_op)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_invalid_op
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_coprocessor_segment_overrun
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+       RING0_EC_FRAME
+       pushl $do_invalid_TSS
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+       RING0_EC_FRAME
+       pushl $do_segment_not_present
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(segment_not_present)
+
+ENTRY(stack_segment)
+       RING0_EC_FRAME
+       pushl $do_stack_segment
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(stack_segment)
+
+KPROBE_ENTRY(general_protection)
+       RING0_EC_FRAME
+       pushl $do_general_protection
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+KPROBE_END(general_protection)
+
+ENTRY(alignment_check)
+       RING0_EC_FRAME
+       pushl $do_alignment_check
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(alignment_check)
+
+ENTRY(divide_error)
+       RING0_INT_FRAME
+       pushl $0                        # no error code
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_divide_error
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl machine_check_vector
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       pushl $do_spurious_interrupt_bug
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(spurious_interrupt_bug)
+
+ENTRY(kernel_thread_helper)
+       pushl $0                # fake return address for unwinder
+       CFI_STARTPROC
+       movl %edx,%eax
+       push %edx
+       CFI_ADJUST_CFA_OFFSET 4
+       call *%ebx
+       push %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       call do_exit
+       CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+       CFI_STARTPROC
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       TRACE_IRQS_OFF
+
+       /* Check to see if we got the event in the critical
+          region in xen_iret_direct, after we've reenabled
+          events and checked for pending events.  This simulates
+          iret instruction's behaviour where it delivers a
+          pending interrupt when enabling interrupts. */
+       movl PT_EIP(%esp),%eax
+       cmpl $xen_iret_start_crit,%eax
+       jb   1f
+       cmpl $xen_iret_end_crit,%eax
+       jae  1f
+
+       call xen_iret_crit_fixup
+
+1:     mov %esp, %eax
+       call xen_evtchn_do_upcall
+       jmp  ret_from_intr
+       CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+       CFI_STARTPROC
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       movl $1,%eax
+1:     mov 4(%esp),%ds
+2:     mov 8(%esp),%es
+3:     mov 12(%esp),%fs
+4:     mov 16(%esp),%gs
+       testl %eax,%eax
+       popl %eax
+       CFI_ADJUST_CFA_OFFSET -4
+       lea 16(%esp),%esp
+       CFI_ADJUST_CFA_OFFSET -16
+       jz 5f
+       addl $16,%esp
+       jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
+5:     pushl $0                # EAX == 0 => Category 1 (Bad segment)
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       jmp ret_from_exception
+       CFI_ENDPROC
+
+.section .fixup,"ax"
+6:     xorl %eax,%eax
+       movl %eax,4(%esp)
+       jmp 1b
+7:     xorl %eax,%eax
+       movl %eax,8(%esp)
+       jmp 2b
+8:     xorl %eax,%eax
+       movl %eax,12(%esp)
+       jmp 3b
+9:     xorl %eax,%eax
+       movl %eax,16(%esp)
+       jmp 4b
+.previous
+.section __ex_table,"a"
+       .align 4
+       .long 1b,6b
+       .long 2b,7b
+       .long 3b,8b
+       .long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
+
+.section .rodata,"a"
+#include "syscall_table_32.S"
+
+syscall_table_size=(.-sys_call_table)
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
new file mode 100644 (file)
index 0000000..41e8aec
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+ * AMD Geode southbridge support code
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <asm/msr.h>
+#include <asm/geode.h>
+
+static struct {
+       char *name;
+       u32 msr;
+       int size;
+       u32 base;
+} lbars[] = {
+       { "geode-pms",   MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
+       { "geode-acpi",  MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
+       { "geode-gpio",  MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
+       { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
+};
+
+static void __init init_lbars(void)
+{
+       u32 lo, hi;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lbars); i++) {
+               rdmsr(lbars[i].msr, lo, hi);
+               if (hi & 0x01)
+                       lbars[i].base = lo & 0x0000ffff;
+
+               if (lbars[i].base == 0)
+                       printk(KERN_ERR "geode:  Couldn't initialize '%s'\n",
+                                       lbars[i].name);
+       }
+}
+
+int geode_get_dev_base(unsigned int dev)
+{
+       BUG_ON(dev >= ARRAY_SIZE(lbars));
+       return lbars[dev].base;
+}
+EXPORT_SYMBOL_GPL(geode_get_dev_base);
+
+/* === GPIO API === */
+
+void geode_gpio_set(unsigned int gpio, unsigned int reg)
+{
+       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+       if (!base)
+               return;
+
+       if (gpio < 16)
+               outl(1 << gpio, base + reg);
+       else
+               outl(1 << (gpio - 16), base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set);
+
+void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+{
+       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+       if (!base)
+               return;
+
+       if (gpio < 16)
+               outl(1 << (gpio + 16), base + reg);
+       else
+               outl(1 << gpio, base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_clear);
+
+int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+{
+       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+       if (!base)
+               return 0;
+
+       if (gpio < 16)
+               return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
+       else
+               return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(geode_gpio_isset);
+
+void geode_gpio_set_irq(unsigned int group, unsigned int irq)
+{
+       u32 lo, hi;
+
+       if (group > 7 || irq > 15)
+               return;
+
+       rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+
+       lo &= ~(0xF << (group * 4));
+       lo |= (irq & 0xF) << (group * 4);
+
+       wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
+
+void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
+{
+       u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+       u32 offset, shift, val;
+
+       if (gpio >= 24)
+               offset = GPIO_MAP_W;
+       else if (gpio >= 16)
+               offset = GPIO_MAP_Z;
+       else if (gpio >= 8)
+               offset = GPIO_MAP_Y;
+       else
+               offset = GPIO_MAP_X;
+
+       shift = (gpio % 8) * 4;
+
+       val = inl(base + offset);
+
+       /* Clear whatever was there before */
+       val &= ~(0xF << shift);
+
+       /* And set the new value */
+
+       val |= ((pair & 7) << shift);
+
+       /* Set the PME bit if this is a PME event */
+
+       if (pme)
+               val |= (1 << (shift + 3));
+
+       outl(val, base + offset);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
+
+static int __init geode_southbridge_init(void)
+{
+       if (!is_geode())
+               return -ENODEV;
+
+       init_lbars();
+       return 0;
+}
+
+postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644 (file)
index 0000000..9150ca9
--- /dev/null
@@ -0,0 +1,578 @@
+/*
+ *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Enhanced CPU detection and feature setting code by Mike Jagdis
+ *  and Martin Mares, November 1997.
+ */
+
+.text
+#include <linux/threads.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/setup.h>
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86            new_cpu_data+CPUINFO_x86
+#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
+#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
+
+/*
+ * This is how much memory *in addition to the memory covered up to
+ * and including _end* we need mapped initially.
+ * We need:
+ *  - one bit for each possible page, but only in low memory, which means
+ *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ *  - enough space to map all low memory, which means
+ *     (2^32/4096) / 1024 pages (worst case, non PAE)
+ *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
+ *  - a few pages for allocator use before the kernel pagetable has
+ *     been set up
+ *
+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
+ * memory, which is currently unreclaimed.
+ *
+ * This should be a multiple of a page.
+ */
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+
+/*
+ * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
+ * %esi points to the real-mode code as a 32-bit pointer.
+ * CS and DS must be 4 GB flat segments, but we don't depend on
+ * any particular GDT layout, because we load our own as soon as we
+ * can.
+ */
+.section .text.head,"ax",@progbits
+ENTRY(startup_32)
+
+/*
+ * Set segments to known values.
+ */
+       cld
+       lgdt boot_gdt_descr - __PAGE_OFFSET
+       movl $(__BOOT_DS),%eax
+       movl %eax,%ds
+       movl %eax,%es
+       movl %eax,%fs
+       movl %eax,%gs
+
+/*
+ * Clear BSS first so that there are no surprises...
+ * No need to cld as DF is already clear from cld above...
+ */
+       xorl %eax,%eax
+       movl $__bss_start - __PAGE_OFFSET,%edi
+       movl $__bss_stop - __PAGE_OFFSET,%ecx
+       subl %edi,%ecx
+       shrl $2,%ecx
+       rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+       movl $(boot_params - __PAGE_OFFSET),%edi
+       movl $(PARAM_SIZE/4),%ecx
+       cld
+       rep
+       movsl
+       movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
+       andl %esi,%esi
+       jnz 2f                  # New command line protocol
+       cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
+       jne 1f
+       movzwl OLD_CL_OFFSET,%esi
+       addl $(OLD_CL_BASE_ADDR),%esi
+2:
+       movl $(boot_command_line - __PAGE_OFFSET),%edi
+       movl $(COMMAND_LINE_SIZE/4),%ecx
+       rep
+       movsl
+1:
+
+/*
+ * Initialize page tables.  This creates a PDE and a set of page
+ * tables, which are located immediately beyond _end.  The variable
+ * init_pg_tables_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ *
+ * Warning: don't use %esi or the stack in this code.  However, %esp
+ * can be used as a GPR if you really need it...
+ */
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+       movl $(pg0 - __PAGE_OFFSET), %edi
+       movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+       movl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
+10:
+       leal 0x007(%edi),%ecx                   /* Create PDE entry */
+       movl %ecx,(%edx)                        /* Store identity PDE entry */
+       movl %ecx,page_pde_offset(%edx)         /* Store kernel PDE entry */
+       addl $4,%edx
+       movl $1024, %ecx
+11:
+       stosl
+       addl $0x1000,%eax
+       loop 11b
+       /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
+       /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
+       leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
+       cmpl %ebp,%eax
+       jb 10b
+       movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+
+       xorl %ebx,%ebx                          /* This is the boot CPU (BSP) */
+       jmp 3f
+/*
+ * Non-boot CPU entry point; entered from trampoline.S
+ * We can't lgdt here, because lgdt itself uses a data segment, but
+ * we know the trampoline has already loaded the boot_gdt for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
+ */
+
+#ifndef CONFIG_HOTPLUG_CPU
+.section .init.text,"ax",@progbits
+#endif
+
+       /* Do an early initialization of the fixmap area */
+       movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+       movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
+       addl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
+       movl %eax, 4092(%edx)
+
+#ifdef CONFIG_SMP
+ENTRY(startup_32_smp)
+       cld
+       movl $(__BOOT_DS),%eax
+       movl %eax,%ds
+       movl %eax,%es
+       movl %eax,%fs
+       movl %eax,%gs
+
+/*
+ *     New page tables may be in 4Mbyte page mode and may
+ *     be using the global pages. 
+ *
+ *     NOTE! If we are on a 486 we may have no cr4 at all!
+ *     So we do not try to touch it unless we really have
+ *     some bits in it to set.  This won't work if the BSP
+ *     implements cr4 but this AP does not -- very unlikely
+ *     but be warned!  The same applies to the pse feature
+ *     if not equally supported. --macro
+ *
+ *     NOTE! We have to correct for the fact that we're
+ *     not yet offset PAGE_OFFSET..
+ */
+#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
+       movl cr4_bits,%edx
+       andl %edx,%edx
+       jz 6f
+       movl %cr4,%eax          # Turn on paging options (PSE,PAE,..)
+       orl %edx,%eax
+       movl %eax,%cr4
+
+       btl $5, %eax            # check if PAE is enabled
+       jnc 6f
+
+       /* Check if extended functions are implemented */
+       movl $0x80000000, %eax
+       cpuid
+       cmpl $0x80000000, %eax
+       jbe 6f
+       mov $0x80000001, %eax
+       cpuid
+       /* Execute Disable bit supported? */
+       btl $20, %edx
+       jnc 6f
+
+       /* Setup EFER (Extended Feature Enable Register) */
+       movl $0xc0000080, %ecx
+       rdmsr
+
+       btsl $11, %eax
+       /* Make changes effective */
+       wrmsr
+
+6:
+       /* This is a secondary processor (AP) */
+       xorl %ebx,%ebx
+       incl %ebx
+
+#endif /* CONFIG_SMP */
+3:
+
+/*
+ * Enable paging
+ */
+       movl $swapper_pg_dir-__PAGE_OFFSET,%eax
+       movl %eax,%cr3          /* set the page table pointer.. */
+       movl %cr0,%eax
+       orl $0x80000000,%eax
+       movl %eax,%cr0          /* ..and set paging (PG) bit */
+       ljmp $__BOOT_CS,$1f     /* Clear prefetch and normalize %eip */
+1:
+       /* Set up the stack pointer */
+       lss stack_start,%esp
+
+/*
+ * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
+ * confuse the debugger if this code is traced.
+ * XXX - best to initialize before switching to protected mode.
+ */
+       pushl $0
+       popfl
+
+#ifdef CONFIG_SMP
+       andl %ebx,%ebx
+       jz  1f                          /* Initial CPU cleans BSS */
+       jmp checkCPUtype
+1:
+#endif /* CONFIG_SMP */
+
+/*
+ * start system 32-bit setup. We need to re-do some of the things done
+ * in 16-bit mode for the "real" operations.
+ */
+       call setup_idt
+
+checkCPUtype:
+
+       movl $-1,X86_CPUID              #  -1 for no CPUID initially
+
+/* check if it is 486 or 386. */
+/*
+ * XXX - this does a lot of unnecessary setup.  Alignment checks don't
+ * apply at our cpl of 0 and the stack ought to be aligned already, and
+ * we don't need to preserve eflags.
+ */
+
+       movb $3,X86             # at least 386
+       pushfl                  # push EFLAGS
+       popl %eax               # get EFLAGS
+       movl %eax,%ecx          # save original EFLAGS
+       xorl $0x240000,%eax     # flip AC and ID bits in EFLAGS
+       pushl %eax              # copy to EFLAGS
+       popfl                   # set EFLAGS
+       pushfl                  # get new EFLAGS
+       popl %eax               # put it in eax
+       xorl %ecx,%eax          # change in flags
+       pushl %ecx              # restore original EFLAGS
+       popfl
+       testl $0x40000,%eax     # check if AC bit changed
+       je is386
+
+       movb $4,X86             # at least 486
+       testl $0x200000,%eax    # check if ID bit changed
+       je is486
+
+       /* get vendor info */
+       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
+       cpuid
+       movl %eax,X86_CPUID             # save CPUID level
+       movl %ebx,X86_VENDOR_ID         # lo 4 chars
+       movl %edx,X86_VENDOR_ID+4       # next 4 chars
+       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
+
+       orl %eax,%eax                   # do we have processor info as well?
+       je is486
+
+       movl $1,%eax            # Use the CPUID instruction to get CPU type
+       cpuid
+       movb %al,%cl            # save reg for future use
+       andb $0x0f,%ah          # mask processor family
+       movb %ah,X86
+       andb $0xf0,%al          # mask model
+       shrb $4,%al
+       movb %al,X86_MODEL
+       andb $0x0f,%cl          # mask mask revision
+       movb %cl,X86_MASK
+       movl %edx,X86_CAPABILITY
+
+is486: movl $0x50022,%ecx      # set AM, WP, NE and MP
+       jmp 2f
+
+is386: movl $2,%ecx            # set MP
+2:     movl %cr0,%eax
+       andl $0x80000011,%eax   # Save PG,PE,ET
+       orl %ecx,%eax
+       movl %eax,%cr0
+
+       call check_x87
+       lgdt early_gdt_descr
+       lidt idt_descr
+       ljmp $(__KERNEL_CS),$1f
+1:     movl $(__KERNEL_DS),%eax        # reload all the segment registers
+       movl %eax,%ss                   # after changing gdt.
+       movl %eax,%fs                   # gets reset once there's real percpu
+
+       movl $(__USER_DS),%eax          # DS/ES contains default USER segment
+       movl %eax,%ds
+       movl %eax,%es
+
+       xorl %eax,%eax                  # Clear GS and LDT
+       movl %eax,%gs
+       lldt %ax
+
+       cld                     # gcc2 wants the direction flag cleared at all times
+       pushl $0                # fake return address for unwinder
+#ifdef CONFIG_SMP
+       movb ready, %cl
+       movb $1, ready
+       cmpb $0,%cl             # the first CPU calls start_kernel
+       je   1f
+       movl $(__KERNEL_PERCPU), %eax
+       movl %eax,%fs           # set this cpu's percpu
+       jmp initialize_secondary # all other CPUs call initialize_secondary
+1:
+#endif /* CONFIG_SMP */
+       jmp start_kernel
+
+/*
+ * We depend on ET to be correct. This checks for 287/387.
+ */
+check_x87:
+       movb $0,X86_HARD_MATH
+       clts
+       fninit
+       fstsw %ax
+       cmpb $0,%al
+       je 1f
+       movl %cr0,%eax          /* no coprocessor: have to set bits */
+       xorl $4,%eax            /* set EM */
+       movl %eax,%cr0
+       ret
+       ALIGN
+1:     movb $1,X86_HARD_MATH
+       .byte 0xDB,0xE4         /* fsetpm for 287, ignored by 387 */
+       ret
+
+/*
+ *  setup_idt
+ *
+ *  sets up a idt with 256 entries pointing to
+ *  ignore_int, interrupt gates. It doesn't actually load
+ *  idt - that can be done only after paging has been enabled
+ *  and the kernel moved to PAGE_OFFSET. Interrupts
+ *  are enabled elsewhere, when we can be relatively
+ *  sure everything is ok.
+ *
+ *  Warning: %esi is live across this function.
+ */
+setup_idt:
+       lea ignore_int,%edx
+       movl $(__KERNEL_CS << 16),%eax
+       movw %dx,%ax            /* selector = 0x0010 = cs */
+       movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
+
+       lea idt_table,%edi
+       mov $256,%ecx
+rp_sidt:
+       movl %eax,(%edi)
+       movl %edx,4(%edi)
+       addl $8,%edi
+       dec %ecx
+       jne rp_sidt
+
+.macro set_early_handler handler,trapno
+       lea \handler,%edx
+       movl $(__KERNEL_CS << 16),%eax
+       movw %dx,%ax
+       movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
+       lea idt_table,%edi
+       movl %eax,8*\trapno(%edi)
+       movl %edx,8*\trapno+4(%edi)
+.endm
+
+       set_early_handler handler=early_divide_err,trapno=0
+       set_early_handler handler=early_illegal_opcode,trapno=6
+       set_early_handler handler=early_protection_fault,trapno=13
+       set_early_handler handler=early_page_fault,trapno=14
+
+       ret
+
+early_divide_err:
+       xor %edx,%edx
+       pushl $0        /* fake errcode */
+       jmp early_fault
+
+early_illegal_opcode:
+       movl $6,%edx
+       pushl $0        /* fake errcode */
+       jmp early_fault
+
+early_protection_fault:
+       movl $13,%edx
+       jmp early_fault
+
+early_page_fault:
+       movl $14,%edx
+       jmp early_fault
+
+early_fault:
+       cld
+#ifdef CONFIG_PRINTK
+       movl $(__KERNEL_DS),%eax
+       movl %eax,%ds
+       movl %eax,%es
+       cmpl $2,early_recursion_flag
+       je hlt_loop
+       incl early_recursion_flag
+       movl %cr2,%eax
+       pushl %eax
+       pushl %edx              /* trapno */
+       pushl $fault_msg
+#ifdef CONFIG_EARLY_PRINTK
+       call early_printk
+#else
+       call printk
+#endif
+#endif
+hlt_loop:
+       hlt
+       jmp hlt_loop
+
+/* This is the default interrupt "handler" :-) */
+       ALIGN
+ignore_int:
+       cld
+#ifdef CONFIG_PRINTK
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       pushl %es
+       pushl %ds
+       movl $(__KERNEL_DS),%eax
+       movl %eax,%ds
+       movl %eax,%es
+       cmpl $2,early_recursion_flag
+       je hlt_loop
+       incl early_recursion_flag
+       pushl 16(%esp)
+       pushl 24(%esp)
+       pushl 32(%esp)
+       pushl 40(%esp)
+       pushl $int_msg
+#ifdef CONFIG_EARLY_PRINTK
+       call early_printk
+#else
+       call printk
+#endif
+       addl $(5*4),%esp
+       popl %ds
+       popl %es
+       popl %edx
+       popl %ecx
+       popl %eax
+#endif
+       iret
+
+.section .text
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
+
+/*
+ * BSS section
+ */
+.section ".bss.page_aligned","wa"
+       .align PAGE_SIZE_asm
+ENTRY(swapper_pg_dir)
+       .fill 1024,4,0
+ENTRY(swapper_pg_pmd)
+       .fill 1024,4,0
+ENTRY(empty_zero_page)
+       .fill 4096,1,0
+
+/*
+ * This starts the data section.
+ */
+.data
+ENTRY(stack_start)
+       .long init_thread_union+THREAD_SIZE
+       .long __BOOT_DS
+
+ready: .byte 0
+
+early_recursion_flag:
+       .long 0
+
+int_msg:
+       .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+
+fault_msg:
+       .ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
+       .asciz "Stack: %p %p %p %p %p %p %p %p\n"
+
+#include "../../x86/xen/xen-head.S"
+
+/*
+ * The IDT and GDT 'descriptors' are a strange 48-bit object
+ * only used by the lidt and lgdt instructions. They are not
+ * like usual segment descriptors - they consist of a 16-bit
+ * segment size, and 32-bit linear address value:
+ */
+
+.globl boot_gdt_descr
+.globl idt_descr
+
+       ALIGN
+# early boot GDT descriptor (must use 1:1 address mapping)
+       .word 0                         # 32 bit align gdt_desc.address
+boot_gdt_descr:
+       .word __BOOT_DS+7
+       .long boot_gdt - __PAGE_OFFSET
+
+       .word 0                         # 32-bit align idt_desc.address
+idt_descr:
+       .word IDT_ENTRIES*8-1           # idt contains 256 entries
+       .long idt_table
+
+# boot GDT descriptor (later on used by CPU#0):
+       .word 0                         # 32 bit align gdt_desc.address
+ENTRY(early_gdt_descr)
+       .word GDT_ENTRIES*8-1
+       .long per_cpu__gdt_page         /* Overwritten for secondary CPUs */
+
+/*
+ * The boot_gdt must mirror the equivalent in setup.S and is
+ * used only for booting.
+ */
+       .align L1_CACHE_BYTES
+ENTRY(boot_gdt)
+       .fill GDT_ENTRY_BOOT_CS,8,0
+       .quad 0x00cf9a000000ffff        /* kernel 4GB code at 0x00000000 */
+       .quad 0x00cf92000000ffff        /* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c
new file mode 100644 (file)
index 0000000..533d493
--- /dev/null
@@ -0,0 +1,553 @@
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/delay.h>
+
+#include <asm/hpet.h>
+#include <asm/io.h>
+
+extern struct clock_event_device *global_clock_event;
+
+#define HPET_MASK      CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT     22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC  1000000
+
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+static void __iomem * hpet_virt_address;
+
+static inline unsigned long hpet_readl(unsigned long a)
+{
+       return readl(hpet_virt_address + a);
+}
+
+static inline void hpet_writel(unsigned long d, unsigned long a)
+{
+       writel(d, hpet_virt_address + a);
+}
+
+/*
+ * HPET command line enable / disable
+ */
+static int boot_hpet_disable;
+
+static int __init hpet_setup(char* str)
+{
+       if (str) {
+               if (!strncmp("disable", str, 7))
+                       boot_hpet_disable = 1;
+       }
+       return 1;
+}
+__setup("hpet=", hpet_setup);
+
+static inline int is_hpet_capable(void)
+{
+       return (!boot_hpet_disable && hpet_address);
+}
+
+/*
+ * HPET timer interrupt enable / disable
+ */
+static int hpet_legacy_int_enabled;
+
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+       return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+static void hpet_reserve_platform_timers(unsigned long id)
+{
+       struct hpet __iomem *hpet = hpet_virt_address;
+       struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+       unsigned int nrtimers, i;
+       struct hpet_data hd;
+
+       nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
+       memset(&hd, 0, sizeof (hd));
+       hd.hd_phys_address = hpet_address;
+       hd.hd_address = hpet_virt_address;
+       hd.hd_nirqs = nrtimers;
+       hd.hd_flags = HPET_DATA_PLATFORM;
+       hpet_reserve_timer(&hd, 0);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+       hpet_reserve_timer(&hd, 1);
+#endif
+
+       hd.hd_irq[0] = HPET_LEGACY_8254;
+       hd.hd_irq[1] = HPET_LEGACY_RTC;
+
+       for (i = 2; i < nrtimers; timer++, i++)
+               hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+                       Tn_INT_ROUTE_CNF_SHIFT;
+
+       hpet_alloc(&hd);
+
+}
+#else
+static void hpet_reserve_platform_timers(unsigned long id) { }
+#endif
+
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_period;
+
+static void hpet_set_mode(enum clock_event_mode mode,
+                         struct clock_event_device *evt);
+static int hpet_next_event(unsigned long delta,
+                          struct clock_event_device *evt);
+
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+       .name           = "hpet",
+       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+       .set_mode       = hpet_set_mode,
+       .set_next_event = hpet_next_event,
+       .shift          = 32,
+       .irq            = 0,
+};
+
+static void hpet_start_counter(void)
+{
+       unsigned long cfg = hpet_readl(HPET_CFG);
+
+       cfg &= ~HPET_CFG_ENABLE;
+       hpet_writel(cfg, HPET_CFG);
+       hpet_writel(0, HPET_COUNTER);
+       hpet_writel(0, HPET_COUNTER + 4);
+       cfg |= HPET_CFG_ENABLE;
+       hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_enable_int(void)
+{
+       unsigned long cfg = hpet_readl(HPET_CFG);
+
+       cfg |= HPET_CFG_LEGACY;
+       hpet_writel(cfg, HPET_CFG);
+       hpet_legacy_int_enabled = 1;
+}
+
+static void hpet_set_mode(enum clock_event_mode mode,
+                         struct clock_event_device *evt)
+{
+       unsigned long cfg, cmp, now;
+       uint64_t delta;
+
+       switch(mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
+               delta >>= hpet_clockevent.shift;
+               now = hpet_readl(HPET_COUNTER);
+               cmp = now + (unsigned long) delta;
+               cfg = hpet_readl(HPET_T0_CFG);
+               cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
+                      HPET_TN_SETVAL | HPET_TN_32BIT;
+               hpet_writel(cfg, HPET_T0_CFG);
+               /*
+                * The first write after writing TN_SETVAL to the
+                * config register sets the counter value, the second
+                * write sets the period.
+                */
+               hpet_writel(cmp, HPET_T0_CMP);
+               udelay(1);
+               hpet_writel((unsigned long) delta, HPET_T0_CMP);
+               break;
+
+       case CLOCK_EVT_MODE_ONESHOT:
+               cfg = hpet_readl(HPET_T0_CFG);
+               cfg &= ~HPET_TN_PERIODIC;
+               cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+               hpet_writel(cfg, HPET_T0_CFG);
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               cfg = hpet_readl(HPET_T0_CFG);
+               cfg &= ~HPET_TN_ENABLE;
+               hpet_writel(cfg, HPET_T0_CFG);
+               break;
+
+       case CLOCK_EVT_MODE_RESUME:
+               hpet_enable_int();
+               break;
+       }
+}
+
+static int hpet_next_event(unsigned long delta,
+                          struct clock_event_device *evt)
+{
+       unsigned long cnt;
+
+       cnt = hpet_readl(HPET_COUNTER);
+       cnt += delta;
+       hpet_writel(cnt, HPET_T0_CMP);
+
+       return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
+}
+
+/*
+ * Clock source related code
+ */
+static cycle_t read_hpet(void)
+{
+       return (cycle_t)hpet_readl(HPET_COUNTER);
+}
+
+static struct clocksource clocksource_hpet = {
+       .name           = "hpet",
+       .rating         = 250,
+       .read           = read_hpet,
+       .mask           = HPET_MASK,
+       .shift          = HPET_SHIFT,
+       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+       .resume         = hpet_start_counter,
+};
+
+/*
+ * Try to setup the HPET timer
+ */
+int __init hpet_enable(void)
+{
+       unsigned long id;
+       uint64_t hpet_freq;
+       u64 tmp, start, now;
+       cycle_t t1;
+
+       if (!is_hpet_capable())
+               return 0;
+
+       hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+
+       /*
+        * Read the period and check for a sane value:
+        */
+       hpet_period = hpet_readl(HPET_PERIOD);
+       if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+               goto out_nohpet;
+
+       /*
+        * The period is a femto seconds value. We need to calculate the
+        * scaled math multiplication factor for nanosecond to hpet tick
+        * conversion.
+        */
+       hpet_freq = 1000000000000000ULL;
+       do_div(hpet_freq, hpet_period);
+       hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
+                                     NSEC_PER_SEC, 32);
+       /* Calculate the min / max delta */
+       hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+                                                          &hpet_clockevent);
+       hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
+                                                          &hpet_clockevent);
+
+       /*
+        * Read the HPET ID register to retrieve the IRQ routing
+        * information and the number of channels
+        */
+       id = hpet_readl(HPET_ID);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+       /*
+        * The legacy routing mode needs at least two channels, tick timer
+        * and the rtc emulation channel.
+        */
+       if (!(id & HPET_ID_NUMBER))
+               goto out_nohpet;
+#endif
+
+       /* Start the counter */
+       hpet_start_counter();
+
+       /* Verify whether hpet counter works */
+       t1 = read_hpet();
+       rdtscll(start);
+
+       /*
+        * We don't know the TSC frequency yet, but waiting for
+        * 200000 TSC cycles is safe:
+        * 4 GHz == 50us
+        * 1 GHz == 200us
+        */
+       do {
+               rep_nop();
+               rdtscll(now);
+       } while ((now - start) < 200000UL);
+
+       if (t1 == read_hpet()) {
+               printk(KERN_WARNING
+                      "HPET counter not counting. HPET disabled\n");
+               goto out_nohpet;
+       }
+
+       /* Initialize and register HPET clocksource
+        *
+        * hpet period is in femto seconds per cycle
+        * so we need to convert this to ns/cyc units
+        * aproximated by mult/2^shift
+        *
+        *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+        *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+        *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+        *  (fsec/cyc << shift)/1000000 = mult
+        *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+        */
+       tmp = (u64)hpet_period << HPET_SHIFT;
+       do_div(tmp, FSEC_PER_NSEC);
+       clocksource_hpet.mult = (u32)tmp;
+
+       clocksource_register(&clocksource_hpet);
+
+       if (id & HPET_ID_LEGSUP) {
+               hpet_enable_int();
+               hpet_reserve_platform_timers(id);
+               /*
+                * Start hpet with the boot cpu mask and make it
+                * global after the IO_APIC has been initialized.
+                */
+               hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+               clockevents_register_device(&hpet_clockevent);
+               global_clock_event = &hpet_clockevent;
+               return 1;
+       }
+       return 0;
+
+out_nohpet:
+       iounmap(hpet_virt_address);
+       hpet_virt_address = NULL;
+       boot_hpet_disable = 1;
+       return 0;
+}
+
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ   64
+#define DEFAULT_RTC_SHIFT      6
+#define RTC_NUM_INTS           1
+
+static unsigned long hpet_rtc_flags;
+static unsigned long hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static unsigned long hpet_t1_cmp;
+static unsigned long hpet_default_delta;
+static unsigned long hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+       unsigned long cfg, cnt, delta, flags;
+
+       if (!is_hpet_enabled())
+               return 0;
+
+       if (!hpet_default_delta) {
+               uint64_t clc;
+
+               clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+               clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+               hpet_default_delta = (unsigned long) clc;
+       }
+
+       if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+               delta = hpet_default_delta;
+       else
+               delta = hpet_pie_delta;
+
+       local_irq_save(flags);
+
+       cnt = delta + hpet_readl(HPET_COUNTER);
+       hpet_writel(cnt, HPET_T1_CMP);
+       hpet_t1_cmp = cnt;
+
+       cfg = hpet_readl(HPET_T1_CFG);
+       cfg &= ~HPET_TN_PERIODIC;
+       cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+       hpet_writel(cfg, HPET_T1_CFG);
+
+       local_irq_restore(flags);
+
+       return 1;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+       if (!is_hpet_enabled())
+               return 0;
+
+       hpet_rtc_flags &= ~bit_mask;
+       return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+       unsigned long oldbits = hpet_rtc_flags;
+
+       if (!is_hpet_enabled())
+               return 0;
+
+       hpet_rtc_flags |= bit_mask;
+
+       if (!oldbits)
+               hpet_rtc_timer_init();
+
+       return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+                       unsigned char sec)
+{
+       if (!is_hpet_enabled())
+               return 0;
+
+       hpet_alarm_time.tm_hour = hrs;
+       hpet_alarm_time.tm_min = min;
+       hpet_alarm_time.tm_sec = sec;
+
+       return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+       uint64_t clc;
+
+       if (!is_hpet_enabled())
+               return 0;
+
+       if (freq <= DEFAULT_RTC_INT_FREQ)
+               hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+       else {
+               clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+               do_div(clc, freq);
+               clc >>= hpet_clockevent.shift;
+               hpet_pie_delta = (unsigned long) clc;
+       }
+       return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+       return is_hpet_enabled();
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+       unsigned long cfg, delta;
+       int lost_ints = -1;
+
+       if (unlikely(!hpet_rtc_flags)) {
+               cfg = hpet_readl(HPET_T1_CFG);
+               cfg &= ~HPET_TN_ENABLE;
+               hpet_writel(cfg, HPET_T1_CFG);
+               return;
+       }
+
+       if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+               delta = hpet_default_delta;
+       else
+               delta = hpet_pie_delta;
+
+       /*
+        * Increment the comparator value until we are ahead of the
+        * current count.
+        */
+       do {
+               hpet_t1_cmp += delta;
+               hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+               lost_ints++;
+       } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
+
+       if (lost_ints) {
+               if (hpet_rtc_flags & RTC_PIE)
+                       hpet_pie_count += lost_ints;
+               if (printk_ratelimit())
+                       printk(KERN_WARNING "rtc: lost %d interrupts\n",
+                               lost_ints);
+       }
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+       struct rtc_time curr_time;
+       unsigned long rtc_int_flag = 0;
+
+       hpet_rtc_timer_reinit();
+
+       if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+               rtc_get_rtc_time(&curr_time);
+
+       if (hpet_rtc_flags & RTC_UIE &&
+           curr_time.tm_sec != hpet_prev_update_sec) {
+               rtc_int_flag = RTC_UF;
+               hpet_prev_update_sec = curr_time.tm_sec;
+       }
+
+       if (hpet_rtc_flags & RTC_PIE &&
+           ++hpet_pie_count >= hpet_pie_limit) {
+               rtc_int_flag |= RTC_PF;
+               hpet_pie_count = 0;
+       }
+
+       if (hpet_rtc_flags & RTC_PIE &&
+           (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+           (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+           (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+                       rtc_int_flag |= RTC_AF;
+
+       if (rtc_int_flag) {
+               rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+               rtc_interrupt(rtc_int_flag, dev_id);
+       }
+       return IRQ_HANDLED;
+}
+#endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
new file mode 100644 (file)
index 0000000..e3d4b73
--- /dev/null
@@ -0,0 +1,30 @@
+#include <linux/module.h>
+#include <asm/checksum.h>
+#include <asm/desc.h>
+
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+/* Networking helper routines. */
+EXPORT_SYMBOL(csum_partial_copy_generic);
+
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+
+EXPORT_SYMBOL(strstr);
+
+#ifdef CONFIG_SMP
+extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+#endif
+
+EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
new file mode 100644 (file)
index 0000000..6658472
--- /dev/null
@@ -0,0 +1,546 @@
+/*
+ *  linux/arch/i386/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/math_emu.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_MATH_EMULATION
+#define HAVE_HWFP (boot_cpu_data.hard_math)
+#else
+#define HAVE_HWFP 1
+#endif
+
+static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
+
+void mxcsr_feature_mask_init(void)
+{
+       unsigned long mask = 0;
+       clts();
+       if (cpu_has_fxsr) {
+               memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+               asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); 
+               mask = current->thread.i387.fxsave.mxcsr_mask;
+               if (mask == 0) mask = 0x0000ffbf;
+       } 
+       mxcsr_feature_mask &= mask;
+       stts();
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+void init_fpu(struct task_struct *tsk)
+{
+       if (cpu_has_fxsr) {
+               memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+               tsk->thread.i387.fxsave.cwd = 0x37f;
+               if (cpu_has_xmm)
+                       tsk->thread.i387.fxsave.mxcsr = 0x1f80;
+       } else {
+               memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
+               tsk->thread.i387.fsave.cwd = 0xffff037fu;
+               tsk->thread.i387.fsave.swd = 0xffff0000u;
+               tsk->thread.i387.fsave.twd = 0xffffffffu;
+               tsk->thread.i387.fsave.fos = 0xffff0000u;
+       }
+       /* only the device not available exception or ptrace can call init_fpu */
+       set_stopped_child_used_math(tsk);
+}
+
+/*
+ * FPU lazy state save handling.
+ */
+
+void kernel_fpu_begin(void)
+{
+       struct thread_info *thread = current_thread_info();
+
+       preempt_disable();
+       if (thread->status & TS_USEDFPU) {
+               __save_init_fpu(thread->task);
+               return;
+       }
+       clts();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
+{
+       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+        tmp = ~twd;
+        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+        /* and move the valid bits to the lower byte. */
+        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+        return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
+{
+       struct _fpxreg *st = NULL;
+       unsigned long tos = (fxsave->swd >> 11) & 7;
+       unsigned long twd = (unsigned long) fxsave->twd;
+       unsigned long tag;
+       unsigned long ret = 0xffff0000u;
+       int i;
+
+#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16);
+
+       for ( i = 0 ; i < 8 ; i++ ) {
+               if ( twd & 0x1 ) {
+                       st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+
+                       switch ( st->exponent & 0x7fff ) {
+                       case 0x7fff:
+                               tag = 2;                /* Special */
+                               break;
+                       case 0x0000:
+                               if ( !st->significand[0] &&
+                                    !st->significand[1] &&
+                                    !st->significand[2] &&
+                                    !st->significand[3] ) {
+                                       tag = 1;        /* Zero */
+                               } else {
+                                       tag = 2;        /* Special */
+                               }
+                               break;
+                       default:
+                               if ( st->significand[3] & 0x8000 ) {
+                                       tag = 0;        /* Valid */
+                               } else {
+                                       tag = 2;        /* Special */
+                               }
+                               break;
+                       }
+               } else {
+                       tag = 3;                        /* Empty */
+               }
+               ret |= (tag << (2 * i));
+               twd = twd >> 1;
+       }
+       return ret;
+}
+
+/*
+ * FPU state interaction.
+ */
+
+unsigned short get_fpu_cwd( struct task_struct *tsk )
+{
+       if ( cpu_has_fxsr ) {
+               return tsk->thread.i387.fxsave.cwd;
+       } else {
+               return (unsigned short)tsk->thread.i387.fsave.cwd;
+       }
+}
+
+unsigned short get_fpu_swd( struct task_struct *tsk )
+{
+       if ( cpu_has_fxsr ) {
+               return tsk->thread.i387.fxsave.swd;
+       } else {
+               return (unsigned short)tsk->thread.i387.fsave.swd;
+       }
+}
+
+#if 0
+unsigned short get_fpu_twd( struct task_struct *tsk )
+{
+       if ( cpu_has_fxsr ) {
+               return tsk->thread.i387.fxsave.twd;
+       } else {
+               return (unsigned short)tsk->thread.i387.fsave.twd;
+       }
+}
+#endif  /*  0  */
+
+unsigned short get_fpu_mxcsr( struct task_struct *tsk )
+{
+       if ( cpu_has_xmm ) {
+               return tsk->thread.i387.fxsave.mxcsr;
+       } else {
+               return 0x1f80;
+       }
+}
+
+#if 0
+
+void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
+{
+       if ( cpu_has_fxsr ) {
+               tsk->thread.i387.fxsave.cwd = cwd;
+       } else {
+               tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
+       }
+}
+
+void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
+{
+       if ( cpu_has_fxsr ) {
+               tsk->thread.i387.fxsave.swd = swd;
+       } else {
+               tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
+       }
+}
+
+void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
+{
+       if ( cpu_has_fxsr ) {
+               tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
+       } else {
+               tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
+       }
+}
+
+#endif  /*  0  */
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static int convert_fxsr_to_user( struct _fpstate __user *buf,
+                                       struct i387_fxsave_struct *fxsave )
+{
+       unsigned long env[7];
+       struct _fpreg __user *to;
+       struct _fpxreg *from;
+       int i;
+
+       env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
+       env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
+       env[2] = twd_fxsr_to_i387(fxsave);
+       env[3] = fxsave->fip;
+       env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
+       env[5] = fxsave->foo;
+       env[6] = fxsave->fos;
+
+       if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
+               return 1;
+
+       to = &buf->_st[0];
+       from = (struct _fpxreg *) &fxsave->st_space[0];
+       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+               unsigned long __user *t = (unsigned long __user *)to;
+               unsigned long *f = (unsigned long *)from;
+
+               if (__put_user(*f, t) ||
+                               __put_user(*(f + 1), t + 1) ||
+                               __put_user(from->exponent, &to->exponent))
+                       return 1;
+       }
+       return 0;
+}
+
+static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
+                                         struct _fpstate __user *buf )
+{
+       unsigned long env[7];
+       struct _fpxreg *to;
+       struct _fpreg __user *from;
+       int i;
+
+       if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
+               return 1;
+
+       fxsave->cwd = (unsigned short)(env[0] & 0xffff);
+       fxsave->swd = (unsigned short)(env[1] & 0xffff);
+       fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
+       fxsave->fip = env[3];
+       fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
+       fxsave->fcs = (env[4] & 0xffff);
+       fxsave->foo = env[5];
+       fxsave->fos = env[6];
+
+       to = (struct _fpxreg *) &fxsave->st_space[0];
+       from = &buf->_st[0];
+       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+               unsigned long *t = (unsigned long *)to;
+               unsigned long __user *f = (unsigned long __user *)from;
+
+               if (__get_user(*t, f) ||
+                               __get_user(*(t + 1), f + 1) ||
+                               __get_user(to->exponent, &from->exponent))
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387_fsave( struct _fpstate __user *buf )
+{
+       struct task_struct *tsk = current;
+
+       unlazy_fpu( tsk );
+       tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
+       if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
+                            sizeof(struct i387_fsave_struct) ) )
+               return -1;
+       return 1;
+}
+
+static int save_i387_fxsave( struct _fpstate __user *buf )
+{
+       struct task_struct *tsk = current;
+       int err = 0;
+
+       unlazy_fpu( tsk );
+
+       if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
+               return -1;
+
+       err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
+       err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
+       if ( err )
+               return -1;
+
+       if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+                            sizeof(struct i387_fxsave_struct) ) )
+               return -1;
+       return 1;
+}
+
+int save_i387( struct _fpstate __user *buf )
+{
+       if ( !used_math() )
+               return 0;
+
+       /* This will cause a "finit" to be triggered by the next
+        * attempted FPU operation by the 'current' process.
+        */
+       clear_used_math();
+
+       if ( HAVE_HWFP ) {
+               if ( cpu_has_fxsr ) {
+                       return save_i387_fxsave( buf );
+               } else {
+                       return save_i387_fsave( buf );
+               }
+       } else {
+               return save_i387_soft( &current->thread.i387.soft, buf );
+       }
+}
+
+static inline int restore_i387_fsave( struct _fpstate __user *buf )
+{
+       struct task_struct *tsk = current;
+       clear_fpu( tsk );
+       return __copy_from_user( &tsk->thread.i387.fsave, buf,
+                                sizeof(struct i387_fsave_struct) );
+}
+
+static int restore_i387_fxsave( struct _fpstate __user *buf )
+{
+       int err;
+       struct task_struct *tsk = current;
+       clear_fpu( tsk );
+       err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+                               sizeof(struct i387_fxsave_struct) );
+       /* mxcsr reserved bits must be masked to zero for security reasons */
+       tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+       return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
+}
+
+int restore_i387( struct _fpstate __user *buf )
+{
+       int err;
+
+       if ( HAVE_HWFP ) {
+               if ( cpu_has_fxsr ) {
+                       err = restore_i387_fxsave( buf );
+               } else {
+                       err = restore_i387_fsave( buf );
+               }
+       } else {
+               err = restore_i387_soft( &current->thread.i387.soft, buf );
+       }
+       set_used_math();
+       return err;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
+                                   struct task_struct *tsk )
+{
+       return __copy_to_user( buf, &tsk->thread.i387.fsave,
+                              sizeof(struct user_i387_struct) );
+}
+
+static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
+                                    struct task_struct *tsk )
+{
+       return convert_fxsr_to_user( (struct _fpstate __user *)buf,
+                                    &tsk->thread.i387.fxsave );
+}
+
+int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
+{
+       if ( HAVE_HWFP ) {
+               if ( cpu_has_fxsr ) {
+                       return get_fpregs_fxsave( buf, tsk );
+               } else {
+                       return get_fpregs_fsave( buf, tsk );
+               }
+       } else {
+               return save_i387_soft( &tsk->thread.i387.soft,
+                                      (struct _fpstate __user *)buf );
+       }
+}
+
+static inline int set_fpregs_fsave( struct task_struct *tsk,
+                                   struct user_i387_struct __user *buf )
+{
+       return __copy_from_user( &tsk->thread.i387.fsave, buf,
+                                sizeof(struct user_i387_struct) );
+}
+
+static inline int set_fpregs_fxsave( struct task_struct *tsk,
+                                    struct user_i387_struct __user *buf )
+{
+       return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
+                                      (struct _fpstate __user *)buf );
+}
+
+int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
+{
+       if ( HAVE_HWFP ) {
+               if ( cpu_has_fxsr ) {
+                       return set_fpregs_fxsave( tsk, buf );
+               } else {
+                       return set_fpregs_fsave( tsk, buf );
+               }
+       } else {
+               return restore_i387_soft( &tsk->thread.i387.soft,
+                                         (struct _fpstate __user *)buf );
+       }
+}
+
+int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
+{
+       if ( cpu_has_fxsr ) {
+               if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
+                                   sizeof(struct user_fxsr_struct) ))
+                       return -EFAULT;
+               return 0;
+       } else {
+               return -EIO;
+       }
+}
+
+int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
+{
+       int ret = 0;
+
+       if ( cpu_has_fxsr ) {
+               if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
+                                 sizeof(struct user_fxsr_struct) ))
+                       ret = -EFAULT;
+               /* mxcsr reserved bits must be masked to zero for security reasons */
+               tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+       } else {
+               ret = -EIO;
+       }
+       return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+static inline void copy_fpu_fsave( struct task_struct *tsk,
+                                  struct user_i387_struct *fpu )
+{
+       memcpy( fpu, &tsk->thread.i387.fsave,
+               sizeof(struct user_i387_struct) );
+}
+
+static inline void copy_fpu_fxsave( struct task_struct *tsk,
+                                  struct user_i387_struct *fpu )
+{
+       unsigned short *to;
+       unsigned short *from;
+       int i;
+
+       memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
+
+       to = (unsigned short *)&fpu->st_space[0];
+       from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
+       for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
+               memcpy( to, from, 5 * sizeof(unsigned short) );
+       }
+}
+
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+       int fpvalid;
+       struct task_struct *tsk = current;
+
+       fpvalid = !!used_math();
+       if ( fpvalid ) {
+               unlazy_fpu( tsk );
+               if ( cpu_has_fxsr ) {
+                       copy_fpu_fxsave( tsk, fpu );
+               } else {
+                       copy_fpu_fsave( tsk, fpu );
+               }
+       }
+
+       return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+       int fpvalid = !!tsk_used_math(tsk);
+
+       if (fpvalid) {
+               if (tsk == current)
+                       unlazy_fpu(tsk);
+               if (cpu_has_fxsr)
+                       copy_fpu_fxsave(tsk, fpu);
+               else
+                       copy_fpu_fsave(tsk, fpu);
+       }
+       return fpvalid;
+}
+
+int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
+{
+       int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
+
+       if (fpvalid) {
+               if (tsk == current)
+                      unlazy_fpu(tsk);
+               memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
+       }
+       return fpvalid;
+}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644 (file)
index 0000000..6f508e8
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * i8237.c: 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/sysdev.h>
+
+#include <asm/dma.h>
+
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+
+static int i8237A_resume(struct sys_device *dev)
+{
+       unsigned long flags;
+       int i;
+
+       flags = claim_dma_lock();
+
+       dma_outb(DMA1_RESET_REG, 0);
+       dma_outb(DMA2_RESET_REG, 0);
+
+       for (i = 0;i < 8;i++) {
+               set_dma_addr(i, 0x000000);
+               /* DMA count is a bit weird so this is not 0 */
+               set_dma_count(i, 1);
+       }
+
+       /* Enable cascade DMA or channel 0-3 won't work */
+       enable_dma(4);
+
+       release_dma_lock(flags);
+
+       return 0;
+}
+
+static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
+{
+       return 0;
+}
+
+static struct sysdev_class i8237_sysdev_class = {
+       set_kset_name("i8237"),
+       .suspend = i8237A_suspend,
+       .resume = i8237A_resume,
+};
+
+static struct sys_device device_i8237A = {
+       .id     = 0,
+       .cls    = &i8237_sysdev_class,
+};
+
+static int __init i8237A_init_sysfs(void)
+{
+       int error = sysdev_class_register(&i8237_sysdev_class);
+       if (!error)
+               error = sysdev_register(&device_i8237A);
+       return error;
+}
+
+device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c
new file mode 100644 (file)
index 0000000..6d839f2
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+ * i8253.c  8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <asm/smp.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+
+/*
+ * Initialize the PIT timer.
+ *
+ * This is also called after resume to bring the PIT into operation again.
+ */
+static void init_pit_timer(enum clock_event_mode mode,
+                          struct clock_event_device *evt)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8253_lock, flags);
+
+       switch(mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               /* binary, mode 2, LSB/MSB, ch 0 */
+               outb_p(0x34, PIT_MODE);
+               outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+               outb(LATCH >> 8 , PIT_CH0);     /* MSB */
+               break;
+
+       case CLOCK_EVT_MODE_SHUTDOWN:
+       case CLOCK_EVT_MODE_UNUSED:
+               if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
+                   evt->mode == CLOCK_EVT_MODE_ONESHOT) {
+                       outb_p(0x30, PIT_MODE);
+                       outb_p(0, PIT_CH0);
+                       outb_p(0, PIT_CH0);
+               }
+               break;
+
+       case CLOCK_EVT_MODE_ONESHOT:
+               /* One shot setup */
+               outb_p(0x38, PIT_MODE);
+               break;
+
+       case CLOCK_EVT_MODE_RESUME:
+               /* Nothing to do here */
+               break;
+       }
+       spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+/*
+ * Program the next event in oneshot mode
+ *
+ * Delta is given in PIT ticks
+ */
+static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8253_lock, flags);
+       outb_p(delta & 0xff , PIT_CH0); /* LSB */
+       outb(delta >> 8 , PIT_CH0);     /* MSB */
+       spin_unlock_irqrestore(&i8253_lock, flags);
+
+       return 0;
+}
+
+/*
+ * On UP the PIT can serve all of the possible timer functions. On SMP systems
+ * it can be solely used for the global tick.
+ *
+ * The profiling and update capabilites are switched off once the local apic is
+ * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
+ * !using_apic_timer decisions in do_timer_interrupt_hook()
+ */
+struct clock_event_device pit_clockevent = {
+       .name           = "pit",
+       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+       .set_mode       = init_pit_timer,
+       .set_next_event = pit_next_event,
+       .shift          = 32,
+       .irq            = 0,
+};
+
+/*
+ * Initialize the conversion factor and the min/max deltas of the clock event
+ * structure and register the clock event source with the framework.
+ */
+void __init setup_pit_timer(void)
+{
+       /*
+        * Start pit with the boot cpu mask and make it global after the
+        * IO_APIC has been initialized.
+        */
+       pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+       pit_clockevent.max_delta_ns =
+               clockevent_delta2ns(0x7FFF, &pit_clockevent);
+       pit_clockevent.min_delta_ns =
+               clockevent_delta2ns(0xF, &pit_clockevent);
+       clockevents_register_device(&pit_clockevent);
+       global_clock_event = &pit_clockevent;
+}
+
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+       unsigned long flags;
+       int count;
+       u32 jifs;
+       static int old_count;
+       static u32 old_jifs;
+
+       spin_lock_irqsave(&i8253_lock, flags);
+       /*
+        * Although our caller may have the read side of xtime_lock,
+        * this is now a seqlock, and we are cheating in this routine
+        * by having side effects on state that we cannot undo if
+        * there is a collision on the seqlock and our caller has to
+        * retry.  (Namely, old_jifs and old_count.)  So we must treat
+        * jiffies as volatile despite the lock.  We read jiffies
+        * before latching the timer count to guarantee that although
+        * the jiffies value might be older than the count (that is,
+        * the counter may underflow between the last point where
+        * jiffies was incremented and the point where we latch the
+        * count), it cannot be newer.
+        */
+       jifs = jiffies;
+       outb_p(0x00, PIT_MODE); /* latch the count ASAP */
+       count = inb_p(PIT_CH0); /* read the latched count */
+       count |= inb_p(PIT_CH0) << 8;
+
+       /* VIA686a test code... reset the latch if count > max + 1 */
+       if (count > LATCH) {
+               outb_p(0x34, PIT_MODE);
+               outb_p(LATCH & 0xff, PIT_CH0);
+               outb(LATCH >> 8, PIT_CH0);
+               count = LATCH - 1;
+       }
+
+       /*
+        * It's possible for count to appear to go the wrong way for a
+        * couple of reasons:
+        *
+        *  1. The timer counter underflows, but we haven't handled the
+        *     resulting interrupt and incremented jiffies yet.
+        *  2. Hardware problem with the timer, not giving us continuous time,
+        *     the counter does small "jumps" upwards on some Pentium systems,
+        *     (see c't 95/10 page 335 for Neptun bug.)
+        *
+        * Previous attempts to handle these cases intelligently were
+        * buggy, so we just do the simple thing now.
+        */
+       if (count > old_count && jifs == old_jifs) {
+               count = old_count;
+       }
+       old_count = count;
+       old_jifs = jifs;
+
+       spin_unlock_irqrestore(&i8253_lock, flags);
+
+       count = (LATCH - 1) - count;
+
+       return (cycle_t)(jifs * LATCH) + count;
+}
+
+static struct clocksource clocksource_pit = {
+       .name   = "pit",
+       .rating = 110,
+       .read   = pit_read,
+       .mask   = CLOCKSOURCE_MASK(32),
+       .mult   = 0,
+       .shift  = 20,
+};
+
+static int __init init_pit_clocksource(void)
+{
+       if (num_possible_cpus() > 1) /* PIT does not scale! */
+               return 0;
+
+       clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+       return clocksource_register(&clocksource_pit);
+}
+arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
new file mode 100644 (file)
index 0000000..0499cbe
--- /dev/null
@@ -0,0 +1,420 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/8253pit.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+#include <io_ports.h>
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+
+static struct irq_chip i8259A_chip = {
+       .name           = "XT-PIC",
+       .mask           = disable_8259A_irq,
+       .disable        = disable_8259A_irq,
+       .unmask         = enable_8259A_irq,
+       .mask_ack       = mask_and_ack_8259A,
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+       unsigned int mask = 1 << irq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask |= mask;
+       if (irq & 8)
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+       else
+               outb(cached_master_mask, PIC_MASTER_IMR);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+       unsigned int mask = ~(1 << irq);
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask &= mask;
+       if (irq & 8)
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+       else
+               outb(cached_master_mask, PIC_MASTER_IMR);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+       unsigned int mask = 1<<irq;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       if (irq < 8)
+               ret = inb(PIC_MASTER_CMD) & mask;
+       else
+               ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+
+       return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+       disable_irq_nosync(irq);
+       io_apic_irqs &= ~(1<<irq);
+       set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+                                     "XT");
+       enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+       int value;
+       int irqmask = 1<<irq;
+
+       if (irq < 8) {
+               outb(0x0B,PIC_MASTER_CMD);      /* ISR register */
+               value = inb(PIC_MASTER_CMD) & irqmask;
+               outb(0x0A,PIC_MASTER_CMD);      /* back to the IRR register */
+               return value;
+       }
+       outb(0x0B,PIC_SLAVE_CMD);       /* ISR register */
+       value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+       outb(0x0A,PIC_SLAVE_CMD);       /* back to the IRR register */
+       return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+       unsigned int irqmask = 1 << irq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       /*
+        * Lightweight spurious IRQ detection. We do not want
+        * to overdo spurious IRQ handling - it's usually a sign
+        * of hardware problems, so we only do the checks we can
+        * do without slowing down good hardware unnecessarily.
+        *
+        * Note that IRQ7 and IRQ15 (the two spurious IRQs
+        * usually resulting from the 8259A-1|2 PICs) occur
+        * even if the IRQ is masked in the 8259A. Thus we
+        * can check spurious 8259A IRQs without doing the
+        * quite slow i8259A_irq_real() call for every IRQ.
+        * This does not cover 100% of spurious interrupts,
+        * but should be enough to warn the user that there
+        * is something bad going on ...
+        */
+       if (cached_irq_mask & irqmask)
+               goto spurious_8259A_irq;
+       cached_irq_mask |= irqmask;
+
+handle_real_irq:
+       if (irq & 8) {
+               inb(PIC_SLAVE_IMR);     /* DUMMY - (do we need this?) */
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+               outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+               outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+       } else {
+               inb(PIC_MASTER_IMR);    /* DUMMY - (do we need this?) */
+               outb(cached_master_mask, PIC_MASTER_IMR);
+               outb(0x60+irq,PIC_MASTER_CMD);  /* 'Specific EOI to master */
+       }
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+       return;
+
+spurious_8259A_irq:
+       /*
+        * this is the slow path - should happen rarely.
+        */
+       if (i8259A_irq_real(irq))
+               /*
+                * oops, the IRQ _is_ in service according to the
+                * 8259A - not spurious, go handle it.
+                */
+               goto handle_real_irq;
+
+       {
+               static int spurious_irq_mask;
+               /*
+                * At this point we can be sure the IRQ is spurious,
+                * lets ACK and report it. [once per IRQ]
+                */
+               if (!(spurious_irq_mask & irqmask)) {
+                       printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+                       spurious_irq_mask |= irqmask;
+               }
+               atomic_inc(&irq_err_count);
+               /*
+                * Theoretically we do not have to handle this IRQ,
+                * but in Linux this does not cause problems and is
+                * simpler for us.
+                */
+               goto handle_real_irq;
+       }
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+       outb(trigger[0], 0x4d0);
+       outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+       /* IRQ 0,1,2,8,13 are marked as reserved */
+       trigger[0] = inb(0x4d0) & 0xF8;
+       trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+       init_8259A(i8259A_auto_eoi);
+       restore_ELCR(irq_trigger);
+       return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+       save_ELCR(irq_trigger);
+       return 0;
+}
+
+static int i8259A_shutdown(struct sys_device *dev)
+{
+       /* Put the i8259A into a quiescent state that
+        * the kernel initialization code can get it
+        * out of.
+        */
+       outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
+       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-1 */
+       return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+       set_kset_name("i8259"),
+       .suspend = i8259A_suspend,
+       .resume = i8259A_resume,
+       .shutdown = i8259A_shutdown,
+};
+
+static struct sys_device device_i8259A = {
+       .id     = 0,
+       .cls    = &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+       int error = sysdev_class_register(&i8259_sysdev_class);
+       if (!error)
+               error = sysdev_register(&device_i8259A);
+       return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+void init_8259A(int auto_eoi)
+{
+       unsigned long flags;
+
+       i8259A_auto_eoi = auto_eoi;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+
+       outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
+       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */
+
+       /*
+        * outb_p - this has to work on a wide range of PC hardware.
+        */
+       outb_p(0x11, PIC_MASTER_CMD);   /* ICW1: select 8259A-1 init */
+       outb_p(0x20 + 0, PIC_MASTER_IMR);       /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+       outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);   /* 8259A-1 (the master) has a slave on IR2 */
+       if (auto_eoi)   /* master does Auto EOI */
+               outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+       else            /* master expects normal EOI */
+               outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+       outb_p(0x11, PIC_SLAVE_CMD);    /* ICW1: select 8259A-2 init */
+       outb_p(0x20 + 8, PIC_SLAVE_IMR);        /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+       outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);  /* 8259A-2 is a slave on master's IR2 */
+       outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+       if (auto_eoi)
+               /*
+                * In AEOI mode we just have to mask the interrupt
+                * when acking.
+                */
+               i8259A_chip.mask_ack = disable_8259A_irq;
+       else
+               i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+       udelay(100);            /* wait for 8259A to initialize */
+
+       outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+       outb(cached_slave_mask, PIC_SLAVE_IMR);   /* restore slave IRQ mask */
+
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+       extern void math_error(void __user *);
+       outb(0,0xF0);
+       if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+               return IRQ_NONE;
+       math_error((void __user *)get_irq_regs()->eip);
+       return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+
+void __init init_ISA_irqs (void)
+{
+       int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       init_bsp_APIC();
+#endif
+       init_8259A(0);
+
+       for (i = 0; i < NR_IRQS; i++) {
+               irq_desc[i].status = IRQ_DISABLED;
+               irq_desc[i].action = NULL;
+               irq_desc[i].depth = 1;
+
+               if (i < 16) {
+                       /*
+                        * 16 old-style INTA-cycle interrupts:
+                        */
+                       set_irq_chip_and_handler_name(i, &i8259A_chip,
+                                                     handle_level_irq, "XT");
+               } else {
+                       /*
+                        * 'high' PCI IRQs filled in on demand
+                        */
+                       irq_desc[i].chip = &no_irq_chip;
+               }
+       }
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+       int i;
+
+       /* all the set up before the call gates are initialised */
+       pre_intr_init_hook();
+
+       /*
+        * Cover the whole vector space, no vector can escape
+        * us. (some of these will be overridden and become
+        * 'special' SMP interrupts)
+        */
+       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+               int vector = FIRST_EXTERNAL_VECTOR + i;
+               if (i >= NR_IRQS)
+                       break;
+               if (vector != SYSCALL_VECTOR) 
+                       set_intr_gate(vector, interrupt[i]);
+       }
+
+       /* setup after call gates are initialised (usually add in
+        * the architecture specific gates)
+        */
+       intr_init_hook();
+
+       /*
+        * External FPU? Set up irq13 if so, for
+        * original braindamaged IBM FERR coupling.
+        */
+       if (boot_cpu_data.hard_math && !cpu_has_fpu)
+               setup_irq(FPU_IRQ, &fpu_irq);
+
+       irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c
new file mode 100644 (file)
index 0000000..d26fc06
--- /dev/null
@@ -0,0 +1,46 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial thread structure.
+ *
+ * We need to make sure that this is THREAD_SIZE aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+       __attribute__((__section__(".data.init_task"))) =
+               { INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's.
+ */ 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
new file mode 100644 (file)
index 0000000..e2f4a1c
--- /dev/null
@@ -0,0 +1,2847 @@
+/*
+ *     Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *     Many thanks to Stig Venaas for trying out countless experimental
+ *     patches and reporting/debugging problems patiently!
+ *
+ *     (c) 1999, Multiple IO-APIC support, developed by
+ *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *     further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *     and Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively
+ *     Paul Diefenbaugh        :       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+
+#include "io_ports.h"
+
+int (*ioapic_renumber_irq)(int ioapic, int irq);
+atomic_t irq_mis_count;
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
+
+int timer_over_8254 __initdata = 1;
+
+/*
+ *     Is the SiS APIC rmw bug present ?
+ *     -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+static int disable_timer_pin_1 __initdata;
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+       int apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+struct io_apic {
+       unsigned int index;
+       unsigned int unused[3];
+       unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(reg, &io_apic->index);
+       return readl(&io_apic->data);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(reg, &io_apic->index);
+       writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
+       if (sis_apic_bug)
+               writel(reg, &io_apic->index);
+       writel(value, &io_apic->data);
+}
+
+union entry_union {
+       struct { u32 w1, w2; };
+       struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+       union entry_union eu;
+       unsigned long flags;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+       union entry_union eu;
+       eu.entry = e;
+       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __ioapic_write_entry(apic, pin, e);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+       unsigned long flags;
+       union entry_union eu = { .entry.mask = 1 };
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+       static int first_free_entry = NR_IRQS;
+       struct irq_pin_list *entry = irq_2_pin + irq;
+
+       while (entry->next)
+               entry = irq_2_pin + entry->next;
+
+       if (entry->pin != -1) {
+               entry->next = first_free_entry;
+               entry = irq_2_pin + entry->next;
+               if (++first_free_entry >= PIN_MAP_SIZE)
+                       panic("io_apic.c: whoops");
+       }
+       entry->apic = apic;
+       entry->pin = pin;
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+                                     int oldapic, int oldpin,
+                                     int newapic, int newpin)
+{
+       struct irq_pin_list *entry = irq_2_pin + irq;
+
+       while (1) {
+               if (entry->apic == oldapic && entry->pin == oldpin) {
+                       entry->apic = newapic;
+                       entry->pin = newpin;
+               }
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+}
+
+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+{
+       struct irq_pin_list *entry = irq_2_pin + irq;
+       unsigned int pin, reg;
+
+       for (;;) {
+               pin = entry->pin;
+               if (pin == -1)
+                       break;
+               reg = io_apic_read(entry->apic, 0x10 + pin*2);
+               reg &= ~disable;
+               reg |= enable;
+               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+}
+
+/* mask = 1 */
+static void __mask_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0x00010000, 0);
+}
+
+/* mask = 0 */
+static void __unmask_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0, 0x00010000);
+}
+
+/* mask = 1, trigger = 0 */
+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+}
+
+/* mask = 0, trigger = 1 */
+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+}
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __mask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __unmask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+       struct IO_APIC_route_entry entry;
+       
+       /* Check delivery_mode to be sure we're not clearing an SMI pin */
+       entry = ioapic_read_entry(apic, pin);
+       if (entry.delivery_mode == dest_SMI)
+               return;
+
+       /*
+        * Disable it in the IO-APIC irq-routing table:
+        */
+       ioapic_mask_entry(apic, pin);
+}
+
+static void clear_IO_APIC (void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++)
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                       clear_IO_APIC_pin(apic, pin);
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
+{
+       unsigned long flags;
+       int pin;
+       struct irq_pin_list *entry = irq_2_pin + irq;
+       unsigned int apicid_value;
+       cpumask_t tmp;
+       
+       cpus_and(tmp, cpumask, cpu_online_map);
+       if (cpus_empty(tmp))
+               tmp = TARGET_CPUS;
+
+       cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
+       apicid_value = cpu_mask_to_apicid(cpumask);
+       /* Prepare to do the io_apic_write */
+       apicid_value = apicid_value << 24;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       for (;;) {
+               pin = entry->pin;
+               if (pin == -1)
+                       break;
+               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+       irq_desc[irq].affinity = cpumask;
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#if defined(CONFIG_IRQBALANCE)
+# include <asm/processor.h>    /* kernel_thread() */
+# include <linux/kernel_stat.h>        /* kstat */
+# include <linux/slab.h>               /* kmalloc() */
+# include <linux/timer.h>      /* time_after() */
+#define IRQBALANCE_CHECK_ARCH -999
+#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA                (HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
+
+static struct irq_cpu_info {
+       unsigned long * last_irq;
+       unsigned long * irq_delta;
+       unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+
+#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
+
+#define IDLE_ENOUGH(cpu,now) \
+       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
+
+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
+
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
+
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+       [0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
+
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+       balance_irq_affinity[irq] = mask;
+}
+
+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
+                       unsigned long now, int direction)
+{
+       int search_idle = 1;
+       int cpu = curr_cpu;
+
+       goto inside;
+
+       do {
+               if (unlikely(cpu == curr_cpu))
+                       search_idle = 0;
+inside:
+               if (direction == 1) {
+                       cpu++;
+                       if (cpu >= NR_CPUS)
+                               cpu = 0;
+               } else {
+                       cpu--;
+                       if (cpu == -1)
+                               cpu = NR_CPUS-1;
+               }
+       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
+                       (search_idle && !IDLE_ENOUGH(cpu,now)));
+
+       return cpu;
+}
+
+static inline void balance_irq(int cpu, int irq)
+{
+       unsigned long now = jiffies;
+       cpumask_t allowed_mask;
+       unsigned int new_cpu;
+               
+       if (irqbalance_disabled)
+               return; 
+
+       cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
+       new_cpu = move(cpu, allowed_mask, now, 1);
+       if (cpu != new_cpu) {
+               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
+       }
+}
+
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+       int i, j;
+
+       for_each_online_cpu(i) {
+               for (j = 0; j < NR_IRQS; j++) {
+                       if (!irq_desc[j].action)
+                               continue;
+                       /* Is it a significant load ?  */
+                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+                                               useful_load_threshold)
+                               continue;
+                       balance_irq(i, j);
+               }
+       }
+       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
+       return;
+}
+
+static void do_irq_balance(void)
+{
+       int i, j;
+       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+       unsigned long move_this_load = 0;
+       int max_loaded = 0, min_loaded = 0;
+       int load;
+       unsigned long useful_load_threshold = balanced_irq_interval + 10;
+       int selected_irq;
+       int tmp_loaded, first_attempt = 1;
+       unsigned long tmp_cpu_irq;
+       unsigned long imbalance = 0;
+       cpumask_t allowed_mask, target_cpu_mask, tmp;
+
+       for_each_possible_cpu(i) {
+               int package_index;
+               CPU_IRQ(i) = 0;
+               if (!cpu_online(i))
+                       continue;
+               package_index = CPU_TO_PACKAGEINDEX(i);
+               for (j = 0; j < NR_IRQS; j++) {
+                       unsigned long value_now, delta;
+                       /* Is this an active IRQ or balancing disabled ? */
+                       if (!irq_desc[j].action || irq_balancing_disabled(j))
+                               continue;
+                       if ( package_index == i )
+                               IRQ_DELTA(package_index,j) = 0;
+                       /* Determine the total count per processor per IRQ */
+                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
+
+                       /* Determine the activity per processor per IRQ */
+                       delta = value_now - LAST_CPU_IRQ(i,j);
+
+                       /* Update last_cpu_irq[][] for the next time */
+                       LAST_CPU_IRQ(i,j) = value_now;
+
+                       /* Ignore IRQs whose rate is less than the clock */
+                       if (delta < useful_load_threshold)
+                               continue;
+                       /* update the load for the processor or package total */
+                       IRQ_DELTA(package_index,j) += delta;
+
+                       /* Keep track of the higher numbered sibling as well */
+                       if (i != package_index)
+                               CPU_IRQ(i) += delta;
+                       /*
+                        * We have sibling A and sibling B in the package
+                        *
+                        * cpu_irq[A] = load for cpu A + load for cpu B
+                        * cpu_irq[B] = load for cpu B
+                        */
+                       CPU_IRQ(package_index) += delta;
+               }
+       }
+       /* Find the least loaded processor package */
+       for_each_online_cpu(i) {
+               if (i != CPU_TO_PACKAGEINDEX(i))
+                       continue;
+               if (min_cpu_irq > CPU_IRQ(i)) {
+                       min_cpu_irq = CPU_IRQ(i);
+                       min_loaded = i;
+               }
+       }
+       max_cpu_irq = ULONG_MAX;
+
+tryanothercpu:
+       /* Look for heaviest loaded processor.
+        * We may come back to get the next heaviest loaded processor.
+        * Skip processors with trivial loads.
+        */
+       tmp_cpu_irq = 0;
+       tmp_loaded = -1;
+       for_each_online_cpu(i) {
+               if (i != CPU_TO_PACKAGEINDEX(i))
+                       continue;
+               if (max_cpu_irq <= CPU_IRQ(i)) 
+                       continue;
+               if (tmp_cpu_irq < CPU_IRQ(i)) {
+                       tmp_cpu_irq = CPU_IRQ(i);
+                       tmp_loaded = i;
+               }
+       }
+
+       if (tmp_loaded == -1) {
+        /* In the case of small number of heavy interrupt sources, 
+         * loading some of the cpus too much. We use Ingo's original 
+         * approach to rotate them around.
+         */
+               if (!first_attempt && imbalance >= useful_load_threshold) {
+                       rotate_irqs_among_cpus(useful_load_threshold);
+                       return;
+               }
+               goto not_worth_the_effort;
+       }
+       
+       first_attempt = 0;              /* heaviest search */
+       max_cpu_irq = tmp_cpu_irq;      /* load */
+       max_loaded = tmp_loaded;        /* processor */
+       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+       
+       /* if imbalance is less than approx 10% of max load, then
+        * observe diminishing returns action. - quit
+        */
+       if (imbalance < (max_cpu_irq >> 3))
+               goto not_worth_the_effort;
+
+tryanotherirq:
+       /* if we select an IRQ to move that can't go where we want, then
+        * see if there is another one to try.
+        */
+       move_this_load = 0;
+       selected_irq = -1;
+       for (j = 0; j < NR_IRQS; j++) {
+               /* Is this an active IRQ? */
+               if (!irq_desc[j].action)
+                       continue;
+               if (imbalance <= IRQ_DELTA(max_loaded,j))
+                       continue;
+               /* Try to find the IRQ that is closest to the imbalance
+                * without going over.
+                */
+               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+                       move_this_load = IRQ_DELTA(max_loaded,j);
+                       selected_irq = j;
+               }
+       }
+       if (selected_irq == -1) {
+               goto tryanothercpu;
+       }
+
+       imbalance = move_this_load;
+       
+       /* For physical_balance case, we accumlated both load
+        * values in the one of the siblings cpu_irq[],
+        * to use the same code for physical and logical processors
+        * as much as possible. 
+        *
+        * NOTE: the cpu_irq[] array holds the sum of the load for
+        * sibling A and sibling B in the slot for the lowest numbered
+        * sibling (A), _AND_ the load for sibling B in the slot for
+        * the higher numbered sibling.
+        *
+        * We seek the least loaded sibling by making the comparison
+        * (A+B)/2 vs B
+        */
+       load = CPU_IRQ(min_loaded) >> 1;
+       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
+               if (load > CPU_IRQ(j)) {
+                       /* This won't change cpu_sibling_map[min_loaded] */
+                       load = CPU_IRQ(j);
+                       min_loaded = j;
+               }
+       }
+
+       cpus_and(allowed_mask,
+               cpu_online_map,
+               balance_irq_affinity[selected_irq]);
+       target_cpu_mask = cpumask_of_cpu(min_loaded);
+       cpus_and(tmp, target_cpu_mask, allowed_mask);
+
+       if (!cpus_empty(tmp)) {
+               /* mark for change destination */
+               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
+               /* Since we made a change, come back sooner to 
+                * check for more variation.
+                */
+               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
+               return;
+       }
+       goto tryanotherirq;
+
+not_worth_the_effort:
+       /*
+        * if we did not find an IRQ to move, then adjust the time interval
+        * upward
+        */
+       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
+       return;
+}
+
+static int balanced_irq(void *unused)
+{
+       int i;
+       unsigned long prev_balance_time = jiffies;
+       long time_remaining = balanced_irq_interval;
+
+       /* push everything to CPU 0 to give us a starting point.  */
+       for (i = 0 ; i < NR_IRQS ; i++) {
+               irq_desc[i].pending_mask = cpumask_of_cpu(0);
+               set_pending_irq(i, cpumask_of_cpu(0));
+       }
+
+       set_freezable();
+       for ( ; ; ) {
+               time_remaining = schedule_timeout_interruptible(time_remaining);
+               try_to_freeze();
+               if (time_after(jiffies,
+                               prev_balance_time+balanced_irq_interval)) {
+                       preempt_disable();
+                       do_irq_balance();
+                       prev_balance_time = jiffies;
+                       time_remaining = balanced_irq_interval;
+                       preempt_enable();
+               }
+       }
+       return 0;
+}
+
+static int __init balanced_irq_init(void)
+{
+       int i;
+       struct cpuinfo_x86 *c;
+       cpumask_t tmp;
+
+       cpus_shift_right(tmp, cpu_online_map, 2);
+        c = &boot_cpu_data;
+       /* When not overwritten by the command line ask subarchitecture. */
+       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
+               irqbalance_disabled = NO_BALANCE_IRQ;
+       if (irqbalance_disabled)
+               return 0;
+       
+        /* disable irqbalance completely if there is only one processor online */
+       if (num_online_cpus() < 2) {
+               irqbalance_disabled = 1;
+               return 0;
+       }
+       /*
+        * Enable physical balance only if more than 1 physical processor
+        * is present
+        */
+       if (smp_num_siblings > 1 && !cpus_empty(tmp))
+               physical_balance = 1;
+
+       for_each_online_cpu(i) {
+               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+                       printk(KERN_ERR "balanced_irq_init: out of memory");
+                       goto failed;
+               }
+               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
+       }
+       
+       printk(KERN_INFO "Starting balanced_irq\n");
+       if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
+               return 0;
+       printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+       for_each_possible_cpu(i) {
+               kfree(irq_cpu_data[i].irq_delta);
+               irq_cpu_data[i].irq_delta = NULL;
+               kfree(irq_cpu_data[i].last_irq);
+               irq_cpu_data[i].last_irq = NULL;
+       }
+       return 0;
+}
+
+int __devinit irqbalance_disable(char *str)
+{
+       irqbalance_disabled = 1;
+       return 1;
+}
+
+__setup("noirqbalance", irqbalance_disable);
+
+late_initcall(balanced_irq_init);
+#endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
+
+#ifndef CONFIG_SMP
+void fastcall send_IPI_self(int vector)
+{
+       unsigned int cfg;
+
+       /*
+        * Wait for idle.
+        */
+       apic_wait_icr_idle();
+       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+       /*
+        * Send the IPI. The write to APIC_ICR fires this off.
+        */
+       apic_write_around(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP */
+
+
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+
+static int __init ioapic_pirq_setup(char *str)
+{
+       int i, max;
+       int ints[MAX_PIRQS+1];
+
+       get_options(str, ARRAY_SIZE(ints), ints);
+
+       for (i = 0; i < MAX_PIRQS; i++)
+               pirq_entries[i] = -1;
+
+       pirqs_enabled = 1;
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "PIRQ redirection, working around broken MP-BIOS.\n");
+       max = MAX_PIRQS;
+       if (ints[0] < MAX_PIRQS)
+               max = ints[0];
+
+       for (i = 0; i < max; i++) {
+               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+               /*
+                * PIRQs are mapped upside down, usually.
+                */
+               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+       }
+       return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++)
+               if (mp_irqs[i].mpc_irqtype == type &&
+                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].mpc_dstirq == pin)
+                       return i;
+
+       return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_MCA
+                   ) &&
+                   (mp_irqs[i].mpc_irqtype == type) &&
+                   (mp_irqs[i].mpc_srcbusirq == irq))
+
+                       return mp_irqs[i].mpc_dstirq;
+       }
+       return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_MCA
+                   ) &&
+                   (mp_irqs[i].mpc_irqtype == type) &&
+                   (mp_irqs[i].mpc_srcbusirq == irq))
+                       break;
+       }
+       if (i < mp_irq_entries) {
+               int apic;
+               for(apic = 0; apic < nr_ioapics; apic++) {
+                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+                               return apic;
+               }
+       }
+
+       return -1;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+       int apic, i, best_guess = -1;
+
+       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
+               "slot:%d, pin:%d.\n", bus, slot, pin);
+       if (mp_bus_id_to_pci_bus[bus] == -1) {
+               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+               return -1;
+       }
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               for (apic = 0; apic < nr_ioapics; apic++)
+                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                               break;
+
+               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+                   !mp_irqs[i].mpc_irqtype &&
+                   (bus == lbus) &&
+                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+                       if (!(apic || IO_APIC_IRQ(irq)))
+                               continue;
+
+                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                               return irq;
+                       /*
+                        * Use the first all-but-pin matching entry as a
+                        * best-guess fuzzy result for broken mptables.
+                        */
+                       if (best_guess < 0)
+                               best_guess = irq;
+               }
+       }
+       return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where 
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+       int pin, ioapic, irq, irq_entry;
+
+       if (skip_ioapic_setup == 1)
+               return;
+
+       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+                       if (irq_entry == -1)
+                               continue;
+                       irq = pin_2_irq(irq_entry, ioapic, pin);
+                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
+               }
+
+       }
+}
+#endif
+
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+       if (irq < 16) {
+               unsigned int port = 0x4d0 + (irq >> 3);
+               return (inb(port) >> (irq & 7)) & 1;
+       }
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "Broken MPtable reports ISA irq %d\n", irq);
+       return 0;
+}
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx)     (0)
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)       (0)
+#define default_ISA_polarity(idx)      (0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)       (1)
+#define default_PCI_polarity(idx)      (1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)       (1)
+#define default_MCA_polarity(idx)      (0)
+
+static int __init MPBIOS_polarity(int idx)
+{
+       int bus = mp_irqs[idx].mpc_srcbus;
+       int polarity;
+
+       /*
+        * Determine IRQ line polarity (high active or low active):
+        */
+       switch (mp_irqs[idx].mpc_irqflag & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent polarity */
+               {
+                       switch (mp_bus_id_to_type[bus])
+                       {
+                               case MP_BUS_ISA: /* ISA pin */
+                               {
+                                       polarity = default_ISA_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_EISA: /* EISA pin */
+                               {
+                                       polarity = default_EISA_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_PCI: /* PCI pin */
+                               {
+                                       polarity = default_PCI_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_MCA: /* MCA pin */
+                               {
+                                       polarity = default_MCA_polarity(idx);
+                                       break;
+                               }
+                               default:
+                               {
+                                       printk(KERN_WARNING "broken BIOS!!\n");
+                                       polarity = 1;
+                                       break;
+                               }
+                       }
+                       break;
+               }
+               case 1: /* high active */
+               {
+                       polarity = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+               case 3: /* low active */
+               {
+                       polarity = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+       }
+       return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+       int bus = mp_irqs[idx].mpc_srcbus;
+       int trigger;
+
+       /*
+        * Determine IRQ trigger mode (edge or level sensitive):
+        */
+       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent */
+               {
+                       switch (mp_bus_id_to_type[bus])
+                       {
+                               case MP_BUS_ISA: /* ISA pin */
+                               {
+                                       trigger = default_ISA_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_EISA: /* EISA pin */
+                               {
+                                       trigger = default_EISA_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_PCI: /* PCI pin */
+                               {
+                                       trigger = default_PCI_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_MCA: /* MCA pin */
+                               {
+                                       trigger = default_MCA_trigger(idx);
+                                       break;
+                               }
+                               default:
+                               {
+                                       printk(KERN_WARNING "broken BIOS!!\n");
+                                       trigger = 1;
+                                       break;
+                               }
+                       }
+                       break;
+               }
+               case 1: /* edge */
+               {
+                       trigger = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 1;
+                       break;
+               }
+               case 3: /* level */
+               {
+                       trigger = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 0;
+                       break;
+               }
+       }
+       return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+       return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+       return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+       int irq, i;
+       int bus = mp_irqs[idx].mpc_srcbus;
+
+       /*
+        * Debugging check, we are in big trouble if this message pops up!
+        */
+       if (mp_irqs[idx].mpc_dstirq != pin)
+               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+       switch (mp_bus_id_to_type[bus])
+       {
+               case MP_BUS_ISA: /* ISA pin */
+               case MP_BUS_EISA:
+               case MP_BUS_MCA:
+               {
+                       irq = mp_irqs[idx].mpc_srcbusirq;
+                       break;
+               }
+               case MP_BUS_PCI: /* PCI pin */
+               {
+                       /*
+                        * PCI IRQs are mapped in order
+                        */
+                       i = irq = 0;
+                       while (i < apic)
+                               irq += nr_ioapic_registers[i++];
+                       irq += pin;
+
+                       /*
+                        * For MPS mode, so far only needed by ES7000 platform
+                        */
+                       if (ioapic_renumber_irq)
+                               irq = ioapic_renumber_irq(apic, irq);
+
+                       break;
+               }
+               default:
+               {
+                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
+                       irq = 0;
+                       break;
+               }
+       }
+
+       /*
+        * PCI IRQ command line redirection. Yes, limits are hardcoded.
+        */
+       if ((pin >= 16) && (pin <= 23)) {
+               if (pirq_entries[pin-16] != -1) {
+                       if (!pirq_entries[pin-16]) {
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               "disabling PIRQ%d\n", pin-16);
+                       } else {
+                               irq = pirq_entries[pin-16];
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               "using PIRQ%d -> IRQ %d\n",
+                                               pin-16, irq);
+                       }
+               }
+       }
+       return irq;
+}
+
+static inline int IO_APIC_irq_trigger(int irq)
+{
+       int apic, idx, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                       idx = find_irq_entry(apic,pin,mp_INT);
+                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+                               return irq_trigger(idx);
+               }
+       }
+       /*
+        * nonexistent IRQs are edge default
+        */
+       return 0;
+}
+
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+
+static int __assign_irq_vector(int irq)
+{
+       static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+       int vector, offset, i;
+
+       BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
+
+       if (irq_vector[irq] > 0)
+               return irq_vector[irq];
+
+       vector = current_vector;
+       offset = current_offset;
+next:
+       vector += 8;
+       if (vector >= FIRST_SYSTEM_VECTOR) {
+               offset = (offset + 1) % 8;
+               vector = FIRST_DEVICE_VECTOR + offset;
+       }
+       if (vector == current_vector)
+               return -ENOSPC;
+       if (vector == SYSCALL_VECTOR)
+               goto next;
+       for (i = 0; i < NR_IRQ_VECTORS; i++)
+               if (irq_vector[i] == vector)
+                       goto next;
+
+       current_vector = vector;
+       current_offset = offset;
+       irq_vector[irq] = vector;
+
+       return vector;
+}
+
+static int assign_irq_vector(int irq)
+{
+       unsigned long flags;
+       int vector;
+
+       spin_lock_irqsave(&vector_lock, flags);
+       vector = __assign_irq_vector(irq);
+       spin_unlock_irqrestore(&vector_lock, flags);
+
+       return vector;
+}
+static struct irq_chip ioapic_chip;
+
+#define IOAPIC_AUTO    -1
+#define IOAPIC_EDGE    0
+#define IOAPIC_LEVEL   1
+
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+           trigger == IOAPIC_LEVEL) {
+               irq_desc[irq].status |= IRQ_LEVEL;
+               set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                        handle_fasteoi_irq, "fasteoi");
+       } else {
+               irq_desc[irq].status &= ~IRQ_LEVEL;
+               set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                        handle_edge_irq, "edge");
+       }
+       set_intr_gate(vector, interrupt[irq]);
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+       struct IO_APIC_route_entry entry;
+       int apic, pin, idx, irq, first_notcon = 1, vector;
+       unsigned long flags;
+
+       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+               /*
+                * add it to the IO-APIC irq-routing table:
+                */
+               memset(&entry,0,sizeof(entry));
+
+               entry.delivery_mode = INT_DELIVERY_MODE;
+               entry.dest_mode = INT_DEST_MODE;
+               entry.mask = 0;                         /* enable IRQ */
+               entry.dest.logical.logical_dest = 
+                                       cpu_mask_to_apicid(TARGET_CPUS);
+
+               idx = find_irq_entry(apic,pin,mp_INT);
+               if (idx == -1) {
+                       if (first_notcon) {
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               " IO-APIC (apicid-pin) %d-%d",
+                                               mp_ioapics[apic].mpc_apicid,
+                                               pin);
+                               first_notcon = 0;
+                       } else
+                               apic_printk(APIC_VERBOSE, ", %d-%d",
+                                       mp_ioapics[apic].mpc_apicid, pin);
+                       continue;
+               }
+
+               entry.trigger = irq_trigger(idx);
+               entry.polarity = irq_polarity(idx);
+
+               if (irq_trigger(idx)) {
+                       entry.trigger = 1;
+                       entry.mask = 1;
+               }
+
+               irq = pin_2_irq(idx, apic, pin);
+               /*
+                * skip adding the timer int on secondary nodes, which causes
+                * a small but painful rift in the time-space continuum
+                */
+               if (multi_timer_check(apic, irq))
+                       continue;
+               else
+                       add_pin_to_irq(irq, apic, pin);
+
+               if (!apic && !IO_APIC_IRQ(irq))
+                       continue;
+
+               if (IO_APIC_IRQ(irq)) {
+                       vector = assign_irq_vector(irq);
+                       entry.vector = vector;
+                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+               
+                       if (!apic && (irq < 16))
+                               disable_8259A_irq(irq);
+               }
+               spin_lock_irqsave(&ioapic_lock, flags);
+               __ioapic_write_entry(apic, pin, entry);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+       }
+       }
+
+       if (!first_notcon)
+               apic_printk(APIC_VERBOSE, " not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin:
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+       struct IO_APIC_route_entry entry;
+
+       memset(&entry,0,sizeof(entry));
+
+       disable_8259A_irq(0);
+
+       /* mask LVT0 */
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+       /*
+        * We use logical delivery to get the timer IRQ
+        * to the first CPU.
+        */
+       entry.dest_mode = INT_DEST_MODE;
+       entry.mask = 0;                                 /* unmask IRQ now */
+       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+       entry.delivery_mode = INT_DELIVERY_MODE;
+       entry.polarity = 0;
+       entry.trigger = 0;
+       entry.vector = vector;
+
+       /*
+        * The timer IRQ doesn't have to know that behind the
+        * scene we have a 8259A-master in AEOI mode ...
+        */
+       irq_desc[0].chip = &ioapic_chip;
+       set_irq_handler(0, handle_edge_irq);
+
+       /*
+        * Add it to the IO-APIC irq-routing table:
+        */
+       ioapic_write_entry(apic, pin, entry);
+
+       enable_8259A_irq(0);
+}
+
+void __init print_IO_APIC(void)
+{
+       int apic, i;
+       union IO_APIC_reg_00 reg_00;
+       union IO_APIC_reg_01 reg_01;
+       union IO_APIC_reg_02 reg_02;
+       union IO_APIC_reg_03 reg_03;
+       unsigned long flags;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+       for (i = 0; i < nr_ioapics; i++)
+               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+       /*
+        * We are a bit conservative about what we expect.  We have to
+        * know about every hardware change ASAP.
+        */
+       printk(KERN_INFO "testing the IO APIC.......................\n");
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(apic, 0);
+       reg_01.raw = io_apic_read(apic, 1);
+       if (reg_01.bits.version >= 0x10)
+               reg_02.raw = io_apic_read(apic, 2);
+       if (reg_01.bits.version >= 0x20)
+               reg_03.raw = io_apic_read(apic, 3);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
+       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
+
+       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
+       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+
+       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+
+       /*
+        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+        * but the value of reg_02 is read as the previous read register
+        * value, so ignore it if reg_02 == reg_01.
+        */
+       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+       }
+
+       /*
+        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+        * or reg_03, but the value of reg_0[23] is read as the previous read
+        * register value, so ignore it if reg_03 == reg_0[12].
+        */
+       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+           reg_03.raw != reg_01.raw) {
+               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
+       }
+
+       printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+                         " Stat Dest Deli Vect:   \n");
+
+       for (i = 0; i <= reg_01.bits.entries; i++) {
+               struct IO_APIC_route_entry entry;
+
+               entry = ioapic_read_entry(apic, i);
+
+               printk(KERN_DEBUG " %02x %03X %02X  ",
+                       i,
+                       entry.dest.logical.logical_dest,
+                       entry.dest.physical.physical_dest
+               );
+
+               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                       entry.mask,
+                       entry.trigger,
+                       entry.irr,
+                       entry.polarity,
+                       entry.delivery_status,
+                       entry.dest_mode,
+                       entry.delivery_mode,
+                       entry.vector
+               );
+       }
+       }
+       printk(KERN_DEBUG "IRQ to pin mappings:\n");
+       for (i = 0; i < NR_IRQS; i++) {
+               struct irq_pin_list *entry = irq_2_pin + i;
+               if (entry->pin < 0)
+                       continue;
+               printk(KERN_DEBUG "IRQ%d ", i);
+               for (;;) {
+                       printk("-> %d:%d", entry->apic, entry->pin);
+                       if (!entry->next)
+                               break;
+                       entry = irq_2_pin + entry->next;
+               }
+               printk("\n");
+       }
+
+       printk(KERN_INFO ".................................... done.\n");
+
+       return;
+}
+
+#if 0
+
+static void print_APIC_bitfield (int base)
+{
+       unsigned int v;
+       int i, j;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+       for (i = 0; i < 8; i++) {
+               v = apic_read(base + i*0x10);
+               for (j = 0; j < 32; j++) {
+                       if (v & (1<<j))
+                               printk("1");
+                       else
+                               printk("0");
+               }
+               printk("\n");
+       }
+}
+
+void /*__init*/ print_local_APIC(void * dummy)
+{
+       unsigned int v, ver, maxlvt;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+               smp_processor_id(), hard_smp_processor_id());
+       v = apic_read(APIC_ID);
+       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+       v = apic_read(APIC_LVR);
+       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+       ver = GET_APIC_VERSION(v);
+       maxlvt = lapic_get_maxlvt();
+
+       v = apic_read(APIC_TASKPRI);
+       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+               v = apic_read(APIC_ARBPRI);
+               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+                       v & APIC_ARBPRI_MASK);
+               v = apic_read(APIC_PROCPRI);
+               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+       }
+
+       v = apic_read(APIC_EOI);
+       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+       v = apic_read(APIC_RRR);
+       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+       v = apic_read(APIC_LDR);
+       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+       v = apic_read(APIC_DFR);
+       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+       v = apic_read(APIC_SPIV);
+       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+       printk(KERN_DEBUG "... APIC ISR field:\n");
+       print_APIC_bitfield(APIC_ISR);
+       printk(KERN_DEBUG "... APIC TMR field:\n");
+       print_APIC_bitfield(APIC_TMR);
+       printk(KERN_DEBUG "... APIC IRR field:\n");
+       print_APIC_bitfield(APIC_IRR);
+
+       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                       apic_write(APIC_ESR, 0);
+               v = apic_read(APIC_ESR);
+               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_ICR);
+       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+       v = apic_read(APIC_ICR2);
+       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+       v = apic_read(APIC_LVTT);
+       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+       if (maxlvt > 3) {                       /* PC is LVT#4. */
+               v = apic_read(APIC_LVTPC);
+               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+       }
+       v = apic_read(APIC_LVT0);
+       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+       v = apic_read(APIC_LVT1);
+       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+       if (maxlvt > 2) {                       /* ERR is LVT#3. */
+               v = apic_read(APIC_LVTERR);
+               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_TMICT);
+       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+       v = apic_read(APIC_TMCCT);
+       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+       v = apic_read(APIC_TDCR);
+       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+       printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+       on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void /*__init*/ print_PIC(void)
+{
+       unsigned int v;
+       unsigned long flags;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+
+       v = inb(0xa1) << 8 | inb(0x21);
+       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+       v = inb(0xa0) << 8 | inb(0x20);
+       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+       outb(0x0b,0xa0);
+       outb(0x0b,0x20);
+       v = inb(0xa0) << 8 | inb(0x20);
+       outb(0x0a,0xa0);
+       outb(0x0a,0x20);
+
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+
+       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+       v = inb(0x4d1) << 8 | inb(0x4d0);
+       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+#endif  /*  0  */
+
+static void __init enable_IO_APIC(void)
+{
+       union IO_APIC_reg_01 reg_01;
+       int i8259_apic, i8259_pin;
+       int i, apic;
+       unsigned long flags;
+
+       for (i = 0; i < PIN_MAP_SIZE; i++) {
+               irq_2_pin[i].pin = -1;
+               irq_2_pin[i].next = 0;
+       }
+       if (!pirqs_enabled)
+               for (i = 0; i < MAX_PIRQS; i++)
+                       pirq_entries[i] = -1;
+
+       /*
+        * The number of IO-APIC IRQ registers (== #pins):
+        */
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_01.raw = io_apic_read(apic, 1);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+       }
+       for(apic = 0; apic < nr_ioapics; apic++) {
+               int pin;
+               /* See if any of the pins is in ExtINT mode */
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                       struct IO_APIC_route_entry entry;
+                       entry = ioapic_read_entry(apic, pin);
+
+
+                       /* If the interrupt line is enabled and in ExtInt mode
+                        * I have found the pin where the i8259 is connected.
+                        */
+                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+                               ioapic_i8259.apic = apic;
+                               ioapic_i8259.pin  = pin;
+                               goto found_i8259;
+                       }
+               }
+       }
+ found_i8259:
+       /* Look to see what if the MP table has reported the ExtINT */
+       /* If we could not find the appropriate pin by looking at the ioapic
+        * the i8259 probably is not connected the ioapic but give the
+        * mptable a chance anyway.
+        */
+       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+       /* Trust the MP table if nothing is setup in the hardware */
+       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+               ioapic_i8259.pin  = i8259_pin;
+               ioapic_i8259.apic = i8259_apic;
+       }
+       /* Complain if the MP table and the hardware disagree */
+       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+       {
+               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+       }
+
+       /*
+        * Do not trust the IO-APIC being empty at bootup
+        */
+       clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+       /*
+        * Clear the IO-APIC before rebooting:
+        */
+       clear_IO_APIC();
+
+       /*
+        * If the i8259 is routed through an IOAPIC
+        * Put that IOAPIC in virtual wire mode
+        * so legacy interrupts can be delivered.
+        */
+       if (ioapic_i8259.pin != -1) {
+               struct IO_APIC_route_entry entry;
+
+               memset(&entry, 0, sizeof(entry));
+               entry.mask            = 0; /* Enabled */
+               entry.trigger         = 0; /* Edge */
+               entry.irr             = 0;
+               entry.polarity        = 0; /* High */
+               entry.delivery_status = 0;
+               entry.dest_mode       = 0; /* Physical */
+               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+               entry.vector          = 0;
+               entry.dest.physical.physical_dest =
+                                       GET_APIC_ID(apic_read(APIC_ID));
+
+               /*
+                * Add it to the IO-APIC irq-routing table:
+                */
+               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+       }
+       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+
+#ifndef CONFIG_X86_NUMAQ
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+       union IO_APIC_reg_00 reg_00;
+       physid_mask_t phys_id_present_map;
+       int apic;
+       int i;
+       unsigned char old_id;
+       unsigned long flags;
+
+       /*
+        * Don't check I/O APIC IDs for xAPIC systems.  They have
+        * no meaning without the serial APIC bus.
+        */
+       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+               return;
+       /*
+        * This is broken; anything with a real cpu count has to
+        * circumvent this idiocy regardless.
+        */
+       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+       /*
+        * Set the IOAPIC ID to the value stored in the MPC table.
+        */
+       for (apic = 0; apic < nr_ioapics; apic++) {
+
+               /* Read the register 0 value */
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_00.raw = io_apic_read(apic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               
+               old_id = mp_ioapics[apic].mpc_apicid;
+
+               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+                               apic, mp_ioapics[apic].mpc_apicid);
+                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+                               reg_00.bits.ID);
+                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+               }
+
+               /*
+                * Sanity check, is the ID really free? Every APIC in a
+                * system must have a unique ID or we get lots of nice
+                * 'stuck on smp_invalidate_needed IPI wait' messages.
+                */
+               if (check_apicid_used(phys_id_present_map,
+                                       mp_ioapics[apic].mpc_apicid)) {
+                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+                               apic, mp_ioapics[apic].mpc_apicid);
+                       for (i = 0; i < get_physical_broadcast(); i++)
+                               if (!physid_isset(i, phys_id_present_map))
+                                       break;
+                       if (i >= get_physical_broadcast())
+                               panic("Max APIC ID exceeded!\n");
+                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+                               i);
+                       physid_set(i, phys_id_present_map);
+                       mp_ioapics[apic].mpc_apicid = i;
+               } else {
+                       physid_mask_t tmp;
+                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+                       apic_printk(APIC_VERBOSE, "Setting %d in the "
+                                       "phys_id_present_map\n",
+                                       mp_ioapics[apic].mpc_apicid);
+                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
+               }
+
+
+               /*
+                * We need to adjust the IRQ routing table
+                * if the ID changed.
+                */
+               if (old_id != mp_ioapics[apic].mpc_apicid)
+                       for (i = 0; i < mp_irq_entries; i++)
+                               if (mp_irqs[i].mpc_dstapic == old_id)
+                                       mp_irqs[i].mpc_dstapic
+                                               = mp_ioapics[apic].mpc_apicid;
+
+               /*
+                * Read the right value from the MPC table and
+                * write it into the ID register.
+                */
+               apic_printk(APIC_VERBOSE, KERN_INFO
+                       "...changing IO-APIC physical APIC ID to %d ...",
+                       mp_ioapics[apic].mpc_apicid);
+
+               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(apic, 0, reg_00.raw);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               /*
+                * Sanity check
+                */
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_00.raw = io_apic_read(apic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+                       printk("could not set ID!\n");
+               else
+                       apic_printk(APIC_VERBOSE, " ok.\n");
+       }
+}
+#else
+static void __init setup_ioapic_ids_from_mpc(void) { }
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+       no_timer_check = 1;
+       return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *     - timer IRQ defaults to IO-APIC IRQ
+ *     - if this function detects that timer IRQs are defunct, then we fall
+ *       back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+       unsigned long t1 = jiffies;
+
+       if (no_timer_check)
+               return 1;
+
+       local_irq_enable();
+       /* Let ten ticks pass... */
+       mdelay((10 * 1000) / HZ);
+
+       /*
+        * Expect a few ticks at least, to be sure some possible
+        * glue logic does not lock up after one or two first
+        * ticks in a non-ExtINT mode.  Also the local APIC
+        * might have cached one ExtINT interrupt.  Finally, at
+        * least one tick may be lost due to delays.
+        */
+       if (jiffies - t1 > 4)
+               return 1;
+
+       return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Startup quirk:
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ *
+ * (We do this for level-triggered IRQs too - it cannot hurt.)
+ */
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+       int was_pending = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       if (irq < 16) {
+               disable_8259A_irq(irq);
+               if (i8259A_irq_pending(irq))
+                       was_pending = 1;
+       }
+       __unmask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return was_pending;
+}
+
+static void ack_ioapic_irq(unsigned int irq)
+{
+       move_native_irq(irq);
+       ack_APIC_irq();
+}
+
+static void ack_ioapic_quirk_irq(unsigned int irq)
+{
+       unsigned long v;
+       int i;
+
+       move_native_irq(irq);
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets).  Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source.  The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually.  We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt.  We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul.  --macro
+ */
+       i = irq_vector[irq];
+
+       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+       ack_APIC_irq();
+
+       if (!(v & (1 << (i & 0x1f)))) {
+               atomic_inc(&irq_mis_count);
+               spin_lock(&ioapic_lock);
+               __mask_and_edge_IO_APIC_irq(irq);
+               __unmask_and_level_IO_APIC_irq(irq);
+               spin_unlock(&ioapic_lock);
+       }
+}
+
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+       send_IPI_self(irq_vector[irq]);
+
+       return 1;
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+       .name           = "IO-APIC",
+       .startup        = startup_ioapic_irq,
+       .mask           = mask_IO_APIC_irq,
+       .unmask         = unmask_IO_APIC_irq,
+       .ack            = ack_ioapic_irq,
+       .eoi            = ack_ioapic_quirk_irq,
+#ifdef CONFIG_SMP
+       .set_affinity   = set_ioapic_affinity_irq,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+
+static inline void init_IO_APIC_traps(void)
+{
+       int irq;
+
+       /*
+        * NOTE! The local APIC isn't very good at handling
+        * multiple interrupts at the same interrupt level.
+        * As the interrupt level is determined by taking the
+        * vector number and shifting that right by 4, we
+        * want to spread these out a bit so that they don't
+        * all fall in the same interrupt level.
+        *
+        * Also, we've got to be careful not to trash gate
+        * 0x80, because int 0x80 is hm, kind of importantish. ;)
+        */
+       for (irq = 0; irq < NR_IRQS ; irq++) {
+               int tmp = irq;
+               if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
+                       /*
+                        * Hmm.. We don't have an entry for this,
+                        * so default to an old-fashioned 8259
+                        * interrupt if we can..
+                        */
+                       if (irq < 16)
+                               make_8259A_irq(irq);
+                       else
+                               /* Strange. Oh, well.. */
+                               irq_desc[irq].chip = &no_irq_chip;
+               }
+       }
+}
+
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void ack_apic(unsigned int irq)
+{
+       ack_APIC_irq();
+}
+
+static void mask_lapic_irq (unsigned int irq)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq (unsigned int irq)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+       .name           = "local-APIC-edge",
+       .mask           = mask_lapic_irq,
+       .unmask         = unmask_lapic_irq,
+       .eoi            = ack_apic,
+};
+
+static void setup_nmi (void)
+{
+       /*
+        * Dirty trick to enable the NMI watchdog ...
+        * We put the 8259A master into AEOI mode and
+        * unmask on all local APICs LVT0 as NMI.
+        *
+        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+        * is from Maciej W. Rozycki - so we do not have to EOI from
+        * the NMI handler or the timer interrupt.
+        */ 
+       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
+
+       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+
+       apic_printk(APIC_VERBOSE, " done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+       int apic, pin, i;
+       struct IO_APIC_route_entry entry0, entry1;
+       unsigned char save_control, save_freq_select;
+
+       pin  = find_isa_irq_pin(8, mp_INT);
+       if (pin == -1) {
+               WARN_ON_ONCE(1);
+               return;
+       }
+       apic = find_isa_irq_apic(8, mp_INT);
+       if (apic == -1) {
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       entry0 = ioapic_read_entry(apic, pin);
+       clear_IO_APIC_pin(apic, pin);
+
+       memset(&entry1, 0, sizeof(entry1));
+
+       entry1.dest_mode = 0;                   /* physical delivery */
+       entry1.mask = 0;                        /* unmask IRQ now */
+       entry1.dest.physical.physical_dest = hard_smp_processor_id();
+       entry1.delivery_mode = dest_ExtINT;
+       entry1.polarity = entry0.polarity;
+       entry1.trigger = 0;
+       entry1.vector = 0;
+
+       ioapic_write_entry(apic, pin, entry1);
+
+       save_control = CMOS_READ(RTC_CONTROL);
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                  RTC_FREQ_SELECT);
+       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+       i = 100;
+       while (i-- > 0) {
+               mdelay(10);
+               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                       i -= 10;
+       }
+
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+       clear_IO_APIC_pin(apic, pin);
+
+       ioapic_write_entry(apic, pin, entry0);
+}
+
+int timer_uses_ioapic_pin_0;
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void __init check_timer(void)
+{
+       int apic1, pin1, apic2, pin2;
+       int vector;
+
+       /*
+        * get/set the timer IRQ vector:
+        */
+       disable_8259A_irq(0);
+       vector = assign_irq_vector(0);
+       set_intr_gate(vector, interrupt[0]);
+
+       /*
+        * Subtle, code in do_timer_interrupt() expects an AEOI
+        * mode for the 8259A whenever interrupts are routed
+        * through I/O APICs.  Also IRQ0 has to be enabled in
+        * the 8259A which implies the virtual wire has to be
+        * disabled in the local APIC.
+        */
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+       init_8259A(1);
+       timer_ack = 1;
+       if (timer_over_8254 > 0)
+               enable_8259A_irq(0);
+
+       pin1  = find_isa_irq_pin(0, mp_INT);
+       apic1 = find_isa_irq_apic(0, mp_INT);
+       pin2  = ioapic_i8259.pin;
+       apic2 = ioapic_i8259.apic;
+
+       if (pin1 == 0)
+               timer_uses_ioapic_pin_0 = 1;
+
+       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+               vector, apic1, pin1, apic2, pin2);
+
+       if (pin1 != -1) {
+               /*
+                * Ok, does IRQ0 through the IOAPIC work?
+                */
+               unmask_IO_APIC_irq(0);
+               if (timer_irq_works()) {
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               disable_8259A_irq(0);
+                               setup_nmi();
+                               enable_8259A_irq(0);
+                       }
+                       if (disable_timer_pin_1 > 0)
+                               clear_IO_APIC_pin(0, pin1);
+                       return;
+               }
+               clear_IO_APIC_pin(apic1, pin1);
+               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
+                               "IO-APIC\n");
+       }
+
+       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+       if (pin2 != -1) {
+               printk("\n..... (found pin %d) ...", pin2);
+               /*
+                * legacy devices should be connected to IO APIC #0
+                */
+               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+               if (timer_irq_works()) {
+                       printk("works.\n");
+                       if (pin1 != -1)
+                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+                       else
+                               add_pin_to_irq(0, apic2, pin2);
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               setup_nmi();
+                       }
+                       return;
+               }
+               /*
+                * Cleanup, just in case ...
+                */
+               clear_IO_APIC_pin(apic2, pin2);
+       }
+       printk(" failed.\n");
+
+       if (nmi_watchdog == NMI_IO_APIC) {
+               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+               nmi_watchdog = 0;
+       }
+
+       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+       disable_8259A_irq(0);
+       set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
+                                     "fasteoi");
+       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
+       enable_8259A_irq(0);
+
+       if (timer_irq_works()) {
+               printk(" works.\n");
+               return;
+       }
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+       printk(" failed.\n");
+
+       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+       timer_ack = 0;
+       init_8259A(0);
+       make_8259A_irq(0);
+       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+
+       unlock_ExtINT_logic();
+
+       if (timer_irq_works()) {
+               printk(" works.\n");
+               return;
+       }
+       printk(" failed :(.\n");
+       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+               "report.  Then try booting with the 'noapic' option");
+}
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS       (1 << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+       enable_IO_APIC();
+
+       if (acpi_ioapic)
+               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
+       else
+               io_apic_irqs = ~PIC_IRQS;
+
+       printk("ENABLING IO-APIC IRQs\n");
+
+       /*
+        * Set up IO-APIC IRQ routing.
+        */
+       if (!acpi_ioapic)
+               setup_ioapic_ids_from_mpc();
+       sync_Arb_IDs();
+       setup_IO_APIC_irqs();
+       init_IO_APIC_traps();
+       check_timer();
+       if (!acpi_ioapic)
+               print_IO_APIC();
+}
+
+static int __init setup_disable_8254_timer(char *s)
+{
+       timer_over_8254 = -1;
+       return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+       timer_over_8254 = 2;
+       return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+
+/*
+ *     Called after all the initialization is done. If we didnt find any
+ *     APIC bugs then we can allow the modify fast path
+ */
+static int __init io_apic_bug_finalize(void)
+{
+       if(sis_apic_bug == -1)
+               sis_apic_bug = 0;
+       return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+struct sysfs_ioapic_data {
+       struct sys_device dev;
+       struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+       struct IO_APIC_route_entry *entry;
+       struct sysfs_ioapic_data *data;
+       int i;
+       
+       data = container_of(dev, struct sysfs_ioapic_data, dev);
+       entry = data->entry;
+       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+               entry[i] = ioapic_read_entry(dev->id, i);
+
+       return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+       struct IO_APIC_route_entry *entry;
+       struct sysfs_ioapic_data *data;
+       unsigned long flags;
+       union IO_APIC_reg_00 reg_00;
+       int i;
+       
+       data = container_of(dev, struct sysfs_ioapic_data, dev);
+       entry = data->entry;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(dev->id, 0);
+       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+               io_apic_write(dev->id, 0, reg_00.raw);
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+               ioapic_write_entry(dev->id, i, entry[i]);
+
+       return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+       set_kset_name("ioapic"),
+       .suspend = ioapic_suspend,
+       .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+       struct sys_device * dev;
+       int i, size, error = 0;
+
+       error = sysdev_class_register(&ioapic_sysdev_class);
+       if (error)
+               return error;
+
+       for (i = 0; i < nr_ioapics; i++ ) {
+               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
+                       * sizeof(struct IO_APIC_route_entry);
+               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+               if (!mp_ioapic_data[i]) {
+                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                       continue;
+               }
+               memset(mp_ioapic_data[i], 0, size);
+               dev = &mp_ioapic_data[i]->dev;
+               dev->id = i; 
+               dev->cls = &ioapic_sysdev_class;
+               error = sysdev_register(dev);
+               if (error) {
+                       kfree(mp_ioapic_data[i]);
+                       mp_ioapic_data[i] = NULL;
+                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                       continue;
+               }
+       }
+
+       return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+       /* Allocate an unused irq */
+       int irq, new, vector = 0;
+       unsigned long flags;
+
+       irq = -ENOSPC;
+       spin_lock_irqsave(&vector_lock, flags);
+       for (new = (NR_IRQS - 1); new >= 0; new--) {
+               if (platform_legacy_irq(new))
+                       continue;
+               if (irq_vector[new] != 0)
+                       continue;
+               vector = __assign_irq_vector(new);
+               if (likely(vector > 0))
+                       irq = new;
+               break;
+       }
+       spin_unlock_irqrestore(&vector_lock, flags);
+
+       if (irq >= 0) {
+               set_intr_gate(vector, interrupt[irq]);
+               dynamic_irq_init(irq);
+       }
+       return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+       unsigned long flags;
+
+       dynamic_irq_cleanup(irq);
+
+       spin_lock_irqsave(&vector_lock, flags);
+       irq_vector[irq] = 0;
+       spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+       int vector;
+       unsigned dest;
+
+       vector = assign_irq_vector(irq);
+       if (vector >= 0) {
+               dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+               msg->address_hi = MSI_ADDR_BASE_HI;
+               msg->address_lo =
+                       MSI_ADDR_BASE_LO |
+                       ((INT_DEST_MODE == 0) ?
+                               MSI_ADDR_DEST_MODE_PHYSICAL:
+                               MSI_ADDR_DEST_MODE_LOGICAL) |
+                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                               MSI_ADDR_REDIRECTION_CPU:
+                               MSI_ADDR_REDIRECTION_LOWPRI) |
+                       MSI_ADDR_DEST_ID(dest);
+
+               msg->data =
+                       MSI_DATA_TRIGGER_EDGE |
+                       MSI_DATA_LEVEL_ASSERT |
+                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                               MSI_DATA_DELIVERY_FIXED:
+                               MSI_DATA_DELIVERY_LOWPRI) |
+                       MSI_DATA_VECTOR(vector);
+       }
+       return vector;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+       struct msi_msg msg;
+       unsigned int dest;
+       cpumask_t tmp;
+       int vector;
+
+       cpus_and(tmp, mask, cpu_online_map);
+       if (cpus_empty(tmp))
+               tmp = TARGET_CPUS;
+
+       vector = assign_irq_vector(irq);
+       if (vector < 0)
+               return;
+
+       dest = cpu_mask_to_apicid(mask);
+
+       read_msi_msg(irq, &msg);
+
+       msg.data &= ~MSI_DATA_VECTOR_MASK;
+       msg.data |= MSI_DATA_VECTOR(vector);
+       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+       write_msi_msg(irq, &msg);
+       irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+       .name           = "PCI-MSI",
+       .unmask         = unmask_msi_irq,
+       .mask           = mask_msi_irq,
+       .ack            = ack_ioapic_irq,
+#ifdef CONFIG_SMP
+       .set_affinity   = set_msi_irq_affinity,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+       struct msi_msg msg;
+       int irq, ret;
+       irq = create_irq();
+       if (irq < 0)
+               return irq;
+
+       ret = msi_compose_msg(dev, irq, &msg);
+       if (ret < 0) {
+               destroy_irq(irq);
+               return ret;
+       }
+
+       set_irq_msi(irq, desc);
+       write_msi_msg(irq, &msg);
+
+       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
+                                     "edge");
+
+       return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+       destroy_irq(irq);
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest)
+{
+       struct ht_irq_msg msg;
+       fetch_ht_irq_msg(irq, &msg);
+
+       msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
+       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+       msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
+       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+       write_ht_irq_msg(irq, &msg);
+}
+
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+       unsigned int dest;
+       cpumask_t tmp;
+
+       cpus_and(tmp, mask, cpu_online_map);
+       if (cpus_empty(tmp))
+               tmp = TARGET_CPUS;
+
+       cpus_and(mask, tmp, CPU_MASK_ALL);
+
+       dest = cpu_mask_to_apicid(mask);
+
+       target_ht_irq(irq, dest);
+       irq_desc[irq].affinity = mask;
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+       .name           = "PCI-HT",
+       .mask           = mask_ht_irq,
+       .unmask         = unmask_ht_irq,
+       .ack            = ack_ioapic_irq,
+#ifdef CONFIG_SMP
+       .set_affinity   = set_ht_irq_affinity,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+       int vector;
+
+       vector = assign_irq_vector(irq);
+       if (vector >= 0) {
+               struct ht_irq_msg msg;
+               unsigned dest;
+               cpumask_t tmp;
+
+               cpus_clear(tmp);
+               cpu_set(vector >> 8, tmp);
+               dest = cpu_mask_to_apicid(tmp);
+
+               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+               msg.address_lo =
+                       HT_IRQ_LOW_BASE |
+                       HT_IRQ_LOW_DEST_ID(dest) |
+                       HT_IRQ_LOW_VECTOR(vector) |
+                       ((INT_DEST_MODE == 0) ?
+                               HT_IRQ_LOW_DM_PHYSICAL :
+                               HT_IRQ_LOW_DM_LOGICAL) |
+                       HT_IRQ_LOW_RQEOI_EDGE |
+                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                               HT_IRQ_LOW_MT_FIXED :
+                               HT_IRQ_LOW_MT_ARBITRATED) |
+                       HT_IRQ_LOW_IRQ_MASKED;
+
+               write_ht_irq_msg(irq, &msg);
+
+               set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+                                             handle_edge_irq, "edge");
+       }
+       return vector;
+}
+#endif /* CONFIG_HT_IRQ */
+
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+       union IO_APIC_reg_00 reg_00;
+       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+       physid_mask_t tmp;
+       unsigned long flags;
+       int i = 0;
+
+       /*
+        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
+        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+        * supports up to 16 on one shared APIC bus.
+        * 
+        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+        *      advantage of new APIC bus architecture.
+        */
+
+       if (physids_empty(apic_id_map))
+               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(ioapic, 0);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       if (apic_id >= get_physical_broadcast()) {
+               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
+               apic_id = reg_00.bits.ID;
+       }
+
+       /*
+        * Every APIC in a system must have a unique ID or we get lots of nice 
+        * 'stuck on smp_invalidate_needed IPI wait' messages.
+        */
+       if (check_apicid_used(apic_id_map, apic_id)) {
+
+               for (i = 0; i < get_physical_broadcast(); i++) {
+                       if (!check_apicid_used(apic_id_map, i))
+                               break;
+               }
+
+               if (i == get_physical_broadcast())
+                       panic("Max apic_id exceeded!\n");
+
+               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+                       "trying %d\n", ioapic, apic_id, i);
+
+               apic_id = i;
+       } 
+
+       tmp = apicid_to_cpu_present(apic_id);
+       physids_or(apic_id_map, apic_id_map, tmp);
+
+       if (reg_00.bits.ID != apic_id) {
+               reg_00.bits.ID = apic_id;
+
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(ioapic, 0, reg_00.raw);
+               reg_00.raw = io_apic_read(ioapic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               /* Sanity check */
+               if (reg_00.bits.ID != apic_id) {
+                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+                       return -1;
+               }
+       }
+
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+       return apic_id;
+}
+
+
+int __init io_apic_get_version (int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return reg_01.bits.version;
+}
+
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+       struct IO_APIC_route_entry entry;
+       unsigned long flags;
+
+       if (!IO_APIC_IRQ(irq)) {
+               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                       ioapic);
+               return -EINVAL;
+       }
+
+       /*
+        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+        * Note that we mask (disable) IRQs now -- these get enabled when the
+        * corresponding device driver registers for this IRQ.
+        */
+
+       memset(&entry,0,sizeof(entry));
+
+       entry.delivery_mode = INT_DELIVERY_MODE;
+       entry.dest_mode = INT_DEST_MODE;
+       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+       entry.trigger = edge_level;
+       entry.polarity = active_high_low;
+       entry.mask  = 1;
+
+       /*
+        * IRQs < 16 are already in the irq_2_pin[] map
+        */
+       if (irq >= 16)
+               add_pin_to_irq(irq, ioapic, pin);
+
+       entry.vector = assign_irq_vector(irq);
+
+       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
+               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
+               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+               edge_level, active_high_low);
+
+       ioapic_register_intr(irq, entry.vector, edge_level);
+
+       if (!ioapic && (irq < 16))
+               disable_8259A_irq(irq);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __ioapic_write_entry(ioapic, pin, entry);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return 0;
+}
+
+#endif /* CONFIG_ACPI */
+
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+       disable_timer_pin_1 = 1;
+       return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+       disable_timer_pin_1 = -1;
+       return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+
+static int __init parse_noapic(char *arg)
+{
+       /* disable IO-APIC */
+       disable_ioapic_setup();
+       return 0;
+}
+early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
new file mode 100644 (file)
index 0000000..3d310a9
--- /dev/null
@@ -0,0 +1,153 @@
+/*
+ *     linux/arch/i386/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+       unsigned long mask;
+       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
+       unsigned int low_index = base & (BITS_PER_LONG-1);
+       int length = low_index + extent;
+
+       if (low_index != 0) {
+               mask = (~0UL << low_index);
+               if (length < BITS_PER_LONG)
+                       mask &= ~(~0UL << length);
+               if (new_value)
+                       *bitmap_base++ |= mask;
+               else
+                       *bitmap_base++ &= ~mask;
+               length -= BITS_PER_LONG;
+       }
+
+       mask = (new_value ? ~0UL : 0UL);
+       while (length >= BITS_PER_LONG) {
+               *bitmap_base++ = mask;
+               length -= BITS_PER_LONG;
+       }
+
+       if (length > 0) {
+               mask = ~(~0UL << length);
+               if (new_value)
+                       *bitmap_base++ |= mask;
+               else
+                       *bitmap_base++ &= ~mask;
+       }
+}
+
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+       unsigned long i, max_long, bytes, bytes_updated;
+       struct thread_struct * t = &current->thread;
+       struct tss_struct * tss;
+       unsigned long *bitmap;
+
+       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+               return -EINVAL;
+       if (turn_on && !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       /*
+        * If it's the first ioperm() call in this thread's lifetime, set the
+        * IO bitmap up. ioperm() is much less timing critical than clone(),
+        * this is why we delay this operation until now:
+        */
+       if (!t->io_bitmap_ptr) {
+               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!bitmap)
+                       return -ENOMEM;
+
+               memset(bitmap, 0xff, IO_BITMAP_BYTES);
+               t->io_bitmap_ptr = bitmap;
+               set_thread_flag(TIF_IO_BITMAP);
+       }
+
+       /*
+        * do it in the per-thread copy and in the TSS ...
+        *
+        * Disable preemption via get_cpu() - we must not switch away
+        * because the ->io_bitmap_max value must match the bitmap
+        * contents:
+        */
+       tss = &per_cpu(init_tss, get_cpu());
+
+       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+       /*
+        * Search for a (possibly new) maximum. This is simple and stupid,
+        * to keep it obviously correct:
+        */
+       max_long = 0;
+       for (i = 0; i < IO_BITMAP_LONGS; i++)
+               if (t->io_bitmap_ptr[i] != ~0UL)
+                       max_long = i;
+
+       bytes = (max_long + 1) * sizeof(long);
+       bytes_updated = max(bytes, t->io_bitmap_max);
+
+       t->io_bitmap_max = bytes;
+
+       /*
+        * Sets the lazy trigger so that the next I/O operation will
+        * reload the correct bitmap.
+        * Reset the owner so that a process switch will not set
+        * tss->io_bitmap_base to IO_BITMAP_OFFSET.
+        */
+       tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+       tss->io_bitmap_owner = NULL;
+
+       put_cpu();
+
+       return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+
+asmlinkage long sys_iopl(unsigned long unused)
+{
+       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
+       unsigned int level = regs->ebx;
+       unsigned int old = (regs->eflags >> 12) & 3;
+       struct thread_struct *t = &current->thread;
+
+       if (level > 3)
+               return -EINVAL;
+       /* Trying to gain more privileges? */
+       if (level > old) {
+               if (!capable(CAP_SYS_RAWIO))
+                       return -EPERM;
+       }
+       t->iopl = level << 12;
+       regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+       set_iopl_mask(t->iopl);
+       return 0;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644 (file)
index 0000000..dd2b97f
--- /dev/null
@@ -0,0 +1,343 @@
+/*
+ *     linux/arch/i386/kernel/irq.c
+ *
+ *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/uaccess.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       /*
+        * Currently unexpected vectors happen only on SMP and APIC.
+        * We _must_ ack these because every local APIC has only N
+        * irq slots per priority level, and a 'hanging, unacked' IRQ
+        * holds up an irq slot - in excessive cases (when multiple
+        * unexpected vectors occur) that might lock up the APIC
+        * completely.
+        * But only ack when the APIC is enabled -AK
+        */
+       if (cpu_has_apic)
+               ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+       struct thread_info      tinfo;
+       u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+#endif
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int do_IRQ(struct pt_regs *regs)
+{      
+       struct pt_regs *old_regs;
+       /* high bit used in ret_from_ code */
+       int irq = ~regs->orig_eax;
+       struct irq_desc *desc = irq_desc + irq;
+#ifdef CONFIG_4KSTACKS
+       union irq_ctx *curctx, *irqctx;
+       u32 *isp;
+#endif
+
+       if (unlikely((unsigned)irq >= NR_IRQS)) {
+               printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+                                       __FUNCTION__, irq);
+               BUG();
+       }
+
+       old_regs = set_irq_regs(regs);
+       irq_enter();
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+       /* Debugging check for stack overflow: is there less than 1KB free? */
+       {
+               long esp;
+
+               __asm__ __volatile__("andl %%esp,%0" :
+                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
+               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+                       printk("do_IRQ: stack overflow: %ld\n",
+                               esp - sizeof(struct thread_info));
+                       dump_stack();
+               }
+       }
+#endif
+
+#ifdef CONFIG_4KSTACKS
+
+       curctx = (union irq_ctx *) current_thread_info();
+       irqctx = hardirq_ctx[smp_processor_id()];
+
+       /*
+        * this is where we switch to the IRQ stack. However, if we are
+        * already using the IRQ stack (because we interrupted a hardirq
+        * handler) we can't do that and just have to keep using the
+        * current stack (which is the irq stack already after all)
+        */
+       if (curctx != irqctx) {
+               int arg1, arg2, ebx;
+
+               /* build the stack frame on the IRQ stack */
+               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+               irqctx->tinfo.task = curctx->tinfo.task;
+               irqctx->tinfo.previous_esp = current_stack_pointer;
+
+               /*
+                * Copy the softirq bits in preempt_count so that the
+                * softirq checks work in the hardirq context.
+                */
+               irqctx->tinfo.preempt_count =
+                       (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+                       (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+               asm volatile(
+                       "       xchgl  %%ebx,%%esp      \n"
+                       "       call   *%%edi           \n"
+                       "       movl   %%ebx,%%esp      \n"
+                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
+                       :  "0" (irq),   "1" (desc),  "2" (isp),
+                          "D" (desc->handle_irq)
+                       : "memory", "cc"
+               );
+       } else
+#endif
+               desc->handle_irq(irq, desc);
+
+       irq_exit();
+       set_irq_regs(old_regs);
+       return 1;
+}
+
+#ifdef CONFIG_4KSTACKS
+
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+               __attribute__((__section__(".bss.page_aligned")));
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+               __attribute__((__section__(".bss.page_aligned")));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+       union irq_ctx *irqctx;
+
+       if (hardirq_ctx[cpu])
+               return;
+
+       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+       irqctx->tinfo.task              = NULL;
+       irqctx->tinfo.exec_domain       = NULL;
+       irqctx->tinfo.cpu               = cpu;
+       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+       hardirq_ctx[cpu] = irqctx;
+
+       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+       irqctx->tinfo.task              = NULL;
+       irqctx->tinfo.exec_domain       = NULL;
+       irqctx->tinfo.cpu               = cpu;
+       irqctx->tinfo.preempt_count     = 0;
+       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+       softirq_ctx[cpu] = irqctx;
+
+       printk("CPU %u irqstacks, hard=%p soft=%p\n",
+               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+void irq_ctx_exit(int cpu)
+{
+       hardirq_ctx[cpu] = NULL;
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+       unsigned long flags;
+       struct thread_info *curctx;
+       union irq_ctx *irqctx;
+       u32 *isp;
+
+       if (in_interrupt())
+               return;
+
+       local_irq_save(flags);
+
+       if (local_softirq_pending()) {
+               curctx = current_thread_info();
+               irqctx = softirq_ctx[smp_processor_id()];
+               irqctx->tinfo.task = curctx->task;
+               irqctx->tinfo.previous_esp = current_stack_pointer;
+
+               /* build the stack frame on the softirq stack */
+               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+               asm volatile(
+                       "       xchgl   %%ebx,%%esp     \n"
+                       "       call    __do_softirq    \n"
+                       "       movl    %%ebx,%%esp     \n"
+                       : "=b"(isp)
+                       : "0"(isp)
+                       : "memory", "cc", "edx", "ecx", "eax"
+               );
+               /*
+                * Shouldnt happen, we returned above if in_interrupt():
+                */
+               WARN_ON_ONCE(softirq_count());
+       }
+
+       local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
+
+/*
+ * Interrupt statistics:
+ */
+
+atomic_t irq_err_count;
+
+/*
+ * /proc/interrupts printing:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+       int i = *(loff_t *) v, j;
+       struct irqaction * action;
+       unsigned long flags;
+
+       if (i == 0) {
+               seq_printf(p, "           ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "CPU%-8d",j);
+               seq_putc(p, '\n');
+       }
+
+       if (i < NR_IRQS) {
+               spin_lock_irqsave(&irq_desc[i].lock, flags);
+               action = irq_desc[i].action;
+               if (!action)
+                       goto skip;
+               seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+               seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+               seq_printf(p, " %8s", irq_desc[i].chip->name);
+               seq_printf(p, "-%-8s", irq_desc[i].name);
+               seq_printf(p, "  %s", action->name);
+
+               for (action=action->next; action; action = action->next)
+                       seq_printf(p, ", %s", action->name);
+
+               seq_putc(p, '\n');
+skip:
+               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+       } else if (i == NR_IRQS) {
+               seq_printf(p, "NMI: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", nmi_count(j));
+               seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+               seq_printf(p, "LOC: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ",
+                               per_cpu(irq_stat,j).apic_timer_irqs);
+               seq_putc(p, '\n');
+#endif
+               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+       }
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <mach_apic.h>
+
+void fixup_irqs(cpumask_t map)
+{
+       unsigned int irq;
+       static int warned;
+
+       for (irq = 0; irq < NR_IRQS; irq++) {
+               cpumask_t mask;
+               if (irq == 2)
+                       continue;
+
+               cpus_and(mask, irq_desc[irq].affinity, map);
+               if (any_online_cpu(mask) == NR_CPUS) {
+                       printk("Breaking affinity for irq %i\n", irq);
+                       mask = map;
+               }
+               if (irq_desc[irq].chip->set_affinity)
+                       irq_desc[irq].chip->set_affinity(irq, mask);
+               else if (irq_desc[irq].action && !(warned++))
+                       printk("Cannot set affinity for irq %i\n", irq);
+       }
+
+#if 0
+       barrier();
+       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+          [note the nop - the interrupt-enable boundary on x86 is two
+          instructions from sti] - to flush out pending hardirqs and
+          IPIs. After this point nothing is supposed to reach this CPU." */
+       __asm__ __volatile__("sti; nop; cli");
+       barrier();
+#else
+       /* That doesn't seem sufficient.  Give it 1ms. */
+       local_irq_enable();
+       mdelay(1);
+       local_irq_disable();
+#endif
+}
+#endif
+
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
new file mode 100644 (file)
index 0000000..448a50b
--- /dev/null
@@ -0,0 +1,751 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/i386/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *             Probes initial implementation ( includes contributions from
+ *             Rusty Russell).
+ * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *             interface to access function arguments.
+ * 2005-May    Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *             <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *             <prasanna@in.ibm.com> added function-return probes.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/kdebug.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+
+void jprobe_return_end(void);
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+/* insert a jmp code */
+static __always_inline void set_jmp_op(void *from, void *to)
+{
+       struct __arch_jmp_op {
+               char op;
+               long raddr;
+       } __attribute__((packed)) *jop;
+       jop = (struct __arch_jmp_op *)from;
+       jop->raddr = (long)(to) - ((long)(from) + 5);
+       jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+
+/*
+ * returns non-zero if opcodes can be boosted.
+ */
+static __always_inline int can_boost(kprobe_opcode_t *opcodes)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)               \
+       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+        << (row % 32))
+       /*
+        * Undefined/reserved opcodes, conditional jump, Opcode Extension
+        * Groups, and some special opcodes can not be boost.
+        */
+       static const unsigned long twobyte_is_boostable[256 / 32] = {
+               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+               /*      -------------------------------         */
+               W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
+               W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
+               W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
+               W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
+               W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
+               W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
+               W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
+               W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
+               W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
+               W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
+               W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
+               W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
+               W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
+               W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
+               W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
+               W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */
+               /*      -------------------------------         */
+               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+       };
+#undef W
+       kprobe_opcode_t opcode;
+       kprobe_opcode_t *orig_opcodes = opcodes;
+retry:
+       if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+               return 0;
+       opcode = *(opcodes++);
+
+       /* 2nd-byte opcode */
+       if (opcode == 0x0f) {
+               if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+                       return 0;
+               return test_bit(*opcodes, twobyte_is_boostable);
+       }
+
+       switch (opcode & 0xf0) {
+       case 0x60:
+               if (0x63 < opcode && opcode < 0x67)
+                       goto retry; /* prefixes */
+               /* can't boost Address-size override and bound */
+               return (opcode != 0x62 && opcode != 0x67);
+       case 0x70:
+               return 0; /* can't boost conditional jump */
+       case 0xc0:
+               /* can't boost software-interruptions */
+               return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+       case 0xd0:
+               /* can boost AA* and XLAT */
+               return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+       case 0xe0:
+               /* can boost in/out and absolute jmps */
+               return ((opcode & 0x04) || opcode == 0xea);
+       case 0xf0:
+               if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+                       goto retry; /* lock/rep(ne) prefix */
+               /* clear and set flags can be boost */
+               return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+       default:
+               if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+                       goto retry; /* prefixes */
+               /* can't boost CS override and call */
+               return (opcode != 0x2e && opcode != 0x9a);
+       }
+}
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
+{
+       switch (opcode) {
+       case 0xfa:              /* cli */
+       case 0xfb:              /* sti */
+       case 0xcf:              /* iret/iretd */
+       case 0x9d:              /* popf/popfd */
+               return 1;
+       }
+       return 0;
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+       /* insn: must be on special executable page on i386. */
+       p->ainsn.insn = get_insn_slot();
+       if (!p->ainsn.insn)
+               return -ENOMEM;
+
+       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       p->opcode = *p->addr;
+       if (can_boost(p->addr)) {
+               p->ainsn.boostable = 0;
+       } else {
+               p->ainsn.boostable = -1;
+       }
+       return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+       text_poke(p->addr, &p->opcode, 1);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+       mutex_lock(&kprobe_mutex);
+       free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+       mutex_unlock(&kprobe_mutex);
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       kcb->prev_kprobe.kp = kprobe_running();
+       kcb->prev_kprobe.status = kcb->kprobe_status;
+       kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
+       kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+       kcb->kprobe_status = kcb->prev_kprobe.status;
+       kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
+       kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+                               struct kprobe_ctlblk *kcb)
+{
+       __get_cpu_var(current_kprobe) = p;
+       kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
+               = (regs->eflags & (TF_MASK | IF_MASK));
+       if (is_IF_modifier(p->opcode))
+               kcb->kprobe_saved_eflags &= ~IF_MASK;
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+       regs->eflags |= TF_MASK;
+       regs->eflags &= ~IF_MASK;
+       /*single step inline if the instruction is an int3*/
+       if (p->opcode == BREAKPOINT_INSTRUCTION)
+               regs->eip = (unsigned long)p->addr;
+       else
+               regs->eip = (unsigned long)p->ainsn.insn;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+                                     struct pt_regs *regs)
+{
+       unsigned long *sara = (unsigned long *)&regs->esp;
+
+       ri->ret_addr = (kprobe_opcode_t *) *sara;
+
+       /* Replace the return addr with trampoline addr */
+       *sara = (unsigned long) &kretprobe_trampoline;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+       struct kprobe *p;
+       int ret = 0;
+       kprobe_opcode_t *addr;
+       struct kprobe_ctlblk *kcb;
+
+       addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
+
+       /*
+        * We don't want to be preempted for the entire
+        * duration of kprobe processing
+        */
+       preempt_disable();
+       kcb = get_kprobe_ctlblk();
+
+       /* Check we're not actually recursing */
+       if (kprobe_running()) {
+               p = get_kprobe(addr);
+               if (p) {
+                       if (kcb->kprobe_status == KPROBE_HIT_SS &&
+                               *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+                               regs->eflags &= ~TF_MASK;
+                               regs->eflags |= kcb->kprobe_saved_eflags;
+                               goto no_kprobe;
+                       }
+                       /* We have reentered the kprobe_handler(), since
+                        * another probe was hit while within the handler.
+                        * We here save the original kprobes variables and
+                        * just single step on the instruction of the new probe
+                        * without calling any user handlers.
+                        */
+                       save_previous_kprobe(kcb);
+                       set_current_kprobe(p, regs, kcb);
+                       kprobes_inc_nmissed_count(p);
+                       prepare_singlestep(p, regs);
+                       kcb->kprobe_status = KPROBE_REENTER;
+                       return 1;
+               } else {
+                       if (*addr != BREAKPOINT_INSTRUCTION) {
+                       /* The breakpoint instruction was removed by
+                        * another cpu right after we hit, no further
+                        * handling of this interrupt is appropriate
+                        */
+                               regs->eip -= sizeof(kprobe_opcode_t);
+                               ret = 1;
+                               goto no_kprobe;
+                       }
+                       p = __get_cpu_var(current_kprobe);
+                       if (p->break_handler && p->break_handler(p, regs)) {
+                               goto ss_probe;
+                       }
+               }
+               goto no_kprobe;
+       }
+
+       p = get_kprobe(addr);
+       if (!p) {
+               if (*addr != BREAKPOINT_INSTRUCTION) {
+                       /*
+                        * The breakpoint instruction was removed right
+                        * after we hit it.  Another cpu has removed
+                        * either a probepoint or a debugger breakpoint
+                        * at this address.  In either case, no further
+                        * handling of this interrupt is appropriate.
+                        * Back up over the (now missing) int3 and run
+                        * the original instruction.
+                        */
+                       regs->eip -= sizeof(kprobe_opcode_t);
+                       ret = 1;
+               }
+               /* Not one of ours: let kernel handle it */
+               goto no_kprobe;
+       }
+
+       set_current_kprobe(p, regs, kcb);
+       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+       if (p->pre_handler && p->pre_handler(p, regs))
+               /* handler has already set things up, so skip ss setup */
+               return 1;
+
+ss_probe:
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+       if (p->ainsn.boostable == 1 && !p->post_handler){
+               /* Boost up -- we can execute copied instructions directly */
+               reset_current_kprobe();
+               regs->eip = (unsigned long)p->ainsn.insn;
+               preempt_enable_no_resched();
+               return 1;
+       }
+#endif
+       prepare_singlestep(p, regs);
+       kcb->kprobe_status = KPROBE_HIT_SS;
+       return 1;
+
+no_kprobe:
+       preempt_enable_no_resched();
+       return ret;
+}
+
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void __kprobes kretprobe_trampoline_holder(void)
+ {
+       asm volatile ( ".global kretprobe_trampoline\n"
+                       "kretprobe_trampoline: \n"
+                       "       pushf\n"
+                       /* skip cs, eip, orig_eax */
+                       "       subl $12, %esp\n"
+                       "       pushl %fs\n"
+                       "       pushl %ds\n"
+                       "       pushl %es\n"
+                       "       pushl %eax\n"
+                       "       pushl %ebp\n"
+                       "       pushl %edi\n"
+                       "       pushl %esi\n"
+                       "       pushl %edx\n"
+                       "       pushl %ecx\n"
+                       "       pushl %ebx\n"
+                       "       movl %esp, %eax\n"
+                       "       call trampoline_handler\n"
+                       /* move eflags to cs */
+                       "       movl 52(%esp), %edx\n"
+                       "       movl %edx, 48(%esp)\n"
+                       /* save true return address on eflags */
+                       "       movl %eax, 52(%esp)\n"
+                       "       popl %ebx\n"
+                       "       popl %ecx\n"
+                       "       popl %edx\n"
+                       "       popl %esi\n"
+                       "       popl %edi\n"
+                       "       popl %ebp\n"
+                       "       popl %eax\n"
+                       /* skip eip, orig_eax, es, ds, fs */
+                       "       addl $20, %esp\n"
+                       "       popf\n"
+                       "       ret\n");
+}
+
+/*
+ * Called from kretprobe_trampoline
+ */
+fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
+{
+       struct kretprobe_instance *ri = NULL;
+       struct hlist_head *head, empty_rp;
+       struct hlist_node *node, *tmp;
+       unsigned long flags, orig_ret_address = 0;
+       unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+
+       INIT_HLIST_HEAD(&empty_rp);
+       spin_lock_irqsave(&kretprobe_lock, flags);
+       head = kretprobe_inst_table_head(current);
+       /* fixup registers */
+       regs->xcs = __KERNEL_CS | get_kernel_rpl();
+       regs->eip = trampoline_address;
+       regs->orig_eax = 0xffffffff;
+
+       /*
+        * It is possible to have multiple instances associated with a given
+        * task either because an multiple functions in the call path
+        * have a return probe installed on them, and/or more then one return
+        * return probe was registered for a target function.
+        *
+        * We can handle this because:
+        *     - instances are always inserted at the head of the list
+        *     - when multiple return probes are registered for the same
+        *       function, the first instance's ret_addr will point to the
+        *       real return address, and all the rest will point to
+        *       kretprobe_trampoline
+        */
+       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+               if (ri->task != current)
+                       /* another task is sharing our hash bucket */
+                       continue;
+
+               if (ri->rp && ri->rp->handler){
+                       __get_cpu_var(current_kprobe) = &ri->rp->kp;
+                       get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+                       ri->rp->handler(ri, regs);
+                       __get_cpu_var(current_kprobe) = NULL;
+               }
+
+               orig_ret_address = (unsigned long)ri->ret_addr;
+               recycle_rp_inst(ri, &empty_rp);
+
+               if (orig_ret_address != trampoline_address)
+                       /*
+                        * This is the real return address. Any other
+                        * instances associated with this task are for
+                        * other calls deeper on the call stack
+                        */
+                       break;
+       }
+
+       kretprobe_assert(ri, orig_ret_address, trampoline_address);
+       spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+               hlist_del(&ri->hlist);
+               kfree(ri);
+       }
+       return (void*)orig_ret_address;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new eip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * This function also checks instruction size for preparing direct execution.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+       unsigned long *tos = (unsigned long *)&regs->esp;
+       unsigned long copy_eip = (unsigned long)p->ainsn.insn;
+       unsigned long orig_eip = (unsigned long)p->addr;
+
+       regs->eflags &= ~TF_MASK;
+       switch (p->ainsn.insn[0]) {
+       case 0x9c:              /* pushfl */
+               *tos &= ~(TF_MASK | IF_MASK);
+               *tos |= kcb->kprobe_old_eflags;
+               break;
+       case 0xc2:              /* iret/ret/lret */
+       case 0xc3:
+       case 0xca:
+       case 0xcb:
+       case 0xcf:
+       case 0xea:              /* jmp absolute -- eip is correct */
+               /* eip is already adjusted, no more changes required */
+               p->ainsn.boostable = 1;
+               goto no_change;
+       case 0xe8:              /* call relative - Fix return addr */
+               *tos = orig_eip + (*tos - copy_eip);
+               break;
+       case 0x9a:              /* call absolute -- same as call absolute, indirect */
+               *tos = orig_eip + (*tos - copy_eip);
+               goto no_change;
+       case 0xff:
+               if ((p->ainsn.insn[1] & 0x30) == 0x10) {
+                       /*
+                        * call absolute, indirect
+                        * Fix return addr; eip is correct.
+                        * But this is not boostable
+                        */
+                       *tos = orig_eip + (*tos - copy_eip);
+                       goto no_change;
+               } else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||       /* jmp near, absolute indirect */
+                          ((p->ainsn.insn[1] & 0x31) == 0x21)) {       /* jmp far, absolute indirect */
+                       /* eip is correct. And this is boostable */
+                       p->ainsn.boostable = 1;
+                       goto no_change;
+               }
+       default:
+               break;
+       }
+
+       if (p->ainsn.boostable == 0) {
+               if ((regs->eip > copy_eip) &&
+                   (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
+                       /*
+                        * These instructions can be executed directly if it
+                        * jumps back to correct address.
+                        */
+                       set_jmp_op((void *)regs->eip,
+                                  (void *)orig_eip + (regs->eip - copy_eip));
+                       p->ainsn.boostable = 1;
+               } else {
+                       p->ainsn.boostable = -1;
+               }
+       }
+
+       regs->eip = orig_eip + (regs->eip - copy_eip);
+
+no_change:
+       return;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.
+ */
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       if (!cur)
+               return 0;
+
+       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+               kcb->kprobe_status = KPROBE_HIT_SSDONE;
+               cur->post_handler(cur, regs, 0);
+       }
+
+       resume_execution(cur, regs, kcb);
+       regs->eflags |= kcb->kprobe_saved_eflags;
+
+       /*Restore back the original saved kprobes variables and continue. */
+       if (kcb->kprobe_status == KPROBE_REENTER) {
+               restore_previous_kprobe(kcb);
+               goto out;
+       }
+       reset_current_kprobe();
+out:
+       preempt_enable_no_resched();
+
+       /*
+        * if somebody else is singlestepping across a probe point, eflags
+        * will have TF set, in which case, continue the remaining processing
+        * of do_debug, as if this is not a probe hit.
+        */
+       if (regs->eflags & TF_MASK)
+               return 0;
+
+       return 1;
+}
+
+static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       switch(kcb->kprobe_status) {
+       case KPROBE_HIT_SS:
+       case KPROBE_REENTER:
+               /*
+                * We are here because the instruction being single
+                * stepped caused a page fault. We reset the current
+                * kprobe and the eip points back to the probe address
+                * and allow the page fault handler to continue as a
+                * normal page fault.
+                */
+               regs->eip = (unsigned long)cur->addr;
+               regs->eflags |= kcb->kprobe_old_eflags;
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       restore_previous_kprobe(kcb);
+               else
+                       reset_current_kprobe();
+               preempt_enable_no_resched();
+               break;
+       case KPROBE_HIT_ACTIVE:
+       case KPROBE_HIT_SSDONE:
+               /*
+                * We increment the nmissed count for accounting,
+                * we can also use npre/npostfault count for accouting
+                * these specific fault cases.
+                */
+               kprobes_inc_nmissed_count(cur);
+
+               /*
+                * We come here because instructions in the pre/post
+                * handler caused the page_fault, this could happen
+                * if handler tries to access user space by
+                * copy_from_user(), get_user() etc. Let the
+                * user-specified handler try to fix it first.
+                */
+               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+                       return 1;
+
+               /*
+                * In case the user-specified fault handler returned
+                * zero, try to fix up.
+                */
+               if (fixup_exception(regs))
+                       return 1;
+
+               /*
+                * fixup_exception() could not handle it,
+                * Let do_page_fault() fix it.
+                */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+                                      unsigned long val, void *data)
+{
+       struct die_args *args = (struct die_args *)data;
+       int ret = NOTIFY_DONE;
+
+       if (args->regs && user_mode_vm(args->regs))
+               return ret;
+
+       switch (val) {
+       case DIE_INT3:
+               if (kprobe_handler(args->regs))
+                       ret = NOTIFY_STOP;
+               break;
+       case DIE_DEBUG:
+               if (post_kprobe_handler(args->regs))
+                       ret = NOTIFY_STOP;
+               break;
+       case DIE_GPF:
+       case DIE_PAGE_FAULT:
+               /* kprobe_running() needs smp_processor_id() */
+               preempt_disable();
+               if (kprobe_running() &&
+                   kprobe_fault_handler(args->regs, args->trapnr))
+                       ret = NOTIFY_STOP;
+               preempt_enable();
+               break;
+       default:
+               break;
+       }
+       return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+       unsigned long addr;
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       kcb->jprobe_saved_regs = *regs;
+       kcb->jprobe_saved_esp = &regs->esp;
+       addr = (unsigned long)(kcb->jprobe_saved_esp);
+
+       /*
+        * TBD: As Linus pointed out, gcc assumes that the callee
+        * owns the argument space and could overwrite it, e.g.
+        * tailcall optimization. So, to be absolutely safe
+        * we also save and restore enough stack bytes to cover
+        * the argument area.
+        */
+       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+                       MIN_STACK_SIZE(addr));
+       regs->eflags &= ~IF_MASK;
+       regs->eip = (unsigned long)(jp->entry);
+       return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       asm volatile ("       xchgl   %%ebx,%%esp     \n"
+                     "       int3                      \n"
+                     "       .globl jprobe_return_end  \n"
+                     "       jprobe_return_end:        \n"
+                     "       nop                       \n"::"b"
+                     (kcb->jprobe_saved_esp):"memory");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+       u8 *addr = (u8 *) (regs->eip - 1);
+       unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+       if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+               if (&regs->esp != kcb->jprobe_saved_esp) {
+                       struct pt_regs *saved_regs =
+                           container_of(kcb->jprobe_saved_esp,
+                                           struct pt_regs, esp);
+                       printk("current esp %p does not match saved esp %p\n",
+                              &regs->esp, kcb->jprobe_saved_esp);
+                       printk("Saved registers for jprobe %p\n", jp);
+                       show_registers(saved_regs);
+                       printk("Current registers\n");
+                       show_registers(regs);
+                       BUG();
+               }
+               *regs = kcb->jprobe_saved_regs;
+               memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
+                      MIN_STACK_SIZE(stack_addr));
+               preempt_enable_no_resched();
+               return 1;
+       }
+       return 0;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+       return 0;
+}
+
+int __init arch_init_kprobes(void)
+{
+       return 0;
+}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
new file mode 100644 (file)
index 0000000..e0b2d17
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * linux/arch/i386/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+       if (current->active_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+       void *oldldt;
+       void *newldt;
+       int oldsize;
+
+       if (mincount <= pc->size)
+               return 0;
+       oldsize = pc->size;
+       mincount = (mincount+511)&(~511);
+       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+       else
+               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+       if (!newldt)
+               return -ENOMEM;
+
+       if (oldsize)
+               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+       oldldt = pc->ldt;
+       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+       pc->ldt = newldt;
+       wmb();
+       pc->size = mincount;
+       wmb();
+
+       if (reload) {
+#ifdef CONFIG_SMP
+               cpumask_t mask;
+               preempt_disable();
+               load_LDT(pc);
+               mask = cpumask_of_cpu(smp_processor_id());
+               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+                       smp_call_function(flush_ldt, NULL, 1, 1);
+               preempt_enable();
+#else
+               load_LDT(pc);
+#endif
+       }
+       if (oldsize) {
+               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(oldldt);
+               else
+                       kfree(oldldt);
+       }
+       return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+       int err = alloc_ldt(new, old->size, 0);
+       if (err < 0)
+               return err;
+       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+       return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       struct mm_struct * old_mm;
+       int retval = 0;
+
+       init_MUTEX(&mm->context.sem);
+       mm->context.size = 0;
+       old_mm = current->mm;
+       if (old_mm && old_mm->context.size > 0) {
+               down(&old_mm->context.sem);
+               retval = copy_ldt(&mm->context, &old_mm->context);
+               up(&old_mm->context.sem);
+       }
+       return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ */
+void destroy_context(struct mm_struct *mm)
+{
+       if (mm->context.size) {
+               if (mm == current->active_mm)
+                       clear_LDT();
+               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(mm->context.ldt);
+               else
+                       kfree(mm->context.ldt);
+               mm->context.size = 0;
+       }
+}
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+       int err;
+       unsigned long size;
+       struct mm_struct * mm = current->mm;
+
+       if (!mm->context.size)
+               return 0;
+       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+
+       down(&mm->context.sem);
+       size = mm->context.size*LDT_ENTRY_SIZE;
+       if (size > bytecount)
+               size = bytecount;
+
+       err = 0;
+       if (copy_to_user(ptr, mm->context.ldt, size))
+               err = -EFAULT;
+       up(&mm->context.sem);
+       if (err < 0)
+               goto error_return;
+       if (size != bytecount) {
+               /* zero-fill the rest */
+               if (clear_user(ptr+size, bytecount-size) != 0) {
+                       err = -EFAULT;
+                       goto error_return;
+               }
+       }
+       return bytecount;
+error_return:
+       return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+       int err;
+       unsigned long size;
+
+       err = 0;
+       size = 5*sizeof(struct desc_struct);
+       if (size > bytecount)
+               size = bytecount;
+
+       err = size;
+       if (clear_user(ptr, size))
+               err = -EFAULT;
+
+       return err;
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+       struct mm_struct * mm = current->mm;
+       __u32 entry_1, entry_2;
+       int error;
+       struct user_desc ldt_info;
+
+       error = -EINVAL;
+       if (bytecount != sizeof(ldt_info))
+               goto out;
+       error = -EFAULT;        
+       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+               goto out;
+
+       error = -EINVAL;
+       if (ldt_info.entry_number >= LDT_ENTRIES)
+               goto out;
+       if (ldt_info.contents == 3) {
+               if (oldmode)
+                       goto out;
+               if (ldt_info.seg_not_present == 0)
+                       goto out;
+       }
+
+       down(&mm->context.sem);
+       if (ldt_info.entry_number >= mm->context.size) {
+               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+               if (error < 0)
+                       goto out_unlock;
+       }
+
+       /* Allow LDTs to be cleared by the user. */
+       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+               if (oldmode || LDT_empty(&ldt_info)) {
+                       entry_1 = 0;
+                       entry_2 = 0;
+                       goto install;
+               }
+       }
+
+       entry_1 = LDT_entry_a(&ldt_info);
+       entry_2 = LDT_entry_b(&ldt_info);
+       if (oldmode)
+               entry_2 &= ~(1 << 20);
+
+       /* Install the new entry ...  */
+install:
+       write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
+       error = 0;
+
+out_unlock:
+       up(&mm->context.sem);
+out:
+       return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+       int ret = -ENOSYS;
+
+       switch (func) {
+       case 0:
+               ret = read_ldt(ptr, bytecount);
+               break;
+       case 1:
+               ret = write_ldt(ptr, bytecount, 1);
+               break;
+       case 2:
+               ret = read_default_ldt(ptr, bytecount);
+               break;
+       case 0x11:
+               ret = write_ldt(ptr, bytecount, 0);
+               break;
+       }
+       return ret;
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644 (file)
index 0000000..91966ba
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+#endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
+
+static void set_idt(void *newidt, __u16 limit)
+{
+       struct Xgt_desc_struct curidt;
+
+       /* ia32 supports unaliged loads & stores */
+       curidt.size    = limit;
+       curidt.address = (unsigned long)newidt;
+
+       load_idt(&curidt);
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+       struct Xgt_desc_struct curgdt;
+
+       /* ia32 supports unaligned loads & stores */
+       curgdt.size    = limit;
+       curgdt.address = (unsigned long)newgdt;
+
+       load_gdt(&curgdt);
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+       __asm__ __volatile__ (
+               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+               "\t1:\n"
+               "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+               "\tmovl %%eax,%%ds\n"
+               "\tmovl %%eax,%%es\n"
+               "\tmovl %%eax,%%fs\n"
+               "\tmovl %%eax,%%gs\n"
+               "\tmovl %%eax,%%ss\n"
+               ::: "eax", "memory");
+#undef STR
+#undef __STR
+}
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+       return 0;
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+       unsigned long page_list[PAGES_NR];
+       void *control_page;
+
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+
+       control_page = page_address(image->control_code_page);
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       page_list[PA_CONTROL_PAGE] = __pa(control_page);
+       page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+       page_list[PA_PGD] = __pa(kexec_pgd);
+       page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+       page_list[PA_PMD_0] = __pa(kexec_pmd0);
+       page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+       page_list[PA_PMD_1] = __pa(kexec_pmd1);
+       page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+       page_list[PA_PTE_0] = __pa(kexec_pte0);
+       page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+       page_list[PA_PTE_1] = __pa(kexec_pte1);
+       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+       /* The segment registers are funny things, they have both a
+        * visible and an invisible part.  Whenever the visible part is
+        * set to a specific selector, the invisible part is loaded
+        * with from a table in memory.  At no other time is the
+        * descriptor table in memory accessed.
+        *
+        * I take advantage of this here by force loading the
+        * segments, before I zap the gdt with an invalid value.
+        */
+       load_segments();
+       /* The gdt & idt are now invalid.
+        * If you want to load them you must set up your own idt & gdt.
+        */
+       set_gdt(phys_to_virt(0),0);
+       set_idt(phys_to_virt(0),0);
+
+       /* now call it */
+       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                       image->start, cpu_has_pae);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel.  By reserving this memory we guarantee
+ * that linux never sets it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init parse_crashkernel(char *arg)
+{
+       unsigned long size, base;
+       size = memparse(arg, &arg);
+       if (*arg == '@') {
+               base = memparse(arg+1, &arg);
+               /* FIXME: Do I want a sanity check
+                * to validate the memory range?
+                */
+               crashk_res.start = base;
+               crashk_res.end   = base + size - 1;
+       }
+       return 0;
+}
+early_param("crashkernel", parse_crashkernel);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
new file mode 100644 (file)
index 0000000..b83672b
--- /dev/null
@@ -0,0 +1,470 @@
+/*
+ *  linux/arch/i386/kernel/mca.c
+ *  Written by Martin Kolinek, February 1996
+ *
+ * Changes:
+ *
+ *     Chris Beauregard July 28th, 1996
+ *     - Fixed up integrated SCSI detection
+ *
+ *     Chris Beauregard August 3rd, 1996
+ *     - Made mca_info local
+ *     - Made integrated registers accessible through standard function calls
+ *     - Added name field
+ *     - More sanity checking
+ *
+ *     Chris Beauregard August 9th, 1996
+ *     - Rewrote /proc/mca
+ *
+ *     Chris Beauregard January 7th, 1997
+ *     - Added basic NMI-processing
+ *     - Added more information to mca_info structure
+ *
+ *     David Weinehall October 12th, 1998
+ *     - Made a lot of cleaning up in the source
+ *     - Added use of save_flags / restore_flags
+ *     - Added the 'driver_loaded' flag in MCA_adapter
+ *     - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
+ *
+ *     David Weinehall March 24th, 1999
+ *     - Fixed the output of 'Driver Installed' in /proc/mca/pos
+ *     - Made the Integrated Video & SCSI show up even if they have id 0000
+ *
+ *     Alexander Viro November 9th, 1999
+ *     - Switched to regular procfs methods
+ *
+ *     Alfred Arnold & David Weinehall August 23rd, 2000
+ *     - Added support for Planar POS-registers
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mca.h>
+#include <linux/kprobes.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <linux/proc_fs.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/ioport.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <asm/arch_hooks.h>
+
+static unsigned char which_scsi = 0;
+
+int MCA_bus = 0;
+EXPORT_SYMBOL(MCA_bus);
+
+/*
+ * Motherboard register spinlock. Untested on SMP at the moment, but
+ * are there any MCA SMP boxes?
+ *
+ * Yes - Alan
+ */
+static DEFINE_SPINLOCK(mca_lock);
+
+/* Build the status info for the adapter */
+
+static void mca_configure_adapter_status(struct mca_device *mca_dev) {
+       mca_dev->status = MCA_ADAPTER_NONE;
+
+       mca_dev->pos_id = mca_dev->pos[0]
+               + (mca_dev->pos[1] << 8);
+
+       if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
+
+               /* id = 0x0000 usually indicates hardware failure,
+                * however, ZP Gu (zpg@castle.net> reports that his 9556
+                * has 0x0000 as id and everything still works. There
+                * also seem to be an adapter with id = 0x0000; the
+                * NCR Parallel Bus Memory Card. Until this is confirmed,
+                * however, this code will stay.
+                */
+
+               mca_dev->status = MCA_ADAPTER_ERROR;
+
+               return;
+       } else if(mca_dev->pos_id != 0xffff) {
+
+               /* 0xffff usually indicates that there's no adapter,
+                * however, some integrated adapters may have 0xffff as
+                * their id and still be valid. Examples are on-board
+                * VGA of the 55sx, the integrated SCSI of the 56 & 57,
+                * and possibly also the 95 ULTIMEDIA.
+                */
+
+               mca_dev->status = MCA_ADAPTER_NORMAL;
+       }
+
+       if((mca_dev->pos_id == 0xffff ||
+           mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
+               int j;
+
+               for(j = 2; j < 8; j++) {
+                       if(mca_dev->pos[j] != 0xff) {
+                               mca_dev->status = MCA_ADAPTER_NORMAL;
+                               break;
+                       }
+               }
+       }
+
+       if(!(mca_dev->pos[2] & MCA_ENABLED)) {
+
+               /* enabled bit is in POS 2 */
+
+               mca_dev->status = MCA_ADAPTER_DISABLED;
+       }
+} /* mca_configure_adapter_status */
+
+/*--------------------------------------------------------------------*/
+
+static struct resource mca_standard_resources[] = {
+       { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
+       { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
+       { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
+       { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
+       { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
+       { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
+       { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
+};
+
+#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
+
+/**
+ *     mca_read_and_store_pos - read the POS registers into a memory buffer
+ *      @pos: a char pointer to 8 bytes, contains the POS register value on
+ *            successful return
+ *
+ *     Returns 1 if a card actually exists (i.e. the pos isn't
+ *     all 0xff) or 0 otherwise
+ */
+static int mca_read_and_store_pos(unsigned char *pos) {
+       int j;
+       int found = 0;
+
+       for(j=0; j<8; j++) {
+               if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
+                       /* 0xff all across means no device. 0x00 means
+                        * something's broken, but a device is
+                        * probably there.  However, if you get 0x00
+                        * from a motherboard register it won't matter
+                        * what we find.  For the record, on the
+                        * 57SLC, the integrated SCSI adapter has
+                        * 0xffff for the adapter ID, but nonzero for
+                        * other registers.  */
+
+                       found = 1;
+               }
+       }
+       return found;
+}
+
+static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
+{
+       unsigned char byte;
+       unsigned long flags;
+
+       if(reg < 0 || reg >= 8)
+               return 0;
+
+       spin_lock_irqsave(&mca_lock, flags);
+       if(mca_dev->pos_register) {
+               /* Disable adapter setup, enable motherboard setup */
+
+               outb_p(0, MCA_ADAPTER_SETUP_REG);
+               outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+
+               byte = inb_p(MCA_POS_REG(reg));
+               outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+       } else {
+
+               /* Make sure motherboard setup is off */
+
+               outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+
+               /* Read the appropriate register */
+
+               outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
+               byte = inb_p(MCA_POS_REG(reg));
+               outb_p(0, MCA_ADAPTER_SETUP_REG);
+       }
+       spin_unlock_irqrestore(&mca_lock, flags);
+
+       mca_dev->pos[reg] = byte;
+
+       return byte;
+}
+
+static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
+                            unsigned char byte)
+{
+       unsigned long flags;
+
+       if(reg < 0 || reg >= 8)
+               return;
+
+       spin_lock_irqsave(&mca_lock, flags);
+
+       /* Make sure motherboard setup is off */
+
+       outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+
+       /* Read in the appropriate register */
+
+       outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
+       outb_p(byte, MCA_POS_REG(reg));
+       outb_p(0, MCA_ADAPTER_SETUP_REG);
+
+       spin_unlock_irqrestore(&mca_lock, flags);
+
+       /* Update the global register list, while we have the byte */
+
+       mca_dev->pos[reg] = byte;
+
+}
+
+/* for the primary MCA bus, we have identity transforms */
+static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
+{
+       return irq;
+}
+
+static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
+{
+       return port;
+}
+
+static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
+{
+       return mem;
+}
+
+
+static int __init mca_init(void)
+{
+       unsigned int i, j;
+       struct mca_device *mca_dev;
+       unsigned char pos[8];
+       short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
+       struct mca_bus *bus;
+
+       /* WARNING: Be careful when making changes here. Putting an adapter
+        * and the motherboard simultaneously into setup mode may result in
+        * damage to chips (according to The Indispensible PC Hardware Book
+        * by Hans-Peter Messmer). Also, we disable system interrupts (so
+        * that we are not disturbed in the middle of this).
+        */
+
+       /* Make sure the MCA bus is present */
+
+       if (mca_system_init()) {
+               printk(KERN_ERR "MCA bus system initialisation failed\n");
+               return -ENODEV;
+       }
+
+       if (!MCA_bus)
+               return -ENODEV;
+
+       printk(KERN_INFO "Micro Channel bus detected.\n");
+
+       /* All MCA systems have at least a primary bus */
+       bus = mca_attach_bus(MCA_PRIMARY_BUS);
+       if (!bus)
+               goto out_nomem;
+       bus->default_dma_mask = 0xffffffffLL;
+       bus->f.mca_write_pos = mca_pc_write_pos;
+       bus->f.mca_read_pos = mca_pc_read_pos;
+       bus->f.mca_transform_irq = mca_dummy_transform_irq;
+       bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
+       bus->f.mca_transform_memory = mca_dummy_transform_memory;
+
+       /* get the motherboard device */
+       mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
+       if(unlikely(!mca_dev))
+               goto out_nomem;
+
+       /*
+        * We do not expect many MCA interrupts during initialization,
+        * but let us be safe:
+        */
+       spin_lock_irq(&mca_lock);
+
+       /* Make sure adapter setup is off */
+
+       outb_p(0, MCA_ADAPTER_SETUP_REG);
+
+       /* Read motherboard POS registers */
+
+       mca_dev->pos_register = 0x7f;
+       outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+       mca_dev->name[0] = 0;
+       mca_read_and_store_pos(mca_dev->pos);
+       mca_configure_adapter_status(mca_dev);
+       /* fake POS and slot for a motherboard */
+       mca_dev->pos_id = MCA_MOTHERBOARD_POS;
+       mca_dev->slot = MCA_MOTHERBOARD;
+       mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+
+       mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+       if(unlikely(!mca_dev))
+               goto out_unlock_nomem;
+
+       /* Put motherboard into video setup mode, read integrated video
+        * POS registers, and turn motherboard setup off.
+        */
+
+       mca_dev->pos_register = 0xdf;
+       outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+       mca_dev->name[0] = 0;
+       mca_read_and_store_pos(mca_dev->pos);
+       mca_configure_adapter_status(mca_dev);
+       /* fake POS and slot for the integrated video */
+       mca_dev->pos_id = MCA_INTEGVIDEO_POS;
+       mca_dev->slot = MCA_INTEGVIDEO;
+       mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+
+       /* Put motherboard into scsi setup mode, read integrated scsi
+        * POS registers, and turn motherboard setup off.
+        *
+        * It seems there are two possible SCSI registers. Martin says that
+        * for the 56,57, 0xf7 is the one, but fails on the 76.
+        * Alfredo (apena@vnet.ibm.com) says
+        * 0xfd works on his machine. We'll try both of them. I figure it's
+        * a good bet that only one could be valid at a time. This could
+        * screw up though if one is used for something else on the other
+        * machine.
+        */
+
+       for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
+               outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
+               if(mca_read_and_store_pos(pos))
+                       break;
+       }
+       if(which_scsi) {
+               /* found a scsi card */
+               mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+               if(unlikely(!mca_dev))
+                       goto out_unlock_nomem;
+
+               for(j = 0; j < 8; j++)
+                       mca_dev->pos[j] = pos[j];
+
+               mca_configure_adapter_status(mca_dev);
+               /* fake POS and slot for integrated SCSI controller */
+               mca_dev->pos_id = MCA_INTEGSCSI_POS;
+               mca_dev->slot = MCA_INTEGSCSI;
+               mca_dev->pos_register = which_scsi;
+               mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+       }
+
+       /* Turn off motherboard setup */
+
+       outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+
+       /* Now loop over MCA slots: put each adapter into setup mode, and
+        * read its POS registers. Then put adapter setup off.
+        */
+
+       for(i=0; i<MCA_MAX_SLOT_NR; i++) {
+               outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
+               if(!mca_read_and_store_pos(pos))
+                       continue;
+
+               mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+               if(unlikely(!mca_dev))
+                       goto out_unlock_nomem;
+
+               for(j=0; j<8; j++)
+                       mca_dev->pos[j]=pos[j];
+
+               mca_dev->driver_loaded = 0;
+               mca_dev->slot = i;
+               mca_dev->pos_register = 0;
+               mca_configure_adapter_status(mca_dev);
+               mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+       }
+       outb_p(0, MCA_ADAPTER_SETUP_REG);
+
+       /* Enable interrupts and return memory start */
+       spin_unlock_irq(&mca_lock);
+
+       for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
+               request_resource(&ioport_resource, mca_standard_resources + i);
+
+       mca_do_proc_init();
+
+       return 0;
+
+ out_unlock_nomem:
+       spin_unlock_irq(&mca_lock);
+ out_nomem:
+       printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
+       return -ENOMEM;
+}
+
+subsys_initcall(mca_init);
+
+/*--------------------------------------------------------------------*/
+
+static __kprobes void
+mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
+{
+       int slot = mca_dev->slot;
+
+       if(slot == MCA_INTEGSCSI) {
+               printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
+                       mca_dev->name);
+       } else if(slot == MCA_INTEGVIDEO) {
+               printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
+                       mca_dev->name);
+       } else if(slot == MCA_MOTHERBOARD) {
+               printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
+                       mca_dev->name);
+       }
+
+       /* More info available in POS 6 and 7? */
+
+       if(check_flag) {
+               unsigned char pos6, pos7;
+
+               pos6 = mca_device_read_pos(mca_dev, 6);
+               pos7 = mca_device_read_pos(mca_dev, 7);
+
+               printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
+       }
+
+} /* mca_handle_nmi_slot */
+
+/*--------------------------------------------------------------------*/
+
+static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
+{
+       struct mca_device *mca_dev = to_mca_device(dev);
+       unsigned char pos5;
+
+       pos5 = mca_device_read_pos(mca_dev, 5);
+
+       if(!(pos5 & 0x80)) {
+               /* Bit 7 of POS 5 is reset when this adapter has a hardware
+                * error. Bit 7 it reset if there's error information
+                * available in POS 6 and 7.
+                */
+               mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
+               return 1;
+       }
+       return 0;
+}
+
+void __kprobes mca_handle_nmi(void)
+{
+       /* First try - scan the various adapters and see if a specific
+        * adapter was responsible for the error.
+        */
+       bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
+
+       mca_nmi_hook();
+} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
new file mode 100644 (file)
index 0000000..09cf781
--- /dev/null
@@ -0,0 +1,850 @@
+/*
+ *     Intel CPU Microcode Update Driver for Linux
+ *
+ *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *                   2006      Shaohua Li <shaohua.li@intel.com>
+ *
+ *     This driver allows to upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II, 
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
+ *     Order Number 245472 or free download from:
+ *             
+ *     http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *
+ *     For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ *
+ *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Initial release.
+ *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Added read() support + cleanups.
+ *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Added 'device trimming' support. open(O_WRONLY) zeroes
+ *             and frees the saved copy of applied microcode.
+ *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *             Added misc device support (now uses both devfs and misc).
+ *             Added MICROCODE_IOCFREE ioctl to clear memory.
+ *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *             Messages for error cases (non Intel & no suitable microcode).
+ *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *             Removed ->release(). Removed exclusive open and status bitmap.
+ *             Added microcode_rwsem to serialize read()/write()/ioctl().
+ *             Removed global kernel lock usage.
+ *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *             Write 0 to 0x8B msr and then cpuid before reading revision,
+ *             so that it works even if there were no update done by the
+ *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *             to be 0 on my machine which is why it worked even when I
+ *             disabled update by the BIOS)
+ *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *                          Tigran Aivazian <tigran@veritas.com>
+ *             Intel Pentium 4 processor support and bugfixes.
+ *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *             Bugfix for HT (Hyper-Threading) enabled processors
+ *             whereby processor resources are shared by all logical processors
+ *             in a single CPU package.
+ *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *             Tigran Aivazian <tigran@veritas.com>,
+ *             Serialize updates as required on HT processors due to speculative
+ *             nature of implementation.
+ *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *             Fix the panic when writing zero-length microcode chunk.
+ *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 
+ *             Jun Nakajima <jun.nakajima@intel.com>
+ *             Support for the microcode updates in the new format.
+ *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *             because we no longer hold a copy of applied microcode 
+ *             in kernel memory.
+ *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *             Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *             Thanks to Stuart Swales for pointing out this bug.
+ */
+
+//#define DEBUG /* pr_debug */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION      "1.14a"
+
+#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
+#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
+#define EXT_HEADER_SIZE                (sizeof (struct extended_sigtable)) /* 20 bytes */
+#define EXT_SIGNATURE_SIZE     (sizeof (struct extended_signature)) /* 12 bytes */
+#define DWSIZE                 (sizeof (u32))
+#define get_totalsize(mc) \
+       (((microcode_t *)mc)->hdr.totalsize ? \
+        ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
+#define get_datasize(mc) \
+       (((microcode_t *)mc)->hdr.datasize ? \
+        ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+
+static struct ucode_cpu_info {
+       int valid;
+       unsigned int sig;
+       unsigned int pf;
+       unsigned int rev;
+       microcode_t *mc;
+} ucode_cpu_info[NR_CPUS];
+
+static void collect_cpu_info(int cpu_num)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu_num;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+       unsigned int val[2];
+
+       /* We should bind the task to the CPU */
+       BUG_ON(raw_smp_processor_id() != cpu_num);
+       uci->pf = uci->rev = 0;
+       uci->mc = NULL;
+       uci->valid = 1;
+
+       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+               cpu_has(c, X86_FEATURE_IA64)) {
+               printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+                       "processor\n", cpu_num);
+               uci->valid = 0;
+               return;
+       }
+
+       uci->sig = cpuid_eax(0x00000001);
+
+       if ((c->x86_model >= 5) || (c->x86 > 6)) {
+               /* get processor flags from MSR 0x17 */
+               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+               uci->pf = 1 << ((val[1] >> 18) & 7);
+       }
+
+       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+       /* see notes above for revision 1.07.  Apparent chip bug */
+       sync_core();
+       /* get the current revision from MSR 0x8B */
+       rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
+       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+                       uci->sig, uci->pf, uci->rev);
+}
+
+static inline int microcode_update_match(int cpu_num,
+       microcode_header_t *mc_header, int sig, int pf)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+
+       if (!sigmatch(sig, uci->sig, pf, uci->pf)
+               || mc_header->rev <= uci->rev)
+               return 0;
+       return 1;
+}
+
+static int microcode_sanity_check(void *mc)
+{
+       microcode_header_t *mc_header = mc;
+       struct extended_sigtable *ext_header = NULL;
+       struct extended_signature *ext_sig;
+       unsigned long total_size, data_size, ext_table_size;
+       int sum, orig_sum, ext_sigcount = 0, i;
+
+       total_size = get_totalsize(mc_header);
+       data_size = get_datasize(mc_header);
+       if (data_size + MC_HEADER_SIZE > total_size) {
+               printk(KERN_ERR "microcode: error! "
+                       "Bad data size in microcode data file\n");
+               return -EINVAL;
+       }
+
+       if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+               printk(KERN_ERR "microcode: error! "
+                       "Unknown microcode update format\n");
+               return -EINVAL;
+       }
+       ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+       if (ext_table_size) {
+               if ((ext_table_size < EXT_HEADER_SIZE)
+                || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+                       printk(KERN_ERR "microcode: error! "
+                               "Small exttable size in microcode data file\n");
+                       return -EINVAL;
+               }
+               ext_header = mc + MC_HEADER_SIZE + data_size;
+               if (ext_table_size != exttable_size(ext_header)) {
+                       printk(KERN_ERR "microcode: error! "
+                               "Bad exttable size in microcode data file\n");
+                       return -EFAULT;
+               }
+               ext_sigcount = ext_header->count;
+       }
+
+       /* check extended table checksum */
+       if (ext_table_size) {
+               int ext_table_sum = 0;
+               int *ext_tablep = (int *)ext_header;
+
+               i = ext_table_size / DWSIZE;
+               while (i--)
+                       ext_table_sum += ext_tablep[i];
+               if (ext_table_sum) {
+                       printk(KERN_WARNING "microcode: aborting, "
+                               "bad extended signature table checksum\n");
+                       return -EINVAL;
+               }
+       }
+
+       /* calculate the checksum */
+       orig_sum = 0;
+       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+       while (i--)
+               orig_sum += ((int *)mc)[i];
+       if (orig_sum) {
+               printk(KERN_ERR "microcode: aborting, bad checksum\n");
+               return -EINVAL;
+       }
+       if (!ext_table_size)
+               return 0;
+       /* check extended signature checksum */
+       for (i = 0; i < ext_sigcount; i++) {
+               ext_sig = (struct extended_signature *)((void *)ext_header
+                       + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+               sum = orig_sum
+                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
+                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+               if (sum) {
+                       printk(KERN_ERR "microcode: aborting, bad checksum\n");
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_maching_microcode(void *mc, int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       microcode_header_t *mc_header = mc;
+       struct extended_sigtable *ext_header;
+       unsigned long total_size = get_totalsize(mc_header);
+       int ext_sigcount, i;
+       struct extended_signature *ext_sig;
+       void *new_mc;
+
+       if (microcode_update_match(cpu, mc_header,
+                       mc_header->sig, mc_header->pf))
+               goto find;
+
+       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+               return 0;
+
+       ext_header = (struct extended_sigtable *)(mc +
+                       get_datasize(mc_header) + MC_HEADER_SIZE);
+       ext_sigcount = ext_header->count;
+       ext_sig = (struct extended_signature *)((void *)ext_header
+                       + EXT_HEADER_SIZE);
+       for (i = 0; i < ext_sigcount; i++) {
+               if (microcode_update_match(cpu, mc_header,
+                               ext_sig->sig, ext_sig->pf))
+                       goto find;
+               ext_sig++;
+       }
+       return 0;
+find:
+       pr_debug("microcode: CPU %d found a matching microcode update with"
+               " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
+       new_mc = vmalloc(total_size);
+       if (!new_mc) {
+               printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+               return -ENOMEM;
+       }
+
+       /* free previous update file */
+       vfree(uci->mc);
+
+       memcpy(new_mc, mc, total_size);
+       uci->mc = new_mc;
+       return 1;
+}
+
+static void apply_microcode(int cpu)
+{
+       unsigned long flags;
+       unsigned int val[2];
+       int cpu_num = raw_smp_processor_id();
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu_num != cpu);
+
+       if (uci->mc == NULL)
+               return;
+
+       /* serialize access to the physical write to MSR 0x79 */
+       spin_lock_irqsave(&microcode_update_lock, flags);          
+
+       /* write microcode via MSR 0x79 */
+       wrmsr(MSR_IA32_UCODE_WRITE,
+               (unsigned long) uci->mc->bits, 
+               (unsigned long) uci->mc->bits >> 16 >> 16);
+       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+       /* see notes above for revision 1.07.  Apparent chip bug */
+       sync_core();
+
+       /* get the current revision from MSR 0x8B */
+       rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+       spin_unlock_irqrestore(&microcode_update_lock, flags);
+       if (val[1] != uci->mc->hdr.rev) {
+               printk(KERN_ERR "microcode: CPU%d updated from revision "
+                       "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
+               return;
+       }
+       pr_debug("microcode: CPU%d updated from revision "
+              "0x%x to 0x%x, date = %08x \n", 
+              cpu_num, uci->rev, val[1], uci->mc->hdr.date);
+       uci->rev = val[1];
+}
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static void __user *user_buffer;       /* user area microcode data buffer */
+static unsigned int user_buffer_size;  /* it's size */
+
+static long get_next_ucode(void **mc, long offset)
+{
+       microcode_header_t mc_header;
+       unsigned long total_size;
+
+       /* No more data */
+       if (offset >= user_buffer_size)
+               return 0;
+       if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
+               printk(KERN_ERR "microcode: error! Can not read user data\n");
+               return -EFAULT;
+       }
+       total_size = get_totalsize(&mc_header);
+       if (offset + total_size > user_buffer_size) {
+               printk(KERN_ERR "microcode: error! Bad total size in microcode "
+                               "data file\n");
+               return -EINVAL;
+       }
+       *mc = vmalloc(total_size);
+       if (!*mc)
+               return -ENOMEM;
+       if (copy_from_user(*mc, user_buffer + offset, total_size)) {
+               printk(KERN_ERR "microcode: error! Can not read user data\n");
+               vfree(*mc);
+               return -EFAULT;
+       }
+       return offset + total_size;
+}
+
+static int do_microcode_update (void)
+{
+       long cursor = 0;
+       int error = 0;
+       void *new_mc = NULL;
+       int cpu;
+       cpumask_t old;
+
+       old = current->cpus_allowed;
+
+       while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
+               error = microcode_sanity_check(new_mc);
+               if (error)
+                       goto out;
+               /*
+                * It's possible the data file has multiple matching ucode,
+                * lets keep searching till the latest version
+                */
+               for_each_online_cpu(cpu) {
+                       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+                       if (!uci->valid)
+                               continue;
+                       set_cpus_allowed(current, cpumask_of_cpu(cpu));
+                       error = get_maching_microcode(new_mc, cpu);
+                       if (error < 0)
+                               goto out;
+                       if (error == 1)
+                               apply_microcode(cpu);
+               }
+               vfree(new_mc);
+       }
+out:
+       if (cursor > 0)
+               vfree(new_mc);
+       if (cursor < 0)
+               error = cursor;
+       set_cpus_allowed(current, old);
+       return error;
+}
+
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
+{
+       ssize_t ret;
+
+       if ((len >> PAGE_SHIFT) > num_physpages) {
+               printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
+               return -EINVAL;
+       }
+
+       lock_cpu_hotplug();
+       mutex_lock(&microcode_mutex);
+
+       user_buffer = (void __user *) buf;
+       user_buffer_size = (int) len;
+
+       ret = do_microcode_update();
+       if (!ret)
+               ret = (ssize_t)len;
+
+       mutex_unlock(&microcode_mutex);
+       unlock_cpu_hotplug();
+
+       return ret;
+}
+
+static const struct file_operations microcode_fops = {
+       .owner          = THIS_MODULE,
+       .write          = microcode_write,
+       .open           = microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+       .minor          = MICROCODE_MINOR,
+       .name           = "microcode",
+       .fops           = &microcode_fops,
+};
+
+static int __init microcode_dev_init (void)
+{
+       int error;
+
+       error = misc_register(&microcode_dev);
+       if (error) {
+               printk(KERN_ERR
+                       "microcode: can't misc_register on minor=%d\n",
+                       MICROCODE_MINOR);
+               return error;
+       }
+
+       return 0;
+}
+
+static void microcode_dev_exit (void)
+{
+       misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+static long get_next_ucode_from_buffer(void **mc, void *buf,
+       unsigned long size, long offset)
+{
+       microcode_header_t *mc_header;
+       unsigned long total_size;
+
+       /* No more data */
+       if (offset >= size)
+               return 0;
+       mc_header = (microcode_header_t *)(buf + offset);
+       total_size = get_totalsize(mc_header);
+
+       if (offset + total_size > size) {
+               printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+               return -EINVAL;
+       }
+
+       *mc = vmalloc(total_size);
+       if (!*mc) {
+               printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+               return -ENOMEM;
+       }
+       memcpy(*mc, buf + offset, total_size);
+       return offset + total_size;
+}
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int cpu_request_microcode(int cpu)
+{
+       char name[30];
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       const struct firmware *firmware;
+       void *buf;
+       unsigned long size;
+       long offset = 0;
+       int error;
+       void *mc;
+
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu != raw_smp_processor_id());
+       sprintf(name,"intel-ucode/%02x-%02x-%02x",
+               c->x86, c->x86_model, c->x86_mask);
+       error = request_firmware(&firmware, name, &microcode_pdev->dev);
+       if (error) {
+               pr_debug("ucode data file %s load failed\n", name);
+               return error;
+       }
+       buf = (void *)firmware->data;
+       size = firmware->size;
+       while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
+                       > 0) {
+               error = microcode_sanity_check(mc);
+               if (error)
+                       break;
+               error = get_maching_microcode(mc, cpu);
+               if (error < 0)
+                       break;
+               /*
+                * It's possible the data file has multiple matching ucode,
+                * lets keep searching till the latest version
+                */
+               if (error == 1) {
+                       apply_microcode(cpu);
+                       error = 0;
+               }
+               vfree(mc);
+       }
+       if (offset > 0)
+               vfree(mc);
+       if (offset < 0)
+               error = offset;
+       release_firmware(firmware);
+
+       return error;
+}
+
+static int apply_microcode_check_cpu(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       cpumask_t old;
+       unsigned int val[2];
+       int err = 0;
+
+       /* Check if the microcode is available */
+       if (!uci->mc)
+               return 0;
+
+       old = current->cpus_allowed;
+       set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+       /* Check if the microcode we have in memory matches the CPU */
+       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+           cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
+               err = -EINVAL;
+
+       if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
+               /* get processor flags from MSR 0x17 */
+               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+               if (uci->pf != (1 << ((val[1] >> 18) & 7)))
+                       err = -EINVAL;
+       }
+
+       if (!err) {
+               wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+               /* see notes above for revision 1.07.  Apparent chip bug */
+               sync_core();
+               /* get the current revision from MSR 0x8B */
+               rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+               if (uci->rev != val[1])
+                       err = -EINVAL;
+       }
+
+       if (!err)
+               apply_microcode(cpu);
+       else
+               printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
+                       " sig=0x%x, pf=0x%x, rev=0x%x\n",
+                       cpu, uci->sig, uci->pf, uci->rev);
+
+       set_cpus_allowed(current, old);
+       return err;
+}
+
+static void microcode_init_cpu(int cpu, int resume)
+{
+       cpumask_t old;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       old = current->cpus_allowed;
+
+       set_cpus_allowed(current, cpumask_of_cpu(cpu));
+       mutex_lock(&microcode_mutex);
+       collect_cpu_info(cpu);
+       if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
+               cpu_request_microcode(cpu);
+       mutex_unlock(&microcode_mutex);
+       set_cpus_allowed(current, old);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       mutex_lock(&microcode_mutex);
+       uci->valid = 0;
+       vfree(uci->mc);
+       uci->mc = NULL;
+       mutex_unlock(&microcode_mutex);
+}
+
+static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+       char *end;
+       unsigned long val = simple_strtoul(buf, &end, 0);
+       int err = 0;
+       int cpu = dev->id;
+
+       if (end == buf)
+               return -EINVAL;
+       if (val == 1) {
+               cpumask_t old;
+
+               old = current->cpus_allowed;
+
+               lock_cpu_hotplug();
+               set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+               mutex_lock(&microcode_mutex);
+               if (uci->valid)
+                       err = cpu_request_microcode(cpu);
+               mutex_unlock(&microcode_mutex);
+               unlock_cpu_hotplug();
+               set_cpus_allowed(current, old);
+       }
+       if (err)
+               return err;
+       return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev, char *buf)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+       return sprintf(buf, "0x%x\n", uci->rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev, char *buf)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+       return sprintf(buf, "0x%x\n", uci->pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+       &attr_reload.attr,
+       &attr_version.attr,
+       &attr_processor_flags.attr,
+       NULL
+};
+
+static struct attribute_group mc_attr_group = {
+       .attrs = mc_default_attrs,
+       .name = "microcode",
+};
+
+static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
+{
+       int err, cpu = sys_dev->id;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       if (!cpu_online(cpu))
+               return 0;
+
+       pr_debug("Microcode:CPU %d added\n", cpu);
+       memset(uci, 0, sizeof(*uci));
+
+       err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+       if (err)
+               return err;
+
+       microcode_init_cpu(cpu, resume);
+
+       return 0;
+}
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+       return __mc_sysdev_add(sys_dev, 0);
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+       int cpu = sys_dev->id;
+
+       if (!cpu_online(cpu))
+               return 0;
+
+       pr_debug("Microcode:CPU %d removed\n", cpu);
+       microcode_fini_cpu(cpu);
+       sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+       return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+       int cpu = dev->id;
+
+       if (!cpu_online(cpu))
+               return 0;
+       pr_debug("Microcode:CPU %d resumed\n", cpu);
+       /* only CPU 0 will apply ucode here */
+       apply_microcode(0);
+       return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+       .add = mc_sysdev_add,
+       .remove = mc_sysdev_remove,
+       .resume = mc_sysdev_resume,
+};
+
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+       struct sys_device *sys_dev;
+
+       sys_dev = get_cpu_sysdev(cpu);
+       switch (action) {
+       case CPU_UP_CANCELED_FROZEN:
+               /* The CPU refused to come up during a system resume */
+               microcode_fini_cpu(cpu);
+               break;
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+               mc_sysdev_add(sys_dev);
+               break;
+       case CPU_ONLINE_FROZEN:
+               /* System-wide resume is in progress, try to apply microcode */
+               if (apply_microcode_check_cpu(cpu)) {
+                       /* The application of microcode failed */
+                       microcode_fini_cpu(cpu);
+                       __mc_sysdev_add(sys_dev, 1);
+                       break;
+               }
+       case CPU_DOWN_FAILED_FROZEN:
+               if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+                       printk(KERN_ERR "Microcode: Failed to create the sysfs "
+                               "group for CPU%d\n", cpu);
+               break;
+       case CPU_DOWN_PREPARE:
+               mc_sysdev_remove(sys_dev);
+               break;
+       case CPU_DOWN_PREPARE_FROZEN:
+               /* Suspend is in progress, only remove the interface */
+               sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata mc_cpu_notifier = {
+       .notifier_call = mc_cpu_callback,
+};
+
+static int __init microcode_init (void)
+{
+       int error;
+
+       error = microcode_dev_init();
+       if (error)
+               return error;
+       microcode_pdev = platform_device_register_simple("microcode", -1,
+                                                        NULL, 0);
+       if (IS_ERR(microcode_pdev)) {
+               microcode_dev_exit();
+               return PTR_ERR(microcode_pdev);
+       }
+
+       lock_cpu_hotplug();
+       error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+       unlock_cpu_hotplug();
+       if (error) {
+               microcode_dev_exit();
+               platform_device_unregister(microcode_pdev);
+               return error;
+       }
+
+       register_hotcpu_notifier(&mc_cpu_notifier);
+
+       printk(KERN_INFO 
+               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+       return 0;
+}
+
+static void __exit microcode_exit (void)
+{
+       microcode_dev_exit();
+
+       unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+       lock_cpu_hotplug();
+       sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+       unlock_cpu_hotplug();
+
+       platform_device_unregister(microcode_pdev);
+}
+
+module_init(microcode_init)
+module_exit(microcode_exit)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
new file mode 100644 (file)
index 0000000..3db0a54
--- /dev/null
@@ -0,0 +1,152 @@
+/*  Kernel module help for i386.
+    Copyright (C) 2001 Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt...)
+#endif
+
+void *module_alloc(unsigned long size)
+{
+       if (size == 0)
+               return NULL;
+       return vmalloc_exec(size);
+}
+
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+       vfree(module_region);
+       /* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+                             Elf_Shdr *sechdrs,
+                             char *secstrings,
+                             struct module *mod)
+{
+       return 0;
+}
+
+int apply_relocate(Elf32_Shdr *sechdrs,
+                  const char *strtab,
+                  unsigned int symindex,
+                  unsigned int relsec,
+                  struct module *me)
+{
+       unsigned int i;
+       Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+       Elf32_Sym *sym;
+       uint32_t *location;
+
+       DEBUGP("Applying relocate section %u to %u\n", relsec,
+              sechdrs[relsec].sh_info);
+       for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+               /* This is where to make the change */
+               location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+                       + rel[i].r_offset;
+               /* This is the symbol it is referring to.  Note that all
+                  undefined symbols have been resolved.  */
+               sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+                       + ELF32_R_SYM(rel[i].r_info);
+
+               switch (ELF32_R_TYPE(rel[i].r_info)) {
+               case R_386_32:
+                       /* We add the value into the location given */
+                       *location += sym->st_value;
+                       break;
+               case R_386_PC32:
+                       /* Add the value, subtract its postition */
+                       *location += sym->st_value - (uint32_t)location;
+                       break;
+               default:
+                       printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+                              me->name, ELF32_R_TYPE(rel[i].r_info));
+                       return -ENOEXEC;
+               }
+       }
+       return 0;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+                      const char *strtab,
+                      unsigned int symindex,
+                      unsigned int relsec,
+                      struct module *me)
+{
+       printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
+              me->name);
+       return -ENOEXEC;
+}
+
+int module_finalize(const Elf_Ehdr *hdr,
+                   const Elf_Shdr *sechdrs,
+                   struct module *me)
+{
+       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+               *para = NULL;
+       char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+       for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
+               if (!strcmp(".text", secstrings + s->sh_name))
+                       text = s;
+               if (!strcmp(".altinstructions", secstrings + s->sh_name))
+                       alt = s;
+               if (!strcmp(".smp_locks", secstrings + s->sh_name))
+                       locks= s;
+               if (!strcmp(".parainstructions", secstrings + s->sh_name))
+                       para = s;
+       }
+
+       if (alt) {
+               /* patch .altinstructions */
+               void *aseg = (void *)alt->sh_addr;
+               apply_alternatives(aseg, aseg + alt->sh_size);
+       }
+       if (locks && text) {
+               void *lseg = (void *)locks->sh_addr;
+               void *tseg = (void *)text->sh_addr;
+               alternatives_smp_module_add(me, me->name,
+                                           lseg, lseg + locks->sh_size,
+                                           tseg, tseg + text->sh_size);
+       }
+
+       if (para) {
+               void *pseg = (void *)para->sh_addr;
+               apply_paravirt(pseg, pseg + para->sh_size);
+       }
+
+       return module_bug_finalize(hdr, sechdrs, me);
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+       alternatives_smp_module_del(mod);
+       module_bug_cleanup(mod);
+}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
new file mode 100644 (file)
index 0000000..13abb4e
--- /dev/null
@@ -0,0 +1,1132 @@
+/*
+ *     Intel Multiprocessor Specification 1.1 and 1.4
+ *     compliant MP-table parsing routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *             Erich Boleyn    :       MP v1.4 and additional changes.
+ *             Alan Cox        :       Added EBDA scanning
+ *             Ingo Molnar     :       various cleanups and rewrites
+ *             Maciej W. Rozycki:      Bits for default MP configurations
+ *             Paul Diefenbaugh:       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+
+#include <asm/smp.h>
+#include <asm/acpi.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
+
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include <mach_mpparse.h>
+#include <bios_ebda.h>
+
+/* Have we found an MP table */
+int smp_found_config;
+unsigned int __cpuinitdata maxcpus = NR_CPUS;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+int mp_bus_id_to_type [MAX_MP_BUSSES];
+int mp_bus_id_to_node [MAX_MP_BUSSES];
+int mp_bus_id_to_local [MAX_MP_BUSSES];
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+static int mp_current_pci_id;
+
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* MP IRQ source entries */
+int mp_irq_entries;
+
+int nr_ioapics;
+
+int pic_mode;
+unsigned long mp_lapic_addr;
+
+unsigned int def_to_bigsmp = 0;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+/* Internal processor count */
+unsigned int __cpuinitdata num_processors;
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+       int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+
+static int mpc_record; 
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
+
+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+{
+       int ver, apicid;
+       physid_mask_t phys_cpu;
+       
+       if (!(m->mpc_cpuflag & CPU_ENABLED))
+               return;
+
+       apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+       if (m->mpc_featureflag&(1<<0))
+               Dprintk("    Floating point unit present.\n");
+       if (m->mpc_featureflag&(1<<7))
+               Dprintk("    Machine Exception supported.\n");
+       if (m->mpc_featureflag&(1<<8))
+               Dprintk("    64 bit compare & exchange supported.\n");
+       if (m->mpc_featureflag&(1<<9))
+               Dprintk("    Internal APIC present.\n");
+       if (m->mpc_featureflag&(1<<11))
+               Dprintk("    SEP present.\n");
+       if (m->mpc_featureflag&(1<<12))
+               Dprintk("    MTRR  present.\n");
+       if (m->mpc_featureflag&(1<<13))
+               Dprintk("    PGE  present.\n");
+       if (m->mpc_featureflag&(1<<14))
+               Dprintk("    MCA  present.\n");
+       if (m->mpc_featureflag&(1<<15))
+               Dprintk("    CMOV  present.\n");
+       if (m->mpc_featureflag&(1<<16))
+               Dprintk("    PAT  present.\n");
+       if (m->mpc_featureflag&(1<<17))
+               Dprintk("    PSE  present.\n");
+       if (m->mpc_featureflag&(1<<18))
+               Dprintk("    PSN  present.\n");
+       if (m->mpc_featureflag&(1<<19))
+               Dprintk("    Cache Line Flush Instruction present.\n");
+       /* 20 Reserved */
+       if (m->mpc_featureflag&(1<<21))
+               Dprintk("    Debug Trace and EMON Store present.\n");
+       if (m->mpc_featureflag&(1<<22))
+               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
+       if (m->mpc_featureflag&(1<<23))
+               Dprintk("    MMX  present.\n");
+       if (m->mpc_featureflag&(1<<24))
+               Dprintk("    FXSR  present.\n");
+       if (m->mpc_featureflag&(1<<25))
+               Dprintk("    XMM  present.\n");
+       if (m->mpc_featureflag&(1<<26))
+               Dprintk("    Willamette New Instructions  present.\n");
+       if (m->mpc_featureflag&(1<<27))
+               Dprintk("    Self Snoop  present.\n");
+       if (m->mpc_featureflag&(1<<28))
+               Dprintk("    HT  present.\n");
+       if (m->mpc_featureflag&(1<<29))
+               Dprintk("    Thermal Monitor present.\n");
+       /* 30, 31 Reserved */
+
+
+       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+               Dprintk("    Bootup CPU\n");
+               boot_cpu_physical_apicid = m->mpc_apicid;
+       }
+
+       ver = m->mpc_apicver;
+
+       /*
+        * Validate version
+        */
+       if (ver == 0x0) {
+               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+                               "fixing up to 0x10. (tell your hw vendor)\n",
+                               m->mpc_apicid);
+               ver = 0x10;
+       }
+       apic_version[m->mpc_apicid] = ver;
+
+       phys_cpu = apicid_to_cpu_present(apicid);
+       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
+
+       if (num_processors >= NR_CPUS) {
+               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+                       "  Processor ignored.\n", NR_CPUS);
+               return;
+       }
+
+       if (num_processors >= maxcpus) {
+               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+                       " Processor ignored.\n", maxcpus);
+               return;
+       }
+
+       cpu_set(num_processors, cpu_possible_map);
+       num_processors++;
+
+       /*
+        * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+        * but we need to work other dependencies like SMP_SUSPEND etc
+        * before this can be done without some confusion.
+        * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+        *       - Ashok Raj <ashok.raj@intel.com>
+        */
+       if (num_processors > 8) {
+               switch (boot_cpu_data.x86_vendor) {
+               case X86_VENDOR_INTEL:
+                       if (!APIC_XAPIC(ver)) {
+                               def_to_bigsmp = 0;
+                               break;
+                       }
+                       /* If P4 and above fall through */
+               case X86_VENDOR_AMD:
+                       def_to_bigsmp = 1;
+               }
+       }
+       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+}
+
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+       char str[7];
+
+       memcpy(str, m->mpc_bustype, 6);
+       str[6] = 0;
+
+       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+
+#if MAX_MP_BUSSES < 256
+       if (m->mpc_busid >= MAX_MP_BUSSES) {
+               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+                       " is too large, max. supported is %d\n",
+                       m->mpc_busid, str, MAX_MP_BUSSES - 1);
+               return;
+       }
+#endif
+
+       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
+               mpc_oem_pci_bus(m, translation_table[mpc_record]);
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+               mp_current_pci_id++;
+       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+       } else {
+               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+       }
+}
+
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+       if (!(m->mpc_flags & MPC_APIC_USABLE))
+               return;
+
+       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+       if (nr_ioapics >= MAX_IO_APICS) {
+               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
+                       MAX_IO_APICS, nr_ioapics);
+               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+       }
+       if (!m->mpc_apicaddr) {
+               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+                       " found in MP table, skipping!\n");
+               return;
+       }
+       mp_ioapics[nr_ioapics] = *m;
+       nr_ioapics++;
+}
+
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+       mp_irqs [mp_irq_entries] = *m;
+       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                       m->mpc_irqtype, m->mpc_irqflag & 3,
+                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+       if (++mp_irq_entries == MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!!\n");
+}
+
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+                       m->mpc_irqtype, m->mpc_irqflag & 3,
+                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+
+#ifdef CONFIG_X86_NUMAQ
+static void __init MP_translation_info (struct mpc_config_translation *m)
+{
+       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
+
+       if (mpc_record >= MAX_MPC_ENTRY) 
+               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+       else
+               translation_table[mpc_record] = m; /* stash this for later */
+       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+               node_set_online(m->trans_quad);
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
+       unsigned short oemsize)
+{
+       int count = sizeof (*oemtable); /* the header size */
+       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
+       
+       mpc_record = 0;
+       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
+       {
+               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+                       oemtable->oem_signature[0],
+                       oemtable->oem_signature[1],
+                       oemtable->oem_signature[2],
+                       oemtable->oem_signature[3]);
+               return;
+       }
+       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
+       {
+               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+               return;
+       }
+       while (count < oemtable->oem_length) {
+               switch (*oemptr) {
+                       case MP_TRANSLATION:
+                       {
+                               struct mpc_config_translation *m=
+                                       (struct mpc_config_translation *)oemptr;
+                               MP_translation_info(m);
+                               oemptr += sizeof(*m);
+                               count += sizeof(*m);
+                               ++mpc_record;
+                               break;
+                       }
+                       default:
+                       {
+                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
+                               return;
+                       }
+               }
+       }
+}
+
+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+               char *productid)
+{
+       if (strncmp(oem, "IBM NUMA", 8))
+               printk("Warning!  May not be a NUMA-Q system!\n");
+       if (mpc->mpc_oemptr)
+               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
+                               mpc->mpc_oemsize);
+}
+#endif /* CONFIG_X86_NUMAQ */
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+       char str[16];
+       char oem[10];
+       int count=sizeof(*mpc);
+       unsigned char *mpt=((unsigned char *)mpc)+count;
+
+       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
+                       *(u32 *)mpc->mpc_signature);
+               return 0;
+       }
+       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+               printk(KERN_ERR "SMP mptable: checksum error!\n");
+               return 0;
+       }
+       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+                       mpc->mpc_spec);
+               return 0;
+       }
+       if (!mpc->mpc_lapic) {
+               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+               return 0;
+       }
+       memcpy(oem,mpc->mpc_oem,8);
+       oem[8]=0;
+       printk(KERN_INFO "OEM ID: %s ",oem);
+
+       memcpy(str,mpc->mpc_productid,12);
+       str[12]=0;
+       printk("Product ID: %s ",str);
+
+       mps_oem_check(mpc, oem, str);
+
+       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+
+       /* 
+        * Save the local APIC address (it might be non-default) -- but only
+        * if we're not using ACPI.
+        */
+       if (!acpi_lapic)
+               mp_lapic_addr = mpc->mpc_lapic;
+
+       /*
+        *      Now process the configuration blocks.
+        */
+       mpc_record = 0;
+       while (count < mpc->mpc_length) {
+               switch(*mpt) {
+                       case MP_PROCESSOR:
+                       {
+                               struct mpc_config_processor *m=
+                                       (struct mpc_config_processor *)mpt;
+                               /* ACPI may have already provided this data */
+                               if (!acpi_lapic)
+                                       MP_processor_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_BUS:
+                       {
+                               struct mpc_config_bus *m=
+                                       (struct mpc_config_bus *)mpt;
+                               MP_bus_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_IOAPIC:
+                       {
+                               struct mpc_config_ioapic *m=
+                                       (struct mpc_config_ioapic *)mpt;
+                               MP_ioapic_info(m);
+                               mpt+=sizeof(*m);
+                               count+=sizeof(*m);
+                               break;
+                       }
+                       case MP_INTSRC:
+                       {
+                               struct mpc_config_intsrc *m=
+                                       (struct mpc_config_intsrc *)mpt;
+
+                               MP_intsrc_info(m);
+                               mpt+=sizeof(*m);
+                               count+=sizeof(*m);
+                               break;
+                       }
+                       case MP_LINTSRC:
+                       {
+                               struct mpc_config_lintsrc *m=
+                                       (struct mpc_config_lintsrc *)mpt;
+                               MP_lintsrc_info(m);
+                               mpt+=sizeof(*m);
+                               count+=sizeof(*m);
+                               break;
+                       }
+                       default:
+                       {
+                               count = mpc->mpc_length;
+                               break;
+                       }
+               }
+               ++mpc_record;
+       }
+       setup_apic_routing();
+       if (!num_processors)
+               printk(KERN_ERR "SMP mptable: no processors registered!\n");
+       return num_processors;
+}
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+       unsigned int port;
+
+       port = 0x4d0 + (irq >> 3);
+       return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+       struct mpc_config_intsrc intsrc;
+       int i;
+       int ELCR_fallback = 0;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqflag = 0;                 /* conforming */
+       intsrc.mpc_srcbus = 0;
+       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+       intsrc.mpc_irqtype = mp_INT;
+
+       /*
+        *  If true, we have an ISA/PCI system with no IRQ entries
+        *  in the MP table. To prevent the PCI interrupts from being set up
+        *  incorrectly, we try to use the ELCR. The sanity check to see if
+        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+        *  never be level sensitive, so we simply see if the ELCR agrees.
+        *  If it does, we assume it's valid.
+        */
+       if (mpc_default_type == 5) {
+               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+
+               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
+               else {
+                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+                       ELCR_fallback = 1;
+               }
+       }
+
+       for (i = 0; i < 16; i++) {
+               switch (mpc_default_type) {
+               case 2:
+                       if (i == 0 || i == 13)
+                               continue;       /* IRQ0 & IRQ13 not connected */
+                       /* fall through */
+               default:
+                       if (i == 2)
+                               continue;       /* IRQ2 is never connected */
+               }
+
+               if (ELCR_fallback) {
+                       /*
+                        *  If the ELCR indicates a level-sensitive interrupt, we
+                        *  copy that information over to the MP table in the
+                        *  irqflag field (level sensitive, active high polarity).
+                        */
+                       if (ELCR_trigger(i))
+                               intsrc.mpc_irqflag = 13;
+                       else
+                               intsrc.mpc_irqflag = 0;
+               }
+
+               intsrc.mpc_srcbusirq = i;
+               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
+               MP_intsrc_info(&intsrc);
+       }
+
+       intsrc.mpc_irqtype = mp_ExtINT;
+       intsrc.mpc_srcbusirq = 0;
+       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
+       MP_intsrc_info(&intsrc);
+}
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+       struct mpc_config_processor processor;
+       struct mpc_config_bus bus;
+       struct mpc_config_ioapic ioapic;
+       struct mpc_config_lintsrc lintsrc;
+       int linttypes[2] = { mp_ExtINT, mp_NMI };
+       int i;
+
+       /*
+        * local APIC has default address
+        */
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+       /*
+        * 2 CPUs, numbered 0 & 1.
+        */
+       processor.mpc_type = MP_PROCESSOR;
+       /* Either an integrated APIC or a discrete 82489DX. */
+       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+       processor.mpc_cpuflag = CPU_ENABLED;
+       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+                                  (boot_cpu_data.x86_model << 4) |
+                                  boot_cpu_data.x86_mask;
+       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+       processor.mpc_reserved[0] = 0;
+       processor.mpc_reserved[1] = 0;
+       for (i = 0; i < 2; i++) {
+               processor.mpc_apicid = i;
+               MP_processor_info(&processor);
+       }
+
+       bus.mpc_type = MP_BUS;
+       bus.mpc_busid = 0;
+       switch (mpc_default_type) {
+               default:
+                       printk("???\n");
+                       printk(KERN_ERR "Unknown standard configuration %d\n",
+                               mpc_default_type);
+                       /* fall through */
+               case 1:
+               case 5:
+                       memcpy(bus.mpc_bustype, "ISA   ", 6);
+                       break;
+               case 2:
+               case 6:
+               case 3:
+                       memcpy(bus.mpc_bustype, "EISA  ", 6);
+                       break;
+               case 4:
+               case 7:
+                       memcpy(bus.mpc_bustype, "MCA   ", 6);
+       }
+       MP_bus_info(&bus);
+       if (mpc_default_type > 4) {
+               bus.mpc_busid = 1;
+               memcpy(bus.mpc_bustype, "PCI   ", 6);
+               MP_bus_info(&bus);
+       }
+
+       ioapic.mpc_type = MP_IOAPIC;
+       ioapic.mpc_apicid = 2;
+       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+       ioapic.mpc_flags = MPC_APIC_USABLE;
+       ioapic.mpc_apicaddr = 0xFEC00000;
+       MP_ioapic_info(&ioapic);
+
+       /*
+        * We set up most of the low 16 IO-APIC pins according to MPS rules.
+        */
+       construct_default_ioirq_mptable(mpc_default_type);
+
+       lintsrc.mpc_type = MP_LINTSRC;
+       lintsrc.mpc_irqflag = 0;                /* conforming */
+       lintsrc.mpc_srcbusid = 0;
+       lintsrc.mpc_srcbusirq = 0;
+       lintsrc.mpc_destapic = MP_APIC_ALL;
+       for (i = 0; i < 2; i++) {
+               lintsrc.mpc_irqtype = linttypes[i];
+               lintsrc.mpc_destapiclint = i;
+               MP_lintsrc_info(&lintsrc);
+       }
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+       struct intel_mp_floating *mpf = mpf_found;
+
+       /*
+        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+        * processors, where MPS only supports physical.
+        */
+       if (acpi_lapic && acpi_ioapic) {
+               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+               return;
+       }
+       else if (acpi_lapic)
+               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+
+       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+       if (mpf->mpf_feature2 & (1<<7)) {
+               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+               pic_mode = 1;
+       } else {
+               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+               pic_mode = 0;
+       }
+
+       /*
+        * Now see if we need to read further.
+        */
+       if (mpf->mpf_feature1 != 0) {
+
+               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+               construct_default_ISA_mptable(mpf->mpf_feature1);
+
+       } else if (mpf->mpf_physptr) {
+
+               /*
+                * Read the physical hardware table.  Anything here will
+                * override the defaults.
+                */
+               if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
+                       smp_found_config = 0;
+                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+                       return;
+               }
+               /*
+                * If there are no explicit MP IRQ entries, then we are
+                * broken.  We set up most of the low 16 IO-APIC pins to
+                * ISA defaults and hope it will work.
+                */
+               if (!mp_irq_entries) {
+                       struct mpc_config_bus bus;
+
+                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+
+                       bus.mpc_type = MP_BUS;
+                       bus.mpc_busid = 0;
+                       memcpy(bus.mpc_bustype, "ISA   ", 6);
+                       MP_bus_info(&bus);
+
+                       construct_default_ioirq_mptable(0);
+               }
+
+       } else
+               BUG();
+
+       printk(KERN_INFO "Processors: %d\n", num_processors);
+       /*
+        * Only use the first configuration found.
+        */
+}
+
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+       unsigned long *bp = phys_to_virt(base);
+       struct intel_mp_floating *mpf;
+
+       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+       if (sizeof(*mpf) != 16)
+               printk("Error: MPF size\n");
+
+       while (length > 0) {
+               mpf = (struct intel_mp_floating *)bp;
+               if ((*bp == SMP_MAGIC_IDENT) &&
+                       (mpf->mpf_length == 1) &&
+                       !mpf_checksum((unsigned char *)bp, 16) &&
+                       ((mpf->mpf_specification == 1)
+                               || (mpf->mpf_specification == 4)) ) {
+
+                       smp_found_config = 1;
+                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
+                                               virt_to_phys(mpf));
+                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
+                       if (mpf->mpf_physptr) {
+                               /*
+                                * We cannot access to MPC table to compute
+                                * table size yet, as only few megabytes from
+                                * the bottom is mapped now.
+                                * PC-9800's MPC table places on the very last
+                                * of physical memory; so that simply reserving
+                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+                                * in reserve_bootmem.
+                                */
+                               unsigned long size = PAGE_SIZE;
+                               unsigned long end = max_low_pfn * PAGE_SIZE;
+                               if (mpf->mpf_physptr + size > end)
+                                       size = end - mpf->mpf_physptr;
+                               reserve_bootmem(mpf->mpf_physptr, size);
+                       }
+
+                       mpf_found = mpf;
+                       return 1;
+               }
+               bp += 4;
+               length -= 16;
+       }
+       return 0;
+}
+
+void __init find_smp_config (void)
+{
+       unsigned int address;
+
+       /*
+        * FIXME: Linux assumes you have 640K of base ram..
+        * this continues the error...
+        *
+        * 1) Scan the bottom 1K for a signature
+        * 2) Scan the top 1K of base RAM
+        * 3) Scan the 64K of bios
+        */
+       if (smp_scan_config(0x0,0x400) ||
+               smp_scan_config(639*0x400,0x400) ||
+                       smp_scan_config(0xF0000,0x10000))
+               return;
+       /*
+        * If it is an SMP machine we should know now, unless the
+        * configuration is in an EISA/MCA bus machine with an
+        * extended bios data area.
+        *
+        * there is a real-mode segmented pointer pointing to the
+        * 4K EBDA area at 0x40E, calculate and scan it here.
+        *
+        * NOTE! There are Linux loaders that will corrupt the EBDA
+        * area, and as such this kind of SMP config may be less
+        * trustworthy, simply because the SMP table may have been
+        * stomped on during early boot. These loaders are buggy and
+        * should be fixed.
+        *
+        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+        */
+
+       address = get_bios_ebda();
+       if (address)
+               smp_scan_config(address, 0x400);
+}
+
+int es7000_plat;
+
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+void __init mp_register_lapic_address(u64 address)
+{
+       mp_lapic_addr = (unsigned long) address;
+
+       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+
+       if (boot_cpu_physical_apicid == -1U)
+               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+
+       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+{
+       struct mpc_config_processor processor;
+       int boot_cpu = 0;
+       
+       if (MAX_APICS - id <= 0) {
+               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+                       id, MAX_APICS);
+               return;
+       }
+
+       if (id == boot_cpu_physical_apicid)
+               boot_cpu = 1;
+
+       processor.mpc_type = MP_PROCESSOR;
+       processor.mpc_apicid = id;
+       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
+               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+       processor.mpc_reserved[0] = 0;
+       processor.mpc_reserved[1] = 0;
+
+       MP_processor_info(&processor);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+#define MP_ISA_BUS             0
+#define MP_MAX_IOAPIC_PIN      127
+
+static struct mp_ioapic_routing {
+       int                     apic_id;
+       int                     gsi_base;
+       int                     gsi_end;
+       u32                     pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic (int gsi)
+{
+       int i = 0;
+
+       /* Find the IOAPIC that manages this GSI. */
+       for (i = 0; i < nr_ioapics; i++) {
+               if ((gsi >= mp_ioapic_routing[i].gsi_base)
+                       && (gsi <= mp_ioapic_routing[i].gsi_end))
+                       return i;
+       }
+
+       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+
+       return -1;
+}
+
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+{
+       int idx = 0;
+       int tmpid;
+
+       if (nr_ioapics >= MAX_IO_APICS) {
+               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+       }
+       if (!address) {
+               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+                       " found in MADT table, skipping!\n");
+               return;
+       }
+
+       idx = nr_ioapics++;
+
+       mp_ioapics[idx].mpc_type = MP_IOAPIC;
+       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+       mp_ioapics[idx].mpc_apicaddr = address;
+
+       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+               && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+               tmpid = io_apic_get_unique_id(idx, id);
+       else
+               tmpid = id;
+       if (tmpid == -1) {
+               nr_ioapics--;
+               return;
+       }
+       mp_ioapics[idx].mpc_apicid = tmpid;
+       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+       
+       /* 
+        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+        */
+       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+       mp_ioapic_routing[idx].gsi_base = gsi_base;
+       mp_ioapic_routing[idx].gsi_end = gsi_base + 
+               io_apic_get_redir_entries(idx);
+
+       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+               mp_ioapic_routing[idx].gsi_base,
+               mp_ioapic_routing[idx].gsi_end);
+}
+
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+       struct mpc_config_intsrc intsrc;
+       int                     ioapic = -1;
+       int                     pin = -1;
+
+       /* 
+        * Convert 'gsi' to 'ioapic.pin'.
+        */
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return;
+       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+       /*
+        * TBD: This check is for faulty timer entries, where the override
+        *      erroneously sets the trigger to level, resulting in a HUGE 
+        *      increase of timer interrupts!
+        */
+       if ((bus_irq == 0) && (trigger == 3))
+               trigger = 1;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqtype = mp_INT;
+       intsrc.mpc_irqflag = (trigger << 2) | polarity;
+       intsrc.mpc_srcbus = MP_ISA_BUS;
+       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
+       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
+       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
+
+       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
+               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+
+       mp_irqs[mp_irq_entries] = intsrc;
+       if (++mp_irq_entries == MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!\n");
+}
+
+void __init mp_config_acpi_legacy_irqs (void)
+{
+       struct mpc_config_intsrc intsrc;
+       int i = 0;
+       int ioapic = -1;
+
+       /* 
+        * Fabricate the legacy ISA bus (bus #31).
+        */
+       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+       /*
+        * Older generations of ES7000 have no legacy identity mappings
+        */
+       if (es7000_plat == 1)
+               return;
+
+       /* 
+        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+        */
+       ioapic = mp_find_ioapic(0);
+       if (ioapic < 0)
+               return;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqflag = 0;                                 /* Conforming */
+       intsrc.mpc_srcbus = MP_ISA_BUS;
+       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+
+       /* 
+        * Use the default configuration for the IRQs 0-15.  Unless
+        * overriden by (MADT) interrupt source override entries.
+        */
+       for (i = 0; i < 16; i++) {
+               int idx;
+
+               for (idx = 0; idx < mp_irq_entries; idx++) {
+                       struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+                       /* Do we already have a mapping for this ISA IRQ? */
+                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+                               break;
+
+                       /* Do we already have a mapping for this IOAPIC pin */
+                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+                               (irq->mpc_dstirq == i))
+                               break;
+               }
+
+               if (idx != mp_irq_entries) {
+                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+                       continue;                       /* IRQ already used */
+               }
+
+               intsrc.mpc_irqtype = mp_INT;
+               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
+               intsrc.mpc_dstirq = i;
+
+               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+                       intsrc.mpc_dstirq);
+
+               mp_irqs[mp_irq_entries] = intsrc;
+               if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                       panic("Max # of irq sources exceeded!\n");
+       }
+}
+
+#define MAX_GSI_NUM    4096
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+       int ioapic = -1;
+       int ioapic_pin = 0;
+       int idx, bit = 0;
+       static int pci_irq = 16;
+       /*
+        * Mapping between Global System Interrups, which
+        * represent all possible interrupts, and IRQs
+        * assigned to actual devices.
+        */
+       static int              gsi_to_irq[MAX_GSI_NUM];
+
+       /* Don't set up the ACPI SCI because it's already set up */
+       if (acpi_gbl_FADT.sci_interrupt == gsi)
+               return gsi;
+
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0) {
+               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+               return gsi;
+       }
+
+       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+       if (ioapic_renumber_irq)
+               gsi = ioapic_renumber_irq(ioapic, gsi);
+
+       /* 
+        * Avoid pin reprogramming.  PRTs typically include entries  
+        * with redundant pin->gsi mappings (but unique PCI devices);
+        * we only program the IOAPIC on the first.
+        */
+       bit = ioapic_pin % 32;
+       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+       if (idx > 3) {
+               printk(KERN_ERR "Invalid reference to IOAPIC pin "
+                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+                       ioapic_pin);
+               return gsi;
+       }
+       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+               return gsi_to_irq[gsi];
+       }
+
+       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+
+       if (triggering == ACPI_LEVEL_SENSITIVE) {
+               /*
+                * For PCI devices assign IRQs in order, avoiding gaps
+                * due to unused I/O APIC pins.
+                */
+               int irq = gsi;
+               if (gsi < MAX_GSI_NUM) {
+                       /*
+                        * Retain the VIA chipset work-around (gsi > 15), but
+                        * avoid a problem where the 8254 timer (IRQ0) is setup
+                        * via an override (so it's not on pin 0 of the ioapic),
+                        * and at the same time, the pin 0 interrupt is a PCI
+                        * type.  The gsi > 15 test could cause these two pins
+                        * to be shared as IRQ0, and they are not shareable.
+                        * So test for this condition, and if necessary, avoid
+                        * the pin collision.
+                        */
+                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
+                               gsi = pci_irq++;
+                       /*
+                        * Don't assign IRQ used by ACPI SCI
+                        */
+                       if (gsi == acpi_gbl_FADT.sci_interrupt)
+                               gsi = pci_irq++;
+                       gsi_to_irq[irq] = gsi;
+               } else {
+                       printk(KERN_ERR "GSI %u is too high\n", gsi);
+                       return gsi;
+               }
+       }
+
+       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+       return gsi;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644 (file)
index 0000000..0c1069b
--- /dev/null
@@ -0,0 +1,224 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * msr.c
+ *
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+static struct class *msr_class;
+
+static loff_t msr_seek(struct file *file, loff_t offset, int orig)
+{
+       loff_t ret = -EINVAL;
+
+       lock_kernel();
+       switch (orig) {
+       case 0:
+               file->f_pos = offset;
+               ret = file->f_pos;
+               break;
+       case 1:
+               file->f_pos += offset;
+               ret = file->f_pos;
+       }
+       unlock_kernel();
+       return ret;
+}
+
+static ssize_t msr_read(struct file *file, char __user * buf,
+                       size_t count, loff_t * ppos)
+{
+       u32 __user *tmp = (u32 __user *) buf;
+       u32 data[2];
+       u32 reg = *ppos;
+       int cpu = iminor(file->f_path.dentry->d_inode);
+       int err;
+
+       if (count % 8)
+               return -EINVAL; /* Invalid chunk size */
+
+       for (; count; count -= 8) {
+               err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+               if (err)
+                       return -EIO;
+               if (copy_to_user(tmp, &data, 8))
+                       return -EFAULT;
+               tmp += 2;
+       }
+
+       return ((char __user *)tmp) - buf;
+}
+
+static ssize_t msr_write(struct file *file, const char __user *buf,
+                        size_t count, loff_t *ppos)
+{
+       const u32 __user *tmp = (const u32 __user *)buf;
+       u32 data[2];
+       u32 reg = *ppos;
+       int cpu = iminor(file->f_path.dentry->d_inode);
+       int err;
+
+       if (count % 8)
+               return -EINVAL; /* Invalid chunk size */
+
+       for (; count; count -= 8) {
+               if (copy_from_user(&data, tmp, 8))
+                       return -EFAULT;
+               err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
+               if (err)
+                       return -EIO;
+               tmp += 2;
+       }
+
+       return ((char __user *)tmp) - buf;
+}
+
+static int msr_open(struct inode *inode, struct file *file)
+{
+       unsigned int cpu = iminor(file->f_path.dentry->d_inode);
+       struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+
+       if (cpu >= NR_CPUS || !cpu_online(cpu))
+               return -ENXIO;  /* No such CPU */
+       if (!cpu_has(c, X86_FEATURE_MSR))
+               return -EIO;    /* MSR not supported */
+
+       return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations msr_fops = {
+       .owner = THIS_MODULE,
+       .llseek = msr_seek,
+       .read = msr_read,
+       .write = msr_write,
+       .open = msr_open,
+};
+
+static int msr_device_create(int i)
+{
+       int err = 0;
+       struct device *dev;
+
+       dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
+       if (IS_ERR(dev))
+               err = PTR_ERR(dev);
+       return err;
+}
+
+static int msr_class_cpu_callback(struct notifier_block *nfb,
+                               unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               msr_device_create(cpu);
+               break;
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
+{
+       .notifier_call = msr_class_cpu_callback,
+};
+
+static int __init msr_init(void)
+{
+       int i, err = 0;
+       i = 0;
+
+       if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
+               printk(KERN_ERR "msr: unable to get major %d for msr\n",
+                      MSR_MAJOR);
+               err = -EBUSY;
+               goto out;
+       }
+       msr_class = class_create(THIS_MODULE, "msr");
+       if (IS_ERR(msr_class)) {
+               err = PTR_ERR(msr_class);
+               goto out_chrdev;
+       }
+       for_each_online_cpu(i) {
+               err = msr_device_create(i);
+               if (err != 0)
+                       goto out_class;
+       }
+       register_hotcpu_notifier(&msr_class_cpu_notifier);
+
+       err = 0;
+       goto out;
+
+out_class:
+       i = 0;
+       for_each_online_cpu(i)
+               device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
+       class_destroy(msr_class);
+out_chrdev:
+       unregister_chrdev(MSR_MAJOR, "cpu/msr");
+out:
+       return err;
+}
+
+static void __exit msr_exit(void)
+{
+       int cpu = 0;
+       for_each_online_cpu(cpu)
+               device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+       class_destroy(msr_class);
+       unregister_chrdev(MSR_MAJOR, "cpu/msr");
+       unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+}
+
+module_init(msr_init);
+module_exit(msr_exit)
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
new file mode 100644 (file)
index 0000000..c7227e2
--- /dev/null
@@ -0,0 +1,468 @@
+/*
+ *  linux/arch/i386/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson  : AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson  : Power Management for local APIC NMI watchdog.
+ *  Mikael Pettersson  : Pentium 4 support for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson  : PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sysdev.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kprobes.h>
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <linux/kdebug.h>
+
+#include <asm/smp.h>
+#include <asm/nmi.h>
+
+#include "mach_traps.h"
+
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
+/* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
+
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+
+static DEFINE_PER_CPU(short, wd_enabled);
+
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
+
+static int endflag __initdata = 0;
+
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+       local_irq_enable_in_hardirq();
+       /* Intentionally don't use cpu_relax here. This is
+          to make sure that the performance counter really ticks,
+          even if there is a simulator or similar that catches the
+          pause instruction. On a real HT machine this is fine because
+          all other CPUs are busy with "useless" delay loops and don't
+          care if they get somewhat less cycles. */
+       while (endflag == 0)
+               mb();
+}
+#endif
+
+static int __init check_nmi_watchdog(void)
+{
+       unsigned int *prev_nmi_count;
+       int cpu;
+
+       if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
+               return 0;
+
+       if (!atomic_read(&nmi_active))
+               return 0;
+
+       prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+       if (!prev_nmi_count)
+               return -1;
+
+       printk(KERN_INFO "Testing NMI watchdog ... ");
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+
+       for_each_possible_cpu(cpu)
+               prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
+       local_irq_enable();
+       mdelay((20*1000)/nmi_hz); // wait 20 ticks
+
+       for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+               /* Check cpu_callin_map here because that is set
+                  after the timer is started. */
+               if (!cpu_isset(cpu, cpu_callin_map))
+                       continue;
+#endif
+               if (!per_cpu(wd_enabled, cpu))
+                       continue;
+               if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+                       printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+                               cpu,
+                               prev_nmi_count[cpu],
+                               nmi_count(cpu));
+                       per_cpu(wd_enabled, cpu) = 0;
+                       atomic_dec(&nmi_active);
+               }
+       }
+       endflag = 1;
+       if (!atomic_read(&nmi_active)) {
+               kfree(prev_nmi_count);
+               atomic_set(&nmi_active, -1);
+               return -1;
+       }
+       printk("OK.\n");
+
+       /* now that we know it works we can reduce NMI frequency to
+          something more reasonable; makes a difference in some configs */
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               nmi_hz = lapic_adjust_nmi_hz(1);
+
+       kfree(prev_nmi_count);
+       return 0;
+}
+/* This needs to happen later in boot so counters are working */
+late_initcall(check_nmi_watchdog);
+
+static int __init setup_nmi_watchdog(char *str)
+{
+       int nmi;
+
+       get_option(&str, &nmi);
+
+       if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
+               return 0;
+
+       nmi_watchdog = nmi;
+       return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+
+/* Suspend/resume support */
+
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+       /* only CPU0 goes here, other CPUs should be offline */
+       nmi_pm_active = atomic_read(&nmi_active);
+       stop_apic_nmi_watchdog(NULL);
+       BUG_ON(atomic_read(&nmi_active) != 0);
+       return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+       /* only CPU0 goes here, other CPUs should be offline */
+       if (nmi_pm_active > 0) {
+               setup_apic_nmi_watchdog(NULL);
+               touch_nmi_watchdog();
+       }
+       return 0;
+}
+
+
+static struct sysdev_class nmi_sysclass = {
+       set_kset_name("lapic_nmi"),
+       .resume         = lapic_nmi_resume,
+       .suspend        = lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+       .id     = 0,
+       .cls    = &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+       int error;
+
+       /* should really be a BUG_ON but b/c this is an
+        * init call, it just doesn't work.  -dcz
+        */
+       if (nmi_watchdog != NMI_LOCAL_APIC)
+               return 0;
+
+       if (atomic_read(&nmi_active) < 0)
+               return 0;
+
+       error = sysdev_class_register(&nmi_sysclass);
+       if (!error)
+               error = sysdev_register(&device_lapic_nmi);
+       return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif /* CONFIG_PM */
+
+static void __acpi_nmi_enable(void *__unused)
+{
+       apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+               on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_disable(void *__unused)
+{
+       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+               on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+void setup_apic_nmi_watchdog (void *unused)
+{
+       if (__get_cpu_var(wd_enabled))
+               return;
+
+       /* cheap hack to support suspend/resume */
+       /* if cpu0 is not active neither should the other cpus */
+       if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+               return;
+
+       switch (nmi_watchdog) {
+       case NMI_LOCAL_APIC:
+               __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
+               if (lapic_watchdog_init(nmi_hz) < 0) {
+                       __get_cpu_var(wd_enabled) = 0;
+                       return;
+               }
+               /* FALL THROUGH */
+       case NMI_IO_APIC:
+               __get_cpu_var(wd_enabled) = 1;
+               atomic_inc(&nmi_active);
+       }
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+       /* only support LOCAL and IO APICs for now */
+       if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+           (nmi_watchdog != NMI_IO_APIC))
+               return;
+       if (__get_cpu_var(wd_enabled) == 0)
+               return;
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               lapic_watchdog_stop();
+       __get_cpu_var(wd_enabled) = 0;
+       atomic_dec(&nmi_active);
+}
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ *  here too!]
+ */
+
+static unsigned int
+       last_irq_sums [NR_CPUS],
+       alert_counter [NR_CPUS];
+
+void touch_nmi_watchdog(void)
+{
+       if (nmi_watchdog > 0) {
+               unsigned cpu;
+
+               /*
+                * Just reset the alert counters, (other CPUs might be
+                * spinning on locks we hold):
+                */
+               for_each_present_cpu(cpu) {
+                       if (alert_counter[cpu])
+                               alert_counter[cpu] = 0;
+               }
+       }
+
+       /*
+        * Tickle the softlockup detector too:
+        */
+       touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+extern void die_nmi(struct pt_regs *, const char *msg);
+
+__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+{
+
+       /*
+        * Since current_thread_info()-> is always on the stack, and we
+        * always switch the stack NMI-atomically, it's safe to use
+        * smp_processor_id().
+        */
+       unsigned int sum;
+       int touched = 0;
+       int cpu = smp_processor_id();
+       int rc=0;
+
+       /* check for other users first */
+       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+                       == NOTIFY_STOP) {
+               rc = 1;
+               touched = 1;
+       }
+
+       if (cpu_isset(cpu, backtrace_mask)) {
+               static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
+
+               spin_lock(&lock);
+               printk("NMI backtrace for cpu %d\n", cpu);
+               dump_stack();
+               spin_unlock(&lock);
+               cpu_clear(cpu, backtrace_mask);
+       }
+
+       /*
+        * Take the local apic timer and PIT/HPET into account. We don't
+        * know which one is active, when we have highres/dyntick on
+        */
+       sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
+
+       /* if the none of the timers isn't firing, this cpu isn't doing much */
+       if (!touched && last_irq_sums[cpu] == sum) {
+               /*
+                * Ayiee, looks like this CPU is stuck ...
+                * wait a few IRQs (5 seconds) before doing the oops ...
+                */
+               alert_counter[cpu]++;
+               if (alert_counter[cpu] == 5*nmi_hz)
+                       /*
+                        * die_nmi will return ONLY if NOTIFY_STOP happens..
+                        */
+                       die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+       } else {
+               last_irq_sums[cpu] = sum;
+               alert_counter[cpu] = 0;
+       }
+       /* see if the nmi watchdog went off */
+       if (!__get_cpu_var(wd_enabled))
+               return rc;
+       switch (nmi_watchdog) {
+       case NMI_LOCAL_APIC:
+               rc |= lapic_wd_event(nmi_hz);
+               break;
+       case NMI_IO_APIC:
+               /* don't know how to accurately check for this.
+                * just assume it was a watchdog timer interrupt
+                * This matches the old behaviour.
+                */
+               rc = 1;
+               break;
+       }
+       return rc;
+}
+
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+       if (unknown_nmi_panic)
+               return unknown_nmi_panic_callback(regs, cpu);
+#endif
+       return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+       unsigned char reason = get_nmi_reason();
+       char buf[64];
+
+       sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+       die_nmi(regs, buf);
+       return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+                       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int old_state;
+
+       nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+       old_state = nmi_watchdog_enabled;
+       proc_dointvec(table, write, file, buffer, length, ppos);
+       if (!!old_state == !!nmi_watchdog_enabled)
+               return 0;
+
+       if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+               printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+               return -EIO;
+       }
+
+       if (nmi_watchdog == NMI_DEFAULT) {
+               if (lapic_watchdog_ok())
+                       nmi_watchdog = NMI_LOCAL_APIC;
+               else
+                       nmi_watchdog = NMI_IO_APIC;
+       }
+
+       if (nmi_watchdog == NMI_LOCAL_APIC) {
+               if (nmi_watchdog_enabled)
+                       enable_lapic_nmi_watchdog();
+               else
+                       disable_lapic_nmi_watchdog();
+       } else {
+               printk( KERN_WARNING
+                       "NMI watchdog doesn't know what hardware to touch\n");
+               return -EIO;
+       }
+       return 0;
+}
+
+#endif
+
+void __trigger_all_cpu_backtrace(void)
+{
+       int i;
+
+       backtrace_mask = cpu_online_map;
+       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+       for (i = 0; i < 10 * 1000; i++) {
+               if (cpus_empty(backtrace_mask))
+                       break;
+               mdelay(1);
+       }
+}
+
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
new file mode 100644 (file)
index 0000000..9000d82
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/numaq.h>
+#include <asm/topology.h>
+#include <asm/processor.h>
+
+#define        MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
+
+/*
+ * Function: smp_dump_qct()
+ *
+ * Description: gets memory layout from the quad config table.  This
+ * function also updates node_online_map with the nodes (quads) present.
+ */
+static void __init smp_dump_qct(void)
+{
+       int node;
+       struct eachquadmem *eq;
+       struct sys_cfg_data *scd =
+               (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
+
+       nodes_clear(node_online_map);
+       for_each_node(node) {
+               if (scd->quads_present31_0 & (1 << node)) {
+                       node_set_online(node);
+                       eq = &scd->eq[node];
+                       /* Convert to pages */
+                       node_start_pfn[node] = MB_TO_PAGES(
+                               eq->hi_shrd_mem_start - eq->priv_mem_size);
+                       node_end_pfn[node] = MB_TO_PAGES(
+                               eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+
+                       memory_present(node,
+                               node_start_pfn[node], node_end_pfn[node]);
+                       node_remap_size[node] = node_memmap_size_bytes(node,
+                                                       node_start_pfn[node],
+                                                       node_end_pfn[node]);
+               }
+       }
+}
+
+/*
+ * Unlike Summit, we don't really care to let the NUMA-Q
+ * fall back to flat mode.  Don't compile for NUMA-Q
+ * unless you really need it!
+ */
+int __init get_memcfg_numaq(void)
+{
+       smp_dump_qct();
+       return 1;
+}
+
+static int __init numaq_tsc_disable(void)
+{
+       if (num_online_nodes() > 1) {
+               printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+               tsc_disable = 1;
+       }
+       return 0;
+}
+arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
new file mode 100644 (file)
index 0000000..739cfb2
--- /dev/null
@@ -0,0 +1,392 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/highmem.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+
+/* nop stub */
+void _paravirt_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+       printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+              paravirt_ops.name);
+}
+
+char *memory_setup(void)
+{
+       return paravirt_ops.memory_setup();
+}
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code)                                 \
+       extern const char start_##name[], end_##name[];         \
+       asm("start_" #name ": " code "; end_" #name ":")
+
+DEF_NATIVE(irq_disable, "cli");
+DEF_NATIVE(irq_enable, "sti");
+DEF_NATIVE(restore_fl, "push %eax; popf");
+DEF_NATIVE(save_fl, "pushf; pop %eax");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(clts, "clts");
+DEF_NATIVE(read_tsc, "rdtsc");
+
+DEF_NATIVE(ud2a, "ud2a");
+
+static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+                            unsigned long addr, unsigned len)
+{
+       const unsigned char *start, *end;
+       unsigned ret;
+
+       switch(type) {
+#define SITE(x)        case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
+               SITE(irq_disable);
+               SITE(irq_enable);
+               SITE(restore_fl);
+               SITE(save_fl);
+               SITE(iret);
+               SITE(irq_enable_sysexit);
+               SITE(read_cr2);
+               SITE(read_cr3);
+               SITE(write_cr3);
+               SITE(clts);
+               SITE(read_tsc);
+#undef SITE
+
+       patch_site:
+               ret = paravirt_patch_insns(ibuf, len, start, end);
+               break;
+
+       case PARAVIRT_PATCH(make_pgd):
+       case PARAVIRT_PATCH(make_pte):
+       case PARAVIRT_PATCH(pgd_val):
+       case PARAVIRT_PATCH(pte_val):
+#ifdef CONFIG_X86_PAE
+       case PARAVIRT_PATCH(make_pmd):
+       case PARAVIRT_PATCH(pmd_val):
+#endif
+               /* These functions end up returning exactly what
+                  they're passed, in the same registers. */
+               ret = paravirt_patch_nop();
+               break;
+
+       default:
+               ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+               break;
+       }
+
+       return ret;
+}
+
+unsigned paravirt_patch_nop(void)
+{
+       return 0;
+}
+
+unsigned paravirt_patch_ignore(unsigned len)
+{
+       return len;
+}
+
+struct branch {
+       unsigned char opcode;
+       u32 delta;
+} __attribute__((packed));
+
+unsigned paravirt_patch_call(void *insnbuf,
+                            const void *target, u16 tgt_clobbers,
+                            unsigned long addr, u16 site_clobbers,
+                            unsigned len)
+{
+       struct branch *b = insnbuf;
+       unsigned long delta = (unsigned long)target - (addr+5);
+
+       if (tgt_clobbers & ~site_clobbers)
+               return len;     /* target would clobber too much for this site */
+       if (len < 5)
+               return len;     /* call too long for patch site */
+
+       b->opcode = 0xe8; /* call */
+       b->delta = delta;
+       BUILD_BUG_ON(sizeof(*b) != 5);
+
+       return 5;
+}
+
+unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+                           unsigned long addr, unsigned len)
+{
+       struct branch *b = insnbuf;
+       unsigned long delta = (unsigned long)target - (addr+5);
+
+       if (len < 5)
+               return len;     /* call too long for patch site */
+
+       b->opcode = 0xe9;       /* jmp */
+       b->delta = delta;
+
+       return 5;
+}
+
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+                               unsigned long addr, unsigned len)
+{
+       void *opfunc = *((void **)&paravirt_ops + type);
+       unsigned ret;
+
+       if (opfunc == NULL)
+               /* If there's no function, patch it with a ud2a (BUG) */
+               ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+       else if (opfunc == paravirt_nop)
+               /* If the operation is a nop, then nop the callsite */
+               ret = paravirt_patch_nop();
+       else if (type == PARAVIRT_PATCH(iret) ||
+                type == PARAVIRT_PATCH(irq_enable_sysexit))
+               /* If operation requires a jmp, then jmp */
+               ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+       else
+               /* Otherwise call the function; assume target could
+                  clobber any caller-save reg */
+               ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
+                                         addr, clobbers, len);
+
+       return ret;
+}
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+                             const char *start, const char *end)
+{
+       unsigned insn_len = end - start;
+
+       if (insn_len > len || start == NULL)
+               insn_len = len;
+       else
+               memcpy(insnbuf, start, insn_len);
+
+       return insn_len;
+}
+
+void init_IRQ(void)
+{
+       paravirt_ops.init_IRQ();
+}
+
+static void native_flush_tlb(void)
+{
+       __native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static void native_flush_tlb_global(void)
+{
+       __native_flush_tlb_global();
+}
+
+static void native_flush_tlb_single(unsigned long addr)
+{
+       __native_flush_tlb_single(addr);
+}
+
+/* These are in entry.S */
+extern void native_iret(void);
+extern void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+       paravirt_ops.banner();
+       return 0;
+}
+core_initcall(print_banner);
+
+static struct resource reserve_ioports = {
+       .start = 0,
+       .end = IO_SPACE_LIMIT,
+       .name = "paravirt-ioport",
+       .flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+       .start = 0,
+       .end = -1,
+       .name = "paravirt-iomem",
+       .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+       int ret;
+
+       ret = request_resource(&ioport_resource, &reserve_ioports);
+       if (ret == 0) {
+               ret = request_resource(&iomem_resource, &reserve_iomem);
+               if (ret)
+                       release_resource(&reserve_ioports);
+       }
+
+       return ret;
+}
+
+struct paravirt_ops paravirt_ops = {
+       .name = "bare hardware",
+       .paravirt_enabled = 0,
+       .kernel_rpl = 0,
+       .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
+
+       .patch = native_patch,
+       .banner = default_banner,
+       .arch_setup = paravirt_nop,
+       .memory_setup = machine_specific_memory_setup,
+       .get_wallclock = native_get_wallclock,
+       .set_wallclock = native_set_wallclock,
+       .time_init = hpet_time_init,
+       .init_IRQ = native_init_IRQ,
+
+       .cpuid = native_cpuid,
+       .get_debugreg = native_get_debugreg,
+       .set_debugreg = native_set_debugreg,
+       .clts = native_clts,
+       .read_cr0 = native_read_cr0,
+       .write_cr0 = native_write_cr0,
+       .read_cr2 = native_read_cr2,
+       .write_cr2 = native_write_cr2,
+       .read_cr3 = native_read_cr3,
+       .write_cr3 = native_write_cr3,
+       .read_cr4 = native_read_cr4,
+       .read_cr4_safe = native_read_cr4_safe,
+       .write_cr4 = native_write_cr4,
+       .save_fl = native_save_fl,
+       .restore_fl = native_restore_fl,
+       .irq_disable = native_irq_disable,
+       .irq_enable = native_irq_enable,
+       .safe_halt = native_safe_halt,
+       .halt = native_halt,
+       .wbinvd = native_wbinvd,
+       .read_msr = native_read_msr_safe,
+       .write_msr = native_write_msr_safe,
+       .read_tsc = native_read_tsc,
+       .read_pmc = native_read_pmc,
+       .sched_clock = native_sched_clock,
+       .get_cpu_khz = native_calculate_cpu_khz,
+       .load_tr_desc = native_load_tr_desc,
+       .set_ldt = native_set_ldt,
+       .load_gdt = native_load_gdt,
+       .load_idt = native_load_idt,
+       .store_gdt = native_store_gdt,
+       .store_idt = native_store_idt,
+       .store_tr = native_store_tr,
+       .load_tls = native_load_tls,
+       .write_ldt_entry = write_dt_entry,
+       .write_gdt_entry = write_dt_entry,
+       .write_idt_entry = write_dt_entry,
+       .load_esp0 = native_load_esp0,
+
+       .set_iopl_mask = native_set_iopl_mask,
+       .io_delay = native_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       .apic_write = native_apic_write,
+       .apic_write_atomic = native_apic_write_atomic,
+       .apic_read = native_apic_read,
+       .setup_boot_clock = setup_boot_APIC_clock,
+       .setup_secondary_clock = setup_secondary_APIC_clock,
+       .startup_ipi_hook = paravirt_nop,
+#endif
+       .set_lazy_mode = paravirt_nop,
+
+       .pagetable_setup_start = native_pagetable_setup_start,
+       .pagetable_setup_done = native_pagetable_setup_done,
+
+       .flush_tlb_user = native_flush_tlb,
+       .flush_tlb_kernel = native_flush_tlb_global,
+       .flush_tlb_single = native_flush_tlb_single,
+       .flush_tlb_others = native_flush_tlb_others,
+
+       .alloc_pt = paravirt_nop,
+       .alloc_pd = paravirt_nop,
+       .alloc_pd_clone = paravirt_nop,
+       .release_pt = paravirt_nop,
+       .release_pd = paravirt_nop,
+
+       .set_pte = native_set_pte,
+       .set_pte_at = native_set_pte_at,
+       .set_pmd = native_set_pmd,
+       .pte_update = paravirt_nop,
+       .pte_update_defer = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+       .kmap_atomic_pte = kmap_atomic,
+#endif
+
+#ifdef CONFIG_X86_PAE
+       .set_pte_atomic = native_set_pte_atomic,
+       .set_pte_present = native_set_pte_present,
+       .set_pud = native_set_pud,
+       .pte_clear = native_pte_clear,
+       .pmd_clear = native_pmd_clear,
+
+       .pmd_val = native_pmd_val,
+       .make_pmd = native_make_pmd,
+#endif
+
+       .pte_val = native_pte_val,
+       .pgd_val = native_pgd_val,
+
+       .make_pte = native_make_pte,
+       .make_pgd = native_make_pgd,
+
+       .irq_enable_sysexit = native_irq_enable_sysexit,
+       .iret = native_iret,
+
+       .dup_mmap = paravirt_nop,
+       .exit_mmap = paravirt_nop,
+       .activate_mm = paravirt_nop,
+};
+
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
new file mode 100644 (file)
index 0000000..048f09b
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * On i386 there is no hardware dynamic DMA address translation,
+ * so consistent alloc/free are merely page allocation/freeing.
+ * The rest of the dynamic DMA mapping interface is implemented
+ * in asm/pci.h.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+
+struct dma_coherent_mem {
+       void            *virt_base;
+       u32             device_base;
+       int             size;
+       int             flags;
+       unsigned long   *bitmap;
+};
+
+void *dma_alloc_coherent(struct device *dev, size_t size,
+                          dma_addr_t *dma_handle, gfp_t gfp)
+{
+       void *ret;
+       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+       int order = get_order(size);
+       /* ignore region specifiers */
+       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+       if (mem) {
+               int page = bitmap_find_free_region(mem->bitmap, mem->size,
+                                                    order);
+               if (page >= 0) {
+                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+                       ret = mem->virt_base + (page << PAGE_SHIFT);
+                       memset(ret, 0, size);
+                       return ret;
+               }
+               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+                       return NULL;
+       }
+
+       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
+               gfp |= GFP_DMA;
+
+       ret = (void *)__get_free_pages(gfp, order);
+
+       if (ret != NULL) {
+               memset(ret, 0, size);
+               *dma_handle = virt_to_phys(ret);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+void dma_free_coherent(struct device *dev, size_t size,
+                        void *vaddr, dma_addr_t dma_handle)
+{
+       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+       int order = get_order(size);
+       
+       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+               bitmap_release_region(mem->bitmap, page, order);
+       } else
+               free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+                               dma_addr_t device_addr, size_t size, int flags)
+{
+       void __iomem *mem_base = NULL;
+       int pages = size >> PAGE_SHIFT;
+       int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+               goto out;
+       if (!size)
+               goto out;
+       if (dev->dma_mem)
+               goto out;
+
+       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+       mem_base = ioremap(bus_addr, size);
+       if (!mem_base)
+               goto out;
+
+       dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+       if (!dev->dma_mem)
+               goto out;
+       dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+       if (!dev->dma_mem->bitmap)
+               goto free1_out;
+
+       dev->dma_mem->virt_base = mem_base;
+       dev->dma_mem->device_base = device_addr;
+       dev->dma_mem->size = pages;
+       dev->dma_mem->flags = flags;
+
+       if (flags & DMA_MEMORY_MAP)
+               return DMA_MEMORY_MAP;
+
+       return DMA_MEMORY_IO;
+
+ free1_out:
+       kfree(dev->dma_mem);
+ out:
+       if (mem_base)
+               iounmap(mem_base);
+       return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+       struct dma_coherent_mem *mem = dev->dma_mem;
+       
+       if(!mem)
+               return;
+       dev->dma_mem = NULL;
+       iounmap(mem->virt_base);
+       kfree(mem->bitmap);
+       kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+                                       dma_addr_t device_addr, size_t size)
+{
+       struct dma_coherent_mem *mem = dev->dma_mem;
+       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       int pos, err;
+
+       if (!mem)
+               return ERR_PTR(-EINVAL);
+
+       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+       if (err != 0)
+               return ERR_PTR(err);
+       return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+int forbid_dac;
+EXPORT_SYMBOL(forbid_dac);
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+       if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+               printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+               forbid_dac = 1;
+       }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+
+static int check_iommu(char *s)
+{
+       if (!strcmp(s, "usedac")) {
+               forbid_dac = -1;
+               return 1;
+       }
+       return 0;
+}
+__setup("iommu=", check_iommu);
+#endif
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644 (file)
index 0000000..bc1f2d3
--- /dev/null
@@ -0,0 +1,20 @@
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+       struct platform_device *pd;
+       int ret;
+
+       pd = platform_device_alloc("pcspkr", -1);
+       if (!pd)
+               return -ENOMEM;
+
+       ret = platform_device_add(pd);
+       if (ret)
+               platform_device_put(pd);
+
+       return ret;
+}
+device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644 (file)
index 0000000..8466471
--- /dev/null
@@ -0,0 +1,951 @@
+/*
+ *  linux/arch/i386/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/personality.h>
+#include <linux/tick.h>
+#include <linux/percpu.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/desc.h>
+#include <asm/vm86.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+
+static int hlt_counter;
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+       return ((unsigned long *)tsk->thread.esp)[3];
+}
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+void disable_hlt(void)
+{
+       hlt_counter++;
+}
+
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+       hlt_counter--;
+}
+
+EXPORT_SYMBOL(enable_hlt);
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+       if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
+               current_thread_info()->status &= ~TS_POLLING;
+               /*
+                * TS_POLLING-cleared state must be visible before we
+                * test NEED_RESCHED:
+                */
+               smp_mb();
+
+               local_irq_disable();
+               if (!need_resched())
+                       safe_halt();    /* enables interrupts racelessly */
+               else
+                       local_irq_enable();
+               current_thread_info()->status |= TS_POLLING;
+       } else {
+               /* loop is done by the caller */
+               cpu_relax();
+       }
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+       cpu_relax();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+       /* This must be done before dead CPU ack */
+       cpu_exit_clear();
+       wbinvd();
+       mb();
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
+
+       /*
+        * With physical CPU hotplug, we should halt the cpu
+        */
+       local_irq_disable();
+       while (1)
+               halt();
+}
+#else
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+       int cpu = smp_processor_id();
+
+       current_thread_info()->status |= TS_POLLING;
+
+       /* endless idle loop with no priority at all */
+       while (1) {
+               tick_nohz_stop_sched_tick();
+               while (!need_resched()) {
+                       void (*idle)(void);
+
+                       if (__get_cpu_var(cpu_idle_state))
+                               __get_cpu_var(cpu_idle_state) = 0;
+
+                       check_pgt_cache();
+                       rmb();
+                       idle = pm_idle;
+
+                       if (!idle)
+                               idle = default_idle;
+
+                       if (cpu_is_offline(cpu))
+                               play_dead();
+
+                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+                       idle();
+               }
+               tick_nohz_restart_sched_tick();
+               preempt_enable_no_resched();
+               schedule();
+               preempt_disable();
+       }
+}
+
+void cpu_idle_wait(void)
+{
+       unsigned int cpu, this_cpu = get_cpu();
+       cpumask_t map, tmp = current->cpus_allowed;
+
+       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+       put_cpu();
+
+       cpus_clear(map);
+       for_each_online_cpu(cpu) {
+               per_cpu(cpu_idle_state, cpu) = 1;
+               cpu_set(cpu, map);
+       }
+
+       __get_cpu_var(cpu_idle_state) = 0;
+
+       wmb();
+       do {
+               ssleep(1);
+               for_each_online_cpu(cpu) {
+                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+                               cpu_clear(cpu, map);
+               }
+               cpus_and(map, map, cpu_online_map);
+       } while (!cpus_empty(map));
+
+       set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+       if (!need_resched()) {
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (!need_resched())
+                       __mwait(eax, ecx);
+       }
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+       local_irq_enable();
+       mwait_idle_with_hints(0, 0);
+}
+
+void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+       if (cpu_has(c, X86_FEATURE_MWAIT)) {
+               printk("monitor/mwait feature present.\n");
+               /*
+                * Skip, if setup has overridden idle.
+                * One CPU supports mwait => All CPUs supports mwait
+                */
+               if (!pm_idle) {
+                       printk("using mwait in idle threads.\n");
+                       pm_idle = mwait_idle;
+               }
+       }
+}
+
+static int __init idle_setup(char *str)
+{
+       if (!strcmp(str, "poll")) {
+               printk("using polling idle threads.\n");
+               pm_idle = poll_idle;
+#ifdef CONFIG_X86_SMP
+               if (smp_num_siblings > 1)
+                       printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
+#endif
+       } else if (!strcmp(str, "mwait"))
+               force_mwait = 1;
+       else
+               return -1;
+
+       boot_option_idle_override = 1;
+       return 0;
+}
+early_param("idle", idle_setup);
+
+void show_regs(struct pt_regs * regs)
+{
+       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+       unsigned long d0, d1, d2, d3, d6, d7;
+
+       printk("\n");
+       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
+       print_symbol("EIP is at %s\n", regs->eip);
+
+       if (user_mode_vm(regs))
+               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
+       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
+              regs->eflags, print_tainted(), init_utsname()->release,
+              (int)strcspn(init_utsname()->version, " "),
+              init_utsname()->version);
+       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+               regs->eax,regs->ebx,regs->ecx,regs->edx);
+       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
+               regs->esi, regs->edi, regs->ebp);
+       printk(" DS: %04x ES: %04x FS: %04x\n",
+              0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
+
+       cr0 = read_cr0();
+       cr2 = read_cr2();
+       cr3 = read_cr3();
+       cr4 = read_cr4_safe();
+       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+
+       get_debugreg(d0, 0);
+       get_debugreg(d1, 1);
+       get_debugreg(d2, 2);
+       get_debugreg(d3, 3);
+       printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+                       d0, d1, d2, d3);
+       get_debugreg(d6, 6);
+       get_debugreg(d7, 7);
+       printk("DR6: %08lx DR7: %08lx\n", d6, d7);
+
+       show_trace(NULL, regs, &regs->esp);
+}
+
+/*
+ * This gets run with %ebx containing the
+ * function to call, and %edx containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+{
+       struct pt_regs regs;
+
+       memset(&regs, 0, sizeof(regs));
+
+       regs.ebx = (unsigned long) fn;
+       regs.edx = (unsigned long) arg;
+
+       regs.xds = __USER_DS;
+       regs.xes = __USER_DS;
+       regs.xfs = __KERNEL_PERCPU;
+       regs.orig_eax = -1;
+       regs.eip = (unsigned long) kernel_thread_helper;
+       regs.xcs = __KERNEL_CS | get_kernel_rpl();
+       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+       /* Ok, create the new process.. */
+       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+       /* The process may have allocated an io port bitmap... nuke it. */
+       if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
+               struct task_struct *tsk = current;
+               struct thread_struct *t = &tsk->thread;
+               int cpu = get_cpu();
+               struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+               kfree(t->io_bitmap_ptr);
+               t->io_bitmap_ptr = NULL;
+               clear_thread_flag(TIF_IO_BITMAP);
+               /*
+                * Careful, clear this in the TSS too:
+                */
+               memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
+               t->io_bitmap_max = 0;
+               tss->io_bitmap_owner = NULL;
+               tss->io_bitmap_max = 0;
+               tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+               put_cpu();
+       }
+}
+
+void flush_thread(void)
+{
+       struct task_struct *tsk = current;
+
+       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+       clear_tsk_thread_flag(tsk, TIF_DEBUG);
+       /*
+        * Forget coprocessor state..
+        */
+       clear_fpu(tsk);
+       clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+       BUG_ON(dead_task->mm);
+       release_vm86_irqs(dead_task);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+       unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+       unsigned long unused,
+       struct task_struct * p, struct pt_regs * regs)
+{
+       struct pt_regs * childregs;
+       struct task_struct *tsk;
+       int err;
+
+       childregs = task_pt_regs(p);
+       *childregs = *regs;
+       childregs->eax = 0;
+       childregs->esp = esp;
+
+       p->thread.esp = (unsigned long) childregs;
+       p->thread.esp0 = (unsigned long) (childregs+1);
+
+       p->thread.eip = (unsigned long) ret_from_fork;
+
+       savesegment(gs,p->thread.gs);
+
+       tsk = current;
+       if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+               p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+                                               IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!p->thread.io_bitmap_ptr) {
+                       p->thread.io_bitmap_max = 0;
+                       return -ENOMEM;
+               }
+               set_tsk_thread_flag(p, TIF_IO_BITMAP);
+       }
+
+       /*
+        * Set a new TLS for the child thread?
+        */
+       if (clone_flags & CLONE_SETTLS) {
+               struct desc_struct *desc;
+               struct user_desc info;
+               int idx;
+
+               err = -EFAULT;
+               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
+                       goto out;
+               err = -EINVAL;
+               if (LDT_empty(&info))
+                       goto out;
+
+               idx = info.entry_number;
+               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                       goto out;
+
+               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+               desc->a = LDT_entry_a(&info);
+               desc->b = LDT_entry_b(&info);
+       }
+
+       err = 0;
+ out:
+       if (err && p->thread.io_bitmap_ptr) {
+               kfree(p->thread.io_bitmap_ptr);
+               p->thread.io_bitmap_max = 0;
+       }
+       return err;
+}
+
+/*
+ * fill in the user structure for a core dump..
+ */
+void dump_thread(struct pt_regs * regs, struct user * dump)
+{
+       int i;
+
+/* changed the size calculations - should hopefully work better. lbt */
+       dump->magic = CMAGIC;
+       dump->start_code = 0;
+       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+       dump->u_dsize -= dump->u_tsize;
+       dump->u_ssize = 0;
+       for (i = 0; i < 8; i++)
+               dump->u_debugreg[i] = current->thread.debugreg[i];  
+
+       if (dump->start_stack < TASK_SIZE)
+               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
+
+       dump->regs.ebx = regs->ebx;
+       dump->regs.ecx = regs->ecx;
+       dump->regs.edx = regs->edx;
+       dump->regs.esi = regs->esi;
+       dump->regs.edi = regs->edi;
+       dump->regs.ebp = regs->ebp;
+       dump->regs.eax = regs->eax;
+       dump->regs.ds = regs->xds;
+       dump->regs.es = regs->xes;
+       dump->regs.fs = regs->xfs;
+       savesegment(gs,dump->regs.gs);
+       dump->regs.orig_eax = regs->orig_eax;
+       dump->regs.eip = regs->eip;
+       dump->regs.cs = regs->xcs;
+       dump->regs.eflags = regs->eflags;
+       dump->regs.esp = regs->esp;
+       dump->regs.ss = regs->xss;
+
+       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+}
+EXPORT_SYMBOL(dump_thread);
+
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+       struct pt_regs ptregs = *task_pt_regs(tsk);
+       ptregs.xcs &= 0xffff;
+       ptregs.xds &= 0xffff;
+       ptregs.xes &= 0xffff;
+       ptregs.xss &= 0xffff;
+
+       elf_core_copy_regs(regs, &ptregs);
+
+       return 1;
+}
+
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+       write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+       preempt_disable();
+       if (!test_and_set_thread_flag(TIF_NOTSC))
+               /*
+                * Must flip the CPU state synchronously with
+                * TIF_NOTSC in the current running context.
+                */
+               hard_disable_TSC();
+       preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+       write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+                struct tss_struct *tss)
+{
+       struct thread_struct *next;
+
+       next = &next_p->thread;
+
+       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+               set_debugreg(next->debugreg[0], 0);
+               set_debugreg(next->debugreg[1], 1);
+               set_debugreg(next->debugreg[2], 2);
+               set_debugreg(next->debugreg[3], 3);
+               /* no 4 and 5 */
+               set_debugreg(next->debugreg[6], 6);
+               set_debugreg(next->debugreg[7], 7);
+       }
+
+#ifdef CONFIG_SECCOMP
+       if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+           test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+               /* prev and next are different */
+               if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+                       hard_disable_TSC();
+               else
+                       hard_enable_TSC();
+       }
+#endif
+
+       if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+               /*
+                * Disable the bitmap via an invalid offset. We still cache
+                * the previous bitmap owner and the IO bitmap contents:
+                */
+               tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+               return;
+       }
+
+       if (likely(next == tss->io_bitmap_owner)) {
+               /*
+                * Previous owner of the bitmap (hence the bitmap content)
+                * matches the next task, we dont have to do anything but
+                * to set a valid offset in the TSS:
+                */
+               tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+               return;
+       }
+       /*
+        * Lazy TSS's I/O bitmap copy. We set an invalid offset here
+        * and we let the task to get a GPF in case an I/O instruction
+        * is performed.  The handler of the GPF will verify that the
+        * faulting task has a valid I/O bitmap and, it true, does the
+        * real copy and restart the instruction.  This will save us
+        * redundant copies when the currently switched task does not
+        * perform any I/O during its timeslice.
+        */
+       tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+}
+
+/*
+ *     switch_to(x,yn) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %eax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+       struct thread_struct *prev = &prev_p->thread,
+                                *next = &next_p->thread;
+       int cpu = smp_processor_id();
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+       /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+
+       __unlazy_fpu(prev_p);
+
+
+       /* we're going to use this soon, after a few expensive things */
+       if (next_p->fpu_counter > 5)
+               prefetch(&next->i387.fxsave);
+
+       /*
+        * Reload esp0.
+        */
+       load_esp0(tss, next);
+
+       /*
+        * Save away %gs. No need to save %fs, as it was saved on the
+        * stack on entry.  No need to save %es and %ds, as those are
+        * always kernel segments while inside the kernel.  Doing this
+        * before setting the new TLS descriptors avoids the situation
+        * where we temporarily have non-reloadable segments in %fs
+        * and %gs.  This could be an issue if the NMI handler ever
+        * used %fs or %gs (it does not today), or if the kernel is
+        * running inside of a hypervisor layer.
+        */
+       savesegment(gs, prev->gs);
+
+       /*
+        * Load the per-thread Thread-Local Storage descriptor.
+        */
+       load_TLS(next, cpu);
+
+       /*
+        * Restore IOPL if needed.  In normal use, the flags restore
+        * in the switch assembly will handle this.  But if the kernel
+        * is running virtualized at a non-zero CPL, the popf will
+        * not restore flags, so it must be done in a separate step.
+        */
+       if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+               set_iopl_mask(next->iopl);
+
+       /*
+        * Now maybe handle debug registers and/or IO bitmaps
+        */
+       if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+                    task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+               __switch_to_xtra(prev_p, next_p, tss);
+
+       /*
+        * Leave lazy mode, flushing any hypercalls made here.
+        * This must be done before restoring TLS segments so
+        * the GDT and LDT are properly updated, and must be
+        * done before math_state_restore, so the TS bit is up
+        * to date.
+        */
+       arch_leave_lazy_cpu_mode();
+
+       /* If the task has used fpu the last 5 timeslices, just do a full
+        * restore of the math state immediately to avoid the trap; the
+        * chances of needing FPU soon are obviously high now
+        */
+       if (next_p->fpu_counter > 5)
+               math_state_restore();
+
+       /*
+        * Restore %gs if needed (which is common)
+        */
+       if (prev->gs | next->gs)
+               loadsegment(gs, next->gs);
+
+       x86_write_percpu(current_task, next_p);
+
+       return prev_p;
+}
+
+asmlinkage int sys_fork(struct pt_regs regs)
+{
+       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+
+asmlinkage int sys_clone(struct pt_regs regs)
+{
+       unsigned long clone_flags;
+       unsigned long newsp;
+       int __user *parent_tidptr, *child_tidptr;
+
+       clone_flags = regs.ebx;
+       newsp = regs.ecx;
+       parent_tidptr = (int __user *)regs.edx;
+       child_tidptr = (int __user *)regs.edi;
+       if (!newsp)
+               newsp = regs.esp;
+       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage int sys_vfork(struct pt_regs regs)
+{
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage int sys_execve(struct pt_regs regs)
+{
+       int error;
+       char * filename;
+
+       filename = getname((char __user *) regs.ebx);
+       error = PTR_ERR(filename);
+       if (IS_ERR(filename))
+               goto out;
+       error = do_execve(filename,
+                       (char __user * __user *) regs.ecx,
+                       (char __user * __user *) regs.edx,
+                       &regs);
+       if (error == 0) {
+               task_lock(current);
+               current->ptrace &= ~PT_DTRACE;
+               task_unlock(current);
+               /* Make sure we don't return using sysenter.. */
+               set_thread_flag(TIF_IRET);
+       }
+       putname(filename);
+out:
+       return error;
+}
+
+#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
+
+unsigned long get_wchan(struct task_struct *p)
+{
+       unsigned long ebp, esp, eip;
+       unsigned long stack_page;
+       int count = 0;
+       if (!p || p == current || p->state == TASK_RUNNING)
+               return 0;
+       stack_page = (unsigned long)task_stack_page(p);
+       esp = p->thread.esp;
+       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+               return 0;
+       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
+       ebp = *(unsigned long *) esp;
+       do {
+               if (ebp < stack_page || ebp > top_ebp+stack_page)
+                       return 0;
+               eip = *(unsigned long *) (ebp+4);
+               if (!in_sched_functions(eip))
+                       return eip;
+               ebp = *(unsigned long *) ebp;
+       } while (count++ < 16);
+       return 0;
+}
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+       struct thread_struct *t = &current->thread;
+       int idx;
+
+       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+               if (desc_empty(t->tls_array + idx))
+                       return idx + GDT_ENTRY_TLS_MIN;
+       return -ESRCH;
+}
+
+/*
+ * Set a given TLS descriptor:
+ */
+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+{
+       struct thread_struct *t = &current->thread;
+       struct user_desc info;
+       struct desc_struct *desc;
+       int cpu, idx;
+
+       if (copy_from_user(&info, u_info, sizeof(info)))
+               return -EFAULT;
+       idx = info.entry_number;
+
+       /*
+        * index -1 means the kernel should try to find and
+        * allocate an empty descriptor:
+        */
+       if (idx == -1) {
+               idx = get_free_idx();
+               if (idx < 0)
+                       return idx;
+               if (put_user(idx, &u_info->entry_number))
+                       return -EFAULT;
+       }
+
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+       /*
+        * We must not get preempted while modifying the TLS.
+        */
+       cpu = get_cpu();
+
+       if (LDT_empty(&info)) {
+               desc->a = 0;
+               desc->b = 0;
+       } else {
+               desc->a = LDT_entry_a(&info);
+               desc->b = LDT_entry_b(&info);
+       }
+       load_TLS(t, cpu);
+
+       put_cpu();
+
+       return 0;
+}
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+       (((desc)->a >> 16) & 0x0000ffff) | \
+       (((desc)->b << 16) & 0x00ff0000) | \
+       ( (desc)->b        & 0xff000000)   )
+
+#define GET_LIMIT(desc) ( \
+       ((desc)->a & 0x0ffff) | \
+        ((desc)->b & 0xf0000) )
+       
+#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
+
+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+{
+       struct user_desc info;
+       struct desc_struct *desc;
+       int idx;
+
+       if (get_user(idx, &u_info->entry_number))
+               return -EFAULT;
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       memset(&info, 0, sizeof(info));
+
+       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+       info.entry_number = idx;
+       info.base_addr = GET_BASE(desc);
+       info.limit = GET_LIMIT(desc);
+       info.seg_32bit = GET_32BIT(desc);
+       info.contents = GET_CONTENTS(desc);
+       info.read_exec_only = !GET_WRITABLE(desc);
+       info.limit_in_pages = GET_LIMIT_PAGES(desc);
+       info.seg_not_present = !GET_PRESENT(desc);
+       info.useable = GET_USEABLE(desc);
+
+       if (copy_to_user(u_info, &info, sizeof(info)))
+               return -EFAULT;
+       return 0;
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+               sp -= get_random_int() % 8192;
+       return sp & ~0xf;
+}
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
new file mode 100644 (file)
index 0000000..7c1b925
--- /dev/null
@@ -0,0 +1,723 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
+ * Also masks reserved bits (31-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x00050dd5
+
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100
+
+/*
+ * Offset of eflags on child stack..
+ */
+#define EFL_OFFSET offsetof(struct pt_regs, eflags)
+
+static inline struct pt_regs *get_child_regs(struct task_struct *task)
+{
+       void *stack_top = (void *)task->thread.esp0;
+       return stack_top - sizeof(struct pt_regs);
+}
+
+/*
+ * This routine will get a word off of the processes privileged stack.
+ * the offset is bytes into the pt_regs structure on the stack.
+ * This routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline int get_stack_long(struct task_struct *task, int offset)
+{
+       unsigned char *stack;
+
+       stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
+       stack += offset;
+       return (*((int *)stack));
+}
+
+/*
+ * This routine will put a word on the processes privileged stack.
+ * the offset is bytes into the pt_regs structure on the stack.
+ * This routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline int put_stack_long(struct task_struct *task, int offset,
+       unsigned long data)
+{
+       unsigned char * stack;
+
+       stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
+       stack += offset;
+       *(unsigned long *) stack = data;
+       return 0;
+}
+
+static int putreg(struct task_struct *child,
+       unsigned long regno, unsigned long value)
+{
+       switch (regno >> 2) {
+               case GS:
+                       if (value && (value & 3) != 3)
+                               return -EIO;
+                       child->thread.gs = value;
+                       return 0;
+               case DS:
+               case ES:
+               case FS:
+                       if (value && (value & 3) != 3)
+                               return -EIO;
+                       value &= 0xffff;
+                       break;
+               case SS:
+               case CS:
+                       if ((value & 3) != 3)
+                               return -EIO;
+                       value &= 0xffff;
+                       break;
+               case EFL:
+                       value &= FLAG_MASK;
+                       value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
+                       break;
+       }
+       if (regno > FS*4)
+               regno -= 1*4;
+       put_stack_long(child, regno, value);
+       return 0;
+}
+
+static unsigned long getreg(struct task_struct *child,
+       unsigned long regno)
+{
+       unsigned long retval = ~0UL;
+
+       switch (regno >> 2) {
+               case GS:
+                       retval = child->thread.gs;
+                       break;
+               case DS:
+               case ES:
+               case FS:
+               case SS:
+               case CS:
+                       retval = 0xffff;
+                       /* fall through */
+               default:
+                       if (regno > FS*4)
+                               regno -= 1*4;
+                       retval &= get_stack_long(child, regno);
+       }
+       return retval;
+}
+
+#define LDT_SEGMENT 4
+
+static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+       unsigned long addr, seg;
+
+       addr = regs->eip;
+       seg = regs->xcs & 0xffff;
+       if (regs->eflags & VM_MASK) {
+               addr = (addr & 0xffff) + (seg << 4);
+               return addr;
+       }
+
+       /*
+        * We'll assume that the code segments in the GDT
+        * are all zero-based. That is largely true: the
+        * TLS segments are used for data, and the PNPBIOS
+        * and APM bios ones we just ignore here.
+        */
+       if (seg & LDT_SEGMENT) {
+               u32 *desc;
+               unsigned long base;
+
+               seg &= ~7UL;
+
+               down(&child->mm->context.sem);
+               if (unlikely((seg >> 3) >= child->mm->context.size))
+                       addr = -1L; /* bogus selector, access would fault */
+               else {
+                       desc = child->mm->context.ldt + seg;
+                       base = ((desc[0] >> 16) |
+                               ((desc[1] & 0xff) << 16) |
+                               (desc[1] & 0xff000000));
+
+                       /* 16-bit code segment? */
+                       if (!((desc[1] >> 22) & 1))
+                               addr &= 0xffff;
+                       addr += base;
+               }
+               up(&child->mm->context.sem);
+       }
+       return addr;
+}
+
+static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+       int i, copied;
+       unsigned char opcode[15];
+       unsigned long addr = convert_eip_to_linear(child, regs);
+
+       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+       for (i = 0; i < copied; i++) {
+               switch (opcode[i]) {
+               /* popf and iret */
+               case 0x9d: case 0xcf:
+                       return 1;
+               /* opcode and address size prefixes */
+               case 0x66: case 0x67:
+                       continue;
+               /* irrelevant prefixes (segment overrides and repeats) */
+               case 0x26: case 0x2e:
+               case 0x36: case 0x3e:
+               case 0x64: case 0x65:
+               case 0xf0: case 0xf2: case 0xf3:
+                       continue;
+
+               /*
+                * pushf: NOTE! We should probably not let
+                * the user see the TF bit being set. But
+                * it's more pain than it's worth to avoid
+                * it, and a debugger could emulate this
+                * all in user space if it _really_ cares.
+                */
+               case 0x9c:
+               default:
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+static void set_singlestep(struct task_struct *child)
+{
+       struct pt_regs *regs = get_child_regs(child);
+
+       /*
+        * Always set TIF_SINGLESTEP - this guarantees that 
+        * we single-step system calls etc..  This will also
+        * cause us to set TF when returning to user mode.
+        */
+       set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+       /*
+        * If TF was already set, don't do anything else
+        */
+       if (regs->eflags & TRAP_FLAG)
+               return;
+
+       /* Set TF on the kernel stack.. */
+       regs->eflags |= TRAP_FLAG;
+
+       /*
+        * ..but if TF is changed by the instruction we will trace,
+        * don't mark it as being "us" that set it, so that we
+        * won't clear it by hand later.
+        */
+       if (is_setting_trap_flag(child, regs))
+               return;
+       
+       child->ptrace |= PT_DTRACE;
+}
+
+static void clear_singlestep(struct task_struct *child)
+{
+       /* Always clear TIF_SINGLESTEP... */
+       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+       /* But touch TF only if it was set by us.. */
+       if (child->ptrace & PT_DTRACE) {
+               struct pt_regs *regs = get_child_regs(child);
+               regs->eflags &= ~TRAP_FLAG;
+               child->ptrace &= ~PT_DTRACE;
+       }
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+       clear_singlestep(child);
+       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+}
+
+/*
+ * Perform get_thread_area on behalf of the traced child.
+ */
+static int
+ptrace_get_thread_area(struct task_struct *child,
+                      int idx, struct user_desc __user *user_desc)
+{
+       struct user_desc info;
+       struct desc_struct *desc;
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+       (((desc)->a >> 16) & 0x0000ffff) | \
+       (((desc)->b << 16) & 0x00ff0000) | \
+       ( (desc)->b        & 0xff000000)   )
+
+#define GET_LIMIT(desc) ( \
+       ((desc)->a & 0x0ffff) | \
+        ((desc)->b & 0xf0000) )
+
+#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
+
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+       info.entry_number = idx;
+       info.base_addr = GET_BASE(desc);
+       info.limit = GET_LIMIT(desc);
+       info.seg_32bit = GET_32BIT(desc);
+       info.contents = GET_CONTENTS(desc);
+       info.read_exec_only = !GET_WRITABLE(desc);
+       info.limit_in_pages = GET_LIMIT_PAGES(desc);
+       info.seg_not_present = !GET_PRESENT(desc);
+       info.useable = GET_USEABLE(desc);
+
+       if (copy_to_user(user_desc, &info, sizeof(info)))
+               return -EFAULT;
+
+       return 0;
+}
+
+/*
+ * Perform set_thread_area on behalf of the traced child.
+ */
+static int
+ptrace_set_thread_area(struct task_struct *child,
+                      int idx, struct user_desc __user *user_desc)
+{
+       struct user_desc info;
+       struct desc_struct *desc;
+
+       if (copy_from_user(&info, user_desc, sizeof(info)))
+               return -EFAULT;
+
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+       if (LDT_empty(&info)) {
+               desc->a = 0;
+               desc->b = 0;
+       } else {
+               desc->a = LDT_entry_a(&info);
+               desc->b = LDT_entry_b(&info);
+       }
+
+       return 0;
+}
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+       struct user * dummy = NULL;
+       int i, ret;
+       unsigned long __user *datap = (unsigned long __user *)data;
+
+       switch (request) {
+       /* when I and D space are separate, these will need to be fixed. */
+       case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+       case PTRACE_PEEKDATA:
+               ret = generic_ptrace_peekdata(child, addr, data);
+               break;
+
+       /* read the word at location addr in the USER area. */
+       case PTRACE_PEEKUSR: {
+               unsigned long tmp;
+
+               ret = -EIO;
+               if ((addr & 3) || addr < 0 || 
+                   addr > sizeof(struct user) - 3)
+                       break;
+
+               tmp = 0;  /* Default return condition */
+               if(addr < FRAME_SIZE*sizeof(long))
+                       tmp = getreg(child, addr);
+               if(addr >= (long) &dummy->u_debugreg[0] &&
+                  addr <= (long) &dummy->u_debugreg[7]){
+                       addr -= (long) &dummy->u_debugreg[0];
+                       addr = addr >> 2;
+                       tmp = child->thread.debugreg[addr];
+               }
+               ret = put_user(tmp, datap);
+               break;
+       }
+
+       /* when I and D space are separate, this will have to be fixed. */
+       case PTRACE_POKETEXT: /* write the word at location addr. */
+       case PTRACE_POKEDATA:
+               ret = generic_ptrace_pokedata(child, addr, data);
+               break;
+
+       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+               ret = -EIO;
+               if ((addr & 3) || addr < 0 || 
+                   addr > sizeof(struct user) - 3)
+                       break;
+
+               if (addr < FRAME_SIZE*sizeof(long)) {
+                       ret = putreg(child, addr, data);
+                       break;
+               }
+               /* We need to be very careful here.  We implicitly
+                  want to modify a portion of the task_struct, and we
+                  have to be selective about what portions we allow someone
+                  to modify. */
+
+                 ret = -EIO;
+                 if(addr >= (long) &dummy->u_debugreg[0] &&
+                    addr <= (long) &dummy->u_debugreg[7]){
+
+                         if(addr == (long) &dummy->u_debugreg[4]) break;
+                         if(addr == (long) &dummy->u_debugreg[5]) break;
+                         if(addr < (long) &dummy->u_debugreg[4] &&
+                            ((unsigned long) data) >= TASK_SIZE-3) break;
+                         
+                         /* Sanity-check data. Take one half-byte at once with
+                          * check = (val >> (16 + 4*i)) & 0xf. It contains the
+                          * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
+                          * 2 and 3 are LENi. Given a list of invalid values,
+                          * we do mask |= 1 << invalid_value, so that
+                          * (mask >> check) & 1 is a correct test for invalid
+                          * values.
+                          *
+                          * R/Wi contains the type of the breakpoint /
+                          * watchpoint, LENi contains the length of the watched
+                          * data in the watchpoint case.
+                          *
+                          * The invalid values are:
+                          * - LENi == 0x10 (undefined), so mask |= 0x0f00.
+                          * - R/Wi == 0x10 (break on I/O reads or writes), so
+                          *   mask |= 0x4444.
+                          * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
+                          *   0x1110.
+                          *
+                          * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
+                          *
+                          * See the Intel Manual "System Programming Guide",
+                          * 15.2.4
+                          *
+                          * Note that LENi == 0x10 is defined on x86_64 in long
+                          * mode (i.e. even for 32-bit userspace software, but
+                          * 64-bit kernel), so the x86_64 mask value is 0x5454.
+                          * See the AMD manual no. 24593 (AMD64 System
+                          * Programming)*/
+
+                         if(addr == (long) &dummy->u_debugreg[7]) {
+                                 data &= ~DR_CONTROL_RESERVED;
+                                 for(i=0; i<4; i++)
+                                         if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+                                                 goto out_tsk;
+                                 if (data)
+                                         set_tsk_thread_flag(child, TIF_DEBUG);
+                                 else
+                                         clear_tsk_thread_flag(child, TIF_DEBUG);
+                         }
+                         addr -= (long) &dummy->u_debugreg;
+                         addr = addr >> 2;
+                         child->thread.debugreg[addr] = data;
+                         ret = 0;
+                 }
+                 break;
+
+       case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
+       case PTRACE_SYSCALL:    /* continue and stop at next (return from) syscall */
+       case PTRACE_CONT:       /* restart after signal. */
+               ret = -EIO;
+               if (!valid_signal(data))
+                       break;
+               if (request == PTRACE_SYSEMU) {
+                       set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+               } else if (request == PTRACE_SYSCALL) {
+                       set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+               } else {
+                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+               }
+               child->exit_code = data;
+               /* make sure the single step bit is not set. */
+               clear_singlestep(child);
+               wake_up_process(child);
+               ret = 0;
+               break;
+
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+       case PTRACE_KILL:
+               ret = 0;
+               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
+                       break;
+               child->exit_code = SIGKILL;
+               /* make sure the single step bit is not set. */
+               clear_singlestep(child);
+               wake_up_process(child);
+               break;
+
+       case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
+       case PTRACE_SINGLESTEP: /* set the trap flag. */
+               ret = -EIO;
+               if (!valid_signal(data))
+                       break;
+
+               if (request == PTRACE_SYSEMU_SINGLESTEP)
+                       set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+               else
+                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+
+               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+               set_singlestep(child);
+               child->exit_code = data;
+               /* give it a chance to run. */
+               wake_up_process(child);
+               ret = 0;
+               break;
+
+       case PTRACE_DETACH:
+               /* detach a process that was attached. */
+               ret = ptrace_detach(child, data);
+               break;
+
+       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+               if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
+                       ret = -EIO;
+                       break;
+               }
+               for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+                       __put_user(getreg(child, i), datap);
+                       datap++;
+               }
+               ret = 0;
+               break;
+       }
+
+       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+               unsigned long tmp;
+               if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
+                       ret = -EIO;
+                       break;
+               }
+               for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+                       __get_user(tmp, datap);
+                       putreg(child, i, tmp);
+                       datap++;
+               }
+               ret = 0;
+               break;
+       }
+
+       case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+               if (!access_ok(VERIFY_WRITE, datap,
+                              sizeof(struct user_i387_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               ret = 0;
+               if (!tsk_used_math(child))
+                       init_fpu(child);
+               get_fpregs((struct user_i387_struct __user *)data, child);
+               break;
+       }
+
+       case PTRACE_SETFPREGS: { /* Set the child FPU state. */
+               if (!access_ok(VERIFY_READ, datap,
+                              sizeof(struct user_i387_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               set_stopped_child_used_math(child);
+               set_fpregs(child, (struct user_i387_struct __user *)data);
+               ret = 0;
+               break;
+       }
+
+       case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
+               if (!access_ok(VERIFY_WRITE, datap,
+                              sizeof(struct user_fxsr_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               if (!tsk_used_math(child))
+                       init_fpu(child);
+               ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
+               break;
+       }
+
+       case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
+               if (!access_ok(VERIFY_READ, datap,
+                              sizeof(struct user_fxsr_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               set_stopped_child_used_math(child);
+               ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
+               break;
+       }
+
+       case PTRACE_GET_THREAD_AREA:
+               ret = ptrace_get_thread_area(child, addr,
+                                       (struct user_desc __user *) data);
+               break;
+
+       case PTRACE_SET_THREAD_AREA:
+               ret = ptrace_set_thread_area(child, addr,
+                                       (struct user_desc __user *) data);
+               break;
+
+       default:
+               ret = ptrace_request(child, request, addr, data);
+               break;
+       }
+ out_tsk:
+       return ret;
+}
+
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+{
+       struct siginfo info;
+
+       tsk->thread.trap_no = 1;
+       tsk->thread.error_code = error_code;
+
+       memset(&info, 0, sizeof(info));
+       info.si_signo = SIGTRAP;
+       info.si_code = TRAP_BRKPT;
+
+       /* User-mode eip? */
+       info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
+
+       /* Send us the fakey SIGTRAP */
+       force_sig_info(SIGTRAP, &info, tsk);
+}
+
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
+{
+       int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+       /*
+        * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+        * interception
+        */
+       int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+       int ret = 0;
+
+       /* do the secure computing check first */
+       if (!entryexit)
+               secure_computing(regs->orig_eax);
+
+       if (unlikely(current->audit_context)) {
+               if (entryexit)
+                       audit_syscall_exit(AUDITSC_RESULT(regs->eax),
+                                               regs->eax);
+               /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+                * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+                * not used, entry.S will call us only on syscall exit, not
+                * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+                * calling send_sigtrap() on syscall entry.
+                *
+                * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+                * is_singlestep is false, despite his name, so we will still do
+                * the correct thing.
+                */
+               else if (is_singlestep)
+                       goto out;
+       }
+
+       if (!(current->ptrace & PT_PTRACED))
+               goto out;
+
+       /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+        * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+        * here. We have to check this and return */
+       if (is_sysemu && entryexit)
+               return 0;
+
+       /* Fake a debug trap */
+       if (is_singlestep)
+               send_sigtrap(current, regs, 0);
+
+       if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
+               goto out;
+
+       /* the 0x80 provides a way for the tracing parent to distinguish
+          between a syscall stop and SIGTRAP delivery */
+       /* Note that the debugger could change the result of test_thread_flag!*/
+       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
+
+       /*
+        * this isn't the same as continuing with a signal, but it will do
+        * for normal use.  strace only continues with a signal if the
+        * stopping signal is not SIGTRAP.  -brl
+        */
+       if (current->exit_code) {
+               send_sig(current->exit_code, current, 1);
+               current->exit_code = 0;
+       }
+       ret = is_sysemu;
+out:
+       if (unlikely(current->audit_context) && !entryexit)
+               audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
+                                   regs->ebx, regs->ecx, regs->edx, regs->esi);
+       if (ret == 0)
+               return 0;
+
+       regs->orig_eax = -1; /* force skip of syscall restarting */
+       if (unlikely(current->audit_context))
+               audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
+       return 1;
+}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644 (file)
index 0000000..6722469
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/pci.h>
+#include <linux/irq.h>
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+{
+       u8 config, rev;
+       u32 word;
+
+       /* BIOS may enable hardware IRQ balancing for
+        * E7520/E7320/E7525(revision ID 0x9 and below)
+        * based platforms.
+        * Disable SW irqbalance/affinity on those platforms.
+        */
+       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+       if (rev > 0x9)
+               return;
+
+       /* enable access to config space*/
+       pci_read_config_byte(dev, 0xf4, &config);
+       pci_write_config_byte(dev, 0xf4, config|0x2);
+
+       /* read xTPR register */
+       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+
+       if (!(word & (1 << 13))) {
+               printk(KERN_INFO "Intel E7520/7320/7525 detected. "
+                       "Disabling irq balancing and affinity\n");
+#ifdef CONFIG_IRQBALANCE
+               irqbalance_disable("");
+#endif
+               noirqdebug_setup("");
+#ifdef CONFIG_PROC_FS
+               no_irq_affinity = 1;
+#endif
+       }
+
+       /* put back the original value for config space*/
+       if (!(config & 0x2))
+               pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
+#endif
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
new file mode 100644 (file)
index 0000000..0d79624
--- /dev/null
@@ -0,0 +1,413 @@
+/*
+ *  linux/arch/i386/kernel/reboot.c
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/efi.h>
+#include <linux/dmi.h>
+#include <linux/ctype.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include "mach_reboot.h"
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+static int reboot_mode;
+static int reboot_thru_bios;
+
+#ifdef CONFIG_SMP
+static int reboot_cpu = -1;
+#endif
+static int __init reboot_setup(char *str)
+{
+       while(1) {
+               switch (*str) {
+               case 'w': /* "warm" reboot (no memory testing etc) */
+                       reboot_mode = 0x1234;
+                       break;
+               case 'c': /* "cold" reboot (with memory testing etc) */
+                       reboot_mode = 0x0;
+                       break;
+               case 'b': /* "bios" reboot by jumping through the BIOS */
+                       reboot_thru_bios = 1;
+                       break;
+               case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
+                       reboot_thru_bios = 0;
+                       break;
+#ifdef CONFIG_SMP
+               case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
+                       if (isdigit(*(str+1))) {
+                               reboot_cpu = (int) (*(str+1) - '0');
+                               if (isdigit(*(str+2)))
+                                       reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
+                       }
+                               /* we will leave sorting out the final value 
+                               when we are ready to reboot, since we might not
+                               have set up boot_cpu_id or smp_num_cpu */
+                       break;
+#endif
+               }
+               if((str = strchr(str,',')) != NULL)
+                       str++;
+               else
+                       break;
+       }
+       return 1;
+}
+
+__setup("reboot=", reboot_setup);
+
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
+ */
+
+/*
+ * Some machines require the "reboot=b"  commandline option, this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(struct dmi_system_id *d)
+{
+       if (!reboot_thru_bios) {
+               reboot_thru_bios = 1;
+               printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+       }
+       return 0;
+}
+
+static struct dmi_system_id __initdata reboot_dmi_table[] = {
+       {       /* Handle problems with rebooting on Dell E520's */
+               .callback = set_bios_reboot,
+               .ident = "Dell E520",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+               },
+       },
+       {       /* Handle problems with rebooting on Dell 1300's */
+               .callback = set_bios_reboot,
+               .ident = "Dell PowerEdge 1300",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+               },
+       },
+       {       /* Handle problems with rebooting on Dell 300's */
+               .callback = set_bios_reboot,
+               .ident = "Dell PowerEdge 300",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+               },
+       },
+       {       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+               .callback = set_bios_reboot,
+               .ident = "Dell OptiPlex 745",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+               },
+       },
+       {       /* Handle problems with rebooting on Dell 2400's */
+               .callback = set_bios_reboot,
+               .ident = "Dell PowerEdge 2400",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+               },
+       },
+       {       /* Handle problems with rebooting on HP laptops */
+               .callback = set_bios_reboot,
+               .ident = "HP Compaq Laptop",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+               },
+       },
+       { }
+};
+
+static int __init reboot_init(void)
+{
+       dmi_check_system(reboot_dmi_table);
+       return 0;
+}
+
+core_initcall(reboot_init);
+
+/* The following code and data reboots the machine by switching to real
+   mode and jumping to the BIOS reset entry point, as if the CPU has
+   really been reset.  The previous version asked the keyboard
+   controller to pulse the CPU reset line, which is more thorough, but
+   doesn't work with at least one type of 486 motherboard.  It is easy
+   to stop this code working; hence the copious comments. */
+
+static unsigned long long
+real_mode_gdt_entries [3] =
+{
+       0x0000000000000000ULL,  /* Null descriptor */
+       0x00009a000000ffffULL,  /* 16-bit real-mode 64k code at 0x00000000 */
+       0x000092000100ffffULL   /* 16-bit real-mode 64k data at 0x00000100 */
+};
+
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
+
+
+/* This is 16-bit protected mode code to disable paging and the cache,
+   switch to real mode and jump to the BIOS reset code.
+
+   The instruction that switches to real mode by writing to CR0 must be
+   followed immediately by a far jump instruction, which set CS to a
+   valid value for real mode, and flushes the prefetch queue to avoid
+   running instructions that have already been decoded in protected
+   mode.
+
+   Clears all the flags except ET, especially PG (paging), PE
+   (protected-mode enable) and TS (task switch for coprocessor state
+   save).  Flushes the TLB after paging has been disabled.  Sets CD and
+   NW, to disable the cache on a 486, and invalidates the cache.  This
+   is more like the state of a 486 after reset.  I don't know if
+   something else should be done for other chips.
+
+   More could be done here to set up the registers as if a CPU reset had
+   occurred; hopefully real BIOSs don't assume much. */
+
+static unsigned char real_mode_switch [] =
+{
+       0x66, 0x0f, 0x20, 0xc0,                 /*    movl  %cr0,%eax        */
+       0x66, 0x83, 0xe0, 0x11,                 /*    andl  $0x00000011,%eax */
+       0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,     /*    orl   $0x60000000,%eax */
+       0x66, 0x0f, 0x22, 0xc0,                 /*    movl  %eax,%cr0        */
+       0x66, 0x0f, 0x22, 0xd8,                 /*    movl  %eax,%cr3        */
+       0x66, 0x0f, 0x20, 0xc3,                 /*    movl  %cr0,%ebx        */
+       0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,       /*    andl  $0x60000000,%ebx */
+       0x74, 0x02,                             /*    jz    f                */
+       0x0f, 0x09,                             /*    wbinvd                 */
+       0x24, 0x10,                             /* f: andb  $0x10,al         */
+       0x66, 0x0f, 0x22, 0xc0                  /*    movl  %eax,%cr0        */
+};
+static unsigned char jump_to_bios [] =
+{
+       0xea, 0x00, 0x00, 0xff, 0xff            /*    ljmp  $0xffff,$0x0000  */
+};
+
+/*
+ * Switch to real mode and then execute the code
+ * specified by the code and length parameters.
+ * We assume that length will aways be less that 100!
+ */
+void machine_real_restart(unsigned char *code, int length)
+{
+       local_irq_disable();
+
+       /* Write zero to CMOS register number 0x0f, which the BIOS POST
+          routine will recognize as telling it to do a proper reboot.  (Well
+          that's what this book in front of me says -- it may only apply to
+          the Phoenix BIOS though, it's not clear).  At the same time,
+          disable NMIs by setting the top bit in the CMOS address register,
+          as we're about to do peculiar things to the CPU.  I'm not sure if
+          `outb_p' is needed instead of just `outb'.  Use it to be on the
+          safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
+        */
+
+       spin_lock(&rtc_lock);
+       CMOS_WRITE(0x00, 0x8f);
+       spin_unlock(&rtc_lock);
+
+       /* Remap the kernel at virtual address zero, as well as offset zero
+          from the kernel segment.  This assumes the kernel segment starts at
+          virtual address PAGE_OFFSET. */
+
+       memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+               sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+
+       /*
+        * Use `swapper_pg_dir' as our page directory.
+        */
+       load_cr3(swapper_pg_dir);
+
+       /* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
+          this on booting to tell it to "Bypass memory test (also warm
+          boot)".  This seems like a fairly standard thing that gets set by
+          REBOOT.COM programs, and the previous reset routine did this
+          too. */
+
+       *((unsigned short *)0x472) = reboot_mode;
+
+       /* For the switch to real mode, copy some code to low memory.  It has
+          to be in the first 64k because it is running in 16-bit mode, and it
+          has to have the same physical and virtual address, because it turns
+          off paging.  Copy it near the end of the first page, out of the way
+          of BIOS variables. */
+
+       memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
+               real_mode_switch, sizeof (real_mode_switch));
+       memcpy ((void *) (0x1000 - 100), code, length);
+
+       /* Set up the IDT for real mode. */
+
+       load_idt(&real_mode_idt);
+
+       /* Set up a GDT from which we can load segment descriptors for real
+          mode.  The GDT is not used in real mode; it is just needed here to
+          prepare the descriptors. */
+
+       load_gdt(&real_mode_gdt);
+
+       /* Load the data segment registers, and thus the descriptors ready for
+          real mode.  The base address of each segment is 0x100, 16 times the
+          selector value being loaded here.  This is so that the segment
+          registers don't have to be reloaded after switching to real mode:
+          the values are consistent for real mode operation already. */
+
+       __asm__ __volatile__ ("movl $0x0010,%%eax\n"
+                               "\tmovl %%eax,%%ds\n"
+                               "\tmovl %%eax,%%es\n"
+                               "\tmovl %%eax,%%fs\n"
+                               "\tmovl %%eax,%%gs\n"
+                               "\tmovl %%eax,%%ss" : : : "eax");
+
+       /* Jump to the 16-bit code that we copied earlier.  It disables paging
+          and the cache, switches to real mode, and jumps to the BIOS reset
+          entry point. */
+
+       __asm__ __volatile__ ("ljmp $0x0008,%0"
+                               :
+                               : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+static void native_machine_shutdown(void)
+{
+#ifdef CONFIG_SMP
+       int reboot_cpu_id;
+
+       /* The boot cpu is always logical cpu 0 */
+       reboot_cpu_id = 0;
+
+       /* See if there has been given a command line override */
+       if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
+               cpu_isset(reboot_cpu, cpu_online_map)) {
+               reboot_cpu_id = reboot_cpu;
+       }
+
+       /* Make certain the cpu I'm rebooting on is online */
+       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+               reboot_cpu_id = smp_processor_id();
+       }
+
+       /* Make certain I only run on the appropriate processor */
+       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+       /* O.K. Now that I'm on the appropriate processor, stop
+        * all of the others, and disable their local APICs.
+        */
+
+       smp_send_stop();
+#endif /* CONFIG_SMP */
+
+       lapic_shutdown();
+
+#ifdef CONFIG_X86_IO_APIC
+       disable_IO_APIC();
+#endif
+}
+
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
+static void native_machine_emergency_restart(void)
+{
+       if (!reboot_thru_bios) {
+               if (efi_enabled) {
+                       efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
+                       load_idt(&no_idt);
+                       __asm__ __volatile__("int3");
+               }
+               /* rebooting needs to touch the page at absolute addr 0 */
+               *((unsigned short *)__va(0x472)) = reboot_mode;
+               for (;;) {
+                       mach_reboot_fixups(); /* for board specific fixups */
+                       mach_reboot();
+                       /* That didn't work - force a triple fault.. */
+                       load_idt(&no_idt);
+                       __asm__ __volatile__("int3");
+               }
+       }
+       if (efi_enabled)
+               efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
+
+       machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
+}
+
+static void native_machine_restart(char * __unused)
+{
+       machine_shutdown();
+       machine_emergency_restart();
+}
+
+static void native_machine_halt(void)
+{
+}
+
+static void native_machine_power_off(void)
+{
+       if (pm_power_off) {
+               machine_shutdown();
+               pm_power_off();
+       }
+}
+
+
+struct machine_ops machine_ops = {
+       .power_off = native_machine_power_off,
+       .shutdown = native_machine_shutdown,
+       .emergency_restart = native_machine_emergency_restart,
+       .restart = native_machine_restart,
+       .halt = native_machine_halt,
+};
+
+void machine_power_off(void)
+{
+       machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+       machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+       machine_ops.emergency_restart();
+}
+
+void machine_restart(char *cmd)
+{
+       machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+       machine_ops.halt();
+}
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644 (file)
index 0000000..03e1cce
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * linux/arch/i386/kernel/reboot_fixups.c
+ *
+ * This is a good place to put board specific reboot fixups.
+ *
+ * List of supported fixups:
+ * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
+ * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
+ *
+ */
+
+#include <asm/delay.h>
+#include <linux/pci.h>
+#include <asm/reboot_fixups.h>
+#include <asm/msr.h>
+
+static void cs5530a_warm_reset(struct pci_dev *dev)
+{
+       /* writing 1 to the reset control register, 0x44 causes the
+       cs5530a to perform a system warm reset */
+       pci_write_config_byte(dev, 0x44, 0x1);
+       udelay(50); /* shouldn't get here but be safe and spin-a-while */
+       return;
+}
+
+static void cs5536_warm_reset(struct pci_dev *dev)
+{
+       /*
+        * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
+        * writing 1 to the LSB of this MSR causes a hard reset.
+        */
+       wrmsrl(0x51400017, 1ULL);
+       udelay(50); /* shouldn't get here but be safe and spin a while */
+}
+
+struct device_fixup {
+       unsigned int vendor;
+       unsigned int device;
+       void (*reboot_fixup)(struct pci_dev *);
+};
+
+static struct device_fixup fixups_table[] = {
+{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
+};
+
+/*
+ * we see if any fixup is available for our current hardware. if there
+ * is a fixup, we call it and we expect to never return from it. if we
+ * do return, we keep looking and then eventually fall back to the
+ * standard mach_reboot on return.
+ */
+void mach_reboot_fixups(void)
+{
+       struct device_fixup *cur;
+       struct pci_dev *dev;
+       int i;
+
+       for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
+               cur = &(fixups_table[i]);
+               dev = pci_get_device(cur->vendor, cur->device, NULL);
+               if (!dev)
+                       continue;
+
+               cur->reboot_fixup(dev);
+       }
+}
+
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644 (file)
index 0000000..f151d6f
--- /dev/null
@@ -0,0 +1,252 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+
+       .text
+       .align PAGE_ALIGNED
+       .globl relocate_kernel
+relocate_kernel:
+       movl    8(%esp), %ebp /* list of pages */
+
+#ifdef CONFIG_X86_PAE
+       /* map the control page at its virtual address */
+
+       movl    PTR(VA_PGD)(%ebp), %edi
+       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0xc0000000, %eax
+       shrl    $27, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_PMD_0)(%ebp), %edx
+       orl     $PAE_PGD_ATTR, %edx
+       movl    %edx, (%eax)
+
+       movl    PTR(VA_PMD_0)(%ebp), %edi
+       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0x3fe00000, %eax
+       shrl    $18, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_PTE_0)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+
+       movl    PTR(VA_PTE_0)(%ebp), %edi
+       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0x001ff000, %eax
+       shrl    $9, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+
+       /* identity map the control page at its physical address */
+
+       movl    PTR(VA_PGD)(%ebp), %edi
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0xc0000000, %eax
+       shrl    $27, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_PMD_1)(%ebp), %edx
+       orl     $PAE_PGD_ATTR, %edx
+       movl    %edx, (%eax)
+
+       movl    PTR(VA_PMD_1)(%ebp), %edi
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0x3fe00000, %eax
+       shrl    $18, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_PTE_1)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+
+       movl    PTR(VA_PTE_1)(%ebp), %edi
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0x001ff000, %eax
+       shrl    $9, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+#else
+       /* map the control page at its virtual address */
+
+       movl    PTR(VA_PGD)(%ebp), %edi
+       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0xffc00000, %eax
+       shrl    $20, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_PTE_0)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+
+       movl    PTR(VA_PTE_0)(%ebp), %edi
+       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0x003ff000, %eax
+       shrl    $10, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+
+       /* identity map the control page at its physical address */
+
+       movl    PTR(VA_PGD)(%ebp), %edi
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0xffc00000, %eax
+       shrl    $20, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_PTE_1)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+
+       movl    PTR(VA_PTE_1)(%ebp), %edi
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+       andl    $0x003ff000, %eax
+       shrl    $10, %eax
+       addl    %edi, %eax
+
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+       orl     $PAGE_ATTR, %edx
+       movl    %edx, (%eax)
+#endif
+
+relocate_new_kernel:
+       /* read the arguments and say goodbye to the stack */
+       movl  4(%esp), %ebx /* page_list */
+       movl  8(%esp), %ebp /* list of pages */
+       movl  12(%esp), %edx /* start address */
+       movl  16(%esp), %ecx /* cpu_has_pae */
+
+       /* zero out flags, and disable interrupts */
+       pushl $0
+       popfl
+
+       /* get physical address of control page now */
+       /* this is impossible after page table switch */
+       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edi
+
+       /* switch to new set of page tables */
+       movl    PTR(PA_PGD)(%ebp), %eax
+       movl    %eax, %cr3
+
+       /* setup a new stack at the end of the physical control page */
+       lea     4096(%edi), %esp
+
+       /* jump to identity mapped page */
+       movl    %edi, %eax
+       addl    $(identity_mapped - relocate_kernel), %eax
+       pushl   %eax
+       ret
+
+identity_mapped:
+       /* store the start address on the stack */
+       pushl   %edx
+
+       /* Set cr0 to a known state:
+        * 31 0 == Paging disabled
+        * 18 0 == Alignment check disabled
+        * 16 0 == Write protect disabled
+        * 3  0 == No task switch
+        * 2  0 == Don't do FP software emulation.
+        * 0  1 == Proctected mode enabled
+        */
+       movl    %cr0, %eax
+       andl    $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+       orl     $(1<<0), %eax
+       movl    %eax, %cr0
+
+       /* clear cr4 if applicable */
+       testl   %ecx, %ecx
+       jz      1f
+       /* Set cr4 to a known state:
+        * Setting everything to zero seems safe.
+        */
+       movl    %cr4, %eax
+       andl    $0, %eax
+       movl    %eax, %cr4
+
+       jmp 1f
+1:
+
+       /* Flush the TLB (needed?) */
+       xorl    %eax, %eax
+       movl    %eax, %cr3
+
+       /* Do the copies */
+       movl    %ebx, %ecx
+       jmp     1f
+
+0:     /* top, read another word from the indirection page */
+       movl    (%ebx), %ecx
+       addl    $4, %ebx
+1:
+       testl   $0x1,   %ecx  /* is it a destination page */
+       jz      2f
+       movl    %ecx,   %edi
+       andl    $0xfffff000, %edi
+       jmp     0b
+2:
+       testl   $0x2,   %ecx  /* is it an indirection page */
+       jz      2f
+       movl    %ecx,   %ebx
+       andl    $0xfffff000, %ebx
+       jmp     0b
+2:
+       testl   $0x4,   %ecx /* is it the done indicator */
+       jz      2f
+       jmp     3f
+2:
+       testl   $0x8,   %ecx /* is it the source indicator */
+       jz      0b           /* Ignore it otherwise */
+       movl    %ecx,   %esi /* For every source page do a copy */
+       andl    $0xfffff000, %esi
+
+       movl    $1024, %ecx
+       rep ; movsl
+       jmp     0b
+
+3:
+
+       /* To be certain of avoiding problems with self-modifying code
+        * I need to execute a serializing instruction here.
+        * So I flush the TLB, it's handy, and not processor dependent.
+        */
+       xorl    %eax, %eax
+       movl    %eax, %cr3
+
+       /* set all of the registers to known values */
+       /* leave %esp alone */
+
+       xorl    %eax, %eax
+       xorl    %ebx, %ebx
+       xorl    %ecx, %ecx
+       xorl    %edx, %edx
+       xorl    %esi, %esi
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
new file mode 100644 (file)
index 0000000..c7d3df2
--- /dev/null
@@ -0,0 +1,131 @@
+/* linux/arch/i386/kernel/scx200.c 
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   National Semiconductor SCx200 support. */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+#include <linux/scx200.h>
+#include <linux/scx200_gpio.h>
+
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+
+#define NAME "scx200"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Driver");
+MODULE_LICENSE("GPL");
+
+unsigned scx200_gpio_base = 0;
+long scx200_gpio_shadow[2];
+
+unsigned scx200_cb_base = 0;
+
+static struct pci_device_id scx200_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
+       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
+       { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
+       { },
+};
+MODULE_DEVICE_TABLE(pci,scx200_tbl);
+
+static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
+
+static struct pci_driver scx200_pci_driver = {
+       .name = "scx200",
+       .id_table = scx200_tbl,
+       .probe = scx200_probe,
+};
+
+static DEFINE_MUTEX(scx200_gpio_config_lock);
+
+static void __devinit scx200_init_shadow(void)
+{
+       int bank;
+
+       /* read the current values driven on the GPIO signals */
+       for (bank = 0; bank < 2; ++bank)
+               scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+
+static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       unsigned base;
+
+       if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
+           pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
+               base = pci_resource_start(pdev, 0);
+               printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+
+               if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
+                       printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+                       return -EBUSY;
+               }
+
+               scx200_gpio_base = base;
+               scx200_init_shadow();
+
+       } else {
+               /* find the base of the Configuration Block */
+               if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
+                       scx200_cb_base = SCx200_CB_BASE_FIXED;
+               } else {
+                       pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
+                       if (scx200_cb_probe(base)) {
+                               scx200_cb_base = base;
+                       } else {
+                               printk(KERN_WARNING NAME ": Configuration Block not found\n");
+                               return -ENODEV;
+                       }
+               }
+               printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
+       }
+
+       return 0;
+}
+
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+       u32 config, new_config;
+
+       mutex_lock(&scx200_gpio_config_lock);
+
+       outl(index, scx200_gpio_base + 0x20);
+       config = inl(scx200_gpio_base + 0x24);
+
+       new_config = (config & mask) | bits;
+       outl(new_config, scx200_gpio_base + 0x24);
+
+       mutex_unlock(&scx200_gpio_config_lock);
+
+       return config;
+}
+
+static int __init scx200_init(void)
+{
+       printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
+
+       return pci_register_driver(&scx200_pci_driver);
+}
+
+static void __exit scx200_cleanup(void)
+{
+       pci_unregister_driver(&scx200_pci_driver);
+       release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
+}
+
+module_init(scx200_init);
+module_exit(scx200_cleanup);
+
+EXPORT_SYMBOL(scx200_gpio_base);
+EXPORT_SYMBOL(scx200_gpio_shadow);
+EXPORT_SYMBOL(scx200_gpio_configure);
+EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
new file mode 100644 (file)
index 0000000..d474cd6
--- /dev/null
@@ -0,0 +1,653 @@
+/*
+ *  linux/arch/i386/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *     David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+
+#include <video/edid.h>
+
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/io.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <bios_ebda.h>
+
+/* This value is set up by the early boot code to point to the value
+   immediately after the boot time page tables.  It contains a *physical*
+   address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_end __initdata = ~0UL;
+
+int disable_pse __devinitdata = 0;
+
+/*
+ * Machine setup..
+ */
+extern struct resource code_resource;
+extern struct resource data_resource;
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+EXPORT_SYMBOL(boot_cpu_data);
+
+unsigned long mmu_cr4_features;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+#ifdef CONFIG_MCA
+EXPORT_SYMBOL(machine_id);
+#endif
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+unsigned int mca_pentium_flag;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+struct ist_info ist_info;
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+EXPORT_SYMBOL(ist_info);
+#endif
+
+extern void early_cpu_init(void);
+extern int root_mountflags;
+
+unsigned long saved_videomode;
+
+#define RAMDISK_IMAGE_START_MASK       0x07FF
+#define RAMDISK_PROMPT_FLAG            0x8000
+#define RAMDISK_LOAD_FLAG              0x4000  
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+struct boot_params __initdata boot_params;
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+int __initdata user_defined_memmap = 0;
+
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem=  [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       if (strcmp(arg, "nopentium") == 0) {
+               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+               disable_pse = 1;
+       } else {
+               /* If the user specifies memory size, we
+                * limit the BIOS-provided memory map to
+                * that size. exactmap can be used to specify
+                * the exact map. mem=number can be used to
+                * trim the existing memory map.
+                */
+               unsigned long long mem_size;
+               mem_size = memparse(arg, &arg);
+               limit_regions(mem_size);
+               user_defined_memmap = 1;
+       }
+       return 0;
+}
+early_param("mem", parse_mem);
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       elfcorehdr_addr = memparse(arg, &arg);
+       return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+       return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       __VMALLOC_RESERVE = memparse(arg, &arg);
+       return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+       unsigned long address;
+
+       if (!arg)
+               return -EINVAL;
+
+       address = memparse(arg, &arg);
+       reserve_top_address(address);
+       return 0;
+}
+early_param("reservetop", parse_reservetop);
+
+/*
+ * Determine low and high memory ranges:
+ */
+unsigned long __init find_max_low_pfn(void)
+{
+       unsigned long max_low_pfn;
+
+       max_low_pfn = max_pfn;
+       if (max_low_pfn > MAXMEM_PFN) {
+               if (highmem_pages == -1)
+                       highmem_pages = max_pfn - MAXMEM_PFN;
+               if (highmem_pages + MAXMEM_PFN < max_pfn)
+                       max_pfn = MAXMEM_PFN + highmem_pages;
+               if (highmem_pages + MAXMEM_PFN > max_pfn) {
+                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
+                       highmem_pages = 0;
+               }
+               max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+               /* Maximum memory usable is what is directly addressable */
+               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+                                       MAXMEM>>20);
+               if (max_pfn > MAX_NONPAE_PFN)
+                       printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+               else
+                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+               max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+               if (max_pfn > MAX_NONPAE_PFN) {
+                       max_pfn = MAX_NONPAE_PFN;
+                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
+                       printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+               }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+       } else {
+               if (highmem_pages == -1)
+                       highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+               if (highmem_pages >= max_pfn) {
+                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+                       highmem_pages = 0;
+               }
+               if (highmem_pages) {
+                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
+                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
+                               highmem_pages = 0;
+                       }
+                       max_low_pfn -= highmem_pages;
+               }
+#else
+               if (highmem_pages)
+                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+       }
+       return max_low_pfn;
+}
+
+/*
+ * workaround for Dell systems that neglect to reserve EBDA
+ */
+static void __init reserve_ebda_region(void)
+{
+       unsigned int addr;
+       addr = get_bios_ebda();
+       if (addr)
+               reserve_bootmem(addr, PAGE_SIZE);       
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init setup_bootmem_allocator(void);
+static unsigned long __init setup_memory(void)
+{
+       /*
+        * partially used pages are not usable - thus
+        * we are rounding upwards:
+        */
+       min_low_pfn = PFN_UP(init_pg_tables_end);
+
+       find_max_pfn();
+
+       max_low_pfn = find_max_low_pfn();
+
+#ifdef CONFIG_HIGHMEM
+       highstart_pfn = highend_pfn = max_pfn;
+       if (max_pfn > max_low_pfn) {
+               highstart_pfn = max_low_pfn;
+       }
+       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+               pages_to_mb(highend_pfn - highstart_pfn));
+       num_physpages = highend_pfn;
+       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+       num_physpages = max_low_pfn;
+       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+       max_mapnr = num_physpages;
+#endif
+       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                       pages_to_mb(max_low_pfn));
+
+       setup_bootmem_allocator();
+
+       return max_low_pfn;
+}
+
+void __init zone_sizes_init(void)
+{
+       unsigned long max_zone_pfns[MAX_NR_ZONES];
+       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+       max_zone_pfns[ZONE_DMA] =
+               virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+       max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+       max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+       add_active_range(0, 0, highend_pfn);
+#else
+       add_active_range(0, 0, max_low_pfn);
+#endif
+
+       free_area_init_nodes(max_zone_pfns);
+}
+#else
+extern unsigned long __init setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+       unsigned long bootmap_size;
+       /*
+        * Initialize the boot-time allocator (with low memory only):
+        */
+       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
+
+       register_bootmem_low_pages(max_low_pfn);
+
+       /*
+        * Reserve the bootmem bitmap itself as well. We do this in two
+        * steps (first step was init_bootmem()) because this catches
+        * the (very unlikely) case of us accidentally initializing the
+        * bootmem allocator with an invalid RAM area.
+        */
+       reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+                        bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
+
+       /*
+        * reserve physical page 0 - it's a special BIOS page on many boxes,
+        * enabling clean reboots, SMP operation, laptop functions.
+        */
+       reserve_bootmem(0, PAGE_SIZE);
+
+       /* reserve EBDA region, it's a 4K region */
+       reserve_ebda_region();
+
+    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
+       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
+       unless you have no PS/2 mouse plugged in. */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+           boot_cpu_data.x86 == 6)
+            reserve_bootmem(0xa0000 - 4096, 4096);
+
+#ifdef CONFIG_SMP
+       /*
+        * But first pinch a few for the stack/trampoline stuff
+        * FIXME: Don't need the extra page at 4K, but need to fix
+        * trampoline before removing it. (see the GDT stuff)
+        */
+       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+       /*
+        * Reserve low memory region for sleep support.
+        */
+       acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+       /*
+        * Find and reserve possible boot-time SMP configuration:
+        */
+       find_smp_config();
+#endif
+       numa_kva_reserve();
+#ifdef CONFIG_BLK_DEV_INITRD
+       if (LOADER_TYPE && INITRD_START) {
+               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
+                       reserve_bootmem(INITRD_START, INITRD_SIZE);
+                       initrd_start = INITRD_START + PAGE_OFFSET;
+                       initrd_end = initrd_start+INITRD_SIZE;
+               }
+               else {
+                       printk(KERN_ERR "initrd extends beyond end of memory "
+                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+                           INITRD_START + INITRD_SIZE,
+                           max_low_pfn << PAGE_SHIFT);
+                       initrd_start = 0;
+               }
+       }
+#endif
+#ifdef CONFIG_KEXEC
+       if (crashk_res.start != crashk_res.end)
+               reserve_bootmem(crashk_res.start,
+                       crashk_res.end - crashk_res.start + 1);
+#endif
+}
+
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+static void __init remapped_pgdat_init(void)
+{
+       int nid;
+
+       for_each_online_node(nid) {
+               if (nid != 0)
+                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+       }
+}
+
+#ifdef CONFIG_MCA
+static void set_mca_bus(int x)
+{
+       MCA_bus = x;
+}
+#else
+static void set_mca_bus(int x) { }
+#endif
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+       return machine_specific_memory_setup();
+}
+
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+       unsigned long max_low_pfn;
+
+       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+       pre_setup_arch_hook();
+       early_cpu_init();
+
+       /*
+        * FIXME: This isn't an official loader_type right
+        * now but does currently work with elilo.
+        * If we were configured as an EFI kernel, check to make
+        * sure that we were loaded correctly from elilo and that
+        * the system table is valid.  If not, then initialize normally.
+        */
+#ifdef CONFIG_EFI
+       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
+               efi_enabled = 1;
+#endif
+
+       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+       screen_info = SCREEN_INFO;
+       edid_info = EDID_INFO;
+       apm_info.bios = APM_BIOS_INFO;
+       ist_info = IST_INFO;
+       saved_videomode = VIDEO_MODE;
+       if( SYS_DESC_TABLE.length != 0 ) {
+               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
+               machine_id = SYS_DESC_TABLE.table[0];
+               machine_submodel_id = SYS_DESC_TABLE.table[1];
+               BIOS_revision = SYS_DESC_TABLE.table[2];
+       }
+       bootloader_type = LOADER_TYPE;
+
+#ifdef CONFIG_BLK_DEV_RAM
+       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+       ARCH_SETUP
+       if (efi_enabled)
+               efi_init();
+       else {
+               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+               print_memory_map(memory_setup());
+       }
+
+       copy_edd();
+
+       if (!MOUNT_ROOT_RDONLY)
+               root_mountflags &= ~MS_RDONLY;
+       init_mm.start_code = (unsigned long) _text;
+       init_mm.end_code = (unsigned long) _etext;
+       init_mm.end_data = (unsigned long) _edata;
+       init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+
+       code_resource.start = virt_to_phys(_text);
+       code_resource.end = virt_to_phys(_etext)-1;
+       data_resource.start = virt_to_phys(_etext);
+       data_resource.end = virt_to_phys(_edata)-1;
+
+       parse_early_param();
+
+       if (user_defined_memmap) {
+               printk(KERN_INFO "user-defined physical RAM map:\n");
+               print_memory_map("user");
+       }
+
+       strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+       *cmdline_p = command_line;
+
+       max_low_pfn = setup_memory();
+
+#ifdef CONFIG_VMI
+       /*
+        * Must be after max_low_pfn is determined, and before kernel
+        * pagetables are setup.
+        */
+       vmi_init();
+#endif
+
+       /*
+        * NOTE: before this point _nobody_ is allowed to allocate
+        * any memory using the bootmem allocator.  Although the
+        * alloctor is now initialised only the first 8Mb of the kernel
+        * virtual address space has been mapped.  All allocations before
+        * paging_init() has completed must use the alloc_bootmem_low_pages()
+        * variant (which allocates DMA'able memory) and care must be taken
+        * not to exceed the 8Mb limit.
+        */
+
+#ifdef CONFIG_SMP
+       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+#endif
+       paging_init();
+       remapped_pgdat_init();
+       sparse_init();
+       zone_sizes_init();
+
+       /*
+        * NOTE: at this point the bootmem allocator is fully available.
+        */
+
+       paravirt_post_allocator_init();
+
+       dmi_scan_machine();
+
+#ifdef CONFIG_X86_GENERICARCH
+       generic_apic_probe();
+#endif 
+       if (efi_enabled)
+               efi_map_memmap();
+
+#ifdef CONFIG_ACPI
+       /*
+        * Parse the ACPI tables for possible boot-time SMP configuration.
+        */
+       acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_PCI
+#ifdef CONFIG_X86_IO_APIC
+       check_acpi_pci();       /* Checks more than just ACPI actually */
+#endif
+#endif
+
+#ifdef CONFIG_ACPI
+       acpi_boot_init();
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+       if (def_to_bigsmp)
+               printk(KERN_WARNING "More than 8 CPUs detected and "
+                       "CONFIG_X86_PC cannot handle it.\nUse "
+                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (smp_found_config)
+               get_smp_config();
+#endif
+
+       e820_register_memory();
+       e820_mark_nosave_regions();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+       if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+               conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+       conswitchp = &dummy_con;
+#endif
+#endif
+}
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe_32.h
new file mode 100644 (file)
index 0000000..0b22217
--- /dev/null
@@ -0,0 +1,21 @@
+struct sigframe
+{
+       char __user *pretcode;
+       int sig;
+       struct sigcontext sc;
+       struct _fpstate fpstate;
+       unsigned long extramask[_NSIG_WORDS-1];
+       char retcode[8];
+};
+
+struct rt_sigframe
+{
+       char __user *pretcode;
+       int sig;
+       struct siginfo __user *pinfo;
+       void __user *puc;
+       struct siginfo info;
+       struct ucontext uc;
+       struct _fpstate fpstate;
+       char retcode[8];
+};
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644 (file)
index 0000000..c03570f
--- /dev/null
@@ -0,0 +1,667 @@
+/*
+ *  linux/arch/i386/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <linux/ptrace.h>
+#include <linux/elf.h>
+#include <linux/binfmts.h>
+#include <asm/processor.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "sigframe_32.h"
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+asmlinkage int
+sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+       mask &= _BLOCKABLE;
+       spin_lock_irq(&current->sighand->siglock);
+       current->saved_sigmask = current->blocked;
+       siginitset(&current->blocked, mask);
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+
+       current->state = TASK_INTERRUPTIBLE;
+       schedule();
+       set_thread_flag(TIF_RESTORE_SIGMASK);
+       return -ERESTARTNOHAND;
+}
+
+asmlinkage int 
+sys_sigaction(int sig, const struct old_sigaction __user *act,
+             struct old_sigaction __user *oact)
+{
+       struct k_sigaction new_ka, old_ka;
+       int ret;
+
+       if (act) {
+               old_sigset_t mask;
+               if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                   __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+                   __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+                       return -EFAULT;
+               __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+               __get_user(mask, &act->sa_mask);
+               siginitset(&new_ka.sa.sa_mask, mask);
+       }
+
+       ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+       if (!ret && oact) {
+               if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                   __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+                   __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+                       return -EFAULT;
+               __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+               __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+       }
+
+       return ret;
+}
+
+asmlinkage int
+sys_sigaltstack(unsigned long ebx)
+{
+       /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
+       struct pt_regs *regs = (struct pt_regs *)&ebx;
+       const stack_t __user *uss = (const stack_t __user *)ebx;
+       stack_t __user *uoss = (stack_t __user *)regs->ecx;
+
+       return do_sigaltstack(uss, uoss, regs->esp);
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
+{
+       unsigned int err = 0;
+
+       /* Always make any pending restarted system calls return -EINTR */
+       current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#define COPY(x)                err |= __get_user(regs->x, &sc->x)
+
+#define COPY_SEG(seg)                                                  \
+       { unsigned short tmp;                                           \
+         err |= __get_user(tmp, &sc->seg);                             \
+         regs->x##seg = tmp; }
+
+#define COPY_SEG_STRICT(seg)                                           \
+       { unsigned short tmp;                                           \
+         err |= __get_user(tmp, &sc->seg);                             \
+         regs->x##seg = tmp|3; }
+
+#define GET_SEG(seg)                                                   \
+       { unsigned short tmp;                                           \
+         err |= __get_user(tmp, &sc->seg);                             \
+         loadsegment(seg,tmp); }
+
+#define        FIX_EFLAGS      (X86_EFLAGS_AC | X86_EFLAGS_RF |                 \
+                        X86_EFLAGS_OF | X86_EFLAGS_DF |                 \
+                        X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
+                        X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+
+       GET_SEG(gs);
+       COPY_SEG(fs);
+       COPY_SEG(es);
+       COPY_SEG(ds);
+       COPY(edi);
+       COPY(esi);
+       COPY(ebp);
+       COPY(esp);
+       COPY(ebx);
+       COPY(edx);
+       COPY(ecx);
+       COPY(eip);
+       COPY_SEG_STRICT(cs);
+       COPY_SEG_STRICT(ss);
+       
+       {
+               unsigned int tmpflags;
+               err |= __get_user(tmpflags, &sc->eflags);
+               regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+               regs->orig_eax = -1;            /* disable syscall checks */
+       }
+
+       {
+               struct _fpstate __user * buf;
+               err |= __get_user(buf, &sc->fpstate);
+               if (buf) {
+                       if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                               goto badframe;
+                       err |= restore_i387(buf);
+               } else {
+                       struct task_struct *me = current;
+                       if (used_math()) {
+                               clear_fpu(me);
+                               clear_used_math();
+                       }
+               }
+       }
+
+       err |= __get_user(*peax, &sc->eax);
+       return err;
+
+badframe:
+       return 1;
+}
+
+asmlinkage int sys_sigreturn(unsigned long __unused)
+{
+       struct pt_regs *regs = (struct pt_regs *) &__unused;
+       struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
+       sigset_t set;
+       int eax;
+
+       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+               goto badframe;
+       if (__get_user(set.sig[0], &frame->sc.oldmask)
+           || (_NSIG_WORDS > 1
+               && __copy_from_user(&set.sig[1], &frame->extramask,
+                                   sizeof(frame->extramask))))
+               goto badframe;
+
+       sigdelsetmask(&set, ~_BLOCKABLE);
+       spin_lock_irq(&current->sighand->siglock);
+       current->blocked = set;
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       
+       if (restore_sigcontext(regs, &frame->sc, &eax))
+               goto badframe;
+       return eax;
+
+badframe:
+       if (show_unhandled_signals && printk_ratelimit())
+               printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+                      " esp:%lx oeax:%lx\n",
+                   current->pid > 1 ? KERN_INFO : KERN_EMERG,
+                   current->comm, current->pid, frame, regs->eip,
+                   regs->esp, regs->orig_eax);
+
+       force_sig(SIGSEGV, current);
+       return 0;
+}      
+
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+       struct pt_regs *regs = (struct pt_regs *) &__unused;
+       struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
+       sigset_t set;
+       int eax;
+
+       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+               goto badframe;
+       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+               goto badframe;
+
+       sigdelsetmask(&set, ~_BLOCKABLE);
+       spin_lock_irq(&current->sighand->siglock);
+       current->blocked = set;
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+               goto badframe;
+
+       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+               goto badframe;
+
+       return eax;
+
+badframe:
+       force_sig(SIGSEGV, current);
+       return 0;
+}      
+
+/*
+ * Set up a signal frame.
+ */
+
+static int
+setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+                struct pt_regs *regs, unsigned long mask)
+{
+       int tmp, err = 0;
+
+       err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+       savesegment(gs, tmp);
+       err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+
+       err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
+       err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
+       err |= __put_user(regs->edi, &sc->edi);
+       err |= __put_user(regs->esi, &sc->esi);
+       err |= __put_user(regs->ebp, &sc->ebp);
+       err |= __put_user(regs->esp, &sc->esp);
+       err |= __put_user(regs->ebx, &sc->ebx);
+       err |= __put_user(regs->edx, &sc->edx);
+       err |= __put_user(regs->ecx, &sc->ecx);
+       err |= __put_user(regs->eax, &sc->eax);
+       err |= __put_user(current->thread.trap_no, &sc->trapno);
+       err |= __put_user(current->thread.error_code, &sc->err);
+       err |= __put_user(regs->eip, &sc->eip);
+       err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
+       err |= __put_user(regs->eflags, &sc->eflags);
+       err |= __put_user(regs->esp, &sc->esp_at_signal);
+       err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+
+       tmp = save_i387(fpstate);
+       if (tmp < 0)
+         err = 1;
+       else
+         err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
+
+       /* non-iBCS2 extensions.. */
+       err |= __put_user(mask, &sc->oldmask);
+       err |= __put_user(current->thread.cr2, &sc->cr2);
+
+       return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static inline void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+       unsigned long esp;
+
+       /* Default to using normal stack */
+       esp = regs->esp;
+
+       /* This is the X/Open sanctioned signal stack switching.  */
+       if (ka->sa.sa_flags & SA_ONSTACK) {
+               if (sas_ss_flags(esp) == 0)
+                       esp = current->sas_ss_sp + current->sas_ss_size;
+       }
+
+       /* This is the legacy signal stack switching. */
+       else if ((regs->xss & 0xffff) != __USER_DS &&
+                !(ka->sa.sa_flags & SA_RESTORER) &&
+                ka->sa.sa_restorer) {
+               esp = (unsigned long) ka->sa.sa_restorer;
+       }
+
+       esp -= frame_size;
+       /* Align the stack pointer according to the i386 ABI,
+        * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+       esp = ((esp + 4) & -16ul) - 4;
+       return (void __user *) esp;
+}
+
+/* These symbols are defined with the addresses in the vsyscall page.
+   See vsyscall-sigreturn.S.  */
+extern void __user __kernel_sigreturn;
+extern void __user __kernel_rt_sigreturn;
+
+static int setup_frame(int sig, struct k_sigaction *ka,
+                      sigset_t *set, struct pt_regs * regs)
+{
+       void __user *restorer;
+       struct sigframe __user *frame;
+       int err = 0;
+       int usig;
+
+       frame = get_sigframe(ka, regs, sizeof(*frame));
+
+       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+               goto give_sigsegv;
+
+       usig = current_thread_info()->exec_domain
+               && current_thread_info()->exec_domain->signal_invmap
+               && sig < 32
+               ? current_thread_info()->exec_domain->signal_invmap[sig]
+               : sig;
+
+       err = __put_user(usig, &frame->sig);
+       if (err)
+               goto give_sigsegv;
+
+       err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
+       if (err)
+               goto give_sigsegv;
+
+       if (_NSIG_WORDS > 1) {
+               err = __copy_to_user(&frame->extramask, &set->sig[1],
+                                     sizeof(frame->extramask));
+               if (err)
+                       goto give_sigsegv;
+       }
+
+       if (current->binfmt->hasvdso)
+               restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+       else
+               restorer = (void *)&frame->retcode;
+       if (ka->sa.sa_flags & SA_RESTORER)
+               restorer = ka->sa.sa_restorer;
+
+       /* Set up to return from userspace.  */
+       err |= __put_user(restorer, &frame->pretcode);
+        
+       /*
+        * This is popl %eax ; movl $,%eax ; int $0x80
+        *
+        * WE DO NOT USE IT ANY MORE! It's only left here for historical
+        * reasons and because gdb uses it as a signature to notice
+        * signal handler stack frames.
+        */
+       err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
+       err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
+       err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+
+       if (err)
+               goto give_sigsegv;
+
+       /* Set up registers for signal handler */
+       regs->esp = (unsigned long) frame;
+       regs->eip = (unsigned long) ka->sa.sa_handler;
+       regs->eax = (unsigned long) sig;
+       regs->edx = (unsigned long) 0;
+       regs->ecx = (unsigned long) 0;
+
+       set_fs(USER_DS);
+       regs->xds = __USER_DS;
+       regs->xes = __USER_DS;
+       regs->xss = __USER_DS;
+       regs->xcs = __USER_CS;
+
+       /*
+        * Clear TF when entering the signal handler, but
+        * notify any tracer that was single-stepping it.
+        * The tracer may want to single-step inside the
+        * handler too.
+        */
+       regs->eflags &= ~TF_MASK;
+       if (test_thread_flag(TIF_SINGLESTEP))
+               ptrace_notify(SIGTRAP);
+
+#if DEBUG_SIG
+       printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+               current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+       return 0;
+
+give_sigsegv:
+       force_sigsegv(sig, current);
+       return -EFAULT;
+}
+
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                          sigset_t *set, struct pt_regs * regs)
+{
+       void __user *restorer;
+       struct rt_sigframe __user *frame;
+       int err = 0;
+       int usig;
+
+       frame = get_sigframe(ka, regs, sizeof(*frame));
+
+       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+               goto give_sigsegv;
+
+       usig = current_thread_info()->exec_domain
+               && current_thread_info()->exec_domain->signal_invmap
+               && sig < 32
+               ? current_thread_info()->exec_domain->signal_invmap[sig]
+               : sig;
+
+       err |= __put_user(usig, &frame->sig);
+       err |= __put_user(&frame->info, &frame->pinfo);
+       err |= __put_user(&frame->uc, &frame->puc);
+       err |= copy_siginfo_to_user(&frame->info, info);
+       if (err)
+               goto give_sigsegv;
+
+       /* Create the ucontext.  */
+       err |= __put_user(0, &frame->uc.uc_flags);
+       err |= __put_user(0, &frame->uc.uc_link);
+       err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+       err |= __put_user(sas_ss_flags(regs->esp),
+                         &frame->uc.uc_stack.ss_flags);
+       err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+       err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+                               regs, set->sig[0]);
+       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+       if (err)
+               goto give_sigsegv;
+
+       /* Set up to return from userspace.  */
+       restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
+       if (ka->sa.sa_flags & SA_RESTORER)
+               restorer = ka->sa.sa_restorer;
+       err |= __put_user(restorer, &frame->pretcode);
+        
+       /*
+        * This is movl $,%eax ; int $0x80
+        *
+        * WE DO NOT USE IT ANY MORE! It's only left here for historical
+        * reasons and because gdb uses it as a signature to notice
+        * signal handler stack frames.
+        */
+       err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
+       err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
+       err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+
+       if (err)
+               goto give_sigsegv;
+
+       /* Set up registers for signal handler */
+       regs->esp = (unsigned long) frame;
+       regs->eip = (unsigned long) ka->sa.sa_handler;
+       regs->eax = (unsigned long) usig;
+       regs->edx = (unsigned long) &frame->info;
+       regs->ecx = (unsigned long) &frame->uc;
+
+       set_fs(USER_DS);
+       regs->xds = __USER_DS;
+       regs->xes = __USER_DS;
+       regs->xss = __USER_DS;
+       regs->xcs = __USER_CS;
+
+       /*
+        * Clear TF when entering the signal handler, but
+        * notify any tracer that was single-stepping it.
+        * The tracer may want to single-step inside the
+        * handler too.
+        */
+       regs->eflags &= ~TF_MASK;
+       if (test_thread_flag(TIF_SINGLESTEP))
+               ptrace_notify(SIGTRAP);
+
+#if DEBUG_SIG
+       printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+               current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+       return 0;
+
+give_sigsegv:
+       force_sigsegv(sig, current);
+       return -EFAULT;
+}
+
+/*
+ * OK, we're invoking a handler
+ */    
+
+static int
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+             sigset_t *oldset, struct pt_regs * regs)
+{
+       int ret;
+
+       /* Are we from a system call? */
+       if (regs->orig_eax >= 0) {
+               /* If so, check system call restarting.. */
+               switch (regs->eax) {
+                       case -ERESTART_RESTARTBLOCK:
+                       case -ERESTARTNOHAND:
+                               regs->eax = -EINTR;
+                               break;
+
+                       case -ERESTARTSYS:
+                               if (!(ka->sa.sa_flags & SA_RESTART)) {
+                                       regs->eax = -EINTR;
+                                       break;
+                               }
+                       /* fallthrough */
+                       case -ERESTARTNOINTR:
+                               regs->eax = regs->orig_eax;
+                               regs->eip -= 2;
+               }
+       }
+
+       /*
+        * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
+        * that register information in the sigcontext is correct.
+        */
+       if (unlikely(regs->eflags & TF_MASK)
+           && likely(current->ptrace & PT_DTRACE)) {
+               current->ptrace &= ~PT_DTRACE;
+               regs->eflags &= ~TF_MASK;
+       }
+
+       /* Set up the stack frame */
+       if (ka->sa.sa_flags & SA_SIGINFO)
+               ret = setup_rt_frame(sig, ka, info, oldset, regs);
+       else
+               ret = setup_frame(sig, ka, oldset, regs);
+
+       if (ret == 0) {
+               spin_lock_irq(&current->sighand->siglock);
+               sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+               if (!(ka->sa.sa_flags & SA_NODEFER))
+                       sigaddset(&current->blocked,sig);
+               recalc_sigpending();
+               spin_unlock_irq(&current->sighand->siglock);
+       }
+
+       return ret;
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+static void fastcall do_signal(struct pt_regs *regs)
+{
+       siginfo_t info;
+       int signr;
+       struct k_sigaction ka;
+       sigset_t *oldset;
+
+       /*
+        * We want the common case to go fast, which
+        * is why we may in certain cases get here from
+        * kernel mode. Just return without doing anything
+        * if so.  vm86 regs switched out by assembly code
+        * before reaching here, so testing against kernel
+        * CS suffices.
+        */
+       if (!user_mode(regs))
+               return;
+
+       if (test_thread_flag(TIF_RESTORE_SIGMASK))
+               oldset = &current->saved_sigmask;
+       else
+               oldset = &current->blocked;
+
+       signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+       if (signr > 0) {
+               /* Reenable any watchpoints before delivering the
+                * signal to user space. The processor register will
+                * have been cleared if the watchpoint triggered
+                * inside the kernel.
+                */
+               if (unlikely(current->thread.debugreg[7]))
+                       set_debugreg(current->thread.debugreg[7], 7);
+
+               /* Whee!  Actually deliver the signal.  */
+               if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+                       /* a signal was successfully delivered; the saved
+                        * sigmask will have been stored in the signal frame,
+                        * and will be restored by sigreturn, so we can simply
+                        * clear the TIF_RESTORE_SIGMASK flag */
+                       if (test_thread_flag(TIF_RESTORE_SIGMASK))
+                               clear_thread_flag(TIF_RESTORE_SIGMASK);
+               }
+
+               return;
+       }
+
+       /* Did we come from a system call? */
+       if (regs->orig_eax >= 0) {
+               /* Restart the system call - no handlers present */
+               switch (regs->eax) {
+               case -ERESTARTNOHAND:
+               case -ERESTARTSYS:
+               case -ERESTARTNOINTR:
+                       regs->eax = regs->orig_eax;
+                       regs->eip -= 2;
+                       break;
+
+               case -ERESTART_RESTARTBLOCK:
+                       regs->eax = __NR_restart_syscall;
+                       regs->eip -= 2;
+                       break;
+               }
+       }
+
+       /* if there's no signal to deliver, we just put the saved sigmask
+        * back */
+       if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+               clear_thread_flag(TIF_RESTORE_SIGMASK);
+               sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+       }
+}
+
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, void *_unused,
+                     __u32 thread_info_flags)
+{
+       /* Pending single-step? */
+       if (thread_info_flags & _TIF_SINGLESTEP) {
+               regs->eflags |= TF_MASK;
+               clear_thread_flag(TIF_SINGLESTEP);
+       }
+
+       /* deal with pending signal delivery */
+       if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
+               do_signal(regs);
+       
+       clear_thread_flag(TIF_IRET);
+}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
new file mode 100644 (file)
index 0000000..2d35d85
--- /dev/null
@@ -0,0 +1,707 @@
+/*
+ *     Intel SMP support routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *     This code is released under the GNU General Public License version 2 or
+ *     later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <mach_apic.h>
+
+/*
+ *     Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *     The Linux implications for SMP are handled as follows:
+ *
+ *     Pentium III / [Xeon]
+ *             None of the E1AP-E3AP errata are visible to the user.
+ *
+ *     E1AP.   see PII A1AP
+ *     E2AP.   see PII A2AP
+ *     E3AP.   see PII A3AP
+ *
+ *     Pentium II / [Xeon]
+ *             None of the A1AP-A3AP errata are visible to the user.
+ *
+ *     A1AP.   see PPro 1AP
+ *     A2AP.   see PPro 2AP
+ *     A3AP.   see PPro 7AP
+ *
+ *     Pentium Pro
+ *             None of 1AP-9AP errata are visible to the normal user,
+ *     except occasional delivery of 'spurious interrupt' as trap #15.
+ *     This is very rare and a non-problem.
+ *
+ *     1AP.    Linux maps APIC as non-cacheable
+ *     2AP.    worked around in hardware
+ *     3AP.    fixed in C0 and above steppings microcode update.
+ *             Linux does not use excessive STARTUP_IPIs.
+ *     4AP.    worked around in hardware
+ *     5AP.    symmetric IO mode (normal Linux operation) not affected.
+ *             'noapic' mode has vector 0xf filled out properly.
+ *     6AP.    'noapic' mode might be affected - fixed in later steppings
+ *     7AP.    We do not assume writes to the LVT deassering IRQs
+ *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
+ *     9AP.    We do not use mixed mode
+ *
+ *     Pentium
+ *             There is a marginal case where REP MOVS on 100MHz SMP
+ *     machines with B stepping processors can fail. XXX should provide
+ *     an L1cache=Writethrough or L1cache=off option.
+ *
+ *             B stepping CPUs may hang. There are hardware work arounds
+ *     for this. We warn about it in case your board doesn't have the work
+ *     arounds. Basically thats so I can tell anyone with a B stepping
+ *     CPU and SMP problems "tough".
+ *
+ *     Specific items [From Pentium Processor Specification Update]
+ *
+ *     1AP.    Linux doesn't use remote read
+ *     2AP.    Linux doesn't trust APIC errors
+ *     3AP.    We work around this
+ *     4AP.    Linux never generated 3 interrupts of the same priority
+ *             to cause a lost local interrupt.
+ *     5AP.    Remote read is never used
+ *     6AP.    not affected - worked around in hardware
+ *     7AP.    not affected - worked around in hardware
+ *     8AP.    worked around in hardware - we get explicit CS errors if not
+ *     9AP.    only 'noapic' mode affected. Might generate spurious
+ *             interrupts, we log only the first one and count the
+ *             rest silently.
+ *     10AP.   not affected - worked around in hardware
+ *     11AP.   Linux reads the APIC between writes to avoid this, as per
+ *             the documentation. Make sure you preserve this as it affects
+ *             the C stepping chips too.
+ *     12AP.   not affected - worked around in hardware
+ *     13AP.   not affected - worked around in hardware
+ *     14AP.   we always deassert INIT during bootup
+ *     15AP.   not affected - worked around in hardware
+ *     16AP.   not affected - worked around in hardware
+ *     17AP.   not affected - worked around in hardware
+ *     18AP.   not affected - worked around in hardware
+ *     19AP.   not affected - worked around in BIOS
+ *
+ *     If this sounds worrying believe me these bugs are either ___RARE___,
+ *     or are signal timing bugs worked around in hardware and there's
+ *     about nothing of note with C stepping upwards.
+ */
+
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
+
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+       unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+       switch (vector) {
+       default:
+               icr |= APIC_DM_FIXED | vector;
+               break;
+       case NMI_VECTOR:
+               icr |= APIC_DM_NMI;
+               break;
+       }
+       return icr;
+}
+
+static inline int __prepare_ICR2 (unsigned int mask)
+{
+       return SET_APIC_DEST_FIELD(mask);
+}
+
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+       /*
+        * Subtle. In the case of the 'never do double writes' workaround
+        * we have to lock out interrupts to be safe.  As we don't care
+        * of the value read we use an atomic rmw access to avoid costly
+        * cli/sti.  Otherwise we use an even cheaper single atomic write
+        * to the APIC.
+        */
+       unsigned int cfg;
+
+       /*
+        * Wait for idle.
+        */
+       apic_wait_icr_idle();
+
+       /*
+        * No need to touch the target chip field
+        */
+       cfg = __prepare_ICR(shortcut, vector);
+
+       /*
+        * Send the IPI. The write to APIC_ICR fires this off.
+        */
+       apic_write_around(APIC_ICR, cfg);
+}
+
+void fastcall send_IPI_self(int vector)
+{
+       __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+       unsigned long cfg;
+
+       /*
+        * Wait for idle.
+        */
+       if (unlikely(vector == NMI_VECTOR))
+               safe_apic_wait_icr_idle();
+       else
+               apic_wait_icr_idle();
+               
+       /*
+        * prepare target chip field
+        */
+       cfg = __prepare_ICR2(mask);
+       apic_write_around(APIC_ICR2, cfg);
+               
+       /*
+        * program the ICR 
+        */
+       cfg = __prepare_ICR(0, vector);
+                       
+       /*
+        * Send the IPI. The write to APIC_ICR fires this off.
+        */
+       apic_write_around(APIC_ICR, cfg);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+       unsigned long mask = cpus_addr(cpumask)[0];
+       unsigned long flags;
+
+       local_irq_save(flags);
+       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+       __send_IPI_dest_field(mask, vector);
+       local_irq_restore(flags);
+}
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+       unsigned long flags;
+       unsigned int query_cpu;
+
+       /*
+        * Hack. The clustered APIC addressing mode doesn't allow us to send 
+        * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
+        * should be modified to do 1 message per cluster ID - mbligh
+        */ 
+
+       local_irq_save(flags);
+       for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
+               if (cpu_isset(query_cpu, mask)) {
+                       __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+                                             vector);
+               }
+       }
+       local_irq_restore(flags);
+}
+
+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
+
+/*
+ *     Smarter SMP flushing macros. 
+ *             c/o Linus Torvalds.
+ *
+ *     These mean you can really definitely utterly forget about
+ *     writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *     Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+void leave_mm(unsigned long cpu)
+{
+       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+               BUG();
+       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+       load_cr3(swapper_pg_dir);
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *     Stop ipi delivery for the old mm. This is not synchronized with
+ *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *     for the wrong mm, and in the worst case we perform a superflous
+ *     tlb flush.
+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *     was in lazy tlb mode.
+ * 1a3) update cpu_tlbstate[].active_mm
+ *     Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *     Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ *     flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *     Atomically set the bit [other cpus will start sending flush ipis],
+ *     and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+
+fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+       unsigned long cpu;
+
+       cpu = get_cpu();
+
+       if (!cpu_isset(cpu, flush_cpumask))
+               goto out;
+               /* 
+                * This was a BUG() but until someone can quote me the
+                * line from the intel manual that guarantees an IPI to
+                * multiple CPUs is retried _only_ on the erroring CPUs
+                * its staying as a return
+                *
+                * BUG();
+                */
+                
+       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+                       if (flush_va == TLB_FLUSH_ALL)
+                               local_flush_tlb();
+                       else
+                               __flush_tlb_one(flush_va);
+               } else
+                       leave_mm(cpu);
+       }
+       ack_APIC_irq();
+       smp_mb__before_clear_bit();
+       cpu_clear(cpu, flush_cpumask);
+       smp_mb__after_clear_bit();
+out:
+       put_cpu_no_resched();
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+                            unsigned long va)
+{
+       cpumask_t cpumask = *cpumaskp;
+
+       /*
+        * A couple of (to be removed) sanity checks:
+        *
+        * - current CPU must not be in mask
+        * - mask must exist :)
+        */
+       BUG_ON(cpus_empty(cpumask));
+       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+       BUG_ON(!mm);
+
+#ifdef CONFIG_HOTPLUG_CPU
+       /* If a CPU which we ran on has gone down, OK. */
+       cpus_and(cpumask, cpumask, cpu_online_map);
+       if (unlikely(cpus_empty(cpumask)))
+               return;
+#endif
+
+       /*
+        * i'm not happy about this global shared spinlock in the
+        * MM hot path, but we'll see how contended it is.
+        * AK: x86-64 has a faster method that could be ported.
+        */
+       spin_lock(&tlbstate_lock);
+       
+       flush_mm = mm;
+       flush_va = va;
+       cpus_or(flush_cpumask, cpumask, flush_cpumask);
+       /*
+        * We have to send the IPI only to
+        * CPUs affected.
+        */
+       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+
+       while (!cpus_empty(flush_cpumask))
+               /* nothing. lockup detection does not belong here */
+               cpu_relax();
+
+       flush_mm = NULL;
+       flush_va = 0;
+       spin_unlock(&tlbstate_lock);
+}
+       
+void flush_tlb_current_task(void)
+{
+       struct mm_struct *mm = current->mm;
+       cpumask_t cpu_mask;
+
+       preempt_disable();
+       cpu_mask = mm->cpu_vm_mask;
+       cpu_clear(smp_processor_id(), cpu_mask);
+
+       local_flush_tlb();
+       if (!cpus_empty(cpu_mask))
+               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+       preempt_enable();
+}
+
+void flush_tlb_mm (struct mm_struct * mm)
+{
+       cpumask_t cpu_mask;
+
+       preempt_disable();
+       cpu_mask = mm->cpu_vm_mask;
+       cpu_clear(smp_processor_id(), cpu_mask);
+
+       if (current->active_mm == mm) {
+               if (current->mm)
+                       local_flush_tlb();
+               else
+                       leave_mm(smp_processor_id());
+       }
+       if (!cpus_empty(cpu_mask))
+               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+
+       preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       cpumask_t cpu_mask;
+
+       preempt_disable();
+       cpu_mask = mm->cpu_vm_mask;
+       cpu_clear(smp_processor_id(), cpu_mask);
+
+       if (current->active_mm == mm) {
+               if(current->mm)
+                       __flush_tlb_one(va);
+                else
+                       leave_mm(smp_processor_id());
+       }
+
+       if (!cpus_empty(cpu_mask))
+               flush_tlb_others(cpu_mask, mm, va);
+
+       preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+static void do_flush_tlb_all(void* info)
+{
+       unsigned long cpu = smp_processor_id();
+
+       __flush_tlb_all();
+       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+               leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+       WARN_ON(cpu_is_offline(cpu));
+       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+       void (*func) (void *info);
+       void *info;
+       atomic_t started;
+       atomic_t finished;
+       int wait;
+};
+
+void lock_ipi_call_lock(void)
+{
+       spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+       spin_unlock_irq(&call_lock);
+}
+
+static struct call_data_struct *call_data;
+
+static void __smp_call_function(void (*func) (void *info), void *info,
+                               int nonatomic, int wait)
+{
+       struct call_data_struct data;
+       int cpus = num_online_cpus() - 1;
+
+       if (!cpus)
+               return;
+
+       data.func = func;
+       data.info = info;
+       atomic_set(&data.started, 0);
+       data.wait = wait;
+       if (wait)
+               atomic_set(&data.finished, 0);
+
+       call_data = &data;
+       mb();
+       
+       /* Send a message to all other CPUs and wait for them to respond */
+       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+       /* Wait for response */
+       while (atomic_read(&data.started) != cpus)
+               cpu_relax();
+
+       if (wait)
+               while (atomic_read(&data.finished) != cpus)
+                       cpu_relax();
+}
+
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.  Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+  * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+static int
+native_smp_call_function_mask(cpumask_t mask,
+                             void (*func)(void *), void *info,
+                             int wait)
+{
+       struct call_data_struct data;
+       cpumask_t allbutself;
+       int cpus;
+
+       /* Can deadlock when called with interrupts disabled */
+       WARN_ON(irqs_disabled());
+
+       /* Holding any lock stops cpus from going down. */
+       spin_lock(&call_lock);
+
+       allbutself = cpu_online_map;
+       cpu_clear(smp_processor_id(), allbutself);
+
+       cpus_and(mask, mask, allbutself);
+       cpus = cpus_weight(mask);
+
+       if (!cpus) {
+               spin_unlock(&call_lock);
+               return 0;
+       }
+
+       data.func = func;
+       data.info = info;
+       atomic_set(&data.started, 0);
+       data.wait = wait;
+       if (wait)
+               atomic_set(&data.finished, 0);
+
+       call_data = &data;
+       mb();
+
+       /* Send a message to other CPUs */
+       if (cpus_equal(mask, allbutself))
+               send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+       else
+               send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+       /* Wait for response */
+       while (atomic_read(&data.started) != cpus)
+               cpu_relax();
+
+       if (wait)
+               while (atomic_read(&data.finished) != cpus)
+                       cpu_relax();
+       spin_unlock(&call_lock);
+
+       return 0;
+}
+
+static void stop_this_cpu (void * dummy)
+{
+       local_irq_disable();
+       /*
+        * Remove this CPU:
+        */
+       cpu_clear(smp_processor_id(), cpu_online_map);
+       disable_local_APIC();
+       if (cpu_data[smp_processor_id()].hlt_works_ok)
+               for(;;) halt();
+       for (;;);
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+static void native_smp_send_stop(void)
+{
+       /* Don't deadlock on the call lock in panic */
+       int nolock = !spin_trylock(&call_lock);
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __smp_call_function(stop_this_cpu, NULL, 0, 0);
+       if (!nolock)
+               spin_unlock(&call_lock);
+       disable_local_APIC();
+       local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+       ack_APIC_irq();
+}
+
+fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+{
+       void (*func) (void *info) = call_data->func;
+       void *info = call_data->info;
+       int wait = call_data->wait;
+
+       ack_APIC_irq();
+       /*
+        * Notify initiating CPU that I've grabbed the data and am
+        * about to execute the function
+        */
+       mb();
+       atomic_inc(&call_data->started);
+       /*
+        * At this point the info structure may be out of scope unless wait==1
+        */
+       irq_enter();
+       (*func)(info);
+       irq_exit();
+
+       if (wait) {
+               mb();
+               atomic_inc(&call_data->finished);
+       }
+}
+
+static int convert_apicid_to_cpu(int apic_id)
+{
+       int i;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               if (x86_cpu_to_apicid[i] == apic_id)
+                       return i;
+       }
+       return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+       int apicid, cpuid;
+
+       if (!boot_cpu_has(X86_FEATURE_APIC))
+               return 0;
+
+       apicid = hard_smp_processor_id();
+       if (apicid == BAD_APICID)
+               return 0;
+
+       cpuid = convert_apicid_to_cpu(apicid);
+
+       return cpuid >= 0 ? cpuid : 0;
+}
+
+struct smp_ops smp_ops = {
+       .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+       .smp_prepare_cpus = native_smp_prepare_cpus,
+       .cpu_up = native_cpu_up,
+       .smp_cpus_done = native_smp_cpus_done,
+
+       .smp_send_stop = native_smp_send_stop,
+       .smp_send_reschedule = native_smp_send_reschedule,
+       .smp_call_function_mask = native_smp_call_function_mask,
+};
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
new file mode 100644 (file)
index 0000000..e4f61d1
--- /dev/null
@@ -0,0 +1,1322 @@
+/*
+ *     x86 SMP booting functions
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *     Much of the core SMP work is based on previous work by Thomas Radke, to
+ *     whom a great many thanks are extended.
+ *
+ *     Thanks to Intel for making available several different Pentium,
+ *     Pentium Pro and Pentium-II/Xeon MP machines.
+ *     Original development of Linux SMP code supported by Caldera.
+ *
+ *     This code is released under the GNU General Public License version 2 or
+ *     later.
+ *
+ *     Fixes
+ *             Felix Koop      :       NR_CPUS used properly
+ *             Jose Renau      :       Handle single CPU case.
+ *             Alan Cox        :       By repeated request 8) - Total BogoMIPS report.
+ *             Greg Wright     :       Fix for kernel stacks panic.
+ *             Erich Boleyn    :       MP v1.4 and additional changes.
+ *     Matthias Sattler        :       Changes for 2.1 kernel map.
+ *     Michel Lespinasse       :       Changes for 2.1 kernel map.
+ *     Michael Chastain        :       Change trampoline.S to gnu as.
+ *             Alan Cox        :       Dumb bug: 'B' step PPro's are fine
+ *             Ingo Molnar     :       Added APIC timers, based on code
+ *                                     from Jose Renau
+ *             Ingo Molnar     :       various cleanups and rewrites
+ *             Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
+ *             Martin J. Bligh :       Added support for multi-quad systems
+ *             Dave Jones      :       Report invalid combinations of Athlon CPUs.
+*              Rusty Russell   :       Hacked into shape for new "hotplug" boot process. */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <linux/nmi.h>
+
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/nmi.h>
+
+#include <mach_apic.h>
+#include <mach_wakecpu.h>
+#include <smpboot_hooks.h>
+#include <asm/vmi.h>
+#include <asm/mtrr.h>
+
+/* Set if we find a B stepping CPU */
+static int __devinitdata smp_b_stepping;
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
+/* representing HT siblings of each logical CPU */
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_core_map);
+
+/* bitmap of online cpus */
+cpumask_t cpu_online_map __read_mostly;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+static cpumask_t smp_commenced_mask;
+
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
+                       { [0 ... NR_CPUS-1] = 0xff };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+
+u8 apicid_2_node[MAX_APICID];
+
+/*
+ * Trampoline 80x86 program as an array.
+ */
+
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end  [];
+static unsigned char *trampoline_base;
+static int trampoline_exec;
+
+static void map_cpu_to_logical_apicid(void);
+
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+
+static unsigned long __devinit setup_trampoline(void)
+{
+       memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
+       return virt_to_phys(trampoline_base);
+}
+
+/*
+ * We are called very early to get the low memory for the
+ * SMP bootup trampoline page.
+ */
+void __init smp_alloc_memory(void)
+{
+       trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+       /*
+        * Has to be in very low memory so we can execute
+        * real-mode AP code.
+        */
+       if (__pa(trampoline_base) >= 0x9F000)
+               BUG();
+       /*
+        * Make the SMP trampoline executable:
+        */
+       trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+
+void __cpuinit smp_store_cpu_info(int id)
+{
+       struct cpuinfo_x86 *c = cpu_data + id;
+
+       *c = boot_cpu_data;
+       if (id!=0)
+               identify_secondary_cpu(c);
+       /*
+        * Mask B, Pentium, but not Pentium MMX
+        */
+       if (c->x86_vendor == X86_VENDOR_INTEL &&
+           c->x86 == 5 &&
+           c->x86_mask >= 1 && c->x86_mask <= 4 &&
+           c->x86_model <= 3)
+               /*
+                * Remember we have B step Pentia with bugs
+                */
+               smp_b_stepping = 1;
+
+       /*
+        * Certain Athlons might work (for various values of 'work') in SMP
+        * but they are not certified as MP capable.
+        */
+       if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+
+               if (num_possible_cpus() == 1)
+                       goto valid_k7;
+
+               /* Athlon 660/661 is valid. */  
+               if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
+                       goto valid_k7;
+
+               /* Duron 670 is valid */
+               if ((c->x86_model==7) && (c->x86_mask==0))
+                       goto valid_k7;
+
+               /*
+                * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
+                * It's worth noting that the A5 stepping (662) of some Athlon XP's
+                * have the MP bit set.
+                * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
+                */
+               if (((c->x86_model==6) && (c->x86_mask>=2)) ||
+                   ((c->x86_model==7) && (c->x86_mask>=1)) ||
+                    (c->x86_model> 7))
+                       if (cpu_has_mp)
+                               goto valid_k7;
+
+               /* If we get here, it's not a certified SMP capable AMD system. */
+               add_taint(TAINT_UNSAFE_SMP);
+       }
+
+valid_k7:
+       ;
+}
+
+extern void calibrate_delay(void);
+
+static atomic_t init_deasserted;
+
+static void __cpuinit smp_callin(void)
+{
+       int cpuid, phys_id;
+       unsigned long timeout;
+
+       /*
+        * If waken up by an INIT in an 82489DX configuration
+        * we may get here before an INIT-deassert IPI reaches
+        * our local APIC.  We have to wait for the IPI or we'll
+        * lock up on an APIC access.
+        */
+       wait_for_init_deassert(&init_deasserted);
+
+       /*
+        * (This works even if the APIC is not enabled.)
+        */
+       phys_id = GET_APIC_ID(apic_read(APIC_ID));
+       cpuid = smp_processor_id();
+       if (cpu_isset(cpuid, cpu_callin_map)) {
+               printk("huh, phys CPU#%d, CPU#%d already present??\n",
+                                       phys_id, cpuid);
+               BUG();
+       }
+       Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+
+       /*
+        * STARTUP IPIs are fragile beasts as they might sometimes
+        * trigger some glue motherboard logic. Complete APIC bus
+        * silence for 1 second, this overestimates the time the
+        * boot CPU is spending to send the up to 2 STARTUP IPIs
+        * by a factor of two. This should be enough.
+        */
+
+       /*
+        * Waiting 2s total for startup (udelay is not yet working)
+        */
+       timeout = jiffies + 2*HZ;
+       while (time_before(jiffies, timeout)) {
+               /*
+                * Has the boot CPU finished it's STARTUP sequence?
+                */
+               if (cpu_isset(cpuid, cpu_callout_map))
+                       break;
+               rep_nop();
+       }
+
+       if (!time_before(jiffies, timeout)) {
+               printk("BUG: CPU%d started up but did not get a callout!\n",
+                       cpuid);
+               BUG();
+       }
+
+       /*
+        * the boot CPU has finished the init stage and is spinning
+        * on callin_map until we finish. We are free to set up this
+        * CPU, first the APIC. (this is probably redundant on most
+        * boards)
+        */
+
+       Dprintk("CALLIN, before setup_local_APIC().\n");
+       smp_callin_clear_local_apic();
+       setup_local_APIC();
+       map_cpu_to_logical_apicid();
+
+       /*
+        * Get our bogomips.
+        */
+       calibrate_delay();
+       Dprintk("Stack at about %p\n",&cpuid);
+
+       /*
+        * Save our processor parameters
+        */
+       smp_store_cpu_info(cpuid);
+
+       /*
+        * Allow the master to continue.
+        */
+       cpu_set(cpuid, cpu_callin_map);
+}
+
+static int cpucount;
+
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * And for power savings, we return cpu_core_map
+        */
+       if (sched_mc_power_savings || sched_smt_power_savings)
+               return cpu_core_map[cpu];
+       else
+               return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
+void __cpuinit set_cpu_sibling_map(int cpu)
+{
+       int i;
+       struct cpuinfo_x86 *c = cpu_data;
+
+       cpu_set(cpu, cpu_sibling_setup_map);
+
+       if (smp_num_siblings > 1) {
+               for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
+                               cpu_set(i, cpu_sibling_map[cpu]);
+                               cpu_set(cpu, cpu_sibling_map[i]);
+                               cpu_set(i, cpu_core_map[cpu]);
+                               cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
+                       }
+               }
+       } else {
+               cpu_set(cpu, cpu_sibling_map[cpu]);
+       }
+
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
+       if (current_cpu_data.x86_max_cores == 1) {
+               cpu_core_map[cpu] = cpu_sibling_map[cpu];
+               c[cpu].booted_cores = 1;
+               return;
+       }
+
+       for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
+               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+                       cpu_set(i, cpu_core_map[cpu]);
+                       cpu_set(cpu, cpu_core_map[i]);
+                       /*
+                        *  Does this new cpu bringup a new core?
+                        */
+                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                               /*
+                                * for each core in package, increment
+                                * the booted_cores for this new cpu
+                                */
+                               if (first_cpu(cpu_sibling_map[i]) == i)
+                                       c[cpu].booted_cores++;
+                               /*
+                                * increment the core count for all
+                                * the other cpus in this package
+                                */
+                               if (i != cpu)
+                                       c[i].booted_cores++;
+                       } else if (i != cpu && !c[cpu].booted_cores)
+                               c[cpu].booted_cores = c[i].booted_cores;
+               }
+       }
+}
+
+/*
+ * Activate a secondary processor.
+ */
+static void __cpuinit start_secondary(void *unused)
+{
+       /*
+        * Don't put *anything* before cpu_init(), SMP booting is too
+        * fragile that we want to limit the things done here to the
+        * most necessary things.
+        */
+#ifdef CONFIG_VMI
+       vmi_bringup();
+#endif
+       cpu_init();
+       preempt_disable();
+       smp_callin();
+       while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+               rep_nop();
+       /*
+        * Check TSC synchronization with the BP:
+        */
+       check_tsc_sync_target();
+
+       setup_secondary_clock();
+       if (nmi_watchdog == NMI_IO_APIC) {
+               disable_8259A_irq(0);
+               enable_NMI_through_LVT0(NULL);
+               enable_8259A_irq(0);
+       }
+       /*
+        * low-memory mappings have been cleared, flush them from
+        * the local TLBs too.
+        */
+       local_flush_tlb();
+
+       /* This must be done before setting cpu_online_map */
+       set_cpu_sibling_map(raw_smp_processor_id());
+       wmb();
+
+       /*
+        * We need to hold call_lock, so there is no inconsistency
+        * between the time smp_call_function() determines number of
+        * IPI receipients, and the time when the determination is made
+        * for which cpus receive the IPI. Holding this
+        * lock helps us to not include this cpu in a currently in progress
+        * smp_call_function().
+        */
+       lock_ipi_call_lock();
+       cpu_set(smp_processor_id(), cpu_online_map);
+       unlock_ipi_call_lock();
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+       /* We can take interrupts now: we're officially "up". */
+       local_irq_enable();
+
+       wmb();
+       cpu_idle();
+}
+
+/*
+ * Everything has been set up for the secondary
+ * CPUs - they just need to reload everything
+ * from the task structure
+ * This function must not return.
+ */
+void __devinit initialize_secondary(void)
+{
+       /*
+        * We don't actually need to load the full TSS,
+        * basically just the stack pointer and the eip.
+        */
+
+       asm volatile(
+               "movl %0,%%esp\n\t"
+               "jmp *%1"
+               :
+               :"m" (current->thread.esp),"m" (current->thread.eip));
+}
+
+/* Static state in head.S used to set up a CPU */
+extern struct {
+       void * esp;
+       unsigned short ss;
+} stack_start;
+
+#ifdef CONFIG_NUMA
+
+/* which logical CPUs are on which nodes */
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
+                               { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_2_cpu_mask);
+/* which node each logical CPU is on */
+int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_2_node);
+
+/* set up a mapping between cpu and node. */
+static inline void map_cpu_to_node(int cpu, int node)
+{
+       printk("Mapping cpu %d to node %d\n", cpu, node);
+       cpu_set(cpu, node_2_cpu_mask[node]);
+       cpu_2_node[cpu] = node;
+}
+
+/* undo a mapping between cpu and node. */
+static inline void unmap_cpu_to_node(int cpu)
+{
+       int node;
+
+       printk("Unmapping cpu %d from all nodes\n", cpu);
+       for (node = 0; node < MAX_NUMNODES; node ++)
+               cpu_clear(cpu, node_2_cpu_mask[node]);
+       cpu_2_node[cpu] = 0;
+}
+#else /* !CONFIG_NUMA */
+
+#define map_cpu_to_node(cpu, node)     ({})
+#define unmap_cpu_to_node(cpu) ({})
+
+#endif /* CONFIG_NUMA */
+
+u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+static void map_cpu_to_logical_apicid(void)
+{
+       int cpu = smp_processor_id();
+       int apicid = logical_smp_processor_id();
+       int node = apicid_to_node(apicid);
+
+       if (!node_online(node))
+               node = first_online_node;
+
+       cpu_2_logical_apicid[cpu] = apicid;
+       map_cpu_to_node(cpu, node);
+}
+
+static void unmap_cpu_to_logical_apicid(int cpu)
+{
+       cpu_2_logical_apicid[cpu] = BAD_APICID;
+       unmap_cpu_to_node(cpu);
+}
+
+static inline void __inquire_remote_apic(int apicid)
+{
+       int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+       char *names[] = { "ID", "VERSION", "SPIV" };
+       int timeout;
+       unsigned long status;
+
+       printk("Inquiring remote APIC #%d...\n", apicid);
+
+       for (i = 0; i < ARRAY_SIZE(regs); i++) {
+               printk("... APIC #%d %s: ", apicid, names[i]);
+
+               /*
+                * Wait for idle.
+                */
+               status = safe_apic_wait_icr_idle();
+               if (status)
+                       printk("a previous APIC delivery may have failed\n");
+
+               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+               apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+
+               timeout = 0;
+               do {
+                       udelay(100);
+                       status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+               } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+               switch (status) {
+               case APIC_ICR_RR_VALID:
+                       status = apic_read(APIC_RRR);
+                       printk("%lx\n", status);
+                       break;
+               default:
+                       printk("failed\n");
+               }
+       }
+}
+
+#ifdef WAKE_SECONDARY_VIA_NMI
+/* 
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+static int __devinit
+wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+{
+       unsigned long send_status, accept_status = 0;
+       int maxlvt;
+
+       /* Target chip */
+       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+
+       /* Boot on the stack */
+       /* Kick the second */
+       apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+
+       Dprintk("Waiting for send to finish...\n");
+       send_status = safe_apic_wait_icr_idle();
+
+       /*
+        * Give the other CPU some time to accept the IPI.
+        */
+       udelay(200);
+       /*
+        * Due to the Pentium erratum 3AP.
+        */
+       maxlvt = lapic_get_maxlvt();
+       if (maxlvt > 3) {
+               apic_read_around(APIC_SPIV);
+               apic_write(APIC_ESR, 0);
+       }
+       accept_status = (apic_read(APIC_ESR) & 0xEF);
+       Dprintk("NMI sent.\n");
+
+       if (send_status)
+               printk("APIC never delivered???\n");
+       if (accept_status)
+               printk("APIC delivery error (%lx).\n", accept_status);
+
+       return (send_status | accept_status);
+}
+#endif /* WAKE_SECONDARY_VIA_NMI */
+
+#ifdef WAKE_SECONDARY_VIA_INIT
+static int __devinit
+wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+{
+       unsigned long send_status, accept_status = 0;
+       int maxlvt, num_starts, j;
+
+       /*
+        * Be paranoid about clearing APIC errors.
+        */
+       if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+               apic_read_around(APIC_SPIV);
+               apic_write(APIC_ESR, 0);
+               apic_read(APIC_ESR);
+       }
+
+       Dprintk("Asserting INIT.\n");
+
+       /*
+        * Turn INIT on target chip
+        */
+       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+       /*
+        * Send IPI
+        */
+       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+                               | APIC_DM_INIT);
+
+       Dprintk("Waiting for send to finish...\n");
+       send_status = safe_apic_wait_icr_idle();
+
+       mdelay(10);
+
+       Dprintk("Deasserting INIT.\n");
+
+       /* Target chip */
+       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+       /* Send IPI */
+       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+
+       Dprintk("Waiting for send to finish...\n");
+       send_status = safe_apic_wait_icr_idle();
+
+       atomic_set(&init_deasserted, 1);
+
+       /*
+        * Should we send STARTUP IPIs ?
+        *
+        * Determine this based on the APIC version.
+        * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+        */
+       if (APIC_INTEGRATED(apic_version[phys_apicid]))
+               num_starts = 2;
+       else
+               num_starts = 0;
+
+       /*
+        * Paravirt / VMI wants a startup IPI hook here to set up the
+        * target processor state.
+        */
+       startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+                        (unsigned long) stack_start.esp);
+
+       /*
+        * Run STARTUP IPI loop.
+        */
+       Dprintk("#startup loops: %d.\n", num_starts);
+
+       maxlvt = lapic_get_maxlvt();
+
+       for (j = 1; j <= num_starts; j++) {
+               Dprintk("Sending STARTUP #%d.\n",j);
+               apic_read_around(APIC_SPIV);
+               apic_write(APIC_ESR, 0);
+               apic_read(APIC_ESR);
+               Dprintk("After apic_write.\n");
+
+               /*
+                * STARTUP IPI
+                */
+
+               /* Target chip */
+               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+               /* Boot on the stack */
+               /* Kick the second */
+               apic_write_around(APIC_ICR, APIC_DM_STARTUP
+                                       | (start_eip >> 12));
+
+               /*
+                * Give the other CPU some time to accept the IPI.
+                */
+               udelay(300);
+
+               Dprintk("Startup point 1.\n");
+
+               Dprintk("Waiting for send to finish...\n");
+               send_status = safe_apic_wait_icr_idle();
+
+               /*
+                * Give the other CPU some time to accept the IPI.
+                */
+               udelay(200);
+               /*
+                * Due to the Pentium erratum 3AP.
+                */
+               if (maxlvt > 3) {
+                       apic_read_around(APIC_SPIV);
+                       apic_write(APIC_ESR, 0);
+               }
+               accept_status = (apic_read(APIC_ESR) & 0xEF);
+               if (send_status || accept_status)
+                       break;
+       }
+       Dprintk("After Startup.\n");
+
+       if (send_status)
+               printk("APIC never delivered???\n");
+       if (accept_status)
+               printk("APIC delivery error (%lx).\n", accept_status);
+
+       return (send_status | accept_status);
+}
+#endif /* WAKE_SECONDARY_VIA_INIT */
+
+extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+       cpumask_t       tmp_map;
+       int cpu;
+       cpus_complement(tmp_map, cpu_present_map);
+       cpu = first_cpu(tmp_map);
+       if (cpu >= NR_CPUS)
+               return -ENODEV;
+       return cpu;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+       struct task_struct *idle;
+
+       if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+               /* initialize thread_struct.  we really want to avoid destroy
+                * idle tread
+                */
+               idle->thread.esp = (unsigned long)task_pt_regs(idle);
+               init_idle(idle, cpu);
+               return idle;
+       }
+       idle = fork_idle(cpu);
+
+       if (!IS_ERR(idle))
+               cpu_idle_tasks[cpu] = idle;
+       return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
+
+static int __cpuinit do_boot_cpu(int apicid, int cpu)
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
+ */
+{
+       struct task_struct *idle;
+       unsigned long boot_error;
+       int timeout;
+       unsigned long start_eip;
+       unsigned short nmi_high = 0, nmi_low = 0;
+
+       /*
+        * Save current MTRR state in case it was changed since early boot
+        * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+        */
+       mtrr_save_state();
+
+       /*
+        * We can't use kernel_thread since we must avoid to
+        * reschedule the child.
+        */
+       idle = alloc_idle_task(cpu);
+       if (IS_ERR(idle))
+               panic("failed fork for CPU %d", cpu);
+
+       init_gdt(cpu);
+       per_cpu(current_task, cpu) = idle;
+       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+
+       idle->thread.eip = (unsigned long) start_secondary;
+       /* start_eip had better be page-aligned! */
+       start_eip = setup_trampoline();
+
+       ++cpucount;
+       alternatives_smp_switch(1);
+
+       /* So we see what's up   */
+       printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+       /* Stack for startup_32 can be just as for start_secondary onwards */
+       stack_start.esp = (void *) idle->thread.esp;
+
+       irq_ctx_init(cpu);
+
+       x86_cpu_to_apicid[cpu] = apicid;
+       /*
+        * This grunge runs the startup process for
+        * the targeted processor.
+        */
+
+       atomic_set(&init_deasserted, 0);
+
+       Dprintk("Setting warm reset code and vector.\n");
+
+       store_NMI_vector(&nmi_high, &nmi_low);
+
+       smpboot_setup_warm_reset_vector(start_eip);
+
+       /*
+        * Starting actual IPI sequence...
+        */
+       boot_error = wakeup_secondary_cpu(apicid, start_eip);
+
+       if (!boot_error) {
+               /*
+                * allow APs to start initializing.
+                */
+               Dprintk("Before Callout %d.\n", cpu);
+               cpu_set(cpu, cpu_callout_map);
+               Dprintk("After Callout %d.\n", cpu);
+
+               /*
+                * Wait 5s total for a response
+                */
+               for (timeout = 0; timeout < 50000; timeout++) {
+                       if (cpu_isset(cpu, cpu_callin_map))
+                               break;  /* It has booted */
+                       udelay(100);
+               }
+
+               if (cpu_isset(cpu, cpu_callin_map)) {
+                       /* number CPUs logically, starting from 1 (BSP is 0) */
+                       Dprintk("OK.\n");
+                       printk("CPU%d: ", cpu);
+                       print_cpu_info(&cpu_data[cpu]);
+                       Dprintk("CPU has booted.\n");
+               } else {
+                       boot_error= 1;
+                       if (*((volatile unsigned char *)trampoline_base)
+                                       == 0xA5)
+                               /* trampoline started but...? */
+                               printk("Stuck ??\n");
+                       else
+                               /* trampoline code not run */
+                               printk("Not responding.\n");
+                       inquire_remote_apic(apicid);
+               }
+       }
+
+       if (boot_error) {
+               /* Try to put things back the way they were before ... */
+               unmap_cpu_to_logical_apicid(cpu);
+               cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+               cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+               cpucount--;
+       } else {
+               x86_cpu_to_apicid[cpu] = apicid;
+               cpu_set(cpu, cpu_present_map);
+       }
+
+       /* mark "stuck" area as not stuck */
+       *((volatile unsigned long *)trampoline_base) = 0;
+
+       return boot_error;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+       int cpu = raw_smp_processor_id();
+
+       idle_task_exit();
+
+       cpucount --;
+       cpu_uninit();
+       irq_ctx_exit(cpu);
+
+       cpu_clear(cpu, cpu_callout_map);
+       cpu_clear(cpu, cpu_callin_map);
+
+       cpu_clear(cpu, smp_commenced_mask);
+       unmap_cpu_to_logical_apicid(cpu);
+}
+
+struct warm_boot_cpu_info {
+       struct completion *complete;
+       struct work_struct task;
+       int apicid;
+       int cpu;
+};
+
+static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
+{
+       struct warm_boot_cpu_info *info =
+               container_of(work, struct warm_boot_cpu_info, task);
+       do_boot_cpu(info->apicid, info->cpu);
+       complete(info->complete);
+}
+
+static int __cpuinit __smp_prepare_cpu(int cpu)
+{
+       DECLARE_COMPLETION_ONSTACK(done);
+       struct warm_boot_cpu_info info;
+       int     apicid, ret;
+
+       apicid = x86_cpu_to_apicid[cpu];
+       if (apicid == BAD_APICID) {
+               ret = -ENODEV;
+               goto exit;
+       }
+
+       info.complete = &done;
+       info.apicid = apicid;
+       info.cpu = cpu;
+       INIT_WORK(&info.task, do_warm_boot_cpu);
+
+       /* init low mem mapping */
+       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+                       min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+       flush_tlb_all();
+       schedule_work(&info.task);
+       wait_for_completion(&done);
+
+       zap_low_mappings();
+       ret = 0;
+exit:
+       return ret;
+}
+#endif
+
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+
+static int boot_cpu_logical_apicid;
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+#ifdef CONFIG_X86_NUMAQ
+EXPORT_SYMBOL(xquad_portio);
+#endif
+
+static void __init smp_boot_cpus(unsigned int max_cpus)
+{
+       int apicid, cpu, bit, kicked;
+       unsigned long bogosum = 0;
+
+       /*
+        * Setup boot CPU information
+        */
+       smp_store_cpu_info(0); /* Final full version of the data */
+       printk("CPU%d: ", 0);
+       print_cpu_info(&cpu_data[0]);
+
+       boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+       boot_cpu_logical_apicid = logical_smp_processor_id();
+       x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+       current_thread_info()->cpu = 0;
+
+       set_cpu_sibling_map(0);
+
+       /*
+        * If we couldn't find an SMP configuration at boot time,
+        * get out of here now!
+        */
+       if (!smp_found_config && !acpi_lapic) {
+               printk(KERN_NOTICE "SMP motherboard not detected.\n");
+               smpboot_clear_io_apic_irqs();
+               phys_cpu_present_map = physid_mask_of_physid(0);
+               if (APIC_init_uniprocessor())
+                       printk(KERN_NOTICE "Local APIC not detected."
+                                          " Using dummy APIC emulation.\n");
+               map_cpu_to_logical_apicid();
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
+               return;
+       }
+
+       /*
+        * Should not be necessary because the MP table should list the boot
+        * CPU too, but we do it for the sake of robustness anyway.
+        * Makes no sense to do this check in clustered apic mode, so skip it
+        */
+       if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+               printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+                               boot_cpu_physical_apicid);
+               physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+       }
+
+       /*
+        * If we couldn't find a local APIC, then get out of here now!
+        */
+       if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
+               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                       boot_cpu_physical_apicid);
+               printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+               smpboot_clear_io_apic_irqs();
+               phys_cpu_present_map = physid_mask_of_physid(0);
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
+               return;
+       }
+
+       verify_local_APIC();
+
+       /*
+        * If SMP should be disabled, then really disable it!
+        */
+       if (!max_cpus) {
+               smp_found_config = 0;
+               printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+               smpboot_clear_io_apic_irqs();
+               phys_cpu_present_map = physid_mask_of_physid(0);
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
+               return;
+       }
+
+       connect_bsp_APIC();
+       setup_local_APIC();
+       map_cpu_to_logical_apicid();
+
+
+       setup_portio_remap();
+
+       /*
+        * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
+        *
+        * In clustered apic mode, phys_cpu_present_map is a constructed thus:
+        * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
+        * clustered apic ID.
+        */
+       Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+
+       kicked = 1;
+       for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
+               apicid = cpu_present_to_apicid(bit);
+               /*
+                * Don't even attempt to start the boot CPU!
+                */
+               if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
+                       continue;
+
+               if (!check_apicid_present(bit))
+                       continue;
+               if (max_cpus <= cpucount+1)
+                       continue;
+
+               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
+                       printk("CPU #%d not responding - cannot use it.\n",
+                                                               apicid);
+               else
+                       ++kicked;
+       }
+
+       /*
+        * Cleanup possible dangling ends...
+        */
+       smpboot_restore_warm_reset_vector();
+
+       /*
+        * Allow the user to impress friends.
+        */
+       Dprintk("Before bogomips.\n");
+       for (cpu = 0; cpu < NR_CPUS; cpu++)
+               if (cpu_isset(cpu, cpu_callout_map))
+                       bogosum += cpu_data[cpu].loops_per_jiffy;
+       printk(KERN_INFO
+               "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+               cpucount+1,
+               bogosum/(500000/HZ),
+               (bogosum/(5000/HZ))%100);
+       
+       Dprintk("Before bogocount - setting activated=1.\n");
+
+       if (smp_b_stepping)
+               printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+
+       /*
+        * Don't taint if we are running SMP kernel on a single non-MP
+        * approved Athlon
+        */
+       if (tainted & TAINT_UNSAFE_SMP) {
+               if (cpucount)
+                       printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
+               else
+                       tainted &= ~TAINT_UNSAFE_SMP;
+       }
+
+       Dprintk("Boot done.\n");
+
+       /*
+        * construct cpu_sibling_map[], so that we can tell sibling CPUs
+        * efficiently.
+        */
+       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+               cpus_clear(cpu_sibling_map[cpu]);
+               cpus_clear(cpu_core_map[cpu]);
+       }
+
+       cpu_set(0, cpu_sibling_map[0]);
+       cpu_set(0, cpu_core_map[0]);
+
+       smpboot_setup_io_apic();
+
+       setup_boot_clock();
+}
+
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+       smp_commenced_mask = cpumask_of_cpu(0);
+       cpu_callin_map = cpumask_of_cpu(0);
+       mb();
+       smp_boot_cpus(max_cpus);
+}
+
+void __init native_smp_prepare_boot_cpu(void)
+{
+       unsigned int cpu = smp_processor_id();
+
+       init_gdt(cpu);
+       switch_to_new_gdt();
+
+       cpu_set(cpu, cpu_online_map);
+       cpu_set(cpu, cpu_callout_map);
+       cpu_set(cpu, cpu_present_map);
+       cpu_set(cpu, cpu_possible_map);
+       __get_cpu_var(cpu_state) = CPU_ONLINE;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void remove_siblinginfo(int cpu)
+{
+       int sibling;
+       struct cpuinfo_x86 *c = cpu_data;
+
+       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+               cpu_clear(cpu, cpu_core_map[sibling]);
+               /*
+                * last thread sibling in this cpu core going down
+                */
+               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                       c[sibling].booted_cores--;
+       }
+                       
+       for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+               cpu_clear(cpu, cpu_sibling_map[sibling]);
+       cpus_clear(cpu_sibling_map[cpu]);
+       cpus_clear(cpu_core_map[cpu]);
+       c[cpu].phys_proc_id = 0;
+       c[cpu].cpu_core_id = 0;
+       cpu_clear(cpu, cpu_sibling_setup_map);
+}
+
+int __cpu_disable(void)
+{
+       cpumask_t map = cpu_online_map;
+       int cpu = smp_processor_id();
+
+       /*
+        * Perhaps use cpufreq to drop frequency, but that could go
+        * into generic code.
+        *
+        * We won't take down the boot processor on i386 due to some
+        * interrupts only being able to be serviced by the BSP.
+        * Especially so if we're not using an IOAPIC   -zwane
+        */
+       if (cpu == 0)
+               return -EBUSY;
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               stop_apic_nmi_watchdog(NULL);
+       clear_local_APIC();
+       /* Allow any queued timer interrupts to get serviced */
+       local_irq_enable();
+       mdelay(1);
+       local_irq_disable();
+
+       remove_siblinginfo(cpu);
+
+       cpu_clear(cpu, map);
+       fixup_irqs(map);
+       /* It's now safe to remove this processor from the online map */
+       cpu_clear(cpu, cpu_online_map);
+       return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+       /* We don't do anything here: idle task is faking death itself. */
+       unsigned int i;
+
+       for (i = 0; i < 10; i++) {
+               /* They ack this in play_dead by setting CPU_DEAD */
+               if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+                       printk ("CPU %d is now offline\n", cpu);
+                       if (1 == num_online_cpus())
+                               alternatives_smp_switch(0);
+                       return;
+               }
+               msleep(100);
+       }
+       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+       return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+       /* We said "no" in __cpu_disable */
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __cpuinit native_cpu_up(unsigned int cpu)
+{
+       unsigned long flags;
+#ifdef CONFIG_HOTPLUG_CPU
+       int ret = 0;
+
+       /*
+        * We do warm boot only on cpus that had booted earlier
+        * Otherwise cold boot is all handled from smp_boot_cpus().
+        * cpu_callin_map is set during AP kickstart process. Its reset
+        * when a cpu is taken offline from cpu_exit_clear().
+        */
+       if (!cpu_isset(cpu, cpu_callin_map))
+               ret = __smp_prepare_cpu(cpu);
+
+       if (ret)
+               return -EIO;
+#endif
+
+       /* In case one didn't come up */
+       if (!cpu_isset(cpu, cpu_callin_map)) {
+               printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
+               return -EIO;
+       }
+
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+       /* Unleash the CPU! */
+       cpu_set(cpu, smp_commenced_mask);
+
+       /*
+        * Check TSC synchronization with the AP (keep irqs disabled
+        * while doing so):
+        */
+       local_irq_save(flags);
+       check_tsc_sync_source(cpu);
+       local_irq_restore(flags);
+
+       while (!cpu_isset(cpu, cpu_online_map)) {
+               cpu_relax();
+               touch_nmi_watchdog();
+       }
+
+       return 0;
+}
+
+void __init native_smp_cpus_done(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_IO_APIC
+       setup_ioapic_dest();
+#endif
+       zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
+       /*
+        * Disable executability of the SMP trampoline:
+        */
+       set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
+}
+
+void __init smp_intr_init(void)
+{
+       /*
+        * IRQ0 must be given a fixed assignment and initialized,
+        * because it's used before the IO-APIC is set up.
+        */
+       set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+       /*
+        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+        * IPI, driven by wakeup.
+        */
+       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+       /* IPI for invalidation */
+       set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+       /* IPI for generic function call */
+       set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+}
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+       extern unsigned int maxcpus;
+
+       maxcpus = simple_strtoul(arg, NULL, 0);
+       return 0;
+}
+early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
new file mode 100644 (file)
index 0000000..bbfe85a
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * SMP stuff which is common to all sub-architectures.
+ */
+#include <linux/module.h>
+#include <asm/smp.h>
+
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+__cpuinit void init_gdt(int cpu)
+{
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+       pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+                       (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+                       __per_cpu_offset[cpu], 0xFFFFF,
+                       0x80 | DESCTYPE_S | 0x2, 0x8);
+
+       per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+       per_cpu(cpu_number, cpu) = cpu;
+}
+
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
+                     int wait)
+{
+       return smp_call_function_mask(cpu_online_map, func, info, wait);
+}
+EXPORT_SYMBOL(smp_call_function);
+
+/**
+ * smp_call_function_single - Run a function on a specific CPU
+ * @cpu: The target CPU.  Cannot be the calling CPU.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                            int nonatomic, int wait)
+{
+       /* prevent preemption and reschedule on another processor */
+       int ret;
+       int me = get_cpu();
+       if (cpu == me) {
+               local_irq_disable();
+               func(info);
+               local_irq_enable();
+               put_cpu();
+               return 0;
+       }
+
+       ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+
+       put_cpu();
+       return ret;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
new file mode 100644 (file)
index 0000000..2a8713e
--- /dev/null
@@ -0,0 +1,360 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/nodemask.h>
+#include <asm/srat.h>
+#include <asm/topology.h>
+#include <asm/smp.h>
+
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x)    ((x) / 8)       /* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x)   ((x) % 8)       /* 8 bits/char */
+#define BMAP_SET(bmap, bit)    ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit)   ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
+static u8 pxm_bitmap[PXM_BITMAP_LEN];  /* bitmap of proximity domains */
+
+#define MAX_CHUNKS_PER_NODE    3
+#define MAXCHUNKS              (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+       unsigned long   start_pfn;
+       unsigned long   end_pfn;
+       u8      pxm;            // proximity domain of node
+       u8      nid;            // which cnode contains this chunk?
+       u8      bank;           // which mem bank on this node
+};
+static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+
+static int num_memory_chunks;          /* total number of memory chunks */
+static u8 __initdata apicid_to_pxm[MAX_APICID];
+
+extern void * boot_ioremap(unsigned long, unsigned long);
+
+/* Identify CPU proximity domains */
+static void __init parse_cpu_affinity_structure(char *p)
+{
+       struct acpi_srat_cpu_affinity *cpu_affinity =
+                               (struct acpi_srat_cpu_affinity *) p;
+
+       if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+               return;         /* empty entry */
+
+       /* mark this node as "seen" in node bitmap */
+       BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+
+       apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
+
+       printk("CPU 0x%02X in proximity domain 0x%02X\n",
+               cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
+}
+
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+static void __init parse_memory_affinity_structure (char *sratp)
+{
+       unsigned long long paddr, size;
+       unsigned long start_pfn, end_pfn;
+       u8 pxm;
+       struct node_memory_chunk_s *p, *q, *pend;
+       struct acpi_srat_mem_affinity *memory_affinity =
+                       (struct acpi_srat_mem_affinity *) sratp;
+
+       if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+               return;         /* empty entry */
+
+       pxm = memory_affinity->proximity_domain & 0xff;
+
+       /* mark this node as "seen" in node bitmap */
+       BMAP_SET(pxm_bitmap, pxm);
+
+       /* calculate info for memory chunk structure */
+       paddr = memory_affinity->base_address;
+       size = memory_affinity->length;
+
+       start_pfn = paddr >> PAGE_SHIFT;
+       end_pfn = (paddr + size) >> PAGE_SHIFT;
+
+
+       if (num_memory_chunks >= MAXCHUNKS) {
+               printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
+                       size/(1024*1024), paddr);
+               return;
+       }
+
+       /* Insertion sort based on base address */
+       pend = &node_memory_chunk[num_memory_chunks];
+       for (p = &node_memory_chunk[0]; p < pend; p++) {
+               if (start_pfn < p->start_pfn)
+                       break;
+       }
+       if (p < pend) {
+               for (q = pend; q >= p; q--)
+                       *(q + 1) = *q;
+       }
+       p->start_pfn = start_pfn;
+       p->end_pfn = end_pfn;
+       p->pxm = pxm;
+
+       num_memory_chunks++;
+
+       printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
+               start_pfn, end_pfn,
+               memory_affinity->memory_type,
+               pxm,
+               ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
+                "enabled and removable" : "enabled" ) );
+}
+
+/*
+ * The SRAT table always lists ascending addresses, so can always
+ * assume that the first "start" address that you see is the real
+ * start of the node, and that the current "end" address is after
+ * the previous one.
+ */
+static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+{
+       /*
+        * Only add present memory as told by the e820.
+        * There is no guarantee from the SRAT that the memory it
+        * enumerates is present at boot time because it represents
+        * *possible* memory hotplug areas the same as normal RAM.
+        */
+       if (memory_chunk->start_pfn >= max_pfn) {
+               printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
+                       memory_chunk->start_pfn, memory_chunk->end_pfn);
+               return;
+       }
+       if (memory_chunk->nid != nid)
+               return;
+
+       if (!node_has_online_mem(nid))
+               node_start_pfn[nid] = memory_chunk->start_pfn;
+
+       if (node_start_pfn[nid] > memory_chunk->start_pfn)
+               node_start_pfn[nid] = memory_chunk->start_pfn;
+
+       if (node_end_pfn[nid] < memory_chunk->end_pfn)
+               node_end_pfn[nid] = memory_chunk->end_pfn;
+}
+
+/* Parse the ACPI Static Resource Affinity Table */
+static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+{
+       u8 *start, *end, *p;
+       int i, j, nid;
+
+       start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
+       p = start;
+       end = (u8 *)sratp + sratp->header.length;
+
+       memset(pxm_bitmap, 0, sizeof(pxm_bitmap));      /* init proximity domain bitmap */
+       memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
+
+       num_memory_chunks = 0;
+       while (p < end) {
+               switch (*p) {
+               case ACPI_SRAT_TYPE_CPU_AFFINITY:
+                       parse_cpu_affinity_structure(p);
+                       break;
+               case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
+                       parse_memory_affinity_structure(p);
+                       break;
+               default:
+                       printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
+                       break;
+               }
+               p += p[1];
+               if (p[1] == 0) {
+                       printk("acpi20_parse_srat: Entry length value is zero;"
+                               " can't parse any further!\n");
+                       break;
+               }
+       }
+
+       if (num_memory_chunks == 0) {
+               printk("could not finy any ACPI SRAT memory areas.\n");
+               goto out_fail;
+       }
+
+       /* Calculate total number of nodes in system from PXM bitmap and create
+        * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
+        * to specify the range of _PXM values.)
+        */
+       /*
+        * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
+        * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
+        * 32, so we will continue numbering them in this manner until MAX_NUMNODES
+        * approaches MAX_PXM_DOMAINS for i386.
+        */
+       nodes_clear(node_online_map);
+       for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+               if (BMAP_TEST(pxm_bitmap, i)) {
+                       int nid = acpi_map_pxm_to_node(i);
+                       node_set_online(nid);
+               }
+       }
+       BUG_ON(num_online_nodes() == 0);
+
+       /* set cnode id in memory chunk structure */
+       for (i = 0; i < num_memory_chunks; i++)
+               node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
+
+       printk("pxm bitmap: ");
+       for (i = 0; i < sizeof(pxm_bitmap); i++) {
+               printk("%02X ", pxm_bitmap[i]);
+       }
+       printk("\n");
+       printk("Number of logical nodes in system = %d\n", num_online_nodes());
+       printk("Number of memory chunks in system = %d\n", num_memory_chunks);
+
+       for (i = 0; i < MAX_APICID; i++)
+               apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+
+       for (j = 0; j < num_memory_chunks; j++){
+               struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+               printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+                      j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
+               node_read_chunk(chunk->nid, chunk);
+               add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+       }
+       for_each_online_node(nid) {
+               unsigned long start = node_start_pfn[nid];
+               unsigned long end = node_end_pfn[nid];
+
+               memory_present(nid, start, end);
+               node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+       }
+       return 1;
+out_fail:
+       return 0;
+}
+
+struct acpi_static_rsdt {
+       struct acpi_table_rsdt table;
+       u32 padding[7]; /* Allow for 7 more table entries */
+};
+
+int __init get_memcfg_from_srat(void)
+{
+       struct acpi_table_header *header = NULL;
+       struct acpi_table_rsdp *rsdp = NULL;
+       struct acpi_table_rsdt *rsdt = NULL;
+       acpi_native_uint rsdp_address = 0;
+       struct acpi_static_rsdt saved_rsdt;
+       int tables = 0;
+       int i = 0;
+
+       rsdp_address = acpi_find_rsdp();
+       if (!rsdp_address) {
+               printk("%s: System description tables not found\n",
+                      __FUNCTION__);
+               goto out_err;
+       }
+
+       printk("%s: assigning address to rsdp\n", __FUNCTION__);
+       rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
+       if (!rsdp) {
+               printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
+               goto out_err;
+       }
+
+       printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
+               rsdp->oem_id);
+
+       if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
+               printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
+               goto out_err;
+       }
+
+       rsdt = (struct acpi_table_rsdt *)
+           boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+
+       if (!rsdt) {
+               printk(KERN_WARNING
+                      "%s: ACPI: Invalid root system description tables (RSDT)\n",
+                      __FUNCTION__);
+               goto out_err;
+       }
+
+       header = &rsdt->header;
+
+       if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
+               printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
+               goto out_err;
+       }
+
+       /* 
+        * The number of tables is computed by taking the 
+        * size of all entries (header size minus total 
+        * size of RSDT) divided by the size of each entry
+        * (4-byte table pointers).
+        */
+       tables = (header->length - sizeof(struct acpi_table_header)) / 4;
+
+       if (!tables)
+               goto out_err;
+
+       memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
+
+       if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
+               printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
+                      saved_rsdt.table.header.length);
+               goto out_err;
+       }
+
+       printk("Begin SRAT table scan....\n");
+
+       for (i = 0; i < tables; i++) {
+               /* Map in header, then map in full table length. */
+               header = (struct acpi_table_header *)
+                       boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
+               if (!header)
+                       break;
+               header = (struct acpi_table_header *)
+                       boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+               if (!header)
+                       break;
+
+               if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
+                       continue;
+
+               /* we've found the srat table. don't need to look at any more tables */
+               return acpi20_parse_srat((struct acpi_table_srat *)header);
+       }
+out_err:
+       remove_all_active_ranges();
+       printk("failed to get NUMA memory information from SRAT table\n");
+       return 0;
+}
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
new file mode 100644 (file)
index 0000000..d0e01a3
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * arch/i386/kernel/summit.c - IBM Summit-Specific Code
+ *
+ * Written By: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (c) 2003 IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/mach-summit/mach_mpparse.h>
+
+static struct rio_table_hdr *rio_table_hdr __initdata;
+static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
+
+static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
+{
+       int twister = 0, node = 0;
+       int i, bus, num_buses;
+
+       for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
+               if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
+                       twister = rio_devs[i]->owner_id;
+                       break;
+               }
+       }
+       if (i == rio_table_hdr->num_rio_dev){
+               printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
+               return last_bus;
+       }
+
+       for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
+               if (scal_devs[i]->node_id == twister){
+                       node = scal_devs[i]->node_id;
+                       break;
+               }
+       }
+       if (i == rio_table_hdr->num_scal_dev){
+               printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
+               return last_bus;
+       }
+
+       switch (rio_devs[wpeg_num]->type){
+       case CompatWPEG:
+               /* The Compatability Winnipeg controls the 2 legacy buses,
+                * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
+                * a PCI-PCI bridge card is used in either slot: total 5 buses.
+                */
+               num_buses = 5;
+               break;
+       case AltWPEG:
+               /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
+                * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
+                * the "extra" buses for each of those slots: total 7 buses.
+                */
+               num_buses = 7;
+               break;
+       case LookOutAWPEG:
+       case LookOutBWPEG:
+               /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
+                * & the "extra" buses for each of those slots: total 9 buses.
+                */
+               num_buses = 9;
+               break;
+       default:
+               printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
+               return last_bus;
+       }
+
+       for(bus = last_bus; bus < last_bus + num_buses; bus++)
+               mp_bus_id_to_node[bus] = node;
+       return bus;
+}
+
+static int __init build_detail_arrays(void)
+{
+       unsigned long ptr;
+       int i, scal_detail_size, rio_detail_size;
+
+       if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+               printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+               return 0;
+       }
+
+       switch (rio_table_hdr->version){
+       default:
+               printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
+               return 0;
+       case 2:
+               scal_detail_size = 11;
+               rio_detail_size = 13;
+               break;
+       case 3:
+               scal_detail_size = 12;
+               rio_detail_size = 15;
+               break;
+       }
+
+       ptr = (unsigned long)rio_table_hdr + 3;
+       for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
+               scal_devs[i] = (struct scal_detail *)ptr;
+
+       for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
+               rio_devs[i] = (struct rio_detail *)ptr;
+
+       return 1;
+}
+
+void __init setup_summit(void)
+{
+       unsigned long           ptr;
+       unsigned short          offset;
+       int                     i, next_wpeg, next_bus = 0;
+
+       /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
+       ptr = *(unsigned short *)phys_to_virt(0x40Eul);
+       ptr = (unsigned long)phys_to_virt(ptr << 4);
+
+       rio_table_hdr = NULL;
+       offset = 0x180;
+       while (offset){
+               /* The block id is stored in the 2nd word */
+               if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+                       /* set the pointer past the offset & block id */
+                       rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+                       break;
+               }
+               /* The next offset is stored in the 1st word.  0 means no more */
+               offset = *((unsigned short *)(ptr + offset));
+       }
+       if (!rio_table_hdr){
+               printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
+               return;
+       }
+
+       if (!build_detail_arrays())
+               return;
+
+       /* The first Winnipeg we're looking for has an index of 0 */
+       next_wpeg = 0;
+       do {
+               for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
+                       if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
+                               /* It's the Winnipeg we're looking for! */
+                               next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
+                               next_wpeg++;
+                               break;
+                       }
+               }
+               /*
+                * If we go through all Rio devices and don't find one with
+                * the next index, it means we've found all the Winnipegs,
+                * and thus all the PCI buses.
+                */
+               if (i == rio_table_hdr->num_rio_dev)
+                       next_wpeg = 0;
+       } while (next_wpeg != 0);
+}
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
new file mode 100644 (file)
index 0000000..4214730
--- /dev/null
@@ -0,0 +1,265 @@
+/*
+ * linux/arch/i386/kernel/sys_i386.c
+ *
+ * This file contains various random system calls that
+ * have a non-standard calling sequence on the Linux/i386
+ * platform.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/ipc.h>
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage int sys_pipe(unsigned long __user * fildes)
+{
+       int fd[2];
+       int error;
+
+       error = do_pipe(fd);
+       if (!error) {
+               if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                       error = -EFAULT;
+       }
+       return error;
+}
+
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+                         unsigned long prot, unsigned long flags,
+                         unsigned long fd, unsigned long pgoff)
+{
+       int error = -EBADF;
+       struct file *file = NULL;
+       struct mm_struct *mm = current->mm;
+
+       flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+       if (!(flags & MAP_ANONYMOUS)) {
+               file = fget(fd);
+               if (!file)
+                       goto out;
+       }
+
+       down_write(&mm->mmap_sem);
+       error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+       up_write(&mm->mmap_sem);
+
+       if (file)
+               fput(file);
+out:
+       return error;
+}
+
+/*
+ * Perform the select(nd, in, out, ex, tv) and mmap() system
+ * calls. Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct {
+       unsigned long addr;
+       unsigned long len;
+       unsigned long prot;
+       unsigned long flags;
+       unsigned long fd;
+       unsigned long offset;
+};
+
+asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
+{
+       struct mmap_arg_struct a;
+       int err = -EFAULT;
+
+       if (copy_from_user(&a, arg, sizeof(a)))
+               goto out;
+
+       err = -EINVAL;
+       if (a.offset & ~PAGE_MASK)
+               goto out;
+
+       err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
+                       a.fd, a.offset >> PAGE_SHIFT);
+out:
+       return err;
+}
+
+
+struct sel_arg_struct {
+       unsigned long n;
+       fd_set __user *inp, *outp, *exp;
+       struct timeval __user *tvp;
+};
+
+asmlinkage int old_select(struct sel_arg_struct __user *arg)
+{
+       struct sel_arg_struct a;
+
+       if (copy_from_user(&a, arg, sizeof(a)))
+               return -EFAULT;
+       /* sys_select() does the appropriate kernel locking */
+       return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+
+/*
+ * sys_ipc() is the de-multiplexer for the SysV IPC calls..
+ *
+ * This is really horribly ugly.
+ */
+asmlinkage int sys_ipc (uint call, int first, int second,
+                       int third, void __user *ptr, long fifth)
+{
+       int version, ret;
+
+       version = call >> 16; /* hack for backward compatibility */
+       call &= 0xffff;
+
+       switch (call) {
+       case SEMOP:
+               return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+       case SEMTIMEDOP:
+               return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
+                                       (const struct timespec __user *)fifth);
+
+       case SEMGET:
+               return sys_semget (first, second, third);
+       case SEMCTL: {
+               union semun fourth;
+               if (!ptr)
+                       return -EINVAL;
+               if (get_user(fourth.__pad, (void __user * __user *) ptr))
+                       return -EFAULT;
+               return sys_semctl (first, second, third, fourth);
+       }
+
+       case MSGSND:
+               return sys_msgsnd (first, (struct msgbuf __user *) ptr, 
+                                  second, third);
+       case MSGRCV:
+               switch (version) {
+               case 0: {
+                       struct ipc_kludge tmp;
+                       if (!ptr)
+                               return -EINVAL;
+                       
+                       if (copy_from_user(&tmp,
+                                          (struct ipc_kludge __user *) ptr, 
+                                          sizeof (tmp)))
+                               return -EFAULT;
+                       return sys_msgrcv (first, tmp.msgp, second,
+                                          tmp.msgtyp, third);
+               }
+               default:
+                       return sys_msgrcv (first,
+                                          (struct msgbuf __user *) ptr,
+                                          second, fifth, third);
+               }
+       case MSGGET:
+               return sys_msgget ((key_t) first, second);
+       case MSGCTL:
+               return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+
+       case SHMAT:
+               switch (version) {
+               default: {
+                       ulong raddr;
+                       ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+                       if (ret)
+                               return ret;
+                       return put_user (raddr, (ulong __user *) third);
+               }
+               case 1: /* iBCS2 emulator entry point */
+                       if (!segment_eq(get_fs(), get_ds()))
+                               return -EINVAL;
+                       /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
+                       return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+               }
+       case SHMDT: 
+               return sys_shmdt ((char __user *)ptr);
+       case SHMGET:
+               return sys_shmget (first, second, third);
+       case SHMCTL:
+               return sys_shmctl (first, second,
+                                  (struct shmid_ds __user *) ptr);
+       default:
+               return -ENOSYS;
+       }
+}
+
+/*
+ * Old cruft
+ */
+asmlinkage int sys_uname(struct old_utsname __user * name)
+{
+       int err;
+       if (!name)
+               return -EFAULT;
+       down_read(&uts_sem);
+       err = copy_to_user(name, utsname(), sizeof (*name));
+       up_read(&uts_sem);
+       return err?-EFAULT:0;
+}
+
+asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+{
+       int error;
+
+       if (!name)
+               return -EFAULT;
+       if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+               return -EFAULT;
+  
+       down_read(&uts_sem);
+       
+       error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                              __OLD_UTS_LEN);
+       error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+       error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                               __OLD_UTS_LEN);
+       error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+       error |= __copy_to_user(&name->release, &utsname()->release,
+                               __OLD_UTS_LEN);
+       error |= __put_user(0, name->release + __OLD_UTS_LEN);
+       error |= __copy_to_user(&name->version, &utsname()->version,
+                               __OLD_UTS_LEN);
+       error |= __put_user(0, name->version + __OLD_UTS_LEN);
+       error |= __copy_to_user(&name->machine, &utsname()->machine,
+                               __OLD_UTS_LEN);
+       error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+       
+       up_read(&uts_sem);
+       
+       error = error ? -EFAULT : 0;
+
+       return error;
+}
+
+
+/*
+ * Do a system call from kernel instead of calling sys_execve so we
+ * end up with proper pt_regs.
+ */
+int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+{
+       long __res;
+       asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+       : "=a" (__res)
+       : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+       return __res;
+}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
new file mode 100644 (file)
index 0000000..8344c70
--- /dev/null
@@ -0,0 +1,326 @@
+ENTRY(sys_call_table)
+       .long sys_restart_syscall       /* 0 - old "setup()" system call, used for restarting */
+       .long sys_exit
+       .long sys_fork
+       .long sys_read
+       .long sys_write
+       .long sys_open          /* 5 */
+       .long sys_close
+       .long sys_waitpid
+       .long sys_creat
+       .long sys_link
+       .long sys_unlink        /* 10 */
+       .long sys_execve
+       .long sys_chdir
+       .long sys_time
+       .long sys_mknod
+       .long sys_chmod         /* 15 */
+       .long sys_lchown16
+       .long sys_ni_syscall    /* old break syscall holder */
+       .long sys_stat
+       .long sys_lseek
+       .long sys_getpid        /* 20 */
+       .long sys_mount
+       .long sys_oldumount
+       .long sys_setuid16
+       .long sys_getuid16
+       .long sys_stime         /* 25 */
+       .long sys_ptrace
+       .long sys_alarm
+       .long sys_fstat
+       .long sys_pause
+       .long sys_utime         /* 30 */
+       .long sys_ni_syscall    /* old stty syscall holder */
+       .long sys_ni_syscall    /* old gtty syscall holder */
+       .long sys_access
+       .long sys_nice
+       .long sys_ni_syscall    /* 35 - old ftime syscall holder */
+       .long sys_sync
+       .long sys_kill
+       .long sys_rename
+       .long sys_mkdir
+       .long sys_rmdir         /* 40 */
+       .long sys_dup
+       .long sys_pipe
+       .long sys_times
+       .long sys_ni_syscall    /* old prof syscall holder */
+       .long sys_brk           /* 45 */
+       .long sys_setgid16
+       .long sys_getgid16
+       .long sys_signal
+       .long sys_geteuid16
+       .long sys_getegid16     /* 50 */
+       .long sys_acct
+       .long sys_umount        /* recycled never used phys() */
+       .long sys_ni_syscall    /* old lock syscall holder */
+       .long sys_ioctl
+       .long sys_fcntl         /* 55 */
+       .long sys_ni_syscall    /* old mpx syscall holder */
+       .long sys_setpgid
+       .long sys_ni_syscall    /* old ulimit syscall holder */
+       .long sys_olduname
+       .long sys_umask         /* 60 */
+       .long sys_chroot
+       .long sys_ustat
+       .long sys_dup2
+       .long sys_getppid
+       .long sys_getpgrp       /* 65 */
+       .long sys_setsid
+       .long sys_sigaction
+       .long sys_sgetmask
+       .long sys_ssetmask
+       .long sys_setreuid16    /* 70 */
+       .long sys_setregid16
+       .long sys_sigsuspend
+       .long sys_sigpending
+       .long sys_sethostname
+       .long sys_setrlimit     /* 75 */
+       .long sys_old_getrlimit
+       .long sys_getrusage
+       .long sys_gettimeofday
+       .long sys_settimeofday
+       .long sys_getgroups16   /* 80 */
+       .long sys_setgroups16
+       .long old_select
+       .long sys_symlink
+       .long sys_lstat
+       .long sys_readlink      /* 85 */
+       .long sys_uselib
+       .long sys_swapon
+       .long sys_reboot
+       .long old_readdir
+       .long old_mmap          /* 90 */
+       .long sys_munmap
+       .long sys_truncate
+       .long sys_ftruncate
+       .long sys_fchmod
+       .long sys_fchown16      /* 95 */
+       .long sys_getpriority
+       .long sys_setpriority
+       .long sys_ni_syscall    /* old profil syscall holder */
+       .long sys_statfs
+       .long sys_fstatfs       /* 100 */
+       .long sys_ioperm
+       .long sys_socketcall
+       .long sys_syslog
+       .long sys_setitimer
+       .long sys_getitimer     /* 105 */
+       .long sys_newstat
+       .long sys_newlstat
+       .long sys_newfstat
+       .long sys_uname
+       .long sys_iopl          /* 110 */
+       .long sys_vhangup
+       .long sys_ni_syscall    /* old "idle" system call */
+       .long sys_vm86old
+       .long sys_wait4
+       .long sys_swapoff       /* 115 */
+       .long sys_sysinfo
+       .long sys_ipc
+       .long sys_fsync
+       .long sys_sigreturn
+       .long sys_clone         /* 120 */
+       .long sys_setdomainname
+       .long sys_newuname
+       .long sys_modify_ldt
+       .long sys_adjtimex
+       .long sys_mprotect      /* 125 */
+       .long sys_sigprocmask
+       .long sys_ni_syscall    /* old "create_module" */
+       .long sys_init_module
+       .long sys_delete_module
+       .long sys_ni_syscall    /* 130: old "get_kernel_syms" */
+       .long sys_quotactl
+       .long sys_getpgid
+       .long sys_fchdir
+       .long sys_bdflush
+       .long sys_sysfs         /* 135 */
+       .long sys_personality
+       .long sys_ni_syscall    /* reserved for afs_syscall */
+       .long sys_setfsuid16
+       .long sys_setfsgid16
+       .long sys_llseek        /* 140 */
+       .long sys_getdents
+       .long sys_select
+       .long sys_flock
+       .long sys_msync
+       .long sys_readv         /* 145 */
+       .long sys_writev
+       .long sys_getsid
+       .long sys_fdatasync
+       .long sys_sysctl
+       .long sys_mlock         /* 150 */
+       .long sys_munlock
+       .long sys_mlockall
+       .long sys_munlockall
+       .long sys_sched_setparam
+       .long sys_sched_getparam   /* 155 */
+       .long sys_sched_setscheduler
+       .long sys_sched_getscheduler
+       .long sys_sched_yield
+       .long sys_sched_get_priority_max
+       .long sys_sched_get_priority_min  /* 160 */
+       .long sys_sched_rr_get_interval
+       .long sys_nanosleep
+       .long sys_mremap
+       .long sys_setresuid16
+       .long sys_getresuid16   /* 165 */
+       .long sys_vm86
+       .long sys_ni_syscall    /* Old sys_query_module */
+       .long sys_poll
+       .long sys_nfsservctl
+       .long sys_setresgid16   /* 170 */
+       .long sys_getresgid16
+       .long sys_prctl
+       .long sys_rt_sigreturn
+       .long sys_rt_sigaction
+       .long sys_rt_sigprocmask        /* 175 */
+       .long sys_rt_sigpending
+       .long sys_rt_sigtimedwait
+       .long sys_rt_sigqueueinfo
+       .long sys_rt_sigsuspend
+       .long sys_pread64       /* 180 */
+       .long sys_pwrite64
+       .long sys_chown16
+       .long sys_getcwd
+       .long sys_capget
+       .long sys_capset        /* 185 */
+       .long sys_sigaltstack
+       .long sys_sendfile
+       .long sys_ni_syscall    /* reserved for streams1 */
+       .long sys_ni_syscall    /* reserved for streams2 */
+       .long sys_vfork         /* 190 */
+       .long sys_getrlimit
+       .long sys_mmap2
+       .long sys_truncate64
+       .long sys_ftruncate64
+       .long sys_stat64        /* 195 */
+       .long sys_lstat64
+       .long sys_fstat64
+       .long sys_lchown
+       .long sys_getuid
+       .long sys_getgid        /* 200 */
+       .long sys_geteuid
+       .long sys_getegid
+       .long sys_setreuid
+       .long sys_setregid
+       .long sys_getgroups     /* 205 */
+       .long sys_setgroups
+       .long sys_fchown
+       .long sys_setresuid
+       .long sys_getresuid
+       .long sys_setresgid     /* 210 */
+       .long sys_getresgid
+       .long sys_chown
+       .long sys_setuid
+       .long sys_setgid
+       .long sys_setfsuid      /* 215 */
+       .long sys_setfsgid
+       .long sys_pivot_root
+       .long sys_mincore
+       .long sys_madvise
+       .long sys_getdents64    /* 220 */
+       .long sys_fcntl64
+       .long sys_ni_syscall    /* reserved for TUX */
+       .long sys_ni_syscall
+       .long sys_gettid
+       .long sys_readahead     /* 225 */
+       .long sys_setxattr
+       .long sys_lsetxattr
+       .long sys_fsetxattr
+       .long sys_getxattr
+       .long sys_lgetxattr     /* 230 */
+       .long sys_fgetxattr
+       .long sys_listxattr
+       .long sys_llistxattr
+       .long sys_flistxattr
+       .long sys_removexattr   /* 235 */
+       .long sys_lremovexattr
+       .long sys_fremovexattr
+       .long sys_tkill
+       .long sys_sendfile64
+       .long sys_futex         /* 240 */
+       .long sys_sched_setaffinity
+       .long sys_sched_getaffinity
+       .long sys_set_thread_area
+       .long sys_get_thread_area
+       .long sys_io_setup      /* 245 */
+       .long sys_io_destroy
+       .long sys_io_getevents
+       .long sys_io_submit
+       .long sys_io_cancel
+       .long sys_fadvise64     /* 250 */
+       .long sys_ni_syscall
+       .long sys_exit_group
+       .long sys_lookup_dcookie
+       .long sys_epoll_create
+       .long sys_epoll_ctl     /* 255 */
+       .long sys_epoll_wait
+       .long sys_remap_file_pages
+       .long sys_set_tid_address
+       .long sys_timer_create
+       .long sys_timer_settime         /* 260 */
+       .long sys_timer_gettime
+       .long sys_timer_getoverrun
+       .long sys_timer_delete
+       .long sys_clock_settime
+       .long sys_clock_gettime         /* 265 */
+       .long sys_clock_getres
+       .long sys_clock_nanosleep
+       .long sys_statfs64
+       .long sys_fstatfs64
+       .long sys_tgkill        /* 270 */
+       .long sys_utimes
+       .long sys_fadvise64_64
+       .long sys_ni_syscall    /* sys_vserver */
+       .long sys_mbind
+       .long sys_get_mempolicy
+       .long sys_set_mempolicy
+       .long sys_mq_open
+       .long sys_mq_unlink
+       .long sys_mq_timedsend
+       .long sys_mq_timedreceive       /* 280 */
+       .long sys_mq_notify
+       .long sys_mq_getsetattr
+       .long sys_kexec_load
+       .long sys_waitid
+       .long sys_ni_syscall            /* 285 */ /* available */
+       .long sys_add_key
+       .long sys_request_key
+       .long sys_keyctl
+       .long sys_ioprio_set
+       .long sys_ioprio_get            /* 290 */
+       .long sys_inotify_init
+       .long sys_inotify_add_watch
+       .long sys_inotify_rm_watch
+       .long sys_migrate_pages
+       .long sys_openat                /* 295 */
+       .long sys_mkdirat
+       .long sys_mknodat
+       .long sys_fchownat
+       .long sys_futimesat
+       .long sys_fstatat64             /* 300 */
+       .long sys_unlinkat
+       .long sys_renameat
+       .long sys_linkat
+       .long sys_symlinkat
+       .long sys_readlinkat            /* 305 */
+       .long sys_fchmodat
+       .long sys_faccessat
+       .long sys_pselect6
+       .long sys_ppoll
+       .long sys_unshare               /* 310 */
+       .long sys_set_robust_list
+       .long sys_get_robust_list
+       .long sys_splice
+       .long sys_sync_file_range
+       .long sys_tee                   /* 315 */
+       .long sys_vmsplice
+       .long sys_move_pages
+       .long sys_getcpu
+       .long sys_epoll_pwait
+       .long sys_utimensat             /* 320 */
+       .long sys_signalfd
+       .long sys_timerfd
+       .long sys_eventfd
+       .long sys_fallocate
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
new file mode 100644 (file)
index 0000000..4eb2e40
--- /dev/null
@@ -0,0 +1,348 @@
+/*
+ * linux/arch/i386/kernel/sysenter.c
+ *
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+
+enum {
+       VDSO_DISABLED = 0,
+       VDSO_ENABLED = 1,
+       VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT   VDSO_COMPAT
+#else
+#define VDSO_DEFAULT   VDSO_ENABLED
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+       vdso_enabled = simple_strtoul(s, NULL, 0);
+
+       return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
+extern asmlinkage void sysenter_entry(void);
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+                               unsigned offset, unsigned size)
+{
+       Elf32_Sym *sym = (void *)ehdr + offset;
+       unsigned nsym = size / sizeof(*sym);
+       unsigned i;
+
+       for(i = 0; i < nsym; i++, sym++) {
+               if (sym->st_shndx == SHN_UNDEF ||
+                   sym->st_shndx == SHN_ABS)
+                       continue;  /* skip */
+
+               if (sym->st_shndx > SHN_LORESERVE) {
+                       printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+                              sym->st_shndx);
+                       continue;
+               }
+
+               switch(ELF_ST_TYPE(sym->st_info)) {
+               case STT_OBJECT:
+               case STT_FUNC:
+               case STT_SECTION:
+               case STT_FILE:
+                       sym->st_value += VDSO_HIGH_BASE;
+               }
+       }
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+       Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+       for(; dyn->d_tag != DT_NULL; dyn++)
+               switch(dyn->d_tag) {
+               case DT_PLTGOT:
+               case DT_HASH:
+               case DT_STRTAB:
+               case DT_SYMTAB:
+               case DT_RELA:
+               case DT_INIT:
+               case DT_FINI:
+               case DT_REL:
+               case DT_DEBUG:
+               case DT_JMPREL:
+               case DT_VERSYM:
+               case DT_VERDEF:
+               case DT_VERNEED:
+               case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+                       /* definitely pointers needing relocation */
+                       dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+                       break;
+
+               case DT_ENCODING ... OLD_DT_LOOS-1:
+               case DT_LOOS ... DT_HIOS-1:
+                       /* Tags above DT_ENCODING are pointers if
+                          they're even */
+                       if (dyn->d_tag >= DT_ENCODING &&
+                           (dyn->d_tag & 1) == 0)
+                               dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+                       break;
+
+               case DT_VERDEFNUM:
+               case DT_VERNEEDNUM:
+               case DT_FLAGS_1:
+               case DT_RELACOUNT:
+               case DT_RELCOUNT:
+               case DT_VALRNGLO ... DT_VALRNGHI:
+                       /* definitely not pointers */
+                       break;
+
+               case OLD_DT_LOOS ... DT_LOOS-1:
+               case DT_HIOS ... DT_VALRNGLO-1:
+               default:
+                       if (dyn->d_tag > DT_ENCODING)
+                               printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+                                      dyn->d_tag);
+                       break;
+               }
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+       Elf32_Phdr *phdr;
+       Elf32_Shdr *shdr;
+       int i;
+
+       BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+              !elf_check_arch(ehdr) ||
+              ehdr->e_type != ET_DYN);
+
+       ehdr->e_entry += VDSO_HIGH_BASE;
+
+       /* rebase phdrs */
+       phdr = (void *)ehdr + ehdr->e_phoff;
+       for (i = 0; i < ehdr->e_phnum; i++) {
+               phdr[i].p_vaddr += VDSO_HIGH_BASE;
+
+               /* relocate dynamic stuff */
+               if (phdr[i].p_type == PT_DYNAMIC)
+                       reloc_dyn(ehdr, phdr[i].p_offset);
+       }
+
+       /* rebase sections */
+       shdr = (void *)ehdr + ehdr->e_shoff;
+       for(i = 0; i < ehdr->e_shnum; i++) {
+               if (!(shdr[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               shdr[i].sh_addr += VDSO_HIGH_BASE;
+
+               if (shdr[i].sh_type == SHT_SYMTAB ||
+                   shdr[i].sh_type == SHT_DYNSYM)
+                       reloc_symtab(ehdr, shdr[i].sh_offset,
+                                    shdr[i].sh_size);
+       }
+}
+
+void enable_sep_cpu(void)
+{
+       int cpu = get_cpu();
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+       if (!boot_cpu_has(X86_FEATURE_SEP)) {
+               put_cpu();
+               return;
+       }
+
+       tss->x86_tss.ss1 = __KERNEL_CS;
+       tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+       wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+       wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
+       wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
+       put_cpu();      
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+       gate_vma.vm_mm = NULL;
+       gate_vma.vm_start = FIXADDR_USER_START;
+       gate_vma.vm_end = FIXADDR_USER_END;
+       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+       gate_vma.vm_page_prot = __P101;
+       /*
+        * Make sure the vDSO gets into every core dump.
+        * Dumping its contents makes post-mortem fully interpretable later
+        * without matching up the same kernel and hardware config to see
+        * what PC values meant.
+        */
+       gate_vma.vm_flags |= VM_ALWAYSDUMP;
+       return 0;
+}
+
+/*
+ * These symbols are defined by vsyscall.o to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vsyscall_int80_start, vsyscall_int80_end;
+extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static struct page *syscall_pages[1];
+
+static void map_compat_vdso(int map)
+{
+       static int vdso_mapped;
+
+       if (map == vdso_mapped)
+               return;
+
+       vdso_mapped = map;
+
+       __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+                    map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+       /* flush stray tlbs */
+       flush_tlb_all();
+}
+
+int __init sysenter_setup(void)
+{
+       void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+       const void *vsyscall;
+       size_t vsyscall_len;
+
+       syscall_pages[0] = virt_to_page(syscall_page);
+
+       gate_vma_init();
+
+       printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+
+       if (!boot_cpu_has(X86_FEATURE_SEP)) {
+               vsyscall = &vsyscall_int80_start;
+               vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+       } else {
+               vsyscall = &vsyscall_sysenter_start;
+               vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
+       }
+
+       memcpy(syscall_page, vsyscall, vsyscall_len);
+       relocate_vdso(syscall_page);
+
+       return 0;
+}
+
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+       struct mm_struct *mm = current->mm;
+       unsigned long addr;
+       int ret = 0;
+       bool compat;
+
+       down_write(&mm->mmap_sem);
+
+       /* Test compat mode once here, in case someone
+          changes it via sysctl */
+       compat = (vdso_enabled == VDSO_COMPAT);
+
+       map_compat_vdso(compat);
+
+       if (compat)
+               addr = VDSO_HIGH_BASE;
+       else {
+               addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+               if (IS_ERR_VALUE(addr)) {
+                       ret = addr;
+                       goto up_fail;
+               }
+
+               /*
+                * MAYWRITE to allow gdb to COW and set breakpoints
+                *
+                * Make sure the vDSO gets into every core dump.
+                * Dumping its contents makes post-mortem fully
+                * interpretable later without matching up the same
+                * kernel and hardware config to see what PC values
+                * meant.
+                */
+               ret = install_special_mapping(mm, addr, PAGE_SIZE,
+                                             VM_READ|VM_EXEC|
+                                             VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+                                             VM_ALWAYSDUMP,
+                                             syscall_pages);
+
+               if (ret)
+                       goto up_fail;
+       }
+
+       current->mm->context.vdso = (void *)addr;
+       current_thread_info()->sysenter_return =
+               (void *)VDSO_SYM(&SYSENTER_RETURN);
+
+  up_fail:
+       up_write(&mm->mmap_sem);
+
+       return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+               return "[vdso]";
+       return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+       struct mm_struct *mm = tsk->mm;
+
+       /* Check to see if this task was created in compat vdso mode */
+       if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+               return &gate_vma;
+       return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+       const struct vm_area_struct *vma = get_gate_vma(task);
+
+       return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+       return 0;
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
new file mode 100644 (file)
index 0000000..19a6c67
--- /dev/null
@@ -0,0 +1,236 @@
+/*
+ *  linux/arch/i386/kernel/time.c
+ *
+ *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02    Alan Modra
+ *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26    Markus Kuhn
+ *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ *      precision CMOS clock update
+ * 1996-05-03    Ingo Molnar
+ *      fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
+ *             "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05    (Various)
+ *     More robust do_fast_gettimeoffset() algorithm implemented
+ *     (works with APM, Cyrix 6x86MX and Centaur C6),
+ *     monotonic gettimeofday() with fast_get_timeoffset(),
+ *     drift-proof precision TSC calibration on boot
+ *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
+ *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
+ *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
+ * 1998-12-16    Andrea Arcangeli
+ *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ *     because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
+ *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *     serialize accesses to xtime/lost_ticks).
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/efi.h>
+#include <linux/mca.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/msr.h>
+#include <asm/delay.h>
+#include <asm/mpspec.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/time.h>
+
+#include "mach_time.h"
+
+#include <linux/timex.h>
+
+#include <asm/hpet.h>
+
+#include <asm/arch_hooks.h>
+
+#include "io_ports.h"
+
+#include <asm/i8259.h>
+
+#include "do_timer.h"
+
+unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+       unsigned char val;
+       lock_cmos_prefix(addr);
+       outb_p(addr, RTC_PORT(0));
+       val = inb_p(RTC_PORT(1));
+       lock_cmos_suffix(addr);
+       return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+       lock_cmos_prefix(addr);
+       outb_p(addr, RTC_PORT(0));
+       outb_p(val, RTC_PORT(1));
+       lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+       int retval;
+       unsigned long flags;
+
+       /* gets recalled with irq locally disabled */
+       /* XXX - does irqsave resolve this? -johnstul */
+       spin_lock_irqsave(&rtc_lock, flags);
+       retval = set_wallclock(nowtime);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       return retval;
+}
+
+
+int timer_ack;
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+       unsigned long pc = instruction_pointer(regs);
+
+#ifdef CONFIG_SMP
+       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+           in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+               return *(unsigned long *)(regs->ebp + 4);
+#else
+               unsigned long *sp = (unsigned long *)&regs->esp;
+
+               /* Return address is either directly at stack pointer
+                  or above a saved eflags. Eflags has bits 22-31 zero,
+                  kernel addresses don't. */
+               if (sp[0] >> 22)
+                       return sp[0];
+               if (sp[1] >> 22)
+                       return sp[1];
+#endif
+       }
+#endif
+       return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
+ */
+irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+#ifdef CONFIG_X86_IO_APIC
+       if (timer_ack) {
+               /*
+                * Subtle, when I/O APICs are used we have to ack timer IRQ
+                * manually to reset the IRR bit for do_slow_gettimeoffset().
+                * This will also deassert NMI lines for the watchdog if run
+                * on an 82489DX-based system.
+                */
+               spin_lock(&i8259A_lock);
+               outb(0x0c, PIC_MASTER_OCW3);
+               /* Ack the IRQ; AEOI will end it automatically. */
+               inb(PIC_MASTER_POLL);
+               spin_unlock(&i8259A_lock);
+       }
+#endif
+
+       do_timer_interrupt_hook();
+
+       if (MCA_bus) {
+               /* The PS/2 uses level-triggered interrupts.  You can't
+               turn them off, nor would you want to (any attempt to
+               enable edge-triggered interrupts usually gets intercepted by a
+               special hardware circuit).  Hence we have to acknowledge
+               the timer interrupt.  Through some incredibly stupid
+               design idea, the reset for IRQ 0 is done by setting the
+               high bit of the PPI port B (0x61).  Note that some PS/2s,
+               notably the 55SX, work fine if this is removed.  */
+
+               u8 irq_v = inb_p( 0x61 );       /* read the current state */
+               outb_p( irq_v|0x80, 0x61 );     /* reset the IRQ */
+       }
+
+       return IRQ_HANDLED;
+}
+
+/* not static: needed by APM */
+unsigned long read_persistent_clock(void)
+{
+       unsigned long retval;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+
+       retval = get_wallclock();
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       return retval;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+       return set_rtc_mmss(now.tv_sec);
+}
+
+extern void (*late_time_init)(void);
+/* Duplicate of time_init() below, with hpet_enable part added */
+void __init hpet_time_init(void)
+{
+       if (!hpet_enable())
+               setup_pit_timer();
+       time_init_hook();
+}
+
+/*
+ * This is called directly from init code; we must delay timer setup in the
+ * HPET case as we can't make the decision to turn on HPET this early in the
+ * boot process.
+ *
+ * The chosen time_init function will usually be hpet_time_init, above, but
+ * in the case of virtual hardware, an alternative function may be substituted.
+ */
+void __init time_init(void)
+{
+       tsc_init();
+       late_time_init = choose_time_init();
+}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644 (file)
index 0000000..4578235
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * arch/i386/kernel/topology.c - Populate sysfs with topology information
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nodemask.h>
+#include <linux/mmzone.h>
+#include <asm/cpu.h>
+
+static struct i386_cpu cpu_devices[NR_CPUS];
+
+int arch_register_cpu(int num)
+{
+       /*
+        * CPU0 cannot be offlined due to several
+        * restrictions and assumptions in kernel. This basically
+        * doesnt add a control file, one cannot attempt to offline
+        * BSP.
+        *
+        * Also certain PCI quirks require not to enable hotplug control
+        * for all CPU's.
+        */
+       if (num && enable_cpu_hotplug)
+               cpu_devices[num].cpu.hotpluggable = 1;
+
+       return register_cpu(&cpu_devices[num].cpu, num);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int enable_cpu_hotplug = 1;
+
+void arch_unregister_cpu(int num) {
+       return unregister_cpu(&cpu_devices[num].cpu);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+EXPORT_SYMBOL(arch_unregister_cpu);
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+static int __init topology_init(void)
+{
+       int i;
+
+#ifdef CONFIG_NUMA
+       for_each_online_node(i)
+               register_one_node(i);
+#endif /* CONFIG_NUMA */
+
+       for_each_present_cpu(i)
+               arch_register_cpu(i);
+       return 0;
+}
+
+subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
new file mode 100644 (file)
index 0000000..f62815f
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ *
+ *     Trampoline.S    Derived from Setup.S by Linus Torvalds
+ *
+ *     4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *     This is only used for booting secondary CPUs in SMP machine
+ *
+ *     Entry: CS:IP point to the start of our code, we are 
+ *     in real mode with no stack, but the rest of the 
+ *     trampoline page to make our stack and everything else
+ *     is a mystery.
+ *
+ *     In fact we don't actually need a stack so we don't
+ *     set one up.
+ *
+ *     We jump into the boot/compressed/head.S code. So you'd
+ *     better be running a compressed kernel image or you
+ *     won't get very far.
+ *
+ *     On entry to trampoline_data, the processor is in real mode
+ *     with 16-bit addressing and 16-bit data.  CS has some value
+ *     and IP is zero.  Thus, data addresses need to be absolute
+ *     (no relocation) and are taken with regard to r_base.
+ *
+ *     If you work on this file, check the object module with
+ *     objdump --reloc to make sure there are no relocation
+ *     entries except for:
+ *
+ *     TYPE              VALUE
+ *     R_386_32          startup_32_smp
+ *     R_386_32          boot_gdt
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+
+.data
+
+/* We can free up trampoline after bootup if cpu hotplug is not supported. */
+#ifndef CONFIG_HOTPLUG_CPU
+.section ".init.data","aw",@progbits
+#endif
+
+.code16
+
+ENTRY(trampoline_data)
+r_base = .
+       wbinvd                  # Needed for NUMA-Q should be harmless for others
+       mov     %cs, %ax        # Code and data in the same place
+       mov     %ax, %ds
+
+       cli                     # We should be safe anyway
+
+       movl    $0xA5A5A5A5, trampoline_data - r_base
+                               # write marker for master knows we're running
+
+       /* GDT tables in non default location kernel can be beyond 16MB and
+        * lgdt will not be able to load the address as in real mode default
+        * operand size is 16bit. Use lgdtl instead to force operand size
+        * to 32 bit.
+        */
+
+       lidtl   boot_idt_descr - r_base # load idt with 0, 0
+       lgdtl   boot_gdt_descr - r_base # load gdt with whatever is appropriate
+
+       xor     %ax, %ax
+       inc     %ax             # protected mode (PE) bit
+       lmsw    %ax             # into protected mode
+       # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
+       ljmpl   $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
+
+       # These need to be in the same 64K segment as the above;
+       # hence we don't use the boot_gdt_descr defined in head.S
+boot_gdt_descr:
+       .word   __BOOT_DS + 7                   # gdt limit
+       .long   boot_gdt - __PAGE_OFFSET        # gdt base
+
+boot_idt_descr:
+       .word   0                               # idt limit = 0
+       .long   0                               # idt base = 0L
+
+.globl trampoline_end
+trampoline_end:
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
new file mode 100644 (file)
index 0000000..47b0bef
--- /dev/null
@@ -0,0 +1,1250 @@
+/*
+ *  linux/arch/i386/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'asm.s'.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
+#include <linux/nmi.h>
+#include <linux/bug.h>
+
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/nmi.h>
+#include <asm/unwind.h>
+#include <asm/smp.h>
+#include <asm/arch_hooks.h>
+#include <linux/kdebug.h>
+#include <asm/stacktrace.h>
+
+#include <linux/module.h>
+
+#include "mach_traps.h"
+
+int panic_on_unrecovered_nmi;
+
+asmlinkage int system_call(void);
+
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq = 0;
+
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.. We have a special link segment
+ * for this.
+ */
+struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void alignment_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void machine_check(void);
+
+int kstack_depth_to_print = 24;
+static unsigned int code_bytes = 64;
+
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+{
+       return  p > (void *)tinfo &&
+               p <= (void *)tinfo + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+       struct stack_frame *next_frame;
+       unsigned long return_address;
+};
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+                               unsigned long *stack, unsigned long ebp,
+                               struct stacktrace_ops *ops, void *data)
+{
+#ifdef CONFIG_FRAME_POINTER
+       struct stack_frame *frame = (struct stack_frame *)ebp;
+       while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
+               struct stack_frame *next;
+               unsigned long addr;
+
+               addr = frame->return_address;
+               ops->address(data, addr);
+               /*
+                * break out of recursive entries (such as
+                * end_of_stack_stop_unwind_function). Also,
+                * we can never allow a frame pointer to
+                * move downwards!
+                */
+               next = frame->next_frame;
+               if (next <= frame)
+                       break;
+               frame = next;
+       }
+#else
+       while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
+               unsigned long addr;
+
+               addr = *stack++;
+               if (__kernel_text_address(addr))
+                       ops->address(data, addr);
+       }
+#endif
+       return ebp;
+}
+
+#define MSG(msg) ops->warning(data, msg)
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack,
+               struct stacktrace_ops *ops, void *data)
+{
+       unsigned long ebp = 0;
+
+       if (!task)
+               task = current;
+
+       if (!stack) {
+               unsigned long dummy;
+               stack = &dummy;
+               if (task != current)
+                       stack = (unsigned long *)task->thread.esp;
+       }
+
+#ifdef CONFIG_FRAME_POINTER
+       if (!ebp) {
+               if (task == current) {
+                       /* Grab ebp right from our regs */
+                       asm ("movl %%ebp, %0" : "=r" (ebp) : );
+               } else {
+                       /* ebp is the last reg pushed by switch_to */
+                       ebp = *(unsigned long *) task->thread.esp;
+               }
+       }
+#endif
+
+       while (1) {
+               struct thread_info *context;
+               context = (struct thread_info *)
+                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+               ebp = print_context_stack(context, stack, ebp, ops, data);
+               /* Should be after the line below, but somewhere
+                  in early boot context comes out corrupted and we
+                  can't reference it -AK */
+               if (ops->stack(data, "IRQ") < 0)
+                       break;
+               stack = (unsigned long*)context->previous_esp;
+               if (!stack)
+                       break;
+               touch_nmi_watchdog();
+       }
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       printk(data);
+       print_symbol(msg, symbol);
+       printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+       printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+       return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+       printk("%s [<%08lx>] ", (char *)data, addr);
+       print_symbol("%s\n", addr);
+       touch_nmi_watchdog();
+}
+
+static struct stacktrace_ops print_trace_ops = {
+       .warning = print_trace_warning,
+       .warning_symbol = print_trace_warning_symbol,
+       .stack = print_trace_stack,
+       .address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                  unsigned long * stack, char *log_lvl)
+{
+       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+       printk("%s =======================\n", log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+               unsigned long * stack)
+{
+       show_trace_log_lvl(task, regs, stack, "");
+}
+
+static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                              unsigned long *esp, char *log_lvl)
+{
+       unsigned long *stack;
+       int i;
+
+       if (esp == NULL) {
+               if (task)
+                       esp = (unsigned long*)task->thread.esp;
+               else
+                       esp = (unsigned long *)&esp;
+       }
+
+       stack = esp;
+       for(i = 0; i < kstack_depth_to_print; i++) {
+               if (kstack_end(stack))
+                       break;
+               if (i && ((i % 8) == 0))
+                       printk("\n%s       ", log_lvl);
+               printk("%08lx ", *stack++);
+       }
+       printk("\n%sCall Trace:\n", log_lvl);
+       show_trace_log_lvl(task, regs, esp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *esp)
+{
+       printk("       ");
+       show_stack_log_lvl(task, NULL, esp, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+       unsigned long stack;
+
+       show_trace(current, NULL, &stack);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+       int i;
+       int in_kernel = 1;
+       unsigned long esp;
+       unsigned short ss, gs;
+
+       esp = (unsigned long) (&regs->esp);
+       savesegment(ss, ss);
+       savesegment(gs, gs);
+       if (user_mode_vm(regs)) {
+               in_kernel = 0;
+               esp = regs->esp;
+               ss = regs->xss & 0xffff;
+       }
+       print_modules();
+       printk(KERN_EMERG "CPU:    %d\n"
+               KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
+               KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
+               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
+               print_tainted(), regs->eflags, init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
+       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+               regs->eax, regs->ebx, regs->ecx, regs->edx);
+       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+               regs->esi, regs->edi, regs->ebp, esp);
+       printk(KERN_EMERG "ds: %04x   es: %04x   fs: %04x  gs: %04x  ss: %04x\n",
+              regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
+       printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
+               TASK_COMM_LEN, current->comm, current->pid,
+               current_thread_info(), current, task_thread_info(current));
+       /*
+        * When in-kernel, we also print out the stack and code at the
+        * time of the fault..
+        */
+       if (in_kernel) {
+               u8 *eip;
+               unsigned int code_prologue = code_bytes * 43 / 64;
+               unsigned int code_len = code_bytes;
+               unsigned char c;
+
+               printk("\n" KERN_EMERG "Stack: ");
+               show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
+
+               printk(KERN_EMERG "Code: ");
+
+               eip = (u8 *)regs->eip - code_prologue;
+               if (eip < (u8 *)PAGE_OFFSET ||
+                       probe_kernel_address(eip, c)) {
+                       /* try starting at EIP */
+                       eip = (u8 *)regs->eip;
+                       code_len = code_len - code_prologue + 1;
+               }
+               for (i = 0; i < code_len; i++, eip++) {
+                       if (eip < (u8 *)PAGE_OFFSET ||
+                               probe_kernel_address(eip, c)) {
+                               printk(" Bad EIP value.");
+                               break;
+                       }
+                       if (eip == (u8 *)regs->eip)
+                               printk("<%02x> ", c);
+                       else
+                               printk("%02x ", c);
+               }
+       }
+       printk("\n");
+}      
+
+int is_valid_bugaddr(unsigned long eip)
+{
+       unsigned short ud2;
+
+       if (eip < PAGE_OFFSET)
+               return 0;
+       if (probe_kernel_address((unsigned short *)eip, ud2))
+               return 0;
+
+       return ud2 == 0x0b0f;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad and
+ * is about to be terminated.
+ */
+void die(const char * str, struct pt_regs * regs, long err)
+{
+       static struct {
+               spinlock_t lock;
+               u32 lock_owner;
+               int lock_owner_depth;
+       } die = {
+               .lock =                 __SPIN_LOCK_UNLOCKED(die.lock),
+               .lock_owner =           -1,
+               .lock_owner_depth =     0
+       };
+       static int die_counter;
+       unsigned long flags;
+
+       oops_enter();
+
+       if (die.lock_owner != raw_smp_processor_id()) {
+               console_verbose();
+               spin_lock_irqsave(&die.lock, flags);
+               die.lock_owner = smp_processor_id();
+               die.lock_owner_depth = 0;
+               bust_spinlocks(1);
+       }
+       else
+               local_save_flags(flags);
+
+       if (++die.lock_owner_depth < 3) {
+               int nl = 0;
+               unsigned long esp;
+               unsigned short ss;
+
+               report_bug(regs->eip, regs);
+
+               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+               printk(KERN_EMERG "PREEMPT ");
+               nl = 1;
+#endif
+#ifdef CONFIG_SMP
+               if (!nl)
+                       printk(KERN_EMERG);
+               printk("SMP ");
+               nl = 1;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               if (!nl)
+                       printk(KERN_EMERG);
+               printk("DEBUG_PAGEALLOC");
+               nl = 1;
+#endif
+               if (nl)
+                       printk("\n");
+               if (notify_die(DIE_OOPS, str, regs, err,
+                                       current->thread.trap_no, SIGSEGV) !=
+                               NOTIFY_STOP) {
+                       show_registers(regs);
+                       /* Executive summary in case the oops scrolled away */
+                       esp = (unsigned long) (&regs->esp);
+                       savesegment(ss, ss);
+                       if (user_mode(regs)) {
+                               esp = regs->esp;
+                               ss = regs->xss & 0xffff;
+                       }
+                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
+                       print_symbol("%s", regs->eip);
+                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
+               }
+               else
+                       regs = NULL;
+       } else
+               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+
+       bust_spinlocks(0);
+       die.lock_owner = -1;
+       add_taint(TAINT_DIE);
+       spin_unlock_irqrestore(&die.lock, flags);
+
+       if (!regs)
+               return;
+
+       if (kexec_should_crash(current))
+               crash_kexec(regs);
+
+       if (in_interrupt())
+               panic("Fatal exception in interrupt");
+
+       if (panic_on_oops)
+               panic("Fatal exception");
+
+       oops_exit();
+       do_exit(SIGSEGV);
+}
+
+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+{
+       if (!user_mode_vm(regs))
+               die(str, regs, err);
+}
+
+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
+                             struct pt_regs * regs, long error_code,
+                             siginfo_t *info)
+{
+       struct task_struct *tsk = current;
+
+       if (regs->eflags & VM_MASK) {
+               if (vm86)
+                       goto vm86_trap;
+               goto trap_signal;
+       }
+
+       if (!user_mode(regs))
+               goto kernel_trap;
+
+       trap_signal: {
+               /*
+                * We want error_code and trap_no set for userspace faults and
+                * kernelspace faults which result in die(), but not
+                * kernelspace faults which are fixed up.  die() gives the
+                * process no chance to handle the signal and notice the
+                * kernel fault information, so that won't result in polluting
+                * the information about previously queued, but not yet
+                * delivered, faults.  See also do_general_protection below.
+                */
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_no = trapnr;
+
+               if (info)
+                       force_sig_info(signr, info, tsk);
+               else
+                       force_sig(signr, tsk);
+               return;
+       }
+
+       kernel_trap: {
+               if (!fixup_exception(regs)) {
+                       tsk->thread.error_code = error_code;
+                       tsk->thread.trap_no = trapnr;
+                       die(str, regs, error_code);
+               }
+               return;
+       }
+
+       vm86_trap: {
+               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
+               if (ret) goto trap_signal;
+               return;
+       }
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       siginfo_t info; \
+       if (irq) \
+               local_irq_enable(); \
+       info.si_signo = signr; \
+       info.si_errno = 0; \
+       info.si_code = sicode; \
+       info.si_addr = (void __user *)siaddr; \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+}
+
+#define DO_VM86_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+}
+
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       siginfo_t info; \
+       info.si_signo = signr; \
+       info.si_errno = 0; \
+       info.si_code = sicode; \
+       info.si_addr = (void __user *)siaddr; \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+}
+
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+#ifndef CONFIG_KPROBES
+DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+#endif
+DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+
+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+                                             long error_code)
+{
+       int cpu = get_cpu();
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+       struct thread_struct *thread = &current->thread;
+
+       /*
+        * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
+        * invalid offset set (the LAZY one) and the faulting thread has
+        * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
+        * and we set the offset field correctly. Then we let the CPU to
+        * restart the faulting instruction.
+        */
+       if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+           thread->io_bitmap_ptr) {
+               memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
+                      thread->io_bitmap_max);
+               /*
+                * If the previously set map was extending to higher ports
+                * than the current one, pad extra space with 0xff (no access).
+                */
+               if (thread->io_bitmap_max < tss->io_bitmap_max)
+                       memset((char *) tss->io_bitmap +
+                               thread->io_bitmap_max, 0xff,
+                               tss->io_bitmap_max - thread->io_bitmap_max);
+               tss->io_bitmap_max = thread->io_bitmap_max;
+               tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+               tss->io_bitmap_owner = thread;
+               put_cpu();
+               return;
+       }
+       put_cpu();
+
+       if (regs->eflags & VM_MASK)
+               goto gp_in_vm86;
+
+       if (!user_mode(regs))
+               goto gp_in_kernel;
+
+       current->thread.error_code = error_code;
+       current->thread.trap_no = 13;
+       if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+           printk_ratelimit())
+               printk(KERN_INFO
+                   "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+                   current->comm, current->pid,
+                   regs->eip, regs->esp, error_code);
+
+       force_sig(SIGSEGV, current);
+       return;
+
+gp_in_vm86:
+       local_irq_enable();
+       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+       return;
+
+gp_in_kernel:
+       if (!fixup_exception(regs)) {
+               current->thread.error_code = error_code;
+               current->thread.trap_no = 13;
+               if (notify_die(DIE_GPF, "general protection fault", regs,
+                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
+                       return;
+               die("general protection fault", regs, error_code);
+       }
+}
+
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+               "CPU %d.\n", reason, smp_processor_id());
+       printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+       if(edac_handler_set()) {
+               edac_atomic_assert_error();
+               return;
+       }
+#endif
+
+       if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+
+       /* Clear and disable the memory parity error line. */
+       clear_mem_error(reason);
+}
+
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+       unsigned long i;
+
+       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+       show_registers(regs);
+
+       /* Re-enable the IOCK line, wait for a few seconds */
+       reason = (reason & 0xf) | 8;
+       outb(reason, 0x61);
+       i = 2000;
+       while (--i) udelay(1000);
+       reason &= ~8;
+       outb(reason, 0x61);
+}
+
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+#ifdef CONFIG_MCA
+       /* Might actually be able to figure out what the guilty party
+       * is. */
+       if( MCA_bus ) {
+               mca_handle_nmi();
+               return;
+       }
+#endif
+       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+               "CPU %d.\n", reason, smp_processor_id());
+       printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+       if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+
+static DEFINE_SPINLOCK(nmi_print_lock);
+
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+{
+       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
+           NOTIFY_STOP)
+               return;
+
+       spin_lock(&nmi_print_lock);
+       /*
+       * We are in trouble anyway, lets at least try
+       * to get a message out.
+       */
+       bust_spinlocks(1);
+       printk(KERN_EMERG "%s", msg);
+       printk(" on CPU%d, eip %08lx, registers:\n",
+               smp_processor_id(), regs->eip);
+       show_registers(regs);
+       console_silent();
+       spin_unlock(&nmi_print_lock);
+       bust_spinlocks(0);
+
+       /* If we are in kernel we are probably nested up pretty bad
+        * and might aswell get out now while we still can.
+       */
+       if (!user_mode_vm(regs)) {
+               current->thread.trap_no = 2;
+               crash_kexec(regs);
+       }
+
+       do_exit(SIGSEGV);
+}
+
+static __kprobes void default_do_nmi(struct pt_regs * regs)
+{
+       unsigned char reason = 0;
+
+       /* Only the BSP gets external NMIs from the system.  */
+       if (!smp_processor_id())
+               reason = get_nmi_reason();
+       if (!(reason & 0xc0)) {
+               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+                                                       == NOTIFY_STOP)
+                       return;
+#ifdef CONFIG_X86_LOCAL_APIC
+               /*
+                * Ok, so this is none of the documented NMI sources,
+                * so it must be the NMI watchdog.
+                */
+               if (nmi_watchdog_tick(regs, reason))
+                       return;
+               if (!do_nmi_callback(regs, smp_processor_id()))
+#endif
+                       unknown_nmi_error(reason, regs);
+
+               return;
+       }
+       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+               return;
+       if (reason & 0x80)
+               mem_parity_error(reason, regs);
+       if (reason & 0x40)
+               io_check_error(reason, regs);
+       /*
+        * Reassert NMI in case it became active meanwhile
+        * as it's edge-triggered.
+        */
+       reassert_nmi();
+}
+
+static int ignore_nmis;
+
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+{
+       int cpu;
+
+       nmi_enter();
+
+       cpu = smp_processor_id();
+
+       ++nmi_count(cpu);
+
+       if (!ignore_nmis)
+               default_do_nmi(regs);
+
+       nmi_exit();
+}
+
+void stop_nmi(void)
+{
+       acpi_nmi_disable();
+       ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+       ignore_nmis--;
+       acpi_nmi_enable();
+}
+
+#ifdef CONFIG_KPROBES
+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
+{
+       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+                       == NOTIFY_STOP)
+               return;
+       /* This is an interrupt gate, because kprobes wants interrupts
+       disabled.  Normal trap handlers don't. */
+       restore_interrupts(regs);
+       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+}
+#endif
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ * 
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
+{
+       unsigned int condition;
+       struct task_struct *tsk = current;
+
+       get_debugreg(condition, 6);
+
+       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+                                       SIGTRAP) == NOTIFY_STOP)
+               return;
+       /* It's safe to allow irq's after DR6 has been saved */
+       if (regs->eflags & X86_EFLAGS_IF)
+               local_irq_enable();
+
+       /* Mask out spurious debug traps due to lazy DR7 setting */
+       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+               if (!tsk->thread.debugreg[7])
+                       goto clear_dr7;
+       }
+
+       if (regs->eflags & VM_MASK)
+               goto debug_vm86;
+
+       /* Save debug status register where ptrace can see it */
+       tsk->thread.debugreg[6] = condition;
+
+       /*
+        * Single-stepping through TF: make sure we ignore any events in
+        * kernel space (but re-enable TF when returning to user mode).
+        */
+       if (condition & DR_STEP) {
+               /*
+                * We already checked v86 mode above, so we can
+                * check for kernel mode by just checking the CPL
+                * of CS.
+                */
+               if (!user_mode(regs))
+                       goto clear_TF_reenable;
+       }
+
+       /* Ok, finally something we can handle */
+       send_sigtrap(tsk, regs, error_code);
+
+       /* Disable additional traps. They'll be re-enabled when
+        * the signal is delivered.
+        */
+clear_dr7:
+       set_debugreg(0, 7);
+       return;
+
+debug_vm86:
+       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+       return;
+
+clear_TF_reenable:
+       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+       regs->eflags &= ~TF_MASK;
+       return;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(void __user *eip)
+{
+       struct task_struct * task;
+       siginfo_t info;
+       unsigned short cwd, swd;
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       task = current;
+       save_init_fpu(task);
+       task->thread.trap_no = 16;
+       task->thread.error_code = 0;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_code = __SI_FAULT;
+       info.si_addr = eip;
+       /*
+        * (~cwd & swd) will mask out exceptions that are not set to unmasked
+        * status.  0x3f is the exception bits in these regs, 0x200 is the
+        * C1 reg you need in case of a stack fault, 0x040 is the stack
+        * fault bit.  We should only be taking one exception at a time,
+        * so if this combination doesn't produce any single exception,
+        * then we have a bad program that isn't syncronizing its FPU usage
+        * and it will suffer the consequences since we won't be able to
+        * fully reproduce the context of the exception
+        */
+       cwd = get_fpu_cwd(task);
+       swd = get_fpu_swd(task);
+       switch (swd & ~cwd & 0x3f) {
+               case 0x000: /* No unmasked exception */
+                       return;
+               default:    /* Multiple exceptions */
+                       break;
+               case 0x001: /* Invalid Op */
+                       /*
+                        * swd & 0x240 == 0x040: Stack Underflow
+                        * swd & 0x240 == 0x240: Stack Overflow
+                        * User must clear the SF bit (0x40) if set
+                        */
+                       info.si_code = FPE_FLTINV;
+                       break;
+               case 0x002: /* Denormalize */
+               case 0x010: /* Underflow */
+                       info.si_code = FPE_FLTUND;
+                       break;
+               case 0x004: /* Zero Divide */
+                       info.si_code = FPE_FLTDIV;
+                       break;
+               case 0x008: /* Overflow */
+                       info.si_code = FPE_FLTOVF;
+                       break;
+               case 0x020: /* Precision */
+                       info.si_code = FPE_FLTRES;
+                       break;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+{
+       ignore_fpu_irq = 1;
+       math_error((void __user *)regs->eip);
+}
+
+static void simd_math_error(void __user *eip)
+{
+       struct task_struct * task;
+       siginfo_t info;
+       unsigned short mxcsr;
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       task = current;
+       save_init_fpu(task);
+       task->thread.trap_no = 19;
+       task->thread.error_code = 0;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_code = __SI_FAULT;
+       info.si_addr = eip;
+       /*
+        * The SIMD FPU exceptions are handled a little differently, as there
+        * is only a single status/control register.  Thus, to determine which
+        * unmasked exception was caught we must mask the exception mask bits
+        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+        */
+       mxcsr = get_fpu_mxcsr(task);
+       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+               case 0x000:
+               default:
+                       break;
+               case 0x001: /* Invalid Op */
+                       info.si_code = FPE_FLTINV;
+                       break;
+               case 0x002: /* Denormalize */
+               case 0x010: /* Underflow */
+                       info.si_code = FPE_FLTUND;
+                       break;
+               case 0x004: /* Zero Divide */
+                       info.si_code = FPE_FLTDIV;
+                       break;
+               case 0x008: /* Overflow */
+                       info.si_code = FPE_FLTOVF;
+                       break;
+               case 0x020: /* Precision */
+                       info.si_code = FPE_FLTRES;
+                       break;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+                                         long error_code)
+{
+       if (cpu_has_xmm) {
+               /* Handle SIMD FPU exceptions on PIII+ processors. */
+               ignore_fpu_irq = 1;
+               simd_math_error((void __user *)regs->eip);
+       } else {
+               /*
+                * Handle strange cache flush from user space exception
+                * in all other cases.  This is undocumented behaviour.
+                */
+               if (regs->eflags & VM_MASK) {
+                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
+                                         error_code);
+                       return;
+               }
+               current->thread.trap_no = 19;
+               current->thread.error_code = error_code;
+               die_if_kernel("cache flush denied", regs, error_code);
+               force_sig(SIGSEGV, current);
+       }
+}
+
+fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
+                                         long error_code)
+{
+#if 0
+       /* No need to warn about this any longer. */
+       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+#endif
+}
+
+fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+                                         unsigned long kesp)
+{
+       struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+       unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+       unsigned long new_kesp = kesp - base;
+       unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+       __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+       /* Set up base for espfix segment */
+       desc &= 0x00f0ff0000000000ULL;
+       desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+               ((((__u64)base) << 32) & 0xff00000000000000ULL) |
+               ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+               (lim_pages & 0xffff);
+       *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+       return new_kesp;
+}
+
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (in this case,
+ * local interrupts are disabled at the call-site in entry.S).
+ */
+asmlinkage void math_state_restore(void)
+{
+       struct thread_info *thread = current_thread_info();
+       struct task_struct *tsk = thread->task;
+
+       clts();         /* Allow maths ops (or we recurse) */
+       if (!tsk_used_math(tsk))
+               init_fpu(tsk);
+       restore_fpu(tsk);
+       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
+       tsk->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+
+#ifndef CONFIG_MATH_EMULATION
+
+asmlinkage void math_emulate(long arg)
+{
+       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
+       printk(KERN_EMERG "killing %s.\n",current->comm);
+       force_sig(SIGFPE,current);
+       schedule();
+}
+
+#endif /* CONFIG_MATH_EMULATION */
+
+#ifdef CONFIG_X86_F00F_BUG
+void __init trap_init_f00f_bug(void)
+{
+       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+
+       /*
+        * Update the IDT descriptor and reload the IDT so that
+        * it uses the read-only mapped virtual address.
+        */
+       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+       load_idt(&idt_descr);
+}
+#endif
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+void set_intr_gate(unsigned int n, void *addr)
+{
+       _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+       _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
+}
+
+static void __init set_trap_gate(unsigned int n, void *addr)
+{
+       _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
+}
+
+static void __init set_system_gate(unsigned int n, void *addr)
+{
+       _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
+}
+
+static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+       _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
+}
+
+
+void __init trap_init(void)
+{
+#ifdef CONFIG_EISA
+       void __iomem *p = ioremap(0x0FFFD9, 4);
+       if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
+               EISA_bus = 1;
+       }
+       iounmap(p);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       init_apic_mappings();
+#endif
+
+       set_trap_gate(0,&divide_error);
+       set_intr_gate(1,&debug);
+       set_intr_gate(2,&nmi);
+       set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
+       set_system_gate(4,&overflow);
+       set_trap_gate(5,&bounds);
+       set_trap_gate(6,&invalid_op);
+       set_trap_gate(7,&device_not_available);
+       set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
+       set_trap_gate(9,&coprocessor_segment_overrun);
+       set_trap_gate(10,&invalid_TSS);
+       set_trap_gate(11,&segment_not_present);
+       set_trap_gate(12,&stack_segment);
+       set_trap_gate(13,&general_protection);
+       set_intr_gate(14,&page_fault);
+       set_trap_gate(15,&spurious_interrupt_bug);
+       set_trap_gate(16,&coprocessor_error);
+       set_trap_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+       set_trap_gate(18,&machine_check);
+#endif
+       set_trap_gate(19,&simd_coprocessor_error);
+
+       if (cpu_has_fxsr) {
+               /*
+                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+                * Generates a compile-time "error: zero width for bit-field" if
+                * the alignment is wrong.
+                */
+               struct fxsrAlignAssert {
+                       int _:!(offsetof(struct task_struct,
+                                       thread.i387.fxsave) & 15);
+               };
+
+               printk(KERN_INFO "Enabling fast FPU save and restore... ");
+               set_in_cr4(X86_CR4_OSFXSR);
+               printk("done.\n");
+       }
+       if (cpu_has_xmm) {
+               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
+                               "support... ");
+               set_in_cr4(X86_CR4_OSXMMEXCPT);
+               printk("done.\n");
+       }
+
+       set_system_gate(SYSCALL_VECTOR,&system_call);
+
+       /*
+        * Should be a barrier for any external CPU state.
+        */
+       cpu_init();
+
+       trap_init_hook();
+}
+
+static int __init kstack_setup(char *s)
+{
+       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+       return 1;
+}
+__setup("kstack=", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+       code_bytes = simple_strtoul(s, NULL, 0);
+       if (code_bytes > 8192)
+               code_bytes = 8192;
+
+       return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
new file mode 100644 (file)
index 0000000..a39280b
--- /dev/null
@@ -0,0 +1,413 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+
+#include "mach_timer.h"
+
+static int tsc_enabled;
+
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
+
+int tsc_disable;
+
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+       printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+                               "cannot disable TSC.\n");
+       return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+       tsc_disable = 1;
+
+       return 1;
+}
+#endif
+
+__setup("notsc", tsc_setup);
+
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+int check_tsc_unstable(void)
+{
+       return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *             ns = cycles / (freq / ns_per_sec)
+ *             ns = cycles * (ns_per_sec / freq)
+ *             ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *             ns = cycles * (10^6 / cpu_khz)
+ *
+ *     Then we use scaling math (suggested by george@mvista.com) to get:
+ *             ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *             ns = cycles * cyc2ns_scale / SC
+ *
+ *     And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                     -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+unsigned long cyc2ns_scale __read_mostly;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+       cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long native_sched_clock(void)
+{
+       unsigned long long this_offset;
+
+       /*
+        * Fall back to jiffies if there's no TSC available:
+        * ( But note that we still use it if the TSC is marked
+        *   unstable. We do this because unlike Time Of Day,
+        *   the scheduler clock tolerates small errors and it's
+        *   very important for it to be as fast as the platform
+        *   can achive it. )
+        */
+       if (unlikely(!tsc_enabled && !tsc_unstable))
+               /* No locking but a rare wrong value is not a big deal: */
+               return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+
+       /* read the Time Stamp Counter: */
+       rdtscll(this_offset);
+
+       /* return the value in ns */
+       return cycles_2_ns(this_offset);
+}
+
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+       return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+       __attribute__((alias("native_sched_clock")));
+#endif
+
+unsigned long native_calculate_cpu_khz(void)
+{
+       unsigned long long start, end;
+       unsigned long count;
+       u64 delta64;
+       int i;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       /* run 3 times to ensure the cache is warm */
+       for (i = 0; i < 3; i++) {
+               mach_prepare_counter();
+               rdtscll(start);
+               mach_countup(&count);
+               rdtscll(end);
+       }
+       /*
+        * Error: ECTCNEVERSET
+        * The CTC wasn't reliable: we got a hit on the very first read,
+        * or the CPU was so fast/slow that the quotient wouldn't fit in
+        * 32 bits..
+        */
+       if (count <= 1)
+               goto err;
+
+       delta64 = end - start;
+
+       /* cpu freq too fast: */
+       if (delta64 > (1ULL<<32))
+               goto err;
+
+       /* cpu freq too slow: */
+       if (delta64 <= CALIBRATE_TIME_MSEC)
+               goto err;
+
+       delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+       do_div(delta64,CALIBRATE_TIME_MSEC);
+
+       local_irq_restore(flags);
+       return (unsigned long)delta64;
+err:
+       local_irq_restore(flags);
+       return 0;
+}
+
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+       unsigned long cpu_khz_old = cpu_khz;
+
+       if (cpu_has_tsc) {
+               cpu_khz = calculate_cpu_khz();
+               tsc_khz = cpu_khz;
+               cpu_data[0].loops_per_jiffy =
+                       cpufreq_scale(cpu_data[0].loops_per_jiffy,
+                                       cpu_khz_old, cpu_khz);
+               return 0;
+       } else
+               return -ENODEV;
+#else
+       return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+#ifdef CONFIG_CPU_FREQ
+
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+       struct cpufreq_freqs *freq = data;
+
+       if (!ref_freq) {
+               if (!freq->old){
+                       ref_freq = freq->new;
+                       return 0;
+               }
+               ref_freq = freq->old;
+               loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+               cpu_khz_ref = cpu_khz;
+       }
+
+       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+           (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+           (val == CPUFREQ_RESUMECHANGE)) {
+               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+                       cpu_data[freq->cpu].loops_per_jiffy =
+                               cpufreq_scale(loops_per_jiffy_ref,
+                                               ref_freq, freq->new);
+
+               if (cpu_khz) {
+
+                       if (num_online_cpus() == 1)
+                               cpu_khz = cpufreq_scale(cpu_khz_ref,
+                                               ref_freq, freq->new);
+                       if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+                               tsc_khz = cpu_khz;
+                               set_cyc2ns_scale(cpu_khz);
+                               /*
+                                * TSC based sched_clock turns
+                                * to junk w/ cpufreq
+                                */
+                               mark_tsc_unstable("cpufreq changes");
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+       .notifier_call  = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+       return cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                                        CPUFREQ_TRANSITION_NOTIFIER);
+}
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/* clock source code */
+
+static unsigned long current_tsc_khz = 0;
+
+static cycle_t read_tsc(void)
+{
+       cycle_t ret;
+
+       rdtscll(ret);
+
+       return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+       .name                   = "tsc",
+       .rating                 = 300,
+       .read                   = read_tsc,
+       .mask                   = CLOCKSOURCE_MASK(64),
+       .mult                   = 0, /* to be set */
+       .shift                  = 22,
+       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
+                                 CLOCK_SOURCE_MUST_VERIFY,
+};
+
+void mark_tsc_unstable(char *reason)
+{
+       if (!tsc_unstable) {
+               tsc_unstable = 1;
+               tsc_enabled = 0;
+               printk("Marking TSC unstable due to: %s.\n", reason);
+               /* Can be called before registration */
+               if (clocksource_tsc.mult)
+                       clocksource_change_rating(&clocksource_tsc, 0);
+               else
+                       clocksource_tsc.rating = 0;
+       }
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
+{
+       printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+                      d->ident);
+       tsc_unstable = 1;
+       return 0;
+}
+
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+       {
+        .callback = dmi_mark_tsc_unstable,
+        .ident = "IBM Thinkpad 380XD",
+        .matches = {
+                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                    DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+                    },
+        },
+        {}
+};
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+       if (!cpu_has_tsc || tsc_unstable)
+               return 1;
+       /*
+        * Intel systems are normally all synchronized.
+        * Exceptions must mark TSC as unstable:
+        */
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+               /* assume multi socket systems are not synchronized: */
+               if (num_possible_cpus() > 1)
+                       tsc_unstable = 1;
+       }
+       return tsc_unstable;
+}
+
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+
+static void __init check_geode_tsc_reliable(void)
+{
+       unsigned long val;
+
+       rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
+       if ((val & RTSC_SUSP))
+               clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
+
+
+void __init tsc_init(void)
+{
+       if (!cpu_has_tsc || tsc_disable)
+               goto out_no_tsc;
+
+       cpu_khz = calculate_cpu_khz();
+       tsc_khz = cpu_khz;
+
+       if (!cpu_khz)
+               goto out_no_tsc;
+
+       printk("Detected %lu.%03lu MHz processor.\n",
+                               (unsigned long)cpu_khz / 1000,
+                               (unsigned long)cpu_khz % 1000);
+
+       set_cyc2ns_scale(cpu_khz);
+       use_tsc_delay();
+
+       /* Check and install the TSC clocksource */
+       dmi_check_system(bad_tsc_dmi_table);
+
+       unsynchronized_tsc();
+       check_geode_tsc_reliable();
+       current_tsc_khz = tsc_khz;
+       clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+                                                       clocksource_tsc.shift);
+       /* lower the rating if we already know its unstable: */
+       if (check_tsc_unstable()) {
+               clocksource_tsc.rating = 0;
+               clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+       } else
+               tsc_enabled = 1;
+
+       clocksource_register(&clocksource_tsc);
+
+       return;
+
+out_no_tsc:
+       /*
+        * Set the tsc_disable flag if there's no TSC support, this
+        * makes it a fast flag for the kernel to see whether it
+        * should be using the TSC.
+        */
+       tsc_disable = 1;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644 (file)
index 0000000..1242462
--- /dev/null
@@ -0,0 +1 @@
+#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644 (file)
index 0000000..f2dcd1d
--- /dev/null
@@ -0,0 +1,843 @@
+/*
+ *  linux/kernel/vm86.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *
+ *  29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
+ *                stack - Manfred Spraul <manfred@colorfullife.com>
+ *
+ *  22 mar 2002 - Manfred detected the stackfaults, but didn't handle
+ *                them correctly. Now the emulation will be in a
+ *                consistent state after stackfaults - Kasper Dupont
+ *                <kasperd@daimi.au.dk>
+ *
+ *  22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
+ *                <kasperd@daimi.au.dk>
+ *
+ *  ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
+ *                caused by Kasper Dupont's changes - Stas Sergeev
+ *
+ *   4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
+ *                Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ *   9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
+ *                Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ *   9 apr 2002 - Changed stack access macros to jump to a label
+ *                instead of returning to userspace. This simplifies
+ *                do_int, and is needed by handle_vm6_fault. Kasper
+ *                Dupont <kasperd@daimi.au.dk>
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/audit.h>
+#include <linux/stddef.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/tlbflush.h>
+#include <asm/irq.h>
+
+/*
+ * Known problems:
+ *
+ * Interrupt handling is not guaranteed:
+ * - a real x86 will disable all interrupts for one instruction
+ *   after a "mov ss,xx" to make stack handling atomic even without
+ *   the 'lss' instruction. We can't guarantee this in v86 mode,
+ *   as the next instruction might result in a page fault or similar.
+ * - a real x86 will have interrupts disabled for one instruction
+ *   past the 'sti' that enables them. We don't bother with all the
+ *   details yet.
+ *
+ * Let's hope these problems do not actually matter for anything.
+ */
+
+
+#define KVM86  ((struct kernel_vm86_struct *)regs)
+#define VMPI   KVM86->vm86plus
+
+
+/*
+ * 8- and 16-bit register defines..
+ */
+#define AL(regs)       (((unsigned char *)&((regs)->pt.eax))[0])
+#define AH(regs)       (((unsigned char *)&((regs)->pt.eax))[1])
+#define IP(regs)       (*(unsigned short *)&((regs)->pt.eip))
+#define SP(regs)       (*(unsigned short *)&((regs)->pt.esp))
+
+/*
+ * virtual flags (16 and 32-bit versions)
+ */
+#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
+#define VEFLAGS        (current->thread.v86flags)
+
+#define set_flags(X,new,mask) \
+((X) = ((X) & ~(mask)) | ((new) & (mask)))
+
+#define SAFE_MASK      (0xDD5)
+#define RETURN_MASK    (0xDFF)
+
+/* convert kernel_vm86_regs to vm86_regs */
+static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
+                                 const struct kernel_vm86_regs *regs)
+{
+       int ret = 0;
+
+       /* kernel_vm86_regs is missing xgs, so copy everything up to
+          (but not including) orig_eax, and then rest including orig_eax. */
+       ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+       ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
+                           sizeof(struct kernel_vm86_regs) -
+                           offsetof(struct kernel_vm86_regs, pt.orig_eax));
+
+       return ret;
+}
+
+/* convert vm86_regs to kernel_vm86_regs */
+static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
+                                   const struct vm86_regs __user *user,
+                                   unsigned extra)
+{
+       int ret = 0;
+
+       /* copy eax-xfs inclusive */
+       ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+       /* copy orig_eax-__gsh+extra */
+       ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
+                             sizeof(struct kernel_vm86_regs) -
+                             offsetof(struct kernel_vm86_regs, pt.orig_eax) +
+                             extra);
+       return ret;
+}
+
+struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
+struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
+{
+       struct tss_struct *tss;
+       struct pt_regs *ret;
+       unsigned long tmp;
+
+       /*
+        * This gets called from entry.S with interrupts disabled, but
+        * from process context. Enable interrupts here, before trying
+        * to access user space.
+        */
+       local_irq_enable();
+
+       if (!current->thread.vm86_info) {
+               printk("no vm86_info: BAD\n");
+               do_exit(SIGSEGV);
+       }
+       set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+       tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
+       tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
+       if (tmp) {
+               printk("vm86: could not access userspace vm86_info\n");
+               do_exit(SIGSEGV);
+       }
+
+       tss = &per_cpu(init_tss, get_cpu());
+       current->thread.esp0 = current->thread.saved_esp0;
+       current->thread.sysenter_cs = __KERNEL_CS;
+       load_esp0(tss, &current->thread);
+       current->thread.saved_esp0 = 0;
+       put_cpu();
+
+       ret = KVM86->regs32;
+
+       ret->xfs = current->thread.saved_fs;
+       loadsegment(gs, current->thread.saved_gs);
+
+       return ret;
+}
+
+static void mark_screen_rdonly(struct mm_struct *mm)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       spinlock_t *ptl;
+       int i;
+
+       pgd = pgd_offset(mm, 0xA0000);
+       if (pgd_none_or_clear_bad(pgd))
+               goto out;
+       pud = pud_offset(pgd, 0xA0000);
+       if (pud_none_or_clear_bad(pud))
+               goto out;
+       pmd = pmd_offset(pud, 0xA0000);
+       if (pmd_none_or_clear_bad(pmd))
+               goto out;
+       pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
+       for (i = 0; i < 32; i++) {
+               if (pte_present(*pte))
+                       set_pte(pte, pte_wrprotect(*pte));
+               pte++;
+       }
+       pte_unmap_unlock(pte, ptl);
+out:
+       flush_tlb();
+}
+
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber);
+static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+
+asmlinkage int sys_vm86old(struct pt_regs regs)
+{
+       struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
+       struct kernel_vm86_struct info; /* declare this _on top_,
+                                        * this avoids wasting of stack space.
+                                        * This remains on the stack until we
+                                        * return to 32 bit user space.
+                                        */
+       struct task_struct *tsk;
+       int tmp, ret = -EPERM;
+
+       tsk = current;
+       if (tsk->thread.saved_esp0)
+               goto out;
+       tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+                                      offsetof(struct kernel_vm86_struct, vm86plus) -
+                                      sizeof(info.regs));
+       ret = -EFAULT;
+       if (tmp)
+               goto out;
+       memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
+       info.regs32 = &regs;
+       tsk->thread.vm86_info = v86;
+       do_sys_vm86(&info, tsk);
+       ret = 0;        /* we never return here */
+out:
+       return ret;
+}
+
+
+asmlinkage int sys_vm86(struct pt_regs regs)
+{
+       struct kernel_vm86_struct info; /* declare this _on top_,
+                                        * this avoids wasting of stack space.
+                                        * This remains on the stack until we
+                                        * return to 32 bit user space.
+                                        */
+       struct task_struct *tsk;
+       int tmp, ret;
+       struct vm86plus_struct __user *v86;
+
+       tsk = current;
+       switch (regs.ebx) {
+               case VM86_REQUEST_IRQ:
+               case VM86_FREE_IRQ:
+               case VM86_GET_IRQ_BITS:
+               case VM86_GET_AND_RESET_IRQ:
+                       ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
+                       goto out;
+               case VM86_PLUS_INSTALL_CHECK:
+                       /* NOTE: on old vm86 stuff this will return the error
+                          from access_ok(), because the subfunction is
+                          interpreted as (invalid) address to vm86_struct.
+                          So the installation check works.
+                        */
+                       ret = 0;
+                       goto out;
+       }
+
+       /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
+       ret = -EPERM;
+       if (tsk->thread.saved_esp0)
+               goto out;
+       v86 = (struct vm86plus_struct __user *)regs.ecx;
+       tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+                                      offsetof(struct kernel_vm86_struct, regs32) -
+                                      sizeof(info.regs));
+       ret = -EFAULT;
+       if (tmp)
+               goto out;
+       info.regs32 = &regs;
+       info.vm86plus.is_vm86pus = 1;
+       tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
+       do_sys_vm86(&info, tsk);
+       ret = 0;        /* we never return here */
+out:
+       return ret;
+}
+
+
+static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+{
+       struct tss_struct *tss;
+/*
+ * make sure the vm86() system call doesn't try to do anything silly
+ */
+       info->regs.pt.xds = 0;
+       info->regs.pt.xes = 0;
+       info->regs.pt.xfs = 0;
+
+/* we are clearing gs later just before "jmp resume_userspace",
+ * because it is not saved/restored.
+ */
+
+/*
+ * The eflags register is also special: we cannot trust that the user
+ * has set it up safely, so this makes sure interrupt etc flags are
+ * inherited from protected mode.
+ */
+       VEFLAGS = info->regs.pt.eflags;
+       info->regs.pt.eflags &= SAFE_MASK;
+       info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
+       info->regs.pt.eflags |= VM_MASK;
+
+       switch (info->cpu_type) {
+               case CPU_286:
+                       tsk->thread.v86mask = 0;
+                       break;
+               case CPU_386:
+                       tsk->thread.v86mask = NT_MASK | IOPL_MASK;
+                       break;
+               case CPU_486:
+                       tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
+                       break;
+               default:
+                       tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
+                       break;
+       }
+
+/*
+ * Save old state, set default return value (%eax) to 0
+ */
+       info->regs32->eax = 0;
+       tsk->thread.saved_esp0 = tsk->thread.esp0;
+       tsk->thread.saved_fs = info->regs32->xfs;
+       savesegment(gs, tsk->thread.saved_gs);
+
+       tss = &per_cpu(init_tss, get_cpu());
+       tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
+       if (cpu_has_sep)
+               tsk->thread.sysenter_cs = 0;
+       load_esp0(tss, &tsk->thread);
+       put_cpu();
+
+       tsk->thread.screen_bitmap = info->screen_bitmap;
+       if (info->flags & VM86_SCREEN_BITMAP)
+               mark_screen_rdonly(tsk->mm);
+
+       /*call audit_syscall_exit since we do not exit via the normal paths */
+       if (unlikely(current->audit_context))
+               audit_syscall_exit(AUDITSC_RESULT(0), 0);
+
+       __asm__ __volatile__(
+               "movl %0,%%esp\n\t"
+               "movl %1,%%ebp\n\t"
+               "mov  %2, %%gs\n\t"
+               "jmp resume_userspace"
+               : /* no outputs */
+               :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
+       /* we never return here */
+}
+
+static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
+{
+       struct pt_regs * regs32;
+
+       regs32 = save_v86_state(regs16);
+       regs32->eax = retval;
+       __asm__ __volatile__("movl %0,%%esp\n\t"
+               "movl %1,%%ebp\n\t"
+               "jmp resume_userspace"
+               : : "r" (regs32), "r" (current_thread_info()));
+}
+
+static inline void set_IF(struct kernel_vm86_regs * regs)
+{
+       VEFLAGS |= VIF_MASK;
+       if (VEFLAGS & VIP_MASK)
+               return_to_32bit(regs, VM86_STI);
+}
+
+static inline void clear_IF(struct kernel_vm86_regs * regs)
+{
+       VEFLAGS &= ~VIF_MASK;
+}
+
+static inline void clear_TF(struct kernel_vm86_regs * regs)
+{
+       regs->pt.eflags &= ~TF_MASK;
+}
+
+static inline void clear_AC(struct kernel_vm86_regs * regs)
+{
+       regs->pt.eflags &= ~AC_MASK;
+}
+
+/* It is correct to call set_IF(regs) from the set_vflags_*
+ * functions. However someone forgot to call clear_IF(regs)
+ * in the opposite case.
+ * After the command sequence CLI PUSHF STI POPF you should
+ * end up with interrups disabled, but you ended up with
+ * interrupts enabled.
+ *  ( I was testing my own changes, but the only bug I
+ *    could find was in a function I had not changed. )
+ * [KD]
+ */
+
+static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
+{
+       set_flags(VEFLAGS, eflags, current->thread.v86mask);
+       set_flags(regs->pt.eflags, eflags, SAFE_MASK);
+       if (eflags & IF_MASK)
+               set_IF(regs);
+       else
+               clear_IF(regs);
+}
+
+static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
+{
+       set_flags(VFLAGS, flags, current->thread.v86mask);
+       set_flags(regs->pt.eflags, flags, SAFE_MASK);
+       if (flags & IF_MASK)
+               set_IF(regs);
+       else
+               clear_IF(regs);
+}
+
+static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
+{
+       unsigned long flags = regs->pt.eflags & RETURN_MASK;
+
+       if (VEFLAGS & VIF_MASK)
+               flags |= IF_MASK;
+       flags |= IOPL_MASK;
+       return flags | (VEFLAGS & current->thread.v86mask);
+}
+
+static inline int is_revectored(int nr, struct revectored_struct * bitmap)
+{
+       __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
+               :"=r" (nr)
+               :"m" (*bitmap),"r" (nr));
+       return nr;
+}
+
+#define val_byte(val, n) (((__u8 *)&val)[n])
+
+#define pushb(base, ptr, val, err_label) \
+       do { \
+               __u8 __val = val; \
+               ptr--; \
+               if (put_user(__val, base + ptr) < 0) \
+                       goto err_label; \
+       } while(0)
+
+#define pushw(base, ptr, val, err_label) \
+       do { \
+               __u16 __val = val; \
+               ptr--; \
+               if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+                       goto err_label; \
+               ptr--; \
+               if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+                       goto err_label; \
+       } while(0)
+
+#define pushl(base, ptr, val, err_label) \
+       do { \
+               __u32 __val = val; \
+               ptr--; \
+               if (put_user(val_byte(__val, 3), base + ptr) < 0) \
+                       goto err_label; \
+               ptr--; \
+               if (put_user(val_byte(__val, 2), base + ptr) < 0) \
+                       goto err_label; \
+               ptr--; \
+               if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+                       goto err_label; \
+               ptr--; \
+               if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+                       goto err_label; \
+       } while(0)
+
+#define popb(base, ptr, err_label) \
+       ({ \
+               __u8 __res; \
+               if (get_user(__res, base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               __res; \
+       })
+
+#define popw(base, ptr, err_label) \
+       ({ \
+               __u16 __res; \
+               if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               __res; \
+       })
+
+#define popl(base, ptr, err_label) \
+       ({ \
+               __u32 __res; \
+               if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               if (get_user(val_byte(__res, 2), base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               if (get_user(val_byte(__res, 3), base + ptr) < 0) \
+                       goto err_label; \
+               ptr++; \
+               __res; \
+       })
+
+/* There are so many possible reasons for this function to return
+ * VM86_INTx, so adding another doesn't bother me. We can expect
+ * userspace programs to be able to handle it. (Getting a problem
+ * in userspace is always better than an Oops anyway.) [KD]
+ */
+static void do_int(struct kernel_vm86_regs *regs, int i,
+    unsigned char __user * ssp, unsigned short sp)
+{
+       unsigned long __user *intr_ptr;
+       unsigned long segoffs;
+
+       if (regs->pt.xcs == BIOSSEG)
+               goto cannot_handle;
+       if (is_revectored(i, &KVM86->int_revectored))
+               goto cannot_handle;
+       if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
+               goto cannot_handle;
+       intr_ptr = (unsigned long __user *) (i << 2);
+       if (get_user(segoffs, intr_ptr))
+               goto cannot_handle;
+       if ((segoffs >> 16) == BIOSSEG)
+               goto cannot_handle;
+       pushw(ssp, sp, get_vflags(regs), cannot_handle);
+       pushw(ssp, sp, regs->pt.xcs, cannot_handle);
+       pushw(ssp, sp, IP(regs), cannot_handle);
+       regs->pt.xcs = segoffs >> 16;
+       SP(regs) -= 6;
+       IP(regs) = segoffs & 0xffff;
+       clear_TF(regs);
+       clear_IF(regs);
+       clear_AC(regs);
+       return;
+
+cannot_handle:
+       return_to_32bit(regs, VM86_INTx + (i << 8));
+}
+
+int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
+{
+       if (VMPI.is_vm86pus) {
+               if ( (trapno==3) || (trapno==1) )
+                       return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+               do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
+               return 0;
+       }
+       if (trapno !=1)
+               return 1; /* we let this handle by the calling routine */
+       if (current->ptrace & PT_PTRACED) {
+               unsigned long flags;
+               spin_lock_irqsave(&current->sighand->siglock, flags);
+               sigdelset(&current->blocked, SIGTRAP);
+               recalc_sigpending();
+               spin_unlock_irqrestore(&current->sighand->siglock, flags);
+       }
+       send_sig(SIGTRAP, current, 1);
+       current->thread.trap_no = trapno;
+       current->thread.error_code = error_code;
+       return 0;
+}
+
+void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
+{
+       unsigned char opcode;
+       unsigned char __user *csp;
+       unsigned char __user *ssp;
+       unsigned short ip, sp, orig_flags;
+       int data32, pref_done;
+
+#define CHECK_IF_IN_TRAP \
+       if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+               newflags |= TF_MASK
+#define VM86_FAULT_RETURN do { \
+       if (VMPI.force_return_for_pic  && (VEFLAGS & (IF_MASK | VIF_MASK))) \
+               return_to_32bit(regs, VM86_PICRETURN); \
+       if (orig_flags & TF_MASK) \
+               handle_vm86_trap(regs, 0, 1); \
+       return; } while (0)
+
+       orig_flags = *(unsigned short *)&regs->pt.eflags;
+
+       csp = (unsigned char __user *) (regs->pt.xcs << 4);
+       ssp = (unsigned char __user *) (regs->pt.xss << 4);
+       sp = SP(regs);
+       ip = IP(regs);
+
+       data32 = 0;
+       pref_done = 0;
+       do {
+               switch (opcode = popb(csp, ip, simulate_sigsegv)) {
+                       case 0x66:      /* 32-bit data */     data32=1; break;
+                       case 0x67:      /* 32-bit address */  break;
+                       case 0x2e:      /* CS */              break;
+                       case 0x3e:      /* DS */              break;
+                       case 0x26:      /* ES */              break;
+                       case 0x36:      /* SS */              break;
+                       case 0x65:      /* GS */              break;
+                       case 0x64:      /* FS */              break;
+                       case 0xf2:      /* repnz */       break;
+                       case 0xf3:      /* rep */             break;
+                       default: pref_done = 1;
+               }
+       } while (!pref_done);
+
+       switch (opcode) {
+
+       /* pushf */
+       case 0x9c:
+               if (data32) {
+                       pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
+                       SP(regs) -= 4;
+               } else {
+                       pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
+                       SP(regs) -= 2;
+               }
+               IP(regs) = ip;
+               VM86_FAULT_RETURN;
+
+       /* popf */
+       case 0x9d:
+               {
+               unsigned long newflags;
+               if (data32) {
+                       newflags=popl(ssp, sp, simulate_sigsegv);
+                       SP(regs) += 4;
+               } else {
+                       newflags = popw(ssp, sp, simulate_sigsegv);
+                       SP(regs) += 2;
+               }
+               IP(regs) = ip;
+               CHECK_IF_IN_TRAP;
+               if (data32) {
+                       set_vflags_long(newflags, regs);
+               } else {
+                       set_vflags_short(newflags, regs);
+               }
+               VM86_FAULT_RETURN;
+               }
+
+       /* int xx */
+       case 0xcd: {
+               int intno=popb(csp, ip, simulate_sigsegv);
+               IP(regs) = ip;
+               if (VMPI.vm86dbg_active) {
+                       if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
+                               return_to_32bit(regs, VM86_INTx + (intno << 8));
+               }
+               do_int(regs, intno, ssp, sp);
+               return;
+       }
+
+       /* iret */
+       case 0xcf:
+               {
+               unsigned long newip;
+               unsigned long newcs;
+               unsigned long newflags;
+               if (data32) {
+                       newip=popl(ssp, sp, simulate_sigsegv);
+                       newcs=popl(ssp, sp, simulate_sigsegv);
+                       newflags=popl(ssp, sp, simulate_sigsegv);
+                       SP(regs) += 12;
+               } else {
+                       newip = popw(ssp, sp, simulate_sigsegv);
+                       newcs = popw(ssp, sp, simulate_sigsegv);
+                       newflags = popw(ssp, sp, simulate_sigsegv);
+                       SP(regs) += 6;
+               }
+               IP(regs) = newip;
+               regs->pt.xcs = newcs;
+               CHECK_IF_IN_TRAP;
+               if (data32) {
+                       set_vflags_long(newflags, regs);
+               } else {
+                       set_vflags_short(newflags, regs);
+               }
+               VM86_FAULT_RETURN;
+               }
+
+       /* cli */
+       case 0xfa:
+               IP(regs) = ip;
+               clear_IF(regs);
+               VM86_FAULT_RETURN;
+
+       /* sti */
+       /*
+        * Damn. This is incorrect: the 'sti' instruction should actually
+        * enable interrupts after the /next/ instruction. Not good.
+        *
+        * Probably needs some horsing around with the TF flag. Aiee..
+        */
+       case 0xfb:
+               IP(regs) = ip;
+               set_IF(regs);
+               VM86_FAULT_RETURN;
+
+       default:
+               return_to_32bit(regs, VM86_UNKNOWN);
+       }
+
+       return;
+
+simulate_sigsegv:
+       /* FIXME: After a long discussion with Stas we finally
+        *        agreed, that this is wrong. Here we should
+        *        really send a SIGSEGV to the user program.
+        *        But how do we create the correct context? We
+        *        are inside a general protection fault handler
+        *        and has just returned from a page fault handler.
+        *        The correct context for the signal handler
+        *        should be a mixture of the two, but how do we
+        *        get the information? [KD]
+        */
+       return_to_32bit(regs, VM86_UNKNOWN);
+}
+
+/* ---------------- vm86 special IRQ passing stuff ----------------- */
+
+#define VM86_IRQNAME           "vm86irq"
+
+static struct vm86_irqs {
+       struct task_struct *tsk;
+       int sig;
+} vm86_irqs[16];
+
+static DEFINE_SPINLOCK(irqbits_lock);
+static int irqbits;
+
+#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
+       | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO)  | (1 << SIGURG) \
+       | (1 << SIGUNUSED) )
+       
+static irqreturn_t irq_handler(int intno, void *dev_id)
+{
+       int irq_bit;
+       unsigned long flags;
+
+       spin_lock_irqsave(&irqbits_lock, flags);        
+       irq_bit = 1 << intno;
+       if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
+               goto out;
+       irqbits |= irq_bit;
+       if (vm86_irqs[intno].sig)
+               send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
+       /*
+        * IRQ will be re-enabled when user asks for the irq (whether
+        * polling or as a result of the signal)
+        */
+       disable_irq_nosync(intno);
+       spin_unlock_irqrestore(&irqbits_lock, flags);
+       return IRQ_HANDLED;
+
+out:
+       spin_unlock_irqrestore(&irqbits_lock, flags);   
+       return IRQ_NONE;
+}
+
+static inline void free_vm86_irq(int irqnumber)
+{
+       unsigned long flags;
+
+       free_irq(irqnumber, NULL);
+       vm86_irqs[irqnumber].tsk = NULL;
+
+       spin_lock_irqsave(&irqbits_lock, flags);        
+       irqbits &= ~(1 << irqnumber);
+       spin_unlock_irqrestore(&irqbits_lock, flags);   
+}
+
+void release_vm86_irqs(struct task_struct *task)
+{
+       int i;
+       for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
+           if (vm86_irqs[i].tsk == task)
+               free_vm86_irq(i);
+}
+
+static inline int get_and_reset_irq(int irqnumber)
+{
+       int bit;
+       unsigned long flags;
+       int ret = 0;
+       
+       if (invalid_vm86_irq(irqnumber)) return 0;
+       if (vm86_irqs[irqnumber].tsk != current) return 0;
+       spin_lock_irqsave(&irqbits_lock, flags);        
+       bit = irqbits & (1 << irqnumber);
+       irqbits &= ~bit;
+       if (bit) {
+               enable_irq(irqnumber);
+               ret = 1;
+       }
+
+       spin_unlock_irqrestore(&irqbits_lock, flags);   
+       return ret;
+}
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber)
+{
+       int ret;
+       switch (subfunction) {
+               case VM86_GET_AND_RESET_IRQ: {
+                       return get_and_reset_irq(irqnumber);
+               }
+               case VM86_GET_IRQ_BITS: {
+                       return irqbits;
+               }
+               case VM86_REQUEST_IRQ: {
+                       int sig = irqnumber >> 8;
+                       int irq = irqnumber & 255;
+                       if (!capable(CAP_SYS_ADMIN)) return -EPERM;
+                       if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
+                       if (invalid_vm86_irq(irq)) return -EPERM;
+                       if (vm86_irqs[irq].tsk) return -EPERM;
+                       ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
+                       if (ret) return ret;
+                       vm86_irqs[irq].sig = sig;
+                       vm86_irqs[irq].tsk = current;
+                       return irq;
+               }
+               case  VM86_FREE_IRQ: {
+                       if (invalid_vm86_irq(irqnumber)) return -EPERM;
+                       if (!vm86_irqs[irqnumber].tsk) return 0;
+                       if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
+                       free_vm86_irq(irqnumber);
+                       return 0;
+               }
+       }
+       return -EINVAL;
+}
+
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
new file mode 100644 (file)
index 0000000..18673e0
--- /dev/null
@@ -0,0 +1,981 @@
+/*
+ * VMI specific paravirt-ops implementation
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <asm/vmi.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/vmi_time.h>
+#include <asm/kmap_types.h>
+
+/* Convenient for calling VMI functions indirectly in the ROM */
+typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
+typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
+
+#define call_vrom_func(rom,func) \
+   (((VROMFUNC *)(rom->func))())
+
+#define call_vrom_long_func(rom,func,arg) \
+   (((VROMLONGFUNC *)(rom->func)) (arg))
+
+static struct vrom_header *vmi_rom;
+static int disable_pge;
+static int disable_pse;
+static int disable_sep;
+static int disable_tsc;
+static int disable_mtrr;
+static int disable_noidle;
+static int disable_vmi_timer;
+
+/* Cached VMI operations */
+static struct {
+       void (*cpuid)(void /* non-c */);
+       void (*_set_ldt)(u32 selector);
+       void (*set_tr)(u32 selector);
+       void (*set_kernel_stack)(u32 selector, u32 esp0);
+       void (*allocate_page)(u32, u32, u32, u32, u32);
+       void (*release_page)(u32, u32);
+       void (*set_pte)(pte_t, pte_t *, unsigned);
+       void (*update_pte)(pte_t *, unsigned);
+       void (*set_linear_mapping)(int, void *, u32, u32);
+       void (*_flush_tlb)(int);
+       void (*set_initial_ap_state)(int, int);
+       void (*halt)(void);
+       void (*set_lazy_mode)(int mode);
+} vmi_ops;
+
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+
+/*
+ * VMI patching routines.
+ */
+#define MNEM_CALL 0xe8
+#define MNEM_JMP  0xe9
+#define MNEM_RET  0xc3
+
+#define IRQ_PATCH_INT_MASK 0
+#define IRQ_PATCH_DISABLE  5
+
+static inline void patch_offset(void *insnbuf,
+                               unsigned long eip, unsigned long dest)
+{
+        *(unsigned long *)(insnbuf+1) = dest-eip-5;
+}
+
+static unsigned patch_internal(int call, unsigned len, void *insnbuf,
+                              unsigned long eip)
+{
+       u64 reloc;
+       struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
+       reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
+       switch(rel->type) {
+               case VMI_RELOCATION_CALL_REL:
+                       BUG_ON(len < 5);
+                       *(char *)insnbuf = MNEM_CALL;
+                       patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+                       return 5;
+
+               case VMI_RELOCATION_JUMP_REL:
+                       BUG_ON(len < 5);
+                       *(char *)insnbuf = MNEM_JMP;
+                       patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+                       return 5;
+
+               case VMI_RELOCATION_NOP:
+                       /* obliterate the whole thing */
+                       return 0;
+
+               case VMI_RELOCATION_NONE:
+                       /* leave native code in place */
+                       break;
+
+               default:
+                       BUG();
+       }
+       return len;
+}
+
+/*
+ * Apply patch if appropriate, return length of new instruction
+ * sequence.  The callee does nop padding for us.
+ */
+static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
+                         unsigned long eip, unsigned len)
+{
+       switch (type) {
+               case PARAVIRT_PATCH(irq_disable):
+                       return patch_internal(VMI_CALL_DisableInterrupts, len,
+                                             insns, eip);
+               case PARAVIRT_PATCH(irq_enable):
+                       return patch_internal(VMI_CALL_EnableInterrupts, len,
+                                             insns, eip);
+               case PARAVIRT_PATCH(restore_fl):
+                       return patch_internal(VMI_CALL_SetInterruptMask, len,
+                                             insns, eip);
+               case PARAVIRT_PATCH(save_fl):
+                       return patch_internal(VMI_CALL_GetInterruptMask, len,
+                                             insns, eip);
+               case PARAVIRT_PATCH(iret):
+                       return patch_internal(VMI_CALL_IRET, len, insns, eip);
+               case PARAVIRT_PATCH(irq_enable_sysexit):
+                       return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
+               default:
+                       break;
+       }
+       return len;
+}
+
+/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
+static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
+                               unsigned int *ecx, unsigned int *edx)
+{
+       int override = 0;
+       if (*eax == 1)
+               override = 1;
+        asm volatile ("call *%6"
+                      : "=a" (*eax),
+                        "=b" (*ebx),
+                        "=c" (*ecx),
+                        "=d" (*edx)
+                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+       if (override) {
+               if (disable_pse)
+                       *edx &= ~X86_FEATURE_PSE;
+               if (disable_pge)
+                       *edx &= ~X86_FEATURE_PGE;
+               if (disable_sep)
+                       *edx &= ~X86_FEATURE_SEP;
+               if (disable_tsc)
+                       *edx &= ~X86_FEATURE_TSC;
+               if (disable_mtrr)
+                       *edx &= ~X86_FEATURE_MTRR;
+       }
+}
+
+static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
+{
+       if (gdt[nr].a != new->a || gdt[nr].b != new->b)
+               write_gdt_entry(gdt, nr, new->a, new->b);
+}
+
+static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
+       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
+       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
+}
+
+static void vmi_set_ldt(const void *addr, unsigned entries)
+{
+       unsigned cpu = smp_processor_id();
+       u32 low, high;
+
+       pack_descriptor(&low, &high, (unsigned long)addr,
+                       entries * sizeof(struct desc_struct) - 1,
+                       DESCTYPE_LDT, 0);
+       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+       vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
+}
+
+static void vmi_set_tr(void)
+{
+       vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
+}
+
+static void vmi_load_esp0(struct tss_struct *tss,
+                                  struct thread_struct *thread)
+{
+       tss->x86_tss.esp0 = thread->esp0;
+
+       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
+       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+               tss->x86_tss.ss1 = thread->sysenter_cs;
+               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+       }
+       vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
+}
+
+static void vmi_flush_tlb_user(void)
+{
+       vmi_ops._flush_tlb(VMI_FLUSH_TLB);
+}
+
+static void vmi_flush_tlb_kernel(void)
+{
+       vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+}
+
+/* Stub to do nothing at all; used for delays and unimplemented calls */
+static void vmi_nop(void)
+{
+}
+
+#ifdef CONFIG_DEBUG_PAGE_TYPE
+
+#ifdef CONFIG_X86_PAE
+#define MAX_BOOT_PTS (2048+4+1)
+#else
+#define MAX_BOOT_PTS (1024+1)
+#endif
+
+/*
+ * During boot, mem_map is not yet available in paging_init, so stash
+ * all the boot page allocations here.
+ */
+static struct {
+       u32 pfn;
+       int type;
+} boot_page_allocations[MAX_BOOT_PTS];
+static int num_boot_page_allocations;
+static int boot_allocations_applied;
+
+void vmi_apply_boot_page_allocations(void)
+{
+       int i;
+       BUG_ON(!mem_map);
+       for (i = 0; i < num_boot_page_allocations; i++) {
+               struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
+               page->type = boot_page_allocations[i].type;
+               page->type = boot_page_allocations[i].type &
+                               ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+       }
+       boot_allocations_applied = 1;
+}
+
+static void record_page_type(u32 pfn, int type)
+{
+       BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
+       boot_page_allocations[num_boot_page_allocations].pfn = pfn;
+       boot_page_allocations[num_boot_page_allocations].type = type;
+       num_boot_page_allocations++;
+}
+
+static void check_zeroed_page(u32 pfn, int type, struct page *page)
+{
+       u32 *ptr;
+       int i;
+       int limit = PAGE_SIZE / sizeof(int);
+
+       if (page_address(page))
+               ptr = (u32 *)page_address(page);
+       else
+               ptr = (u32 *)__va(pfn << PAGE_SHIFT);
+       /*
+        * When cloning the root in non-PAE mode, only the userspace
+        * pdes need to be zeroed.
+        */
+       if (type & VMI_PAGE_CLONE)
+               limit = USER_PTRS_PER_PGD;
+       for (i = 0; i < limit; i++)
+               BUG_ON(ptr[i]);
+}
+
+/*
+ * We stash the page type into struct page so we can verify the page
+ * types are used properly.
+ */
+static void vmi_set_page_type(u32 pfn, int type)
+{
+       /* PAE can have multiple roots per page - don't track */
+       if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+               return;
+
+       if (boot_allocations_applied) {
+               struct page *page = pfn_to_page(pfn);
+               if (type != VMI_PAGE_NORMAL)
+                       BUG_ON(page->type);
+               else
+                       BUG_ON(page->type == VMI_PAGE_NORMAL);
+               page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+               if (type & VMI_PAGE_ZEROED)
+                       check_zeroed_page(pfn, type, page);
+       } else {
+               record_page_type(pfn, type);
+       }
+}
+
+static void vmi_check_page_type(u32 pfn, int type)
+{
+       /* PAE can have multiple roots per page - skip checks */
+       if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+               return;
+
+       type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+       if (boot_allocations_applied) {
+               struct page *page = pfn_to_page(pfn);
+               BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
+               BUG_ON(type == VMI_PAGE_NORMAL && page->type);
+               BUG_ON((type & page->type) == 0);
+       }
+}
+#else
+#define vmi_set_page_type(p,t) do { } while (0)
+#define vmi_check_page_type(p,t) do { } while (0)
+#endif
+
+#ifdef CONFIG_HIGHPTE
+static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+       void *va = kmap_atomic(page, type);
+
+       /*
+        * Internally, the VMI ROM must map virtual addresses to physical
+        * addresses for processing MMU updates.  By the time MMU updates
+        * are issued, this information is typically already lost.
+        * Fortunately, the VMI provides a cache of mapping slots for active
+        * page tables.
+        *
+        * We use slot zero for the linear mapping of physical memory, and
+        * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
+        *
+        *  args:                 SLOT                 VA    COUNT PFN
+        */
+       BUG_ON(type != KM_PTE0 && type != KM_PTE1);
+       vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+
+       return va;
+}
+#endif
+
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+{
+       vmi_set_page_type(pfn, VMI_PAGE_L1);
+       vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
+}
+
+static void vmi_allocate_pd(u32 pfn)
+{
+       /*
+        * This call comes in very early, before mem_map is setup.
+        * It is called only for swapper_pg_dir, which already has
+        * data on it.
+        */
+       vmi_set_page_type(pfn, VMI_PAGE_L2);
+       vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
+}
+
+static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+{
+       vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
+       vmi_check_page_type(clonepfn, VMI_PAGE_L2);
+       vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
+}
+
+static void vmi_release_pt(u32 pfn)
+{
+       vmi_ops.release_page(pfn, VMI_PAGE_L1);
+       vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+static void vmi_release_pd(u32 pfn)
+{
+       vmi_ops.release_page(pfn, VMI_PAGE_L2);
+       vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+/*
+ * Helper macros for MMU update flags.  We can defer updates until a flush
+ * or page invalidation only if the update is to the current address space
+ * (otherwise, there is no flush).  We must check against init_mm, since
+ * this could be a kernel update, which usually passes init_mm, although
+ * sometimes this check can be skipped if we know the particular function
+ * is only called on user mode PTEs.  We could change the kernel to pass
+ * current->active_mm here, but in particular, I was unsure if changing
+ * mm/highmem.c to do this would still be correct on other architectures.
+ */
+#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
+                                       (!mustbeuser && (mm) == &init_mm))
+#define vmi_flags_addr(mm, addr, level, user)                           \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+#define vmi_flags_addr_defer(mm, addr, level, user)                     \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+
+static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+       vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+       vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pte(pte_t *ptep, pte_t pte)
+{
+       /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
+       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
+       vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+       vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+#ifdef CONFIG_X86_PAE
+       const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+       vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
+#else
+       const pte_t pte = { pmdval.pud.pgd.pgd };
+       vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+#endif
+       vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
+}
+
+#ifdef CONFIG_X86_PAE
+
+static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+       /*
+        * XXX This is called from set_pmd_pte, but at both PT
+        * and PD layers so the VMI_PAGE_PT flag is wrong.  But
+        * it is only called for large page mapping changes,
+        * the Xen backend, doesn't support large pages, and the
+        * ESX backend doesn't depend on the flag.
+        */
+       set_64bit((unsigned long long *)ptep,pte_val(pteval));
+       vmi_ops.update_pte(ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+       vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
+}
+
+static void vmi_set_pud(pud_t *pudp, pud_t pudval)
+{
+       /* Um, eww */
+       const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+       vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+       vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
+}
+
+static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       const pte_t pte = { 0 };
+       vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+       vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_pmd_clear(pmd_t *pmd)
+{
+       const pte_t pte = { 0 };
+       vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
+       vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
+}
+#endif
+
+#ifdef CONFIG_SMP
+static void __devinit
+vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+                    unsigned long start_esp)
+{
+       struct vmi_ap_state ap;
+
+       /* Default everything to zero.  This is fine for most GPRs. */
+       memset(&ap, 0, sizeof(struct vmi_ap_state));
+
+       ap.gdtr_limit = GDT_SIZE - 1;
+       ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
+
+       ap.idtr_limit = IDT_ENTRIES * 8 - 1;
+       ap.idtr_base = (unsigned long) idt_table;
+
+       ap.ldtr = 0;
+
+       ap.cs = __KERNEL_CS;
+       ap.eip = (unsigned long) start_eip;
+       ap.ss = __KERNEL_DS;
+       ap.esp = (unsigned long) start_esp;
+
+       ap.ds = __USER_DS;
+       ap.es = __USER_DS;
+       ap.fs = __KERNEL_PERCPU;
+       ap.gs = 0;
+
+       ap.eflags = 0;
+
+#ifdef CONFIG_X86_PAE
+       /* efer should match BSP efer. */
+       if (cpu_has_nx) {
+               unsigned l, h;
+               rdmsr(MSR_EFER, l, h);
+               ap.efer = (unsigned long long) h << 32 | l;
+       }
+#endif
+
+       ap.cr3 = __pa(swapper_pg_dir);
+       /* Protected mode, paging, AM, WP, NE, MP. */
+       ap.cr0 = 0x80050023;
+       ap.cr4 = mmu_cr4_features;
+       vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
+}
+#endif
+
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+       static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
+
+       if (!vmi_ops.set_lazy_mode)
+               return;
+
+       /* Modes should never nest or overlap */
+       BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
+                                            mode == PARAVIRT_LAZY_FLUSH));
+
+       if (mode == PARAVIRT_LAZY_FLUSH) {
+               vmi_ops.set_lazy_mode(0);
+               vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
+       } else {
+               vmi_ops.set_lazy_mode(mode);
+               __get_cpu_var(lazy_mode) = mode;
+       }
+}
+
+static inline int __init check_vmi_rom(struct vrom_header *rom)
+{
+       struct pci_header *pci;
+       struct pnp_header *pnp;
+       const char *manufacturer = "UNKNOWN";
+       const char *product = "UNKNOWN";
+       const char *license = "unspecified";
+
+       if (rom->rom_signature != 0xaa55)
+               return 0;
+       if (rom->vrom_signature != VMI_SIGNATURE)
+               return 0;
+       if (rom->api_version_maj != VMI_API_REV_MAJOR ||
+           rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
+               printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
+                               rom->api_version_maj,
+                               rom->api_version_min);
+               return 0;
+       }
+
+       /*
+        * Relying on the VMI_SIGNATURE field is not 100% safe, so check
+        * the PCI header and device type to make sure this is really a
+        * VMI device.
+        */
+       if (!rom->pci_header_offs) {
+               printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
+               return 0;
+       }
+
+       pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
+       if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
+           pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
+               /* Allow it to run... anyways, but warn */
+               printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
+       }
+
+       if (rom->pnp_header_offs) {
+               pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
+               if (pnp->manufacturer_offset)
+                       manufacturer = (const char *)rom+pnp->manufacturer_offset;
+               if (pnp->product_offset)
+                       product = (const char *)rom+pnp->product_offset;
+       }
+
+       if (rom->license_offs)
+               license = (char *)rom+rom->license_offs;
+
+       printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
+               manufacturer, product,
+               rom->api_version_maj, rom->api_version_min,
+               pci->rom_version_maj, pci->rom_version_min);
+
+       /* Don't allow BSD/MIT here for now because we don't want to end up
+          with any binary only shim layers */
+       if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
+               printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
+                       license);
+               return 0;
+       }
+
+       return 1;
+}
+
+/*
+ * Probe for the VMI option ROM
+ */
+static inline int __init probe_vmi_rom(void)
+{
+       unsigned long base;
+
+       /* VMI ROM is in option ROM area, check signature */
+       for (base = 0xC0000; base < 0xE0000; base += 2048) {
+               struct vrom_header *romstart;
+               romstart = (struct vrom_header *)isa_bus_to_virt(base);
+               if (check_vmi_rom(romstart)) {
+                       vmi_rom = romstart;
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * VMI setup common to all processors
+ */
+void vmi_bringup(void)
+{
+       /* We must establish the lowmem mapping for MMU ops to work */
+       if (vmi_ops.set_linear_mapping)
+               vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
+}
+
+/*
+ * Return a pointer to a VMI function or NULL if unimplemented
+ */
+static void *vmi_get_function(int vmicall)
+{
+       u64 reloc;
+       const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+       reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
+       BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
+       if (rel->type == VMI_RELOCATION_CALL_REL)
+               return (void *)rel->eip;
+       else
+               return NULL;
+}
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For unimplemented operations, fall back to default, unless nop
+ * is returned by the ROM.
+ */
+#define para_fill(opname, vmicall)                             \
+do {                                                           \
+       reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
+                                   VMI_CALL_##vmicall);        \
+       if (rel->type == VMI_RELOCATION_CALL_REL)               \
+               paravirt_ops.opname = (void *)rel->eip;         \
+       else if (rel->type == VMI_RELOCATION_NOP)               \
+               paravirt_ops.opname = (void *)vmi_nop;          \
+       else if (rel->type != VMI_RELOCATION_NONE)              \
+               printk(KERN_WARNING "VMI: Unknown relocation "  \
+                                   "type %d for " #vmicall"\n",\
+                                       rel->type);             \
+} while (0)
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For cached operations which do not match the VMI ROM ABI and must
+ * go through a tranlation stub.  Ignore NOPs, since it is not clear
+ * a NOP * VMI function corresponds to a NOP paravirt-op when the
+ * functions are not in 1-1 correspondence.
+ */
+#define para_wrap(opname, wrapper, cache, vmicall)             \
+do {                                                           \
+       reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
+                                   VMI_CALL_##vmicall);        \
+       BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);           \
+       if (rel->type == VMI_RELOCATION_CALL_REL) {             \
+               paravirt_ops.opname = wrapper;                  \
+               vmi_ops.cache = (void *)rel->eip;               \
+       }                                                       \
+} while (0)
+
+/*
+ * Activate the VMI interface and switch into paravirtualized mode
+ */
+static inline int __init activate_vmi(void)
+{
+       short kernel_cs;
+       u64 reloc;
+       const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+
+       if (call_vrom_func(vmi_rom, vmi_init) != 0) {
+               printk(KERN_ERR "VMI ROM failed to initialize!");
+               return 0;
+       }
+       savesegment(cs, kernel_cs);
+
+       paravirt_ops.paravirt_enabled = 1;
+       paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+
+       paravirt_ops.patch = vmi_patch;
+       paravirt_ops.name = "vmi";
+
+       /*
+        * Many of these operations are ABI compatible with VMI.
+        * This means we can fill in the paravirt-ops with direct
+        * pointers into the VMI ROM.  If the calling convention for
+        * these operations changes, this code needs to be updated.
+        *
+        * Exceptions
+        *  CPUID paravirt-op uses pointers, not the native ISA
+        *  halt has no VMI equivalent; all VMI halts are "safe"
+        *  no MSR support yet - just trap and emulate.  VMI uses the
+        *    same ABI as the native ISA, but Linux wants exceptions
+        *    from bogus MSR read / write handled
+        *  rdpmc is not yet used in Linux
+        */
+
+       /* CPUID is special, so very special it gets wrapped like a present */
+       para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
+
+       para_fill(clts, CLTS);
+       para_fill(get_debugreg, GetDR);
+       para_fill(set_debugreg, SetDR);
+       para_fill(read_cr0, GetCR0);
+       para_fill(read_cr2, GetCR2);
+       para_fill(read_cr3, GetCR3);
+       para_fill(read_cr4, GetCR4);
+       para_fill(write_cr0, SetCR0);
+       para_fill(write_cr2, SetCR2);
+       para_fill(write_cr3, SetCR3);
+       para_fill(write_cr4, SetCR4);
+       para_fill(save_fl, GetInterruptMask);
+       para_fill(restore_fl, SetInterruptMask);
+       para_fill(irq_disable, DisableInterrupts);
+       para_fill(irq_enable, EnableInterrupts);
+
+       para_fill(wbinvd, WBINVD);
+       para_fill(read_tsc, RDTSC);
+
+       /* The following we emulate with trap and emulate for now */
+       /* paravirt_ops.read_msr = vmi_rdmsr */
+       /* paravirt_ops.write_msr = vmi_wrmsr */
+       /* paravirt_ops.rdpmc = vmi_rdpmc */
+
+       /* TR interface doesn't pass TR value, wrap */
+       para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
+
+       /* LDT is special, too */
+       para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
+
+       para_fill(load_gdt, SetGDT);
+       para_fill(load_idt, SetIDT);
+       para_fill(store_gdt, GetGDT);
+       para_fill(store_idt, GetIDT);
+       para_fill(store_tr, GetTR);
+       paravirt_ops.load_tls = vmi_load_tls;
+       para_fill(write_ldt_entry, WriteLDTEntry);
+       para_fill(write_gdt_entry, WriteGDTEntry);
+       para_fill(write_idt_entry, WriteIDTEntry);
+       para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+       para_fill(set_iopl_mask, SetIOPLMask);
+       para_fill(io_delay, IODelay);
+       para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
+
+       /* user and kernel flush are just handled with different flags to FlushTLB */
+       para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+       para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
+       para_fill(flush_tlb_single, InvalPage);
+
+       /*
+        * Until a standard flag format can be agreed on, we need to
+        * implement these as wrappers in Linux.  Get the VMI ROM
+        * function pointers for the two backend calls.
+        */
+#ifdef CONFIG_X86_PAE
+       vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
+       vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
+#else
+       vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
+       vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
+#endif
+
+       if (vmi_ops.set_pte) {
+               paravirt_ops.set_pte = vmi_set_pte;
+               paravirt_ops.set_pte_at = vmi_set_pte_at;
+               paravirt_ops.set_pmd = vmi_set_pmd;
+#ifdef CONFIG_X86_PAE
+               paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+               paravirt_ops.set_pte_present = vmi_set_pte_present;
+               paravirt_ops.set_pud = vmi_set_pud;
+               paravirt_ops.pte_clear = vmi_pte_clear;
+               paravirt_ops.pmd_clear = vmi_pmd_clear;
+#endif
+       }
+
+       if (vmi_ops.update_pte) {
+               paravirt_ops.pte_update = vmi_update_pte;
+               paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+       }
+
+       vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
+       if (vmi_ops.allocate_page) {
+               paravirt_ops.alloc_pt = vmi_allocate_pt;
+               paravirt_ops.alloc_pd = vmi_allocate_pd;
+               paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+       }
+
+       vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
+       if (vmi_ops.release_page) {
+               paravirt_ops.release_pt = vmi_release_pt;
+               paravirt_ops.release_pd = vmi_release_pd;
+       }
+
+       /* Set linear is needed in all cases */
+       vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+#ifdef CONFIG_HIGHPTE
+       if (vmi_ops.set_linear_mapping)
+               paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+#endif
+
+       /*
+        * These MUST always be patched.  Don't support indirect jumps
+        * through these operations, as the VMI interface may use either
+        * a jump or a call to get to these operations, depending on
+        * the backend.  They are performance critical anyway, so requiring
+        * a patch is not a big problem.
+        */
+       paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+       paravirt_ops.iret = (void *)0xbadbab0;
+
+#ifdef CONFIG_SMP
+       para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       para_fill(apic_read, APICRead);
+       para_fill(apic_write, APICWrite);
+       para_fill(apic_write_atomic, APICWrite);
+#endif
+
+       /*
+        * Check for VMI timer functionality by probing for a cycle frequency method
+        */
+       reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
+       if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
+               vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
+               vmi_timer_ops.get_cycle_counter =
+                       vmi_get_function(VMI_CALL_GetCycleCounter);
+               vmi_timer_ops.get_wallclock =
+                       vmi_get_function(VMI_CALL_GetWallclockTime);
+               vmi_timer_ops.wallclock_updated =
+                       vmi_get_function(VMI_CALL_WallclockUpdated);
+               vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
+               vmi_timer_ops.cancel_alarm =
+                        vmi_get_function(VMI_CALL_CancelAlarm);
+               paravirt_ops.time_init = vmi_time_init;
+               paravirt_ops.get_wallclock = vmi_get_wallclock;
+               paravirt_ops.set_wallclock = vmi_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+               paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+               paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
+#endif
+               paravirt_ops.sched_clock = vmi_sched_clock;
+               paravirt_ops.get_cpu_khz = vmi_cpu_khz;
+
+               /* We have true wallclock functions; disable CMOS clock sync */
+               no_sync_cmos_clock = 1;
+       } else {
+               disable_noidle = 1;
+               disable_vmi_timer = 1;
+       }
+
+       para_fill(safe_halt, Halt);
+
+       /*
+        * Alternative instruction rewriting doesn't happen soon enough
+        * to convert VMI_IRET to a call instead of a jump; so we have
+        * to do this before IRQs get reenabled.  Fortunately, it is
+        * idempotent.
+        */
+       apply_paravirt(__parainstructions, __parainstructions_end);
+
+       vmi_bringup();
+
+       return 1;
+}
+
+#undef para_fill
+
+void __init vmi_init(void)
+{
+       unsigned long flags;
+
+       if (!vmi_rom)
+               probe_vmi_rom();
+       else
+               check_vmi_rom(vmi_rom);
+
+       /* In case probing for or validating the ROM failed, basil */
+       if (!vmi_rom)
+               return;
+
+       reserve_top_address(-vmi_rom->virtual_top);
+
+       local_irq_save(flags);
+       activate_vmi();
+
+#ifdef CONFIG_X86_IO_APIC
+       /* This is virtual hardware; timer routing is wired correctly */
+       no_timer_check = 1;
+#endif
+       local_irq_restore(flags & X86_EFLAGS_IF);
+}
+
+static int __init parse_vmi(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       if (!strcmp(arg, "disable_pge")) {
+               clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+               disable_pge = 1;
+       } else if (!strcmp(arg, "disable_pse")) {
+               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+               disable_pse = 1;
+       } else if (!strcmp(arg, "disable_sep")) {
+               clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+               disable_sep = 1;
+       } else if (!strcmp(arg, "disable_tsc")) {
+               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+               disable_tsc = 1;
+       } else if (!strcmp(arg, "disable_mtrr")) {
+               clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+               disable_mtrr = 1;
+       } else if (!strcmp(arg, "disable_timer")) {
+               disable_vmi_timer = 1;
+               disable_noidle = 1;
+       } else if (!strcmp(arg, "disable_noidle"))
+               disable_noidle = 1;
+       return 0;
+}
+
+early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
new file mode 100644 (file)
index 0000000..b1b5ab0
--- /dev/null
@@ -0,0 +1,320 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2007, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+#include <asm/arch_hooks.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/i8253.h>
+
+#include <irq_vectors.h>
+#include "io_ports.h"
+
+#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+
+static DEFINE_PER_CPU(struct clock_event_device, local_events);
+
+static inline u32 vmi_counter(u32 flags)
+{
+       /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
+        * cycle counter. */
+       return flags & VMI_ALARM_COUNTER_MASK;
+}
+
+/* paravirt_ops.get_wallclock = vmi_get_wallclock */
+unsigned long vmi_get_wallclock(void)
+{
+       unsigned long long wallclock;
+       wallclock = vmi_timer_ops.get_wallclock(); // nsec
+       (void)do_div(wallclock, 1000000000);       // sec
+
+       return wallclock;
+}
+
+/* paravirt_ops.set_wallclock = vmi_set_wallclock */
+int vmi_set_wallclock(unsigned long now)
+{
+       return 0;
+}
+
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
+{
+       return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
+}
+
+/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
+unsigned long vmi_cpu_khz(void)
+{
+       unsigned long long khz;
+       khz = vmi_timer_ops.get_cycle_frequency();
+       (void)do_div(khz, 1000);
+       return khz;
+}
+
+static inline unsigned int vmi_get_timer_vector(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+       return FIRST_DEVICE_VECTOR;
+#else
+       return FIRST_EXTERNAL_VECTOR;
+#endif
+}
+
+/** vmi clockchip */
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int startup_timer_irq(unsigned int irq)
+{
+       unsigned long val = apic_read(APIC_LVTT);
+       apic_write(APIC_LVTT, vmi_get_timer_vector());
+
+       return (val & APIC_SEND_PENDING);
+}
+
+static void mask_timer_irq(unsigned int irq)
+{
+       unsigned long val = apic_read(APIC_LVTT);
+       apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
+}
+
+static void unmask_timer_irq(unsigned int irq)
+{
+       unsigned long val = apic_read(APIC_LVTT);
+       apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
+}
+
+static void ack_timer_irq(unsigned int irq)
+{
+       ack_APIC_irq();
+}
+
+static struct irq_chip vmi_chip __read_mostly = {
+       .name           = "VMI-LOCAL",
+       .startup        = startup_timer_irq,
+       .mask           = mask_timer_irq,
+       .unmask         = unmask_timer_irq,
+       .ack            = ack_timer_irq
+};
+#endif
+
+/** vmi clockevent */
+#define VMI_ALARM_WIRED_IRQ0    0x00000000
+#define VMI_ALARM_WIRED_LVTT    0x00010000
+static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
+
+static inline int vmi_get_alarm_wiring(void)
+{
+       return vmi_wiring;
+}
+
+static void vmi_timer_set_mode(enum clock_event_mode mode,
+                              struct clock_event_device *evt)
+{
+       cycle_t now, cycles_per_hz;
+       BUG_ON(!irqs_disabled());
+
+       switch (mode) {
+       case CLOCK_EVT_MODE_ONESHOT:
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+       case CLOCK_EVT_MODE_PERIODIC:
+               cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
+               (void)do_div(cycles_per_hz, HZ);
+               now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
+               vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
+               break;
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               switch (evt->mode) {
+               case CLOCK_EVT_MODE_ONESHOT:
+                       vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
+                       break;
+               case CLOCK_EVT_MODE_PERIODIC:
+                       vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
+                       break;
+               default:
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
+}
+
+static int vmi_timer_next_event(unsigned long delta,
+                               struct clock_event_device *evt)
+{
+       /* Unfortunately, set_next_event interface only passes relative
+        * expiry, but we want absolute expiry.  It'd be better if were
+        * were passed an aboslute expiry, since a bunch of time may
+        * have been stolen between the time the delta is computed and
+        * when we set the alarm below. */
+       cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
+
+       BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+       vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
+       return 0;
+}
+
+static struct clock_event_device vmi_clockevent = {
+       .name           = "vmi-timer",
+       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+       .shift          = 22,
+       .set_mode       = vmi_timer_set_mode,
+       .set_next_event = vmi_timer_next_event,
+       .rating         = 1000,
+       .irq            = 0,
+};
+
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+       struct clock_event_device *evt = &__get_cpu_var(local_events);
+       evt->event_handler(evt);
+       return IRQ_HANDLED;
+}
+
+static struct irqaction vmi_clock_action  = {
+       .name           = "vmi-timer",
+       .handler        = vmi_timer_interrupt,
+       .flags          = IRQF_DISABLED | IRQF_NOBALANCING,
+       .mask           = CPU_MASK_ALL,
+};
+
+static void __devinit vmi_time_init_clockevent(void)
+{
+       cycle_t cycles_per_msec;
+       struct clock_event_device *evt;
+
+       int cpu = smp_processor_id();
+       evt = &__get_cpu_var(local_events);
+
+       /* Use cycles_per_msec since div_sc params are 32-bits. */
+       cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+       (void)do_div(cycles_per_msec, 1000);
+
+       memcpy(evt, &vmi_clockevent, sizeof(*evt));
+       /* Must pick .shift such that .mult fits in 32-bits.  Choosing
+        * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
+        * before overflow. */
+       evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
+       /* Upper bound is clockevent's use of ulong for cycle deltas. */
+       evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
+       evt->min_delta_ns = clockevent_delta2ns(1, evt);
+       evt->cpumask = cpumask_of_cpu(cpu);
+
+       printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+              evt->name, evt->mult, evt->shift);
+       clockevents_register_device(evt);
+}
+
+void __init vmi_time_init(void)
+{
+       /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
+       outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+       vmi_time_init_clockevent();
+       setup_irq(0, &vmi_clock_action);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void __devinit vmi_time_bsp_init(void)
+{
+       /*
+        * On APIC systems, we want local timers to fire on each cpu.  We do
+        * this by programming LVTT to deliver timer events to the IRQ handler
+        * for IRQ-0, since we can't re-use the APIC local timer handler
+        * without interfering with that code.
+        */
+       clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+       local_irq_disable();
+#ifdef CONFIG_X86_SMP
+       /*
+        * XXX handle_percpu_irq only defined for SMP; we need to switch over
+        * to using it, since this is a local interrupt, which each CPU must
+        * handle individually without locking out or dropping simultaneous
+        * local timers on other CPUs.  We also don't want to trigger the
+        * quirk workaround code for interrupts which gets invoked from
+        * handle_percpu_irq via eoi, so we use our own IRQ chip.
+        */
+       set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
+#else
+       set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
+#endif
+       vmi_wiring = VMI_ALARM_WIRED_LVTT;
+       apic_write(APIC_LVTT, vmi_get_timer_vector());
+       local_irq_enable();
+       clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+}
+
+void __devinit vmi_time_ap_init(void)
+{
+       vmi_time_init_clockevent();
+       apic_write(APIC_LVTT, vmi_get_timer_vector());
+}
+#endif
+
+/** vmi clocksource */
+
+static cycle_t read_real_cycles(void)
+{
+       return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static struct clocksource clocksource_vmi = {
+       .name                   = "vmi-timer",
+       .rating                 = 450,
+       .read                   = read_real_cycles,
+       .mask                   = CLOCKSOURCE_MASK(64),
+       .mult                   = 0, /* to be set */
+       .shift                  = 22,
+       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int __init init_vmi_clocksource(void)
+{
+       cycle_t cycles_per_msec;
+
+       if (!vmi_timer_ops.get_cycle_frequency)
+               return 0;
+       /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
+       cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+       (void)do_div(cycles_per_msec, 1000);
+
+       /* Note that clocksource.{mult, shift} converts in the opposite direction
+        * as clockevents.  */
+       clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+                                                   clocksource_vmi.shift);
+
+       printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
+       return clocksource_register(&clocksource_vmi);
+
+}
+module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644 (file)
index 0000000..849ee61
--- /dev/null
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "vmlinux_32.lds.S"
+#else
+# include "vmlinux_64.lds.S"
+#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
new file mode 100644 (file)
index 0000000..7d72cce
--- /dev/null
@@ -0,0 +1,213 @@
+/* ld script to make i386 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
+/* Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+#define LOAD_OFFSET __PAGE_OFFSET
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+
+PHDRS {
+       text PT_LOAD FLAGS(5);  /* R_E */
+       data PT_LOAD FLAGS(7);  /* RWE */
+       note PT_NOTE FLAGS(0);  /* ___ */
+}
+SECTIONS
+{
+  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+  phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+       _text = .;                      /* Text and read-only data */
+       *(.text.head)
+  } :text = 0x9090
+
+  /* read-only */
+  .text : AT(ADDR(.text) - LOAD_OFFSET) {
+       TEXT_TEXT
+       SCHED_TEXT
+       LOCK_TEXT
+       KPROBES_TEXT
+       *(.fixup)
+       *(.gnu.warning)
+       _etext = .;                     /* End of text section */
+  } :text = 0x9090
+
+  . = ALIGN(16);               /* Exception table */
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+       __start___ex_table = .;
+        *(__ex_table)
+       __stop___ex_table = .;
+  }
+
+  NOTES :text :note
+
+  BUG_TABLE :text
+
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+       __tracedata_start = .;
+       *(.tracedata)
+       __tracedata_end = .;
+  }
+
+  RODATA
+
+  /* writeable */
+  . = ALIGN(4096);
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {      /* Data */
+       DATA_DATA
+       CONSTRUCTORS
+       } :data
+
+  . = ALIGN(4096);
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+       __nosave_begin = .;
+       *(.data.nosave)
+       . = ALIGN(4096);
+       __nosave_end = .;
+  }
+
+  . = ALIGN(4096);
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+       *(.data.page_aligned)
+       *(.data.idt)
+  }
+
+  . = ALIGN(32);
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+       *(.data.cacheline_aligned)
+  }
+
+  /* rarely changed data like cpu maps */
+  . = ALIGN(32);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+       *(.data.read_mostly)
+       _edata = .;             /* End of data section */
+  }
+
+  . = ALIGN(THREAD_SIZE);      /* init_task */
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+       *(.data.init_task)
+  }
+
+  /* might get freed after init */
+  . = ALIGN(4096);
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+       __smp_locks = .;
+       *(.smp_locks)
+       __smp_locks_end = .;
+  }
+  /* will be freed after init
+   * Following ALIGN() is required to make sure no other data falls on the
+   * same page where __smp_alt_end is pointing as that page might be freed
+   * after boot. Always make sure that ALIGN() directive is present after
+   * the section which contains __smp_alt_end.
+   */
+  . = ALIGN(4096);
+
+  /* will be freed after init */
+  . = ALIGN(4096);             /* Init code and data */
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+       __init_begin = .;
+       _sinittext = .;
+       *(.init.text)
+       _einittext = .;
+  }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
+  . = ALIGN(16);
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+       __setup_start = .;
+       *(.init.setup)
+       __setup_end = .;
+   }
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+       __initcall_start = .;
+       INITCALLS
+       __initcall_end = .;
+  }
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+       __con_initcall_start = .;
+       *(.con_initcall.init)
+       __con_initcall_end = .;
+  }
+  SECURITY_INIT
+  . = ALIGN(4);
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+       __alt_instructions = .;
+       *(.altinstructions)
+       __alt_instructions_end = .;
+  }
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+       *(.altinstr_replacement)
+  }
+  . = ALIGN(4);
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+       __parainstructions = .;
+       *(.parainstructions)
+       __parainstructions_end = .;
+  }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+#if defined(CONFIG_BLK_DEV_INITRD)
+  . = ALIGN(4096);
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+       __initramfs_start = .;
+       *(.init.ramfs)
+       __initramfs_end = .;
+  }
+#endif
+  . = ALIGN(4096);
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
+       __per_cpu_start = .;
+       *(.data.percpu)
+       *(.data.percpu.shared_aligned)
+       __per_cpu_end = .;
+  }
+  . = ALIGN(4096);
+  /* freed after init ends here */
+       
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+       __init_end = .;
+       __bss_start = .;                /* BSS */
+       *(.bss.page_aligned)
+       *(.bss)
+       . = ALIGN(4);
+       __bss_stop = .;
+       _end = . ;
+       /* This is where the kernel creates the early boot page tables */
+       . = ALIGN(4096);
+       pg0 = . ;
+  }
+
+  /* Sections to be discarded */
+  /DISCARD/ : {
+       *(.exitcall.exit)
+       }
+
+  STABS_DEBUG
+
+  DWARF_DEBUG
+}
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
new file mode 100644 (file)
index 0000000..103cab6
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Code for the vsyscall page.  This version uses the old int $0x80 method.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+
+       .text
+       .globl __kernel_vsyscall
+       .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+       int $0x80
+       ret
+.LEND_vsyscall:
+       .size __kernel_vsyscall,.-.LSTART_vsyscall
+       .previous
+
+       .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+       .long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+       .long 0                 /* CIE ID */
+       .byte 1                 /* Version number */
+       .string "zR"            /* NUL-terminated augmentation string */
+       .uleb128 1              /* Code alignment factor */
+       .sleb128 -4             /* Data alignment factor */
+       .byte 8                 /* Return address register column */
+       .uleb128 1              /* Augmentation value length */
+       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+       .byte 0x0c              /* DW_CFA_def_cfa */
+       .uleb128 4
+       .uleb128 4
+       .byte 0x88              /* DW_CFA_offset, column 0x8 */
+       .uleb128 1
+       .align 4
+.LENDCIEDLSI:
+       .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+       .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+       .long .LSTART_vsyscall-.        /* PC-relative start address */
+       .long .LEND_vsyscall-.LSTART_vsyscall
+       .uleb128 0
+       .align 4
+.LENDFDEDLSI:
+       .previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
new file mode 100644 (file)
index 0000000..fcf376a
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+       .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *     hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word.  So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
+
+#include "../../x86/xen/vdso.h"        /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+
+       .globl VDSO_NOTE_MASK
+ELFNOTE_START(GNU, 2, "a")
+       .long 1                 /* ncaps */
+VDSO_NOTE_MASK:
+       .long 0                 /* mask */
+       .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
new file mode 100644 (file)
index 0000000..a92262f
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * So far this code is the same for both int80 and sysenter versions.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The kernel assumes that the addresses of these
+ * routines are constant for all vsyscall implementations.
+ */
+
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+
+
+/* XXX
+   Should these be named "_sigtramp" or something?
+*/
+
+       .text
+       .org __kernel_vsyscall+32,0x90
+       .globl __kernel_sigreturn
+       .type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+       popl %eax               /* XXX does this mean it needs unwind info? */
+       movl $__NR_sigreturn, %eax
+       int $0x80
+.LEND_sigreturn:
+       .size __kernel_sigreturn,.-.LSTART_sigreturn
+
+       .balign 32
+       .globl __kernel_rt_sigreturn
+       .type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+       movl $__NR_rt_sigreturn, %eax
+       int $0x80
+.LEND_rt_sigreturn:
+       .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+       .balign 32
+       .previous
+
+       .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+       .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+       .long 0                 /* CIE ID */
+       .byte 1                 /* Version number */
+       .string "zRS"           /* NUL-terminated augmentation string */
+       .uleb128 1              /* Code alignment factor */
+       .sleb128 -4             /* Data alignment factor */
+       .byte 8                 /* Return address register column */
+       .uleb128 1              /* Augmentation value length */
+       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+       .byte 0                 /* DW_CFA_nop */
+       .align 4
+.LENDCIEDLSI1:
+       .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+       .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+       /* HACK: The dwarf2 unwind routines will subtract 1 from the
+          return address to get an address in the middle of the
+          presumed call instruction.  Since we didn't get here via
+          a call, we need to include the nop before the real start
+          to make up for it.  */
+       .long .LSTART_sigreturn-1-.     /* PC-relative start address */
+       .long .LEND_sigreturn-.LSTART_sigreturn+1
+       .uleb128 0                      /* Augmentation */
+       /* What follows are the instructions for the table generation.
+          We record the locations of each register saved.  This is
+          complicated by the fact that the "CFA" is always assumed to
+          be the value of the stack pointer in the caller.  This means
+          that we must define the CFA of this body of code to be the
+          saved value of the stack pointer in the sigcontext.  Which
+          also means that there is no fixed relation to the other 
+          saved registers, which means that we must use DW_CFA_expression
+          to compute their addresses.  It also means that when we 
+          adjust the stack with the popl, we have to do it all over again.  */
+
+#define do_cfa_expr(offset)                                            \
+       .byte 0x0f;                     /* DW_CFA_def_cfa_expression */ \
+       .uleb128 1f-0f;                 /*   length */                  \
+0:     .byte 0x74;                     /*     DW_OP_breg4 */           \
+       .sleb128 offset;                /*      offset */               \
+       .byte 0x06;                     /*     DW_OP_deref */           \
+1:
+
+#define do_expr(regno, offset)                                         \
+       .byte 0x10;                     /* DW_CFA_expression */         \
+       .uleb128 regno;                 /*   regno */                   \
+       .uleb128 1f-0f;                 /*   length */                  \
+0:     .byte 0x74;                     /*     DW_OP_breg4 */           \
+       .sleb128 offset;                /*       offset */              \
+1:
+
+       do_cfa_expr(SIGCONTEXT_esp+4)
+       do_expr(0, SIGCONTEXT_eax+4)
+       do_expr(1, SIGCONTEXT_ecx+4)
+       do_expr(2, SIGCONTEXT_edx+4)
+       do_expr(3, SIGCONTEXT_ebx+4)
+       do_expr(5, SIGCONTEXT_ebp+4)
+       do_expr(6, SIGCONTEXT_esi+4)
+       do_expr(7, SIGCONTEXT_edi+4)
+       do_expr(8, SIGCONTEXT_eip+4)
+
+       .byte 0x42      /* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+       do_cfa_expr(SIGCONTEXT_esp)
+       do_expr(0, SIGCONTEXT_eax)
+       do_expr(1, SIGCONTEXT_ecx)
+       do_expr(2, SIGCONTEXT_edx)
+       do_expr(3, SIGCONTEXT_ebx)
+       do_expr(5, SIGCONTEXT_ebp)
+       do_expr(6, SIGCONTEXT_esi)
+       do_expr(7, SIGCONTEXT_edi)
+       do_expr(8, SIGCONTEXT_eip)
+
+       .align 4
+.LENDFDEDLSI1:
+
+       .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+       .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+       /* HACK: See above wrt unwind library assumptions.  */
+       .long .LSTART_rt_sigreturn-1-.  /* PC-relative start address */
+       .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+       .uleb128 0                      /* Augmentation */
+       /* What follows are the instructions for the table generation.
+          We record the locations of each register saved.  This is
+          slightly less complicated than the above, since we don't
+          modify the stack pointer in the process.  */
+
+       do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
+       do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
+       do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
+       do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
+       do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
+       do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
+       do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
+       do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
+       do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
+
+       .align 4
+.LENDFDEDLSI2:
+       .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
new file mode 100644 (file)
index 0000000..ed879bf
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * Code for the vsyscall page.  This version uses the sysenter instruction.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+
+/*
+ * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
+ * %ecx itself for arg2. The pushing is because the sysexit instruction
+ * (found in entry.S) requires that we clobber %ecx with the desired %esp.
+ * User code might expect that %ecx is unclobbered though, as it would be
+ * for returning via the iret instruction, so we must push and pop.
+ *
+ * The caller puts arg3 in %edx, which the sysexit instruction requires
+ * for %eip. Thus, exactly as for arg2, we must push and pop.
+ *
+ * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
+ * instruction clobbers %esp, the user's %esp won't even survive entry
+ * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
+ * arg6 from the stack.
+ *
+ * You can not use this vsyscall for the clone() syscall because the
+ * three dwords on the parent stack do not get copied to the child.
+ */
+       .text
+       .globl __kernel_vsyscall
+       .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+       push %ecx
+.Lpush_ecx:
+       push %edx
+.Lpush_edx:
+       push %ebp
+.Lenter_kernel:
+       movl %esp,%ebp
+       sysenter
+
+       /* 7: align return point with nop's to make disassembly easier */
+       .space 7,0x90
+
+       /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
+       jmp .Lenter_kernel
+       /* 16: System call normal return point is here! */
+       .globl SYSENTER_RETURN  /* Symbol used by sysenter.c  */
+SYSENTER_RETURN:
+       pop %ebp
+.Lpop_ebp:
+       pop %edx
+.Lpop_edx:
+       pop %ecx
+.Lpop_ecx:
+       ret
+.LEND_vsyscall:
+       .size __kernel_vsyscall,.-.LSTART_vsyscall
+       .previous
+
+       .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+       .long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+       .long 0                 /* CIE ID */
+       .byte 1                 /* Version number */
+       .string "zR"            /* NUL-terminated augmentation string */
+       .uleb128 1              /* Code alignment factor */
+       .sleb128 -4             /* Data alignment factor */
+       .byte 8                 /* Return address register column */
+       .uleb128 1              /* Augmentation value length */
+       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+       .byte 0x0c              /* DW_CFA_def_cfa */
+       .uleb128 4
+       .uleb128 4
+       .byte 0x88              /* DW_CFA_offset, column 0x8 */
+       .uleb128 1
+       .align 4
+.LENDCIEDLSI:
+       .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+       .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+       .long .LSTART_vsyscall-.        /* PC-relative start address */
+       .long .LEND_vsyscall-.LSTART_vsyscall
+       .uleb128 0
+       /* What follows are the instructions for the table generation.
+          We have to record all changes of the stack pointer.  */
+       .byte 0x04              /* DW_CFA_advance_loc4 */
+       .long .Lpush_ecx-.LSTART_vsyscall
+       .byte 0x0e              /* DW_CFA_def_cfa_offset */
+       .byte 0x08              /* RA at offset 8 now */
+       .byte 0x04              /* DW_CFA_advance_loc4 */
+       .long .Lpush_edx-.Lpush_ecx
+       .byte 0x0e              /* DW_CFA_def_cfa_offset */
+       .byte 0x0c              /* RA at offset 12 now */
+       .byte 0x04              /* DW_CFA_advance_loc4 */
+       .long .Lenter_kernel-.Lpush_edx
+       .byte 0x0e              /* DW_CFA_def_cfa_offset */
+       .byte 0x10              /* RA at offset 16 now */
+       .byte 0x85, 0x04        /* DW_CFA_offset %ebp -16 */
+       /* Finally the epilogue.  */
+       .byte 0x04              /* DW_CFA_advance_loc4 */
+       .long .Lpop_ebp-.Lenter_kernel
+       .byte 0x0e              /* DW_CFA_def_cfa_offset */
+       .byte 0x0c              /* RA at offset 12 now */
+       .byte 0xc5              /* DW_CFA_restore %ebp */
+       .byte 0x04              /* DW_CFA_advance_loc4 */
+       .long .Lpop_edx-.Lpop_ebp
+       .byte 0x0e              /* DW_CFA_def_cfa_offset */
+       .byte 0x08              /* RA at offset 8 now */
+       .byte 0x04              /* DW_CFA_advance_loc4 */
+       .long .Lpop_ecx-.Lpop_edx
+       .byte 0x0e              /* DW_CFA_def_cfa_offset */
+       .byte 0x04              /* RA at offset 4 now */
+       .align 4
+.LENDFDEDLSI:
+       .previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
new file mode 100644 (file)
index 0000000..a5ab3dc
--- /dev/null
@@ -0,0 +1,15 @@
+#include <linux/init.h>
+
+__INITDATA
+
+       .globl vsyscall_int80_start, vsyscall_int80_end
+vsyscall_int80_start:
+       .incbin "arch/x86/kernel/vsyscall-int80_32.so"
+vsyscall_int80_end:
+
+       .globl vsyscall_sysenter_start, vsyscall_sysenter_end
+vsyscall_sysenter_start:
+       .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
+vsyscall_sysenter_end:
+
+__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
new file mode 100644 (file)
index 0000000..4a8b0ed
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address, and with only one read-only
+ * segment (that fits in one page).  This script controls its layout.
+ */
+#include <asm/asm-offsets.h>
+
+SECTIONS
+{
+  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
+
+  .hash           : { *(.hash) }               :text
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VDSO_PRELINK_asm + 0x400;
+
+  .text           : { *(.text) }               :text =0x90909090
+  .note                  : { *(.note.*) }              :text :note
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }       :text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }    :text
+  .dynamic        : { *(.dynamic) }            :text :dynamic
+  .useless        : {
+       *(.got.plt) *(.got)
+       *(.data .data.* .gnu.linkonce.d.*)
+       *(.dynbss)
+       *(.bss .bss.* .gnu.linkonce.b.*)
+  }                                            :text
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  note PT_NOTE FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.5 {
+    global:
+       __kernel_vsyscall;
+       __kernel_sigreturn;
+       __kernel_rt_sigreturn;
+
+    local: *;
+  };
+}
+
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);
index 08f489a..19d6d40 100644 (file)
@@ -2,7 +2,7 @@
 # Makefile for the generic architecture
 #
 
-EXTRA_CFLAGS   := -Iarch/i386/kernel
+EXTRA_CFLAGS   := -Iarch/x86/kernel
 
 obj-y          := probe.o summit.o bigsmp.o es7000.o default.o 
 obj-y          += ../../x86/mach-es7000/
index 33b74cf..15c250b 100644 (file)
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-EXTRA_CFLAGS   := -Iarch/i386/kernel
+EXTRA_CFLAGS   := -Iarch/x86/kernel
 obj-y                  := setup.o voyager_basic.o voyager_thread.o
 
 obj-$(CONFIG_SMP)      += voyager_smp.o voyager_cat.o
index 83e7fd6..b383be0 100644 (file)
@@ -139,5 +139,5 @@ __kernel_rt_sigreturn:
        .align 4
 .LENDFDE3:
 
-#include "../../i386/kernel/vsyscall-note_32.S"
+#include "../../x86/kernel/vsyscall-note_32.S"
 
index d3ebd16..577d08f 100644 (file)
@@ -1,5 +1,5 @@
 ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/i386/kernel/Makefile_32
+include ${srctree}/arch/x86/kernel/Makefile_32
 else
 include ${srctree}/arch/x86_64/kernel/Makefile_64
 endif
index 690aebf..3a204cc 100644 (file)
@@ -49,15 +49,15 @@ obj-y                               += pcspeaker.o
 CFLAGS_vsyscall_64.o           := $(PROFILING) -g0
 
 therm_throt-y                   += ../../x86/kernel/cpu/mcheck/therm_throt.o
-bootflag-y                     += ../../i386/kernel/bootflag.o
-cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
-topology-y                     += ../../i386/kernel/topology.o
-microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
+bootflag-y                     += ../../x86/kernel/bootflag.o
+cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../x86/kernel/cpuid.o
+topology-y                     += ../../x86/kernel/topology.o
+microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../x86/kernel/microcode.o
 intel_cacheinfo-y              += ../../x86/kernel/cpu/intel_cacheinfo.o
 addon_cpuid_features-y         += ../../x86/kernel/cpu/addon_cpuid_features.o
-quirks-y                       += ../../i386/kernel/quirks.o
-i8237-y                                += ../../i386/kernel/i8237.o
-msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
-alternative-y                  += ../../i386/kernel/alternative.o
-pcspeaker-y                    += ../../i386/kernel/pcspeaker.o
+quirks-y                       += ../../x86/kernel/quirks.o
+i8237-y                                += ../../x86/kernel/i8237.o
+msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../x86/kernel/msr.o
+alternative-y                  += ../../x86/kernel/alternative.o
+pcspeaker-y                    += ../../x86/kernel/pcspeaker.o
 perfctr-watchdog-y             += ../../x86/kernel/cpu/perfctr-watchdog.o