From af5f48bdae2a14ea9877d7f64aac1db80c3582eb Mon Sep 17 00:00:00 2001 From: Robert Buchholz Date: Thu, 28 Aug 2008 19:08:00 +0000 Subject: Upgrade patches to Xen 3.3.0 and Debian Security 2.6.18.dfsg.1-22etch2 svn path=/patches/; revision=81 --- trunk/2.6.18/00000_README | 62 +- trunk/2.6.18/10001_xen-3.2.0.patch | 139942 ------------ trunk/2.6.18/10002_xen-3.3.0.patch | 210345 ++++++++++++++++++ trunk/2.6.18/30073_nfs-write-corruption.patch | 76 + ...86-clear-df-before-calling-signal-handler.patch | 57 + trunk/2.6.18/30075_3w-xxxx-bigmem-corruption.patch | 42 + trunk/2.6.18/30076_dnotify-race-locking.patch | 29 + ...sctp-make-sure-n-sizeof-does-not-overflow.patch | 29 + .../30078_esp-iv-in-linear-part-of-skb.patch | 48 + ...fix-zeroing-on-exception-in-copy_user-pre.patch | 798 + ...d64-fix-zeroing-on-exception-in-copy_user.patch | 272 + .../30080_tty-fix-for-tty-operations-bugs.patch | 183 + ...ivileges-before-setting-mount-propagation.patch | 28 + .../30082a_x86-add-copy_user_handle_tail.patch | 56 + trunk/2.6.18/30082b_x86-fix-copy_user.patch | 537 + ...86-wrong-register-was-used-in-align-macro.patch | 29 + trunk/2.6.18/30084_cifs-fix-compiler-warning.patch | 21 + ...tfilter-nf_nat_snmp_basic-fix-range-check.patch | 27 + ...r-is-valid-in-snd_seq_oss_synth_make_info.patch | 30 + ...30087_vfs-fix-lookup-on-deleted-directory.patch | 71 + 20 files changed, 212738 insertions(+), 139944 deletions(-) delete mode 100644 trunk/2.6.18/10001_xen-3.2.0.patch create mode 100644 trunk/2.6.18/10002_xen-3.3.0.patch create mode 100644 trunk/2.6.18/30073_nfs-write-corruption.patch create mode 100644 trunk/2.6.18/30074_x86-clear-df-before-calling-signal-handler.patch create mode 100644 trunk/2.6.18/30075_3w-xxxx-bigmem-corruption.patch create mode 100644 trunk/2.6.18/30076_dnotify-race-locking.patch create mode 100644 trunk/2.6.18/30077_sctp-make-sure-n-sizeof-does-not-overflow.patch create mode 100644 trunk/2.6.18/30078_esp-iv-in-linear-part-of-skb.patch create mode 100644 trunk/2.6.18/30079a_amd64-fix-zeroing-on-exception-in-copy_user-pre.patch create mode 100644 trunk/2.6.18/30079b_amd64-fix-zeroing-on-exception-in-copy_user.patch create mode 100644 trunk/2.6.18/30080_tty-fix-for-tty-operations-bugs.patch create mode 100644 trunk/2.6.18/30081_check-privileges-before-setting-mount-propagation.patch create mode 100644 trunk/2.6.18/30082a_x86-add-copy_user_handle_tail.patch create mode 100644 trunk/2.6.18/30082b_x86-fix-copy_user.patch create mode 100644 trunk/2.6.18/30083_x86-wrong-register-was-used-in-align-macro.patch create mode 100644 trunk/2.6.18/30084_cifs-fix-compiler-warning.patch create mode 100644 trunk/2.6.18/30085_netfilter-nf_nat_snmp_basic-fix-range-check.patch create mode 100644 trunk/2.6.18/30086_sound-ensure-device-number-is-valid-in-snd_seq_oss_synth_make_info.patch create mode 100644 trunk/2.6.18/30087_vfs-fix-lookup-on-deleted-directory.patch diff --git a/trunk/2.6.18/00000_README b/trunk/2.6.18/00000_README index 7ee89f7..c9c4c91 100644 --- a/trunk/2.6.18/00000_README +++ b/trunk/2.6.18/00000_README @@ -19,8 +19,8 @@ Numbering Patches ------- -10001_xen-3.2.0.patch - Upstream 3.2.0 patch +10002_xen-3.3.0.patch + Upstream 3.3.0 patch 30001_nfnetlink_log-null-deref.patch [SECURITY] Fix remotely exploitable NULL pointer dereference in @@ -329,6 +329,64 @@ Patches heap overflow See CVE-2008-1673 +30073_nfs-write-corruption.patch + Fix potential nfs write corruption (closes: #470719) + +30074_x86-clear-df-before-calling-signal-handler.patch + [i386/amd64] Clear DF before calling signal handler. (closes: #469058) + CVE-2008-1367 + +30075_3w-xxxx-bigmem-corruption.patch + 3w-xxxx: Fix data corruption on em64t systems w/ > 2GB of memory + (closes: #464923). + +30076_dnotify-race-locking.patch + Add missing locking for the dnotify-race fix that was included in + the upstream commit + +30077_sctp-make-sure-n-sizeof-does-not-overflow.patch + [SECURITY] Fix potential overflow condition in + sctp_getsockopt_local_addrs_old + See CVE-2008-2826 + +30078_esp-iv-in-linear-part-of-skb.patch + [SECURITY] Avoid tripping BUG() in IPsec code when the first fragment + of an ESP packet does not contain the entire ESP header and IV + See CVE-2007-6282 + +30079a_amd64-fix-zeroing-on-exception-in-copy_user-pre.patch +30079b_amd64-fix-zeroing-on-exception-in-copy_user.patch + [SECURITY] [amd64] Fix potential information leak when a copy + operation fails by properly zeroing out destination memory + See CVE-2008-2729 + +30080_tty-fix-for-tty-operations-bugs.patch + [SECURITY] Fix issues with tty operation handling in various drivers + See CVE-2008-2812 + +30081_check-privileges-before-setting-mount-propagation.patch + [SECURITY] Check CAP_SYS_ADMIN when changing mountpoint type + See CVE-2008-2931 + +30082a_x86-add-copy_user_handle_tail.patch +30082b_x86-fix-copy_user.patch + [SECURITY][amd64] Fix memory leak in the copy_user routine, see #490910. + See CVE-2008-0598 + +30083_x86-wrong-register-was-used-in-align-macro.patch + Fix regression introduced upstream by the fix for CVE-2008-0598 + +30084_cifs-fix-compiler-warning.patch +30085_netfilter-nf_nat_snmp_basic-fix-range-check.patch + Fix regressions introduced upstream by the fixes for CVE-2008-1673 + +30086_sound-ensure-device-number-is-valid-in-snd_seq_oss_synth_make_info.patch + Fix possible information leak in seq_oss_synth.c + See CVE-2008-3272 + +30087_vfs-fix-lookup-on-deleted-directory.patch + Fix potential memory leak in lookup path + See CVE-2008-3275 50009_gentooify-tls-warning.patch Change tls warning instructions to apply directly to Gentoo. diff --git a/trunk/2.6.18/10001_xen-3.2.0.patch b/trunk/2.6.18/10001_xen-3.2.0.patch deleted file mode 100644 index d3d6c2b..0000000 --- a/trunk/2.6.18/10001_xen-3.2.0.patch +++ /dev/null @@ -1,139942 +0,0 @@ -diff -rpuN linux-2.6.18.8/arch/i386/Kconfig linux-2.6.18-xen-3.2.0/arch/i386/Kconfig ---- linux-2.6.18.8/arch/i386/Kconfig 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/Kconfig 2008-02-15 16:21:49.000000000 -0800 -@@ -16,6 +16,7 @@ config X86_32 - - config GENERIC_TIME - bool -+ depends on !X86_XEN - default y - - config LOCKDEP_SUPPORT -@@ -103,6 +104,16 @@ config X86_PC - help - Choose this option if your computer is a standard PC or compatible. - -+config X86_XEN -+ bool "Xen-compatible" -+ select XEN -+ select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST -+ select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST -+ select SWIOTLB -+ help -+ Choose this option if you plan to run this kernel on top of the -+ Xen Hypervisor. -+ - config X86_ELAN - bool "AMD Elan" - help -@@ -213,6 +224,7 @@ source "arch/i386/Kconfig.cpu" - - config HPET_TIMER - bool "HPET Timer Support" -+ depends on !X86_XEN - help - This enables the use of the HPET for the kernel's internal timer. - HPET is the next generation timer replacing legacy 8254s. -@@ -263,7 +275,7 @@ source "kernel/Kconfig.preempt" - - config X86_UP_APIC - bool "Local APIC support on uniprocessors" -- depends on !SMP && !(X86_VISWS || X86_VOYAGER) -+ depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) - help - A local APIC (Advanced Programmable Interrupt Controller) is an - integrated interrupt controller in the CPU. If you have a single-CPU -@@ -288,12 +300,12 @@ config X86_UP_IOAPIC - - config X86_LOCAL_APIC - bool -- depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) -+ depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) - default y - - config X86_IO_APIC - bool -- depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) -+ depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) - default y - - config X86_VISWS_APIC -@@ -303,7 +315,7 @@ config X86_VISWS_APIC - - config X86_MCE - bool "Machine Check Exception" -- depends on !X86_VOYAGER -+ depends on !(X86_VOYAGER || X86_XEN) - ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). -@@ -402,6 +414,7 @@ config X86_REBOOTFIXUPS - - config MICROCODE - tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" -+ depends on !XEN_UNPRIVILEGED_GUEST - ---help--- - If you say Y here and also to "/dev file system support" in the - 'File systems' section, you will be able to update the microcode on -@@ -434,6 +447,10 @@ config X86_CPUID - with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to - /dev/cpu/31/cpuid. - -+config SWIOTLB -+ bool -+ default n -+ - source "drivers/firmware/Kconfig" - - choice -@@ -616,6 +633,7 @@ config HIGHPTE - - config MATH_EMULATION - bool "Math emulation" -+ depends on !X86_XEN - ---help--- - Linux can emulate a math coprocessor (used for floating point - operations) if you don't have one. 486DX and Pentium processors have -@@ -641,6 +659,8 @@ config MATH_EMULATION - - config MTRR - bool "MTRR (Memory Type Range Register) support" -+ depends on !XEN_UNPRIVILEGED_GUEST -+ default y if X86_XEN - ---help--- - On Intel P6 family processors (Pentium Pro, Pentium II and later) - the Memory Type Range Registers (MTRRs) may be used to control -@@ -675,7 +695,7 @@ config MTRR - - config EFI - bool "Boot from EFI support" -- depends on ACPI -+ depends on ACPI && !X86_XEN - default n - ---help--- - This enables the the kernel to boot on EFI platforms using -@@ -693,7 +713,7 @@ config EFI - - config IRQBALANCE - bool "Enable kernel irq balancing" -- depends on SMP && X86_IO_APIC -+ depends on SMP && X86_IO_APIC && !X86_XEN - default y - help - The default yes will allow the kernel to do irq load balancing. -@@ -741,7 +761,7 @@ source kernel/Kconfig.hz - - config KEXEC - bool "kexec system call (EXPERIMENTAL)" -- depends on EXPERIMENTAL -+ depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot -@@ -793,6 +813,7 @@ config HOTPLUG_CPU - - config COMPAT_VDSO - bool "Compat VDSO support" -+ depends on !X86_XEN - default y - help - Map the VDSO to the predictable old-style address too. -@@ -810,18 +831,18 @@ config ARCH_ENABLE_MEMORY_HOTPLUG - depends on HIGHMEM - - menu "Power management options (ACPI, APM)" -- depends on !X86_VOYAGER -+ depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) - --source kernel/power/Kconfig -+source "kernel/power/Kconfig" - - source "drivers/acpi/Kconfig" - - menu "APM (Advanced Power Management) BIOS Support" --depends on PM && !X86_VISWS -+depends on PM && !(X86_VISWS || X86_XEN) - - config APM - tristate "APM (Advanced Power Management) BIOS support" -- depends on PM -+ depends on PM && PM_LEGACY - ---help--- - APM is a BIOS specification for saving power using several different - techniques. This is mostly useful for battery powered laptops with -@@ -1006,6 +1027,7 @@ choice - - config PCI_GOBIOS - bool "BIOS" -+ depends on !X86_XEN - - config PCI_GOMMCONFIG - bool "MMConfig" -@@ -1013,6 +1035,13 @@ config PCI_GOMMCONFIG - config PCI_GODIRECT - bool "Direct" - -+config PCI_GOXEN_FE -+ bool "Xen PCI Frontend" -+ depends on X86_XEN -+ help -+ The PCI device frontend driver allows the kernel to import arbitrary -+ PCI devices from a PCI backend to support PCI driver domains. -+ - config PCI_GOANY - bool "Any" - -@@ -1020,7 +1049,7 @@ endchoice - - config PCI_BIOS - bool -- depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) -+ depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY) - default y - - config PCI_DIRECT -@@ -1033,6 +1062,18 @@ config PCI_MMCONFIG - depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) - default y - -+config XEN_PCIDEV_FRONTEND -+ bool -+ depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY) -+ default y -+ -+config XEN_PCIDEV_FE_DEBUG -+ bool "Xen PCI Frontend Debugging" -+ depends on XEN_PCIDEV_FRONTEND -+ default n -+ help -+ Enables some debug statements within the PCI Frontend. -+ - source "drivers/pci/pcie/Kconfig" - - source "drivers/pci/Kconfig" -@@ -1043,7 +1084,7 @@ config ISA_DMA_API - - config ISA - bool "ISA support" -- depends on !(X86_VOYAGER || X86_VISWS) -+ depends on !(X86_VOYAGER || X86_VISWS || X86_XEN) - help - Find out whether you have ISA slots on your motherboard. ISA is the - name of a bus system, i.e. the way the CPU talks to the other stuff -@@ -1070,7 +1111,7 @@ config EISA - source "drivers/eisa/Kconfig" - - config MCA -- bool "MCA support" if !(X86_VISWS || X86_VOYAGER) -+ bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN) - default y if X86_VOYAGER - help - MicroChannel Architecture is found in some IBM PS/2 machines and -@@ -1146,6 +1187,8 @@ source "security/Kconfig" - - source "crypto/Kconfig" - -+source "drivers/xen/Kconfig" -+ - source "lib/Kconfig" - - # -@@ -1171,7 +1214,7 @@ config X86_SMP - - config X86_HT - bool -- depends on SMP && !(X86_VISWS || X86_VOYAGER) -+ depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN) - default y - - config X86_BIOS_REBOOT -@@ -1184,6 +1227,16 @@ config X86_TRAMPOLINE - depends on X86_SMP || (X86_VOYAGER && SMP) - default y - -+config X86_NO_TSS -+ bool -+ depends on X86_XEN -+ default y -+ -+config X86_NO_IDT -+ bool -+ depends on X86_XEN -+ default y -+ - config KTIME_SCALAR - bool - default y -diff -rpuN linux-2.6.18.8/arch/i386/Kconfig.cpu linux-2.6.18-xen-3.2.0/arch/i386/Kconfig.cpu ---- linux-2.6.18.8/arch/i386/Kconfig.cpu 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/Kconfig.cpu 2008-02-15 16:21:49.000000000 -0800 -@@ -252,7 +252,7 @@ config X86_PPRO_FENCE - - config X86_F00F_BUG - bool -- depends on M586MMX || M586TSC || M586 || M486 || M386 -+ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT - default y - - config X86_WP_WORKS_OK -@@ -312,5 +312,5 @@ config X86_OOSTORE - - config X86_TSC - bool -- depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ -+ depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ && !X86_XEN - default y -diff -rpuN linux-2.6.18.8/arch/i386/Kconfig.debug linux-2.6.18-xen-3.2.0/arch/i386/Kconfig.debug ---- linux-2.6.18.8/arch/i386/Kconfig.debug 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/Kconfig.debug 2008-02-15 16:21:49.000000000 -0800 -@@ -79,6 +79,7 @@ config X86_MPPARSE - config DOUBLEFAULT - default y - bool "Enable doublefault exception handler" if EMBEDDED -+ depends on !X86_NO_TSS - help - This option allows trapping of rare doublefault exceptions that - would otherwise cause a system to silently reboot. Disabling this -diff -rpuN linux-2.6.18.8/arch/i386/Makefile linux-2.6.18-xen-3.2.0/arch/i386/Makefile ---- linux-2.6.18.8/arch/i386/Makefile 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/Makefile 2008-02-15 16:21:49.000000000 -0800 -@@ -71,6 +71,10 @@ mcore-$(CONFIG_X86_BIGSMP) := mach-defau - mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit - mcore-$(CONFIG_X86_SUMMIT) := mach-default - -+# Xen subarch support -+mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen -+mcore-$(CONFIG_X86_XEN) := mach-xen -+ - # generic subarchitecture - mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic - mcore-$(CONFIG_X86_GENERICARCH) := mach-default -@@ -102,9 +106,20 @@ AFLAGS += $(mflags-y) - - boot := arch/i386/boot - --PHONY += zImage bzImage compressed zlilo bzlilo \ -+PHONY += zImage bzImage vmlinuz compressed zlilo bzlilo \ - zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install - -+ifdef CONFIG_XEN -+CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ -+ -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS) -+all: vmlinuz -+ -+# KBUILD_IMAGE specifies the target image being built -+KBUILD_IMAGE := $(boot)/vmlinuz -+ -+vmlinuz: vmlinux -+ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) -+else - all: bzImage - - # KBUILD_IMAGE specify target image being built -@@ -124,6 +139,7 @@ zdisk bzdisk: vmlinux - - fdimage fdimage144 fdimage288 isoimage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ -+endif - - install: - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install -diff -rpuN linux-2.6.18.8/arch/i386/boot/Makefile linux-2.6.18-xen-3.2.0/arch/i386/boot/Makefile ---- linux-2.6.18.8/arch/i386/boot/Makefile 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/boot/Makefile 2008-02-15 16:21:49.000000000 -0800 -@@ -26,7 +26,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA - #RAMDISK := -DRAMDISK=512 - - targets := vmlinux.bin bootsect bootsect.o \ -- setup setup.o zImage bzImage -+ setup setup.o zImage bzImage vmlinuz vmlinux-stripped - subdir- := compressed - - hostprogs-y := tools/build -@@ -133,5 +133,13 @@ zlilo: $(BOOTIMAGE) - cp System.map $(INSTALL_PATH)/ - if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi - -+$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE -+ $(call if_changed,gzip) -+ @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' -+ -+$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded -+$(obj)/vmlinux-stripped: vmlinux FORCE -+ $(call if_changed,objcopy) -+ - install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" -diff -rpuN linux-2.6.18.8/arch/i386/kernel/Makefile linux-2.6.18-xen-3.2.0/arch/i386/kernel/Makefile ---- linux-2.6.18.8/arch/i386/kernel/Makefile 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/Makefile 2008-02-15 16:21:49.000000000 -0800 -@@ -43,6 +43,7 @@ obj-$(CONFIG_K8_NB) += k8.o - EXTRA_AFLAGS := -traditional - - obj-$(CONFIG_SCx200) += scx200.o -+obj-$(CONFIG_XEN) += fixup.o - - # vsyscall.o contains the vsyscall DSO images as __initdata. - # We must build both images before we can assemble it. -@@ -80,5 +81,8 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall. - $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE - $(call if_changed,syscall) - -+early_printk-y += ../../x86_64/kernel/early_printk.o - k8-y += ../../x86_64/kernel/k8.o - -+disabled-obj-$(CONFIG_XEN) := i8259.o reboot.o smpboot.o trampoline.o -+%/head.o %/head.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) := -diff -rpuN linux-2.6.18.8/arch/i386/kernel/acpi/Makefile linux-2.6.18-xen-3.2.0/arch/i386/kernel/acpi/Makefile ---- linux-2.6.18.8/arch/i386/kernel/acpi/Makefile 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/acpi/Makefile 2008-02-15 16:21:49.000000000 -0800 -@@ -6,3 +6,4 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),) - obj-y += cstate.o processor.o - endif - -+disabled-obj-$(CONFIG_XEN) := cstate.o wakeup.o -diff -rpuN linux-2.6.18.8/arch/i386/kernel/acpi/boot-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/acpi/boot-xen.c ---- linux-2.6.18.8/arch/i386/kernel/acpi/boot-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/acpi/boot-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,1168 @@ -+/* -+ * boot.c - Architecture-Specific Low-Level ACPI Boot Support -+ * -+ * Copyright (C) 2001, 2002 Paul Diefenbaugh -+ * Copyright (C) 2001 Jun Nakajima -+ * -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ * -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_X86_64 -+ -+extern void __init clustered_apic_check(void); -+ -+extern int gsi_irq_sharing(int gsi); -+#include -+ -+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } -+ -+ -+#else /* X86 */ -+ -+#ifdef CONFIG_X86_LOCAL_APIC -+#include -+#include -+#endif /* CONFIG_X86_LOCAL_APIC */ -+ -+static inline int gsi_irq_sharing(int gsi) { return gsi; } -+ -+#endif /* X86 */ -+ -+#define BAD_MADT_ENTRY(entry, end) ( \ -+ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ -+ ((acpi_table_entry_header *)entry)->length < sizeof(*entry)) -+ -+#define PREFIX "ACPI: " -+ -+int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ -+int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ -+int acpi_ht __initdata = 1; /* enable HT */ -+ -+int acpi_lapic; -+int acpi_ioapic; -+int acpi_strict; -+EXPORT_SYMBOL(acpi_strict); -+ -+acpi_interrupt_flags acpi_sci_flags __initdata; -+int acpi_sci_override_gsi __initdata; -+int acpi_skip_timer_override __initdata; -+ -+#ifdef CONFIG_X86_LOCAL_APIC -+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; -+#endif -+ -+#ifndef __HAVE_ARCH_CMPXCHG -+#warning ACPI uses CMPXCHG, i486 and later hardware -+#endif -+ -+#define MAX_MADT_ENTRIES 256 -+u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] = -+ {[0 ... MAX_MADT_ENTRIES - 1] = 0xff }; -+EXPORT_SYMBOL(x86_acpiid_to_apicid); -+ -+/* -------------------------------------------------------------------------- -+ Boot-time Configuration -+ -------------------------------------------------------------------------- */ -+ -+/* -+ * The default interrupt routing model is PIC (8259). This gets -+ * overriden if IOAPICs are enumerated (below). -+ */ -+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; -+ -+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) -+ -+/* rely on all ACPI tables being in the direct mapping */ -+char *__acpi_map_table(unsigned long phys_addr, unsigned long size) -+{ -+ if (!phys_addr || !size) -+ return NULL; -+ -+ if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE) -+ return __va(phys_addr); -+ -+ return NULL; -+} -+ -+#else -+ -+/* -+ * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, -+ * to map the target physical address. The problem is that set_fixmap() -+ * provides a single page, and it is possible that the page is not -+ * sufficient. -+ * By using this area, we can map up to MAX_IO_APICS pages temporarily, -+ * i.e. until the next __va_range() call. -+ * -+ * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* -+ * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and -+ * count idx down while incrementing the phys address. -+ */ -+char *__acpi_map_table(unsigned long phys, unsigned long size) -+{ -+ unsigned long base, offset, mapped_size; -+ int idx; -+ -+#ifndef CONFIG_XEN -+ if (phys + size < 8 * 1024 * 1024) -+ return __va(phys); -+#endif -+ -+ offset = phys & (PAGE_SIZE - 1); -+ mapped_size = PAGE_SIZE - offset; -+ set_fixmap(FIX_ACPI_END, phys); -+ base = fix_to_virt(FIX_ACPI_END); -+ -+ /* -+ * Most cases can be covered by the below. -+ */ -+ idx = FIX_ACPI_END; -+ while (mapped_size < size) { -+ if (--idx < FIX_ACPI_BEGIN) -+ return NULL; /* cannot handle this */ -+ phys += PAGE_SIZE; -+ set_fixmap(idx, phys); -+ mapped_size += PAGE_SIZE; -+ } -+ -+ return ((unsigned char *)base + offset); -+} -+#endif -+ -+#ifdef CONFIG_PCI_MMCONFIG -+/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -+struct acpi_table_mcfg_config *pci_mmcfg_config; -+int pci_mmcfg_config_num; -+ -+int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) -+{ -+ struct acpi_table_mcfg *mcfg; -+ unsigned long i; -+ int config_size; -+ -+ if (!phys_addr || !size) -+ return -EINVAL; -+ -+ mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size); -+ if (!mcfg) { -+ printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); -+ return -ENODEV; -+ } -+ -+ /* how many config structures do we have */ -+ pci_mmcfg_config_num = 0; -+ i = size - sizeof(struct acpi_table_mcfg); -+ while (i >= sizeof(struct acpi_table_mcfg_config)) { -+ ++pci_mmcfg_config_num; -+ i -= sizeof(struct acpi_table_mcfg_config); -+ }; -+ if (pci_mmcfg_config_num == 0) { -+ printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); -+ return -ENODEV; -+ } -+ -+ config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); -+ pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); -+ if (!pci_mmcfg_config) { -+ printk(KERN_WARNING PREFIX -+ "No memory for MCFG config tables\n"); -+ return -ENOMEM; -+ } -+ -+ memcpy(pci_mmcfg_config, &mcfg->config, config_size); -+ for (i = 0; i < pci_mmcfg_config_num; ++i) { -+ if (mcfg->config[i].base_reserved) { -+ printk(KERN_ERR PREFIX -+ "MMCONFIG not in low 4GB of memory\n"); -+ kfree(pci_mmcfg_config); -+ pci_mmcfg_config_num = 0; -+ return -ENODEV; -+ } -+ } -+ -+ return 0; -+} -+#endif /* CONFIG_PCI_MMCONFIG */ -+ -+#ifdef CONFIG_X86_LOCAL_APIC -+static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) -+{ -+ struct acpi_table_madt *madt = NULL; -+ -+ if (!phys_addr || !size || !cpu_has_apic) -+ return -EINVAL; -+ -+ madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size); -+ if (!madt) { -+ printk(KERN_WARNING PREFIX "Unable to map MADT\n"); -+ return -ENODEV; -+ } -+ -+ if (madt->lapic_address) { -+ acpi_lapic_addr = (u64) madt->lapic_address; -+ -+ printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", -+ madt->lapic_address); -+ } -+ -+ acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); -+ -+ return 0; -+} -+ -+static int __init -+acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end) -+{ -+ struct acpi_table_lapic *processor = NULL; -+ -+ processor = (struct acpi_table_lapic *)header; -+ -+ if (BAD_MADT_ENTRY(processor, end)) -+ return -EINVAL; -+ -+ acpi_table_print_madt_entry(header); -+ -+ /* Record local apic id only when enabled */ -+ if (processor->flags.enabled) -+ x86_acpiid_to_apicid[processor->acpi_id] = processor->id; -+ -+ /* -+ * We need to register disabled CPU as well to permit -+ * counting disabled CPUs. This allows us to size -+ * cpus_possible_map more accurately, to permit -+ * to not preallocating memory for all NR_CPUS -+ * when we use CPU hotplug. -+ */ -+ mp_register_lapic(processor->id, /* APIC ID */ -+ processor->flags.enabled); /* Enabled? */ -+ -+ return 0; -+} -+ -+static int __init -+acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header, -+ const unsigned long end) -+{ -+ struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL; -+ -+ lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header; -+ -+ if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) -+ return -EINVAL; -+ -+ acpi_lapic_addr = lapic_addr_ovr->address; -+ -+ return 0; -+} -+ -+static int __init -+acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end) -+{ -+ struct acpi_table_lapic_nmi *lapic_nmi = NULL; -+ -+ lapic_nmi = (struct acpi_table_lapic_nmi *)header; -+ -+ if (BAD_MADT_ENTRY(lapic_nmi, end)) -+ return -EINVAL; -+ -+ acpi_table_print_madt_entry(header); -+ -+ if (lapic_nmi->lint != 1) -+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); -+ -+ return 0; -+} -+ -+#endif /*CONFIG_X86_LOCAL_APIC */ -+ -+#ifdef CONFIG_X86_IO_APIC -+ -+static int __init -+acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end) -+{ -+ struct acpi_table_ioapic *ioapic = NULL; -+ -+ ioapic = (struct acpi_table_ioapic *)header; -+ -+ if (BAD_MADT_ENTRY(ioapic, end)) -+ return -EINVAL; -+ -+ acpi_table_print_madt_entry(header); -+ -+ mp_register_ioapic(ioapic->id, -+ ioapic->address, ioapic->global_irq_base); -+ -+ return 0; -+} -+ -+/* -+ * Parse Interrupt Source Override for the ACPI SCI -+ */ -+static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) -+{ -+ if (trigger == 0) /* compatible SCI trigger is level */ -+ trigger = 3; -+ -+ if (polarity == 0) /* compatible SCI polarity is low */ -+ polarity = 3; -+ -+ /* Command-line over-ride via acpi_sci= */ -+ if (acpi_sci_flags.trigger) -+ trigger = acpi_sci_flags.trigger; -+ -+ if (acpi_sci_flags.polarity) -+ polarity = acpi_sci_flags.polarity; -+ -+ /* -+ * mp_config_acpi_legacy_irqs() already setup IRQs < 16 -+ * If GSI is < 16, this will update its flags, -+ * else it will create a new mp_irqs[] entry. -+ */ -+ mp_override_legacy_irq(gsi, polarity, trigger, gsi); -+ -+ /* -+ * stash over-ride to indicate we've been here -+ * and for later update of acpi_fadt -+ */ -+ acpi_sci_override_gsi = gsi; -+ return; -+} -+ -+static int __init -+acpi_parse_int_src_ovr(acpi_table_entry_header * header, -+ const unsigned long end) -+{ -+ struct acpi_table_int_src_ovr *intsrc = NULL; -+ -+ intsrc = (struct acpi_table_int_src_ovr *)header; -+ -+ if (BAD_MADT_ENTRY(intsrc, end)) -+ return -EINVAL; -+ -+ acpi_table_print_madt_entry(header); -+ -+ if (intsrc->bus_irq == acpi_fadt.sci_int) { -+ acpi_sci_ioapic_setup(intsrc->global_irq, -+ intsrc->flags.polarity, -+ intsrc->flags.trigger); -+ return 0; -+ } -+ -+ if (acpi_skip_timer_override && -+ intsrc->bus_irq == 0 && intsrc->global_irq == 2) { -+ printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); -+ return 0; -+ } -+ -+ mp_override_legacy_irq(intsrc->bus_irq, -+ intsrc->flags.polarity, -+ intsrc->flags.trigger, intsrc->global_irq); -+ -+ return 0; -+} -+ -+static int __init -+acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end) -+{ -+ struct acpi_table_nmi_src *nmi_src = NULL; -+ -+ nmi_src = (struct acpi_table_nmi_src *)header; -+ -+ if (BAD_MADT_ENTRY(nmi_src, end)) -+ return -EINVAL; -+ -+ acpi_table_print_madt_entry(header); -+ -+ /* TBD: Support nimsrc entries? */ -+ -+ return 0; -+} -+ -+#endif /* CONFIG_X86_IO_APIC */ -+ -+/* -+ * acpi_pic_sci_set_trigger() -+ * -+ * use ELCR to set PIC-mode trigger type for SCI -+ * -+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's -+ * it may require Edge Trigger -- use "acpi_sci=edge" -+ * -+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers -+ * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. -+ * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) -+ * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) -+ */ -+ -+void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) -+{ -+ unsigned int mask = 1 << irq; -+ unsigned int old, new; -+ -+ /* Real old ELCR mask */ -+ old = inb(0x4d0) | (inb(0x4d1) << 8); -+ -+ /* -+ * If we use ACPI to set PCI irq's, then we should clear ELCR -+ * since we will set it correctly as we enable the PCI irq -+ * routing. -+ */ -+ new = acpi_noirq ? old : 0; -+ -+ /* -+ * Update SCI information in the ELCR, it isn't in the PCI -+ * routing tables.. -+ */ -+ switch (trigger) { -+ case 1: /* Edge - clear */ -+ new &= ~mask; -+ break; -+ case 3: /* Level - set */ -+ new |= mask; -+ break; -+ } -+ -+ if (old == new) -+ return; -+ -+ printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); -+ outb(new, 0x4d0); -+ outb(new >> 8, 0x4d1); -+} -+ -+int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) -+{ -+#ifdef CONFIG_X86_IO_APIC -+ if (use_pci_vector() && !platform_legacy_irq(gsi)) -+ *irq = IO_APIC_VECTOR(gsi); -+ else -+#endif -+ *irq = gsi_irq_sharing(gsi); -+ return 0; -+} -+ -+/* -+ * success: return IRQ number (>=0) -+ * failure: return < 0 -+ */ -+int acpi_register_gsi(u32 gsi, int triggering, int polarity) -+{ -+ unsigned int irq; -+ unsigned int plat_gsi = gsi; -+ -+#ifdef CONFIG_PCI -+ /* -+ * Make sure all (legacy) PCI IRQs are set as level-triggered. -+ */ -+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { -+ extern void eisa_set_level_irq(unsigned int irq); -+ -+ if (triggering == ACPI_LEVEL_SENSITIVE) -+ eisa_set_level_irq(gsi); -+ } -+#endif -+ -+#ifdef CONFIG_X86_IO_APIC -+ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { -+ plat_gsi = mp_register_gsi(gsi, triggering, polarity); -+ } -+#endif -+ acpi_gsi_to_irq(plat_gsi, &irq); -+ return irq; -+} -+ -+EXPORT_SYMBOL(acpi_register_gsi); -+ -+/* -+ * ACPI based hotplug support for CPU -+ */ -+#ifdef CONFIG_ACPI_HOTPLUG_CPU -+int acpi_map_lsapic(acpi_handle handle, int *pcpu) -+{ -+ /* TBD */ -+ return -EINVAL; -+} -+ -+EXPORT_SYMBOL(acpi_map_lsapic); -+ -+int acpi_unmap_lsapic(int cpu) -+{ -+ /* TBD */ -+ return -EINVAL; -+} -+ -+EXPORT_SYMBOL(acpi_unmap_lsapic); -+#endif /* CONFIG_ACPI_HOTPLUG_CPU */ -+ -+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) -+{ -+ /* TBD */ -+ return -EINVAL; -+} -+ -+EXPORT_SYMBOL(acpi_register_ioapic); -+ -+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) -+{ -+ /* TBD */ -+ return -EINVAL; -+} -+ -+EXPORT_SYMBOL(acpi_unregister_ioapic); -+ -+static unsigned long __init -+acpi_scan_rsdp(unsigned long start, unsigned long length) -+{ -+ unsigned long offset = 0; -+ unsigned long sig_len = sizeof("RSD PTR ") - 1; -+ unsigned long vstart = (unsigned long)isa_bus_to_virt(start); -+ -+ /* -+ * Scan all 16-byte boundaries of the physical memory region for the -+ * RSDP signature. -+ */ -+ for (offset = 0; offset < length; offset += 16) { -+ if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len)) -+ continue; -+ return (start + offset); -+ } -+ -+ return 0; -+} -+ -+static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) -+{ -+ struct acpi_table_sbf *sb; -+ -+ if (!phys_addr || !size) -+ return -EINVAL; -+ -+ sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size); -+ if (!sb) { -+ printk(KERN_WARNING PREFIX "Unable to map SBF\n"); -+ return -ENODEV; -+ } -+ -+ sbf_port = sb->sbf_cmos; /* Save CMOS port */ -+ -+ return 0; -+} -+ -+#ifdef CONFIG_HPET_TIMER -+ -+static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) -+{ -+ struct acpi_table_hpet *hpet_tbl; -+ -+ if (!phys || !size) -+ return -EINVAL; -+ -+ hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size); -+ if (!hpet_tbl) { -+ printk(KERN_WARNING PREFIX "Unable to map HPET\n"); -+ return -ENODEV; -+ } -+ -+ if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) { -+ printk(KERN_WARNING PREFIX "HPET timers must be located in " -+ "memory.\n"); -+ return -1; -+ } -+#ifdef CONFIG_X86_64 -+ vxtime.hpet_address = hpet_tbl->addr.addrl | -+ ((long)hpet_tbl->addr.addrh << 32); -+ -+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", -+ hpet_tbl->id, vxtime.hpet_address); -+#else /* X86 */ -+ { -+ extern unsigned long hpet_address; -+ -+ hpet_address = hpet_tbl->addr.addrl; -+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", -+ hpet_tbl->id, hpet_address); -+ } -+#endif /* X86 */ -+ -+ return 0; -+} -+#else -+#define acpi_parse_hpet NULL -+#endif -+ -+#ifdef CONFIG_X86_PM_TIMER -+extern u32 pmtmr_ioport; -+#endif -+ -+static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) -+{ -+ struct fadt_descriptor *fadt = NULL; -+ -+ fadt = (struct fadt_descriptor *)__acpi_map_table(phys, size); -+ if (!fadt) { -+ printk(KERN_WARNING PREFIX "Unable to map FADT\n"); -+ return 0; -+ } -+ /* initialize sci_int early for INT_SRC_OVR MADT parsing */ -+ acpi_fadt.sci_int = fadt->sci_int; -+ -+ /* initialize rev and apic_phys_dest_mode for x86_64 genapic */ -+ acpi_fadt.revision = fadt->revision; -+ acpi_fadt.force_apic_physical_destination_mode = -+ fadt->force_apic_physical_destination_mode; -+ -+#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN) -+ /* detect the location of the ACPI PM Timer */ -+ if (fadt->revision >= FADT2_REVISION_ID) { -+ /* FADT rev. 2 */ -+ if (fadt->xpm_tmr_blk.address_space_id != -+ ACPI_ADR_SPACE_SYSTEM_IO) -+ return 0; -+ -+ pmtmr_ioport = fadt->xpm_tmr_blk.address; -+ /* -+ * "X" fields are optional extensions to the original V1.0 -+ * fields, so we must selectively expand V1.0 fields if the -+ * corresponding X field is zero. -+ */ -+ if (!pmtmr_ioport) -+ pmtmr_ioport = fadt->V1_pm_tmr_blk; -+ } else { -+ /* FADT rev. 1 */ -+ pmtmr_ioport = fadt->V1_pm_tmr_blk; -+ } -+ if (pmtmr_ioport) -+ printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", -+ pmtmr_ioport); -+#endif -+ return 0; -+} -+ -+unsigned long __init acpi_find_rsdp(void) -+{ -+ unsigned long rsdp_phys = 0; -+ -+ if (efi_enabled) { -+ if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) -+ return efi.acpi20; -+ else if (efi.acpi != EFI_INVALID_TABLE_ADDR) -+ return efi.acpi; -+ } -+ /* -+ * Scan memory looking for the RSDP signature. First search EBDA (low -+ * memory) paragraphs and then search upper memory (E0000-FFFFF). -+ */ -+ rsdp_phys = acpi_scan_rsdp(0, 0x400); -+ if (!rsdp_phys) -+ rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000); -+ -+ return rsdp_phys; -+} -+ -+#ifdef CONFIG_X86_LOCAL_APIC -+/* -+ * Parse LAPIC entries in MADT -+ * returns 0 on success, < 0 on error -+ */ -+static int __init acpi_parse_madt_lapic_entries(void) -+{ -+ int count; -+ -+ if (!cpu_has_apic) -+ return -ENODEV; -+ -+ /* -+ * Note that the LAPIC address is obtained from the MADT (32-bit value) -+ * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). -+ */ -+ -+ count = -+ acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, -+ acpi_parse_lapic_addr_ovr, 0); -+ if (count < 0) { -+ printk(KERN_ERR PREFIX -+ "Error parsing LAPIC address override entry\n"); -+ return count; -+ } -+ -+ mp_register_lapic_address(acpi_lapic_addr); -+ -+ count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic, -+ MAX_APICS); -+ if (!count) { -+ printk(KERN_ERR PREFIX "No LAPIC entries present\n"); -+ /* TBD: Cleanup to allow fallback to MPS */ -+ return -ENODEV; -+ } else if (count < 0) { -+ printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); -+ /* TBD: Cleanup to allow fallback to MPS */ -+ return count; -+ } -+ -+ count = -+ acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0); -+ if (count < 0) { -+ printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); -+ /* TBD: Cleanup to allow fallback to MPS */ -+ return count; -+ } -+ return 0; -+} -+#endif /* CONFIG_X86_LOCAL_APIC */ -+ -+#ifdef CONFIG_X86_IO_APIC -+/* -+ * Parse IOAPIC related entries in MADT -+ * returns 0 on success, < 0 on error -+ */ -+static int __init acpi_parse_madt_ioapic_entries(void) -+{ -+ int count; -+ -+ /* -+ * ACPI interpreter is required to complete interrupt setup, -+ * so if it is off, don't enumerate the io-apics with ACPI. -+ * If MPS is present, it will handle them, -+ * otherwise the system will stay in PIC mode -+ */ -+ if (acpi_disabled || acpi_noirq) { -+ return -ENODEV; -+ } -+ -+ if (!cpu_has_apic) -+ return -ENODEV; -+ -+ /* -+ * if "noapic" boot option, don't look for IO-APICs -+ */ -+ if (skip_ioapic_setup) { -+ printk(KERN_INFO PREFIX "Skipping IOAPIC probe " -+ "due to 'noapic' option.\n"); -+ return -ENODEV; -+ } -+ -+ count = -+ acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, -+ MAX_IO_APICS); -+ if (!count) { -+ printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); -+ return -ENODEV; -+ } else if (count < 0) { -+ printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); -+ return count; -+ } -+ -+ count = -+ acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, -+ NR_IRQ_VECTORS); -+ if (count < 0) { -+ printk(KERN_ERR PREFIX -+ "Error parsing interrupt source overrides entry\n"); -+ /* TBD: Cleanup to allow fallback to MPS */ -+ return count; -+ } -+ -+ /* -+ * If BIOS did not supply an INT_SRC_OVR for the SCI -+ * pretend we got one so we can set the SCI flags. -+ */ -+ if (!acpi_sci_override_gsi) -+ acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); -+ -+ /* Fill in identity legacy mapings where no override */ -+ mp_config_acpi_legacy_irqs(); -+ -+ count = -+ acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, -+ NR_IRQ_VECTORS); -+ if (count < 0) { -+ printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); -+ /* TBD: Cleanup to allow fallback to MPS */ -+ return count; -+ } -+ -+ return 0; -+} -+#else -+static inline int acpi_parse_madt_ioapic_entries(void) -+{ -+ return -1; -+} -+#endif /* !CONFIG_X86_IO_APIC */ -+ -+static void __init acpi_process_madt(void) -+{ -+#ifdef CONFIG_X86_LOCAL_APIC -+ int count, error; -+ -+ count = acpi_table_parse(ACPI_APIC, acpi_parse_madt); -+ if (count >= 1) { -+ -+ /* -+ * Parse MADT LAPIC entries -+ */ -+ error = acpi_parse_madt_lapic_entries(); -+ if (!error) { -+ acpi_lapic = 1; -+ -+#ifdef CONFIG_X86_GENERICARCH -+ generic_bigsmp_probe(); -+#endif -+ /* -+ * Parse MADT IO-APIC entries -+ */ -+ error = acpi_parse_madt_ioapic_entries(); -+ if (!error) { -+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; -+ acpi_irq_balance_set(NULL); -+ acpi_ioapic = 1; -+ -+ smp_found_config = 1; -+ clustered_apic_check(); -+ } -+ } -+ if (error == -EINVAL) { -+ /* -+ * Dell Precision Workstation 410, 610 come here. -+ */ -+ printk(KERN_ERR PREFIX -+ "Invalid BIOS MADT, disabling ACPI\n"); -+ disable_acpi(); -+ } -+ } -+#endif -+ return; -+} -+ -+extern int acpi_force; -+ -+#ifdef __i386__ -+ -+static int __init disable_acpi_irq(struct dmi_system_id *d) -+{ -+ if (!acpi_force) { -+ printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", -+ d->ident); -+ acpi_noirq_set(); -+ } -+ return 0; -+} -+ -+static int __init disable_acpi_pci(struct dmi_system_id *d) -+{ -+ if (!acpi_force) { -+ printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", -+ d->ident); -+ acpi_disable_pci(); -+ } -+ return 0; -+} -+ -+static int __init dmi_disable_acpi(struct dmi_system_id *d) -+{ -+ if (!acpi_force) { -+ printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); -+ disable_acpi(); -+ } else { -+ printk(KERN_NOTICE -+ "Warning: DMI blacklist says broken, but acpi forced\n"); -+ } -+ return 0; -+} -+ -+/* -+ * Limit ACPI to CPU enumeration for HT -+ */ -+static int __init force_acpi_ht(struct dmi_system_id *d) -+{ -+ if (!acpi_force) { -+ printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", -+ d->ident); -+ disable_acpi(); -+ acpi_ht = 1; -+ } else { -+ printk(KERN_NOTICE -+ "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); -+ } -+ return 0; -+} -+ -+/* -+ * If your system is blacklisted here, but you find that acpi=force -+ * works for you, please contact acpi-devel@sourceforge.net -+ */ -+static struct dmi_system_id __initdata acpi_dmi_table[] = { -+ /* -+ * Boxes that need ACPI disabled -+ */ -+ { -+ .callback = dmi_disable_acpi, -+ .ident = "IBM Thinkpad", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -+ DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), -+ }, -+ }, -+ -+ /* -+ * Boxes that need acpi=ht -+ */ -+ { -+ .callback = force_acpi_ht, -+ .ident = "FSC Primergy T850", -+ .matches = { -+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), -+ DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "DELL GX240", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), -+ DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "HP VISUALIZE NT Workstation", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), -+ DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "Compaq Workstation W8000", -+ .matches = { -+ DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), -+ DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "ASUS P4B266", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -+ DMI_MATCH(DMI_BOARD_NAME, "P4B266"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "ASUS P2B-DS", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -+ DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "ASUS CUR-DLS", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -+ DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "ABIT i440BX-W83977", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "ABIT "), -+ DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "IBM Bladecenter", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -+ DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "IBM eServer xSeries 360", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -+ DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "IBM eserver xSeries 330", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -+ DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), -+ }, -+ }, -+ { -+ .callback = force_acpi_ht, -+ .ident = "IBM eserver xSeries 440", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -+ DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), -+ }, -+ }, -+ -+ /* -+ * Boxes that need ACPI PCI IRQ routing disabled -+ */ -+ { -+ .callback = disable_acpi_irq, -+ .ident = "ASUS A7V", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), -+ DMI_MATCH(DMI_BOARD_NAME, ""), -+ /* newer BIOS, Revision 1011, does work */ -+ DMI_MATCH(DMI_BIOS_VERSION, -+ "ASUS A7V ACPI BIOS Revision 1007"), -+ }, -+ }, -+ -+ /* -+ * Boxes that need ACPI PCI IRQ routing and PCI scan disabled -+ */ -+ { /* _BBN 0 bug */ -+ .callback = disable_acpi_pci, -+ .ident = "ASUS PR-DLS", -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -+ DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), -+ DMI_MATCH(DMI_BIOS_VERSION, -+ "ASUS PR-DLS ACPI BIOS Revision 1010"), -+ DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") -+ }, -+ }, -+ { -+ .callback = disable_acpi_pci, -+ .ident = "Acer TravelMate 36x Laptop", -+ .matches = { -+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"), -+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), -+ }, -+ }, -+ {} -+}; -+ -+#endif /* __i386__ */ -+ -+/* -+ * acpi_boot_table_init() and acpi_boot_init() -+ * called from setup_arch(), always. -+ * 1. checksums all tables -+ * 2. enumerates lapics -+ * 3. enumerates io-apics -+ * -+ * acpi_table_init() is separate to allow reading SRAT without -+ * other side effects. -+ * -+ * side effects of acpi_boot_init: -+ * acpi_lapic = 1 if LAPIC found -+ * acpi_ioapic = 1 if IOAPIC found -+ * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; -+ * if acpi_blacklisted() acpi_disabled = 1; -+ * acpi_irq_model=... -+ * ... -+ * -+ * return value: (currently ignored) -+ * 0: success -+ * !0: failure -+ */ -+ -+int __init acpi_boot_table_init(void) -+{ -+ int error; -+ -+#ifdef __i386__ -+ dmi_check_system(acpi_dmi_table); -+#endif -+ -+ /* -+ * If acpi_disabled, bail out -+ * One exception: acpi=ht continues far enough to enumerate LAPICs -+ */ -+ if (acpi_disabled && !acpi_ht) -+ return 1; -+ -+ /* -+ * Initialize the ACPI boot-time table parser. -+ */ -+ error = acpi_table_init(); -+ if (error) { -+ disable_acpi(); -+ return error; -+ } -+ -+ acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); -+ -+ /* -+ * blacklist may disable ACPI entirely -+ */ -+ error = acpi_blacklisted(); -+ if (error) { -+ if (acpi_force) { -+ printk(KERN_WARNING PREFIX "acpi=force override\n"); -+ } else { -+ printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); -+ disable_acpi(); -+ return error; -+ } -+ } -+ -+ return 0; -+} -+ -+int __init acpi_boot_init(void) -+{ -+ /* -+ * If acpi_disabled, bail out -+ * One exception: acpi=ht continues far enough to enumerate LAPICs -+ */ -+ if (acpi_disabled && !acpi_ht) -+ return 1; -+ -+ acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); -+ -+ /* -+ * set sci_int and PM timer address -+ */ -+ acpi_table_parse(ACPI_FADT, acpi_parse_fadt); -+ -+ /* -+ * Process the Multiple APIC Description Table (MADT), if present -+ */ -+ acpi_process_madt(); -+ -+ acpi_table_parse(ACPI_HPET, acpi_parse_hpet); -+ -+ return 0; -+} -diff -rpuN linux-2.6.18.8/arch/i386/kernel/acpi/sleep-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/acpi/sleep-xen.c ---- linux-2.6.18.8/arch/i386/kernel/acpi/sleep-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/acpi/sleep-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,134 @@ -+/* -+ * sleep.c - x86-specific ACPI sleep support. -+ * -+ * Copyright (C) 2001-2003 Patrick Mochel -+ * Copyright (C) 2001-2003 Pavel Machek -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifndef CONFIG_ACPI_PV_SLEEP -+/* address in low memory of the wakeup routine. */ -+unsigned long acpi_wakeup_address = 0; -+unsigned long acpi_video_flags; -+extern char wakeup_start, wakeup_end; -+ -+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); -+#endif -+ -+/** -+ * acpi_save_state_mem - save kernel state -+ * -+ * Create an identity mapped page table and copy the wakeup routine to -+ * low memory. -+ */ -+int acpi_save_state_mem(void) -+{ -+#ifndef CONFIG_ACPI_PV_SLEEP -+ if (!acpi_wakeup_address) -+ return 1; -+ memcpy((void *)acpi_wakeup_address, &wakeup_start, -+ &wakeup_end - &wakeup_start); -+ acpi_copy_wakeup_routine(acpi_wakeup_address); -+#endif -+ return 0; -+} -+ -+/* -+ * acpi_restore_state - undo effects of acpi_save_state_mem -+ */ -+void acpi_restore_state_mem(void) -+{ -+} -+ -+/** -+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation -+ * -+ * We allocate a page from the first 1MB of memory for the wakeup -+ * routine for when we come back from a sleep state. The -+ * runtime allocator allows specification of <16MB pages, but not -+ * <1MB pages. -+ */ -+void __init acpi_reserve_bootmem(void) -+{ -+#ifndef CONFIG_ACPI_PV_SLEEP -+ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { -+ printk(KERN_ERR -+ "ACPI: Wakeup code way too big, S3 disabled.\n"); -+ return; -+ } -+ -+ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); -+ if (!acpi_wakeup_address) -+ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -+#endif -+} -+ -+#ifndef CONFIG_ACPI_PV_SLEEP -+static int __init acpi_sleep_setup(char *str) -+{ -+ while ((str != NULL) && (*str != '\0')) { -+ if (strncmp(str, "s3_bios", 7) == 0) -+ acpi_video_flags = 1; -+ if (strncmp(str, "s3_mode", 7) == 0) -+ acpi_video_flags |= 2; -+ str = strchr(str, ','); -+ if (str != NULL) -+ str += strspn(str, ", \t"); -+ } -+ return 1; -+} -+ -+__setup("acpi_sleep=", acpi_sleep_setup); -+ -+static __init int reset_videomode_after_s3(struct dmi_system_id *d) -+{ -+ acpi_video_flags |= 2; -+ return 0; -+} -+ -+static __initdata struct dmi_system_id acpisleep_dmi_table[] = { -+ { /* Reset video mode after returning from ACPI S3 sleep */ -+ .callback = reset_videomode_after_s3, -+ .ident = "Toshiba Satellite 4030cdt", -+ .matches = { -+ DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), -+ }, -+ }, -+ {} -+}; -+ -+static int __init acpisleep_dmi_init(void) -+{ -+ dmi_check_system(acpisleep_dmi_table); -+ return 0; -+} -+ -+core_initcall(acpisleep_dmi_init); -+ -+#else /* CONFIG_ACPI_PV_SLEEP */ -+#include -+#include -+int acpi_notify_hypervisor_state(u8 sleep_state, -+ u32 pm1a_cnt, u32 pm1b_cnt) -+{ -+ struct xen_platform_op op = { -+ .cmd = XENPF_enter_acpi_sleep, -+ .interface_version = XENPF_INTERFACE_VERSION, -+ .u = { -+ .enter_acpi_sleep = { -+ .pm1a_cnt_val = (u16)pm1a_cnt, -+ .pm1b_cnt_val = (u16)pm1b_cnt, -+ .sleep_state = sleep_state, -+ }, -+ }, -+ }; -+ -+ return HYPERVISOR_platform_op(&op); -+} -+#endif /* CONFIG_ACPI_PV_SLEEP */ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/apic-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/apic-xen.c ---- linux-2.6.18.8/arch/i386/kernel/apic-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/apic-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,155 @@ -+/* -+ * Local APIC handling, local APIC timers -+ * -+ * (c) 1999, 2000 Ingo Molnar -+ * -+ * Fixes -+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; -+ * thanks to Eric Gilmore -+ * and Rolf G. Tews -+ * for testing these extensively. -+ * Maciej W. Rozycki : Various updates and fixes. -+ * Mikael Pettersson : Power Management for UP-APIC. -+ * Pavel Machek and -+ * Mikael Pettersson : PM converted to driver model. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+#include "io_ports.h" -+ -+#ifndef CONFIG_XEN -+/* -+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as -+ * IPIs in place of local APIC timers -+ */ -+static cpumask_t timer_bcast_ipi; -+#endif -+ -+/* -+ * Knob to control our willingness to enable the local APIC. -+ */ -+int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ -+ -+/* -+ * Debug level -+ */ -+int apic_verbosity; -+ -+#ifndef CONFIG_XEN -+static int modern_apic(void) -+{ -+ unsigned int lvr, version; -+ /* AMD systems use old APIC versions, so check the CPU */ -+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -+ boot_cpu_data.x86 >= 0xf) -+ return 1; -+ lvr = apic_read(APIC_LVR); -+ version = GET_APIC_VERSION(lvr); -+ return version >= 0x14; -+} -+#endif /* !CONFIG_XEN */ -+ -+/* -+ * 'what should we do if we get a hw irq event on an illegal vector'. -+ * each architecture has to answer this themselves. -+ */ -+void ack_bad_irq(unsigned int irq) -+{ -+ printk("unexpected IRQ trap at vector %02x\n", irq); -+ /* -+ * Currently unexpected vectors happen only on SMP and APIC. -+ * We _must_ ack these because every local APIC has only N -+ * irq slots per priority level, and a 'hanging, unacked' IRQ -+ * holds up an irq slot - in excessive cases (when multiple -+ * unexpected vectors occur) that might lock up the APIC -+ * completely. -+ * But only ack when the APIC is enabled -AK -+ */ -+ if (cpu_has_apic) -+ ack_APIC_irq(); -+} -+ -+int get_physical_broadcast(void) -+{ -+ return 0xff; -+} -+ -+#ifndef CONFIG_XEN -+#ifndef CONFIG_SMP -+static void up_apic_timer_interrupt_call(struct pt_regs *regs) -+{ -+ int cpu = smp_processor_id(); -+ -+ /* -+ * the NMI deadlock-detector uses this. -+ */ -+ per_cpu(irq_stat, cpu).apic_timer_irqs++; -+ -+ smp_local_timer_interrupt(regs); -+} -+#endif -+ -+void smp_send_timer_broadcast_ipi(struct pt_regs *regs) -+{ -+ cpumask_t mask; -+ -+ cpus_and(mask, cpu_online_map, timer_bcast_ipi); -+ if (!cpus_empty(mask)) { -+#ifdef CONFIG_SMP -+ send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -+#else -+ /* -+ * We can directly call the apic timer interrupt handler -+ * in UP case. Minus all irq related functions -+ */ -+ up_apic_timer_interrupt_call(regs); -+#endif -+ } -+} -+#endif -+ -+int setup_profiling_timer(unsigned int multiplier) -+{ -+ return -EINVAL; -+} -+ -+/* -+ * This initializes the IO-APIC and APIC hardware if this is -+ * a UP kernel. -+ */ -+int __init APIC_init_uniprocessor (void) -+{ -+#ifdef CONFIG_X86_IO_APIC -+ if (smp_found_config) -+ if (!skip_ioapic_setup && nr_ioapics) -+ setup_IO_APIC(); -+#endif -+ -+ return 0; -+} -diff -rpuN linux-2.6.18.8/arch/i386/kernel/asm-offsets.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/asm-offsets.c ---- linux-2.6.18.8/arch/i386/kernel/asm-offsets.c 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/asm-offsets.c 2008-02-15 16:21:49.000000000 -0800 -@@ -66,9 +66,14 @@ void foo(void) - OFFSET(pbe_orig_address, pbe, orig_address); - OFFSET(pbe_next, pbe, next); - -+#ifndef CONFIG_X86_NO_TSS - /* Offset from the sysenter stack to tss.esp0 */ -- DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - -+ DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) - - sizeof(struct tss_struct)); -+#else -+ /* sysenter stack points directly to esp0 */ -+ DEFINE(SYSENTER_stack_esp0, 0); -+#endif - - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VDSO_PRELINK, VDSO_PRELINK); -diff -rpuN linux-2.6.18.8/arch/i386/kernel/cpu/Makefile linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/Makefile ---- linux-2.6.18.8/arch/i386/kernel/cpu/Makefile 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/Makefile 2008-02-15 16:21:49.000000000 -0800 -@@ -17,3 +17,4 @@ obj-$(CONFIG_X86_MCE) += mcheck/ - - obj-$(CONFIG_MTRR) += mtrr/ - obj-$(CONFIG_CPU_FREQ) += cpufreq/ -+ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/cpu/common-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/common-xen.c ---- linux-2.6.18.8/arch/i386/kernel/cpu/common-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/common-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,743 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#ifdef CONFIG_X86_LOCAL_APIC -+#include -+#include -+#include -+#else -+#ifdef CONFIG_XEN -+#define phys_pkg_id(a,b) a -+#endif -+#endif -+#include -+ -+#include "cpu.h" -+ -+DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -+EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -+ -+#ifndef CONFIG_XEN -+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); -+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); -+#endif -+ -+static int cachesize_override __cpuinitdata = -1; -+static int disable_x86_fxsr __cpuinitdata; -+static int disable_x86_serial_nr __cpuinitdata = 1; -+static int disable_x86_sep __cpuinitdata; -+ -+struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -+ -+extern int disable_pse; -+ -+static void default_init(struct cpuinfo_x86 * c) -+{ -+ /* Not much we can do here... */ -+ /* Check if at least it has cpuid */ -+ if (c->cpuid_level == -1) { -+ /* No cpuid. It must be an ancient CPU */ -+ if (c->x86 == 4) -+ strcpy(c->x86_model_id, "486"); -+ else if (c->x86 == 3) -+ strcpy(c->x86_model_id, "386"); -+ } -+} -+ -+static struct cpu_dev default_cpu = { -+ .c_init = default_init, -+ .c_vendor = "Unknown", -+}; -+static struct cpu_dev * this_cpu = &default_cpu; -+ -+static int __init cachesize_setup(char *str) -+{ -+ get_option (&str, &cachesize_override); -+ return 1; -+} -+__setup("cachesize=", cachesize_setup); -+ -+int __cpuinit get_model_name(struct cpuinfo_x86 *c) -+{ -+ unsigned int *v; -+ char *p, *q; -+ -+ if (cpuid_eax(0x80000000) < 0x80000004) -+ return 0; -+ -+ v = (unsigned int *) c->x86_model_id; -+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); -+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); -+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); -+ c->x86_model_id[48] = 0; -+ -+ /* Intel chips right-justify this string for some dumb reason; -+ undo that brain damage */ -+ p = q = &c->x86_model_id[0]; -+ while ( *p == ' ' ) -+ p++; -+ if ( p != q ) { -+ while ( *p ) -+ *q++ = *p++; -+ while ( q <= &c->x86_model_id[48] ) -+ *q++ = '\0'; /* Zero-pad the rest */ -+ } -+ -+ return 1; -+} -+ -+ -+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -+{ -+ unsigned int n, dummy, ecx, edx, l2size; -+ -+ n = cpuid_eax(0x80000000); -+ -+ if (n >= 0x80000005) { -+ cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); -+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", -+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); -+ c->x86_cache_size=(ecx>>24)+(edx>>24); -+ } -+ -+ if (n < 0x80000006) /* Some chips just has a large L1. */ -+ return; -+ -+ ecx = cpuid_ecx(0x80000006); -+ l2size = ecx >> 16; -+ -+ /* do processor-specific cache resizing */ -+ if (this_cpu->c_size_cache) -+ l2size = this_cpu->c_size_cache(c,l2size); -+ -+ /* Allow user to override all this if necessary. */ -+ if (cachesize_override != -1) -+ l2size = cachesize_override; -+ -+ if ( l2size == 0 ) -+ return; /* Again, no L2 cache is possible */ -+ -+ c->x86_cache_size = l2size; -+ -+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", -+ l2size, ecx & 0xFF); -+} -+ -+/* Naming convention should be: [()] */ -+/* This table only is used unless init_() below doesn't set it; */ -+/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ -+ -+/* Look up CPU names by table lookup. */ -+static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) -+{ -+ struct cpu_model_info *info; -+ -+ if ( c->x86_model >= 16 ) -+ return NULL; /* Range check */ -+ -+ if (!this_cpu) -+ return NULL; -+ -+ info = this_cpu->c_models; -+ -+ while (info && info->family) { -+ if (info->family == c->x86) -+ return info->model_names[c->x86_model]; -+ info++; -+ } -+ return NULL; /* Not found */ -+} -+ -+ -+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) -+{ -+ char *v = c->x86_vendor_id; -+ int i; -+ static int printed; -+ -+ for (i = 0; i < X86_VENDOR_NUM; i++) { -+ if (cpu_devs[i]) { -+ if (!strcmp(v,cpu_devs[i]->c_ident[0]) || -+ (cpu_devs[i]->c_ident[1] && -+ !strcmp(v,cpu_devs[i]->c_ident[1]))) { -+ c->x86_vendor = i; -+ if (!early) -+ this_cpu = cpu_devs[i]; -+ return; -+ } -+ } -+ } -+ if (!printed) { -+ printed++; -+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); -+ printk(KERN_ERR "CPU: Your system may be unstable.\n"); -+ } -+ c->x86_vendor = X86_VENDOR_UNKNOWN; -+ this_cpu = &default_cpu; -+} -+ -+ -+static int __init x86_fxsr_setup(char * s) -+{ -+ disable_x86_fxsr = 1; -+ return 1; -+} -+__setup("nofxsr", x86_fxsr_setup); -+ -+ -+static int __init x86_sep_setup(char * s) -+{ -+ disable_x86_sep = 1; -+ return 1; -+} -+__setup("nosep", x86_sep_setup); -+ -+ -+/* Standard macro to see if a specific flag is changeable */ -+static inline int flag_is_changeable_p(u32 flag) -+{ -+ u32 f1, f2; -+ -+ asm("pushfl\n\t" -+ "pushfl\n\t" -+ "popl %0\n\t" -+ "movl %0,%1\n\t" -+ "xorl %2,%0\n\t" -+ "pushl %0\n\t" -+ "popfl\n\t" -+ "pushfl\n\t" -+ "popl %0\n\t" -+ "popfl\n\t" -+ : "=&r" (f1), "=&r" (f2) -+ : "ir" (flag)); -+ -+ return ((f1^f2) & flag) != 0; -+} -+ -+ -+/* Probe for the CPUID instruction */ -+static int __cpuinit have_cpuid_p(void) -+{ -+ return flag_is_changeable_p(X86_EFLAGS_ID); -+} -+ -+/* Do minimum CPU detection early. -+ Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. -+ The others are not touched to avoid unwanted side effects. -+ -+ WARNING: this function is only called on the BP. Don't add code here -+ that is supposed to run on all CPUs. */ -+static void __init early_cpu_detect(void) -+{ -+ struct cpuinfo_x86 *c = &boot_cpu_data; -+ -+ c->x86_cache_alignment = 32; -+ -+ if (!have_cpuid_p()) -+ return; -+ -+ /* Get vendor name */ -+ cpuid(0x00000000, &c->cpuid_level, -+ (int *)&c->x86_vendor_id[0], -+ (int *)&c->x86_vendor_id[8], -+ (int *)&c->x86_vendor_id[4]); -+ -+ get_cpu_vendor(c, 1); -+ -+ c->x86 = 4; -+ if (c->cpuid_level >= 0x00000001) { -+ u32 junk, tfms, cap0, misc; -+ cpuid(0x00000001, &tfms, &misc, &junk, &cap0); -+ c->x86 = (tfms >> 8) & 15; -+ c->x86_model = (tfms >> 4) & 15; -+ if (c->x86 == 0xf) -+ c->x86 += (tfms >> 20) & 0xff; -+ if (c->x86 >= 0x6) -+ c->x86_model += ((tfms >> 16) & 0xF) << 4; -+ c->x86_mask = tfms & 15; -+ if (cap0 & (1<<19)) -+ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; -+ } -+} -+ -+void __cpuinit generic_identify(struct cpuinfo_x86 * c) -+{ -+ u32 tfms, xlvl; -+ int ebx; -+ -+ if (have_cpuid_p()) { -+ /* Get vendor name */ -+ cpuid(0x00000000, &c->cpuid_level, -+ (int *)&c->x86_vendor_id[0], -+ (int *)&c->x86_vendor_id[8], -+ (int *)&c->x86_vendor_id[4]); -+ -+ get_cpu_vendor(c, 0); -+ /* Initialize the standard set of capabilities */ -+ /* Note that the vendor-specific code below might override */ -+ -+ /* Intel-defined flags: level 0x00000001 */ -+ if ( c->cpuid_level >= 0x00000001 ) { -+ u32 capability, excap; -+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability); -+ c->x86_capability[0] = capability; -+ c->x86_capability[4] = excap; -+ c->x86 = (tfms >> 8) & 15; -+ c->x86_model = (tfms >> 4) & 15; -+ if (c->x86 == 0xf) -+ c->x86 += (tfms >> 20) & 0xff; -+ if (c->x86 >= 0x6) -+ c->x86_model += ((tfms >> 16) & 0xF) << 4; -+ c->x86_mask = tfms & 15; -+#ifdef CONFIG_X86_HT -+ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); -+#else -+ c->apicid = (ebx >> 24) & 0xFF; -+#endif -+ } else { -+ /* Have CPUID level 0 only - unheard of */ -+ c->x86 = 4; -+ } -+ -+ /* AMD-defined flags: level 0x80000001 */ -+ xlvl = cpuid_eax(0x80000000); -+ if ( (xlvl & 0xffff0000) == 0x80000000 ) { -+ if ( xlvl >= 0x80000001 ) { -+ c->x86_capability[1] = cpuid_edx(0x80000001); -+ c->x86_capability[6] = cpuid_ecx(0x80000001); -+ } -+ if ( xlvl >= 0x80000004 ) -+ get_model_name(c); /* Default name */ -+ } -+ } -+ -+ early_intel_workaround(c); -+ -+#ifdef CONFIG_X86_HT -+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; -+#endif -+} -+ -+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) -+{ -+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { -+ /* Disable processor serial number */ -+ unsigned long lo,hi; -+ rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); -+ lo |= 0x200000; -+ wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); -+ printk(KERN_NOTICE "CPU serial number disabled.\n"); -+ clear_bit(X86_FEATURE_PN, c->x86_capability); -+ -+ /* Disabling the serial number may affect the cpuid level */ -+ c->cpuid_level = cpuid_eax(0); -+ } -+} -+ -+static int __init x86_serial_nr_setup(char *s) -+{ -+ disable_x86_serial_nr = 0; -+ return 1; -+} -+__setup("serialnumber", x86_serial_nr_setup); -+ -+ -+ -+/* -+ * This does the hard work of actually picking apart the CPU stuff... -+ */ -+void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -+{ -+ int i; -+ -+ c->loops_per_jiffy = loops_per_jiffy; -+ c->x86_cache_size = -1; -+ c->x86_vendor = X86_VENDOR_UNKNOWN; -+ c->cpuid_level = -1; /* CPUID not detected */ -+ c->x86_model = c->x86_mask = 0; /* So far unknown... */ -+ c->x86_vendor_id[0] = '\0'; /* Unset */ -+ c->x86_model_id[0] = '\0'; /* Unset */ -+ c->x86_max_cores = 1; -+ memset(&c->x86_capability, 0, sizeof c->x86_capability); -+ -+ if (!have_cpuid_p()) { -+ /* First of all, decide if this is a 486 or higher */ -+ /* It's a 486 if we can modify the AC flag */ -+ if ( flag_is_changeable_p(X86_EFLAGS_AC) ) -+ c->x86 = 4; -+ else -+ c->x86 = 3; -+ } -+ -+ generic_identify(c); -+ -+ printk(KERN_DEBUG "CPU: After generic identify, caps:"); -+ for (i = 0; i < NCAPINTS; i++) -+ printk(" %08lx", c->x86_capability[i]); -+ printk("\n"); -+ -+ if (this_cpu->c_identify) { -+ this_cpu->c_identify(c); -+ -+ printk(KERN_DEBUG "CPU: After vendor identify, caps:"); -+ for (i = 0; i < NCAPINTS; i++) -+ printk(" %08lx", c->x86_capability[i]); -+ printk("\n"); -+ } -+ -+ /* -+ * Vendor-specific initialization. In this section we -+ * canonicalize the feature flags, meaning if there are -+ * features a certain CPU supports which CPUID doesn't -+ * tell us, CPUID claiming incorrect flags, or other bugs, -+ * we handle them here. -+ * -+ * At the end of this section, c->x86_capability better -+ * indicate the features this CPU genuinely supports! -+ */ -+ if (this_cpu->c_init) -+ this_cpu->c_init(c); -+ -+ /* Disable the PN if appropriate */ -+ squash_the_stupid_serial_number(c); -+ -+ /* -+ * The vendor-specific functions might have changed features. Now -+ * we do "generic changes." -+ */ -+ -+ /* TSC disabled? */ -+ if ( tsc_disable ) -+ clear_bit(X86_FEATURE_TSC, c->x86_capability); -+ -+ /* FXSR disabled? */ -+ if (disable_x86_fxsr) { -+ clear_bit(X86_FEATURE_FXSR, c->x86_capability); -+ clear_bit(X86_FEATURE_XMM, c->x86_capability); -+ } -+ -+ /* SEP disabled? */ -+ if (disable_x86_sep) -+ clear_bit(X86_FEATURE_SEP, c->x86_capability); -+ -+ if (disable_pse) -+ clear_bit(X86_FEATURE_PSE, c->x86_capability); -+ -+ /* If the model name is still unset, do table lookup. */ -+ if ( !c->x86_model_id[0] ) { -+ char *p; -+ p = table_lookup_model(c); -+ if ( p ) -+ strcpy(c->x86_model_id, p); -+ else -+ /* Last resort... */ -+ sprintf(c->x86_model_id, "%02x/%02x", -+ c->x86, c->x86_model); -+ } -+ -+ /* Now the feature flags better reflect actual CPU features! */ -+ -+ printk(KERN_DEBUG "CPU: After all inits, caps:"); -+ for (i = 0; i < NCAPINTS; i++) -+ printk(" %08lx", c->x86_capability[i]); -+ printk("\n"); -+ -+ /* -+ * On SMP, boot_cpu_data holds the common feature set between -+ * all CPUs; so make sure that we indicate which features are -+ * common between the CPUs. The first time this routine gets -+ * executed, c == &boot_cpu_data. -+ */ -+ if ( c != &boot_cpu_data ) { -+ /* AND the already accumulated flags with these */ -+ for ( i = 0 ; i < NCAPINTS ; i++ ) -+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; -+ } -+ -+ /* Init Machine Check Exception if available. */ -+ mcheck_init(c); -+ -+ if (c == &boot_cpu_data) -+ sysenter_setup(); -+ enable_sep_cpu(); -+ -+ if (c == &boot_cpu_data) -+ mtrr_bp_init(); -+ else -+ mtrr_ap_init(); -+} -+ -+#ifdef CONFIG_X86_HT -+void __cpuinit detect_ht(struct cpuinfo_x86 *c) -+{ -+ u32 eax, ebx, ecx, edx; -+ int index_msb, core_bits; -+ -+ cpuid(1, &eax, &ebx, &ecx, &edx); -+ -+ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) -+ return; -+ -+ smp_num_siblings = (ebx & 0xff0000) >> 16; -+ -+ if (smp_num_siblings == 1) { -+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); -+ } else if (smp_num_siblings > 1 ) { -+ -+ if (smp_num_siblings > NR_CPUS) { -+ printk(KERN_WARNING "CPU: Unsupported number of the " -+ "siblings %d", smp_num_siblings); -+ smp_num_siblings = 1; -+ return; -+ } -+ -+ index_msb = get_count_order(smp_num_siblings); -+ c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); -+ -+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n", -+ c->phys_proc_id); -+ -+ smp_num_siblings = smp_num_siblings / c->x86_max_cores; -+ -+ index_msb = get_count_order(smp_num_siblings) ; -+ -+ core_bits = get_count_order(c->x86_max_cores); -+ -+ c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & -+ ((1 << core_bits) - 1); -+ -+ if (c->x86_max_cores > 1) -+ printk(KERN_INFO "CPU: Processor Core ID: %d\n", -+ c->cpu_core_id); -+ } -+} -+#endif -+ -+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -+{ -+ char *vendor = NULL; -+ -+ if (c->x86_vendor < X86_VENDOR_NUM) -+ vendor = this_cpu->c_vendor; -+ else if (c->cpuid_level >= 0) -+ vendor = c->x86_vendor_id; -+ -+ if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) -+ printk("%s ", vendor); -+ -+ if (!c->x86_model_id[0]) -+ printk("%d86", c->x86); -+ else -+ printk("%s", c->x86_model_id); -+ -+ if (c->x86_mask || c->cpuid_level >= 0) -+ printk(" stepping %02x\n", c->x86_mask); -+ else -+ printk("\n"); -+} -+ -+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -+ -+/* This is hacky. :) -+ * We're emulating future behavior. -+ * In the future, the cpu-specific init functions will be called implicitly -+ * via the magic of initcalls. -+ * They will insert themselves into the cpu_devs structure. -+ * Then, when cpu_init() is called, we can just iterate over that array. -+ */ -+ -+extern int intel_cpu_init(void); -+extern int cyrix_init_cpu(void); -+extern int nsc_init_cpu(void); -+extern int amd_init_cpu(void); -+extern int centaur_init_cpu(void); -+extern int transmeta_init_cpu(void); -+extern int rise_init_cpu(void); -+extern int nexgen_init_cpu(void); -+extern int umc_init_cpu(void); -+ -+void __init early_cpu_init(void) -+{ -+ intel_cpu_init(); -+ cyrix_init_cpu(); -+ nsc_init_cpu(); -+ amd_init_cpu(); -+ centaur_init_cpu(); -+ transmeta_init_cpu(); -+ rise_init_cpu(); -+ nexgen_init_cpu(); -+ umc_init_cpu(); -+ early_cpu_detect(); -+ -+#ifdef CONFIG_DEBUG_PAGEALLOC -+ /* pse is not compatible with on-the-fly unmapping, -+ * disable it even if the cpus claim to support it. -+ */ -+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); -+ disable_pse = 1; -+#endif -+} -+ -+static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr) -+{ -+ unsigned long frames[16]; -+ unsigned long va; -+ int f; -+ -+ for (va = gdt_descr->address, f = 0; -+ va < gdt_descr->address + gdt_descr->size; -+ va += PAGE_SIZE, f++) { -+ frames[f] = virt_to_mfn(va); -+ make_lowmem_page_readonly( -+ (void *)va, XENFEAT_writable_descriptor_tables); -+ } -+ if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8)) -+ BUG(); -+} -+ -+/* -+ * cpu_init() initializes state that is per-CPU. Some data is already -+ * initialized (naturally) in the bootstrap process, such as the GDT -+ * and IDT. We reload them nevertheless, this function acts as a -+ * 'CPU state barrier', nothing should get across. -+ */ -+void __cpuinit cpu_init(void) -+{ -+ int cpu = smp_processor_id(); -+#ifndef CONFIG_X86_NO_TSS -+ struct tss_struct * t = &per_cpu(init_tss, cpu); -+#endif -+ struct thread_struct *thread = ¤t->thread; -+ struct desc_struct *gdt; -+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); -+ -+ if (cpu_test_and_set(cpu, cpu_initialized)) { -+ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); -+ for (;;) local_irq_enable(); -+ } -+ printk(KERN_INFO "Initializing CPU#%d\n", cpu); -+ -+ if (cpu_has_vme || cpu_has_de) -+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); -+ if (tsc_disable && cpu_has_tsc) { -+ printk(KERN_NOTICE "Disabling TSC...\n"); -+ /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ -+ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); -+ set_in_cr4(X86_CR4_TSD); -+ } -+ -+#ifndef CONFIG_XEN -+ /* The CPU hotplug case */ -+ if (cpu_gdt_descr->address) { -+ gdt = (struct desc_struct *)cpu_gdt_descr->address; -+ memset(gdt, 0, PAGE_SIZE); -+ goto old_gdt; -+ } -+ /* -+ * This is a horrible hack to allocate the GDT. The problem -+ * is that cpu_init() is called really early for the boot CPU -+ * (and hence needs bootmem) but much later for the secondary -+ * CPUs, when bootmem will have gone away -+ */ -+ if (NODE_DATA(0)->bdata->node_bootmem_map) { -+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); -+ /* alloc_bootmem_pages panics on failure, so no check */ -+ memset(gdt, 0, PAGE_SIZE); -+ } else { -+ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); -+ if (unlikely(!gdt)) { -+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); -+ for (;;) -+ local_irq_enable(); -+ } -+ } -+old_gdt: -+ /* -+ * Initialize the per-CPU GDT with the boot GDT, -+ * and set up the GDT descriptor: -+ */ -+ memcpy(gdt, cpu_gdt_table, GDT_SIZE); -+ -+ /* Set up GDT entry for 16bit stack */ -+ *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= -+ ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | -+ ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | -+ (CPU_16BIT_STACK_SIZE - 1); -+ -+ cpu_gdt_descr->size = GDT_SIZE - 1; -+ cpu_gdt_descr->address = (unsigned long)gdt; -+#else -+ if (cpu == 0 && cpu_gdt_descr->address == 0) { -+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); -+ /* alloc_bootmem_pages panics on failure, so no check */ -+ memset(gdt, 0, PAGE_SIZE); -+ -+ memcpy(gdt, cpu_gdt_table, GDT_SIZE); -+ -+ cpu_gdt_descr->size = GDT_SIZE; -+ cpu_gdt_descr->address = (unsigned long)gdt; -+ } -+#endif -+ -+ cpu_gdt_init(cpu_gdt_descr); -+ -+ /* -+ * Set up and load the per-CPU TSS and LDT -+ */ -+ atomic_inc(&init_mm.mm_count); -+ current->active_mm = &init_mm; -+ if (current->mm) -+ BUG(); -+ enter_lazy_tlb(&init_mm, current); -+ -+ load_esp0(t, thread); -+ -+ load_LDT(&init_mm.context); -+ -+#ifdef CONFIG_DOUBLEFAULT -+ /* Set up doublefault TSS pointer in the GDT */ -+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -+#endif -+ -+ /* Clear %fs and %gs. */ -+ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); -+ -+ /* Clear all 6 debug registers: */ -+ set_debugreg(0, 0); -+ set_debugreg(0, 1); -+ set_debugreg(0, 2); -+ set_debugreg(0, 3); -+ set_debugreg(0, 6); -+ set_debugreg(0, 7); -+ -+ /* -+ * Force FPU initialization: -+ */ -+ current_thread_info()->status = 0; -+ clear_used_math(); -+ mxcsr_feature_mask_init(); -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+void __cpuinit cpu_uninit(void) -+{ -+ int cpu = raw_smp_processor_id(); -+ cpu_clear(cpu, cpu_initialized); -+ -+ /* lazy TLB state */ -+ per_cpu(cpu_tlbstate, cpu).state = 0; -+ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -+} -+#endif -diff -rpuN linux-2.6.18.8/arch/i386/kernel/cpu/cpufreq/powernow-k8.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/cpufreq/powernow-k8.c ---- linux-2.6.18.8/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2008-02-15 16:21:49.000000000 -0800 -@@ -46,7 +46,7 @@ - - #define PFX "powernow-k8: " - #define BFX PFX "BIOS error: " --#define VERSION "version 2.00.00" -+#define VERSION "version 2.20.00" - #include "powernow-k8.h" - - /* serialize freq changes */ -@@ -66,36 +66,15 @@ static u32 find_freq_from_fid(u32 fid) - return 800 + (fid * 100); - } - -- - /* Return a frequency in KHz, given an input fid */ - static u32 find_khz_freq_from_fid(u32 fid) - { - return 1000 * find_freq_from_fid(fid); - } - --/* Return a frequency in MHz, given an input fid and did */ --static u32 find_freq_from_fiddid(u32 fid, u32 did) --{ -- return 100 * (fid + 0x10) >> did; --} -- --static u32 find_khz_freq_from_fiddid(u32 fid, u32 did) --{ -- return 1000 * find_freq_from_fiddid(fid, did); --} -- --static u32 find_fid_from_pstate(u32 pstate) --{ -- u32 hi, lo; -- rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi); -- return lo & HW_PSTATE_FID_MASK; --} -- --static u32 find_did_from_pstate(u32 pstate) -+static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 pstate) - { -- u32 hi, lo; -- rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi); -- return (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT; -+ return data[pstate].frequency; - } - - /* Return the vco fid for an input fid -@@ -139,9 +118,7 @@ static int query_current_values_with_pen - if (cpu_family == CPU_HW_PSTATE) { - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; -- rdmsr(MSR_PSTATE_DEF_BASE + i, lo, hi); -- data->currfid = lo & HW_PSTATE_FID_MASK; -- data->currdid = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT; -+ data->currpstate = i; - return 0; - } - do { -@@ -292,7 +269,7 @@ static int decrease_vid_code_by_step(str - static int transition_pstate(struct powernow_k8_data *data, u32 pstate) - { - wrmsr(MSR_PSTATE_CTRL, pstate, 0); -- data->currfid = find_fid_from_pstate(pstate); -+ data->currpstate = pstate; - return 0; - } - -@@ -738,6 +715,7 @@ static int find_psb_table(struct powerno - - data->numps = psb->numps; - dprintk("numpstates: 0x%x\n", data->numps); -+ data->starting_core_affinity = cpumask_of_cpu(0); - return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); - } - /* -@@ -758,15 +736,43 @@ static int find_psb_table(struct powerno - #ifdef CONFIG_X86_POWERNOW_K8_ACPI - static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) - { -- if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) -+ if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) - return; - -- data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; -- data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; -- data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; -- data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; -- data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); -- data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; -+ data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; -+ data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; -+ data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; -+ data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; -+ data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); -+ data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; -+} -+ -+static struct acpi_processor_performance *acpi_perf_data[NR_CPUS]; -+static int preregister_valid = 0; -+ -+static int powernow_k8_cpu_preinit_acpi() -+{ -+ int i; -+ struct acpi_processor_performance *data; -+ for_each_possible_cpu(i) { -+ data = kzalloc(sizeof(struct acpi_processor_performance), -+ GFP_KERNEL); -+ if (!data) { -+ int j; -+ for_each_possible_cpu(j) { -+ kfree(acpi_perf_data[j]); -+ acpi_perf_data[j] = NULL; -+ } -+ return -ENODEV; -+ } -+ acpi_perf_data[i] = data; -+ } -+ -+ if (acpi_processor_preregister_performance(acpi_perf_data)) -+ return -ENODEV; -+ else -+ preregister_valid = 1; -+ return 0; - } - - static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) -@@ -774,28 +780,29 @@ static int powernow_k8_cpu_init_acpi(str - struct cpufreq_frequency_table *powernow_table; - int ret_val; - -- if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { -+ data->acpi_data = acpi_perf_data[data->cpu]; -+ if (acpi_processor_register_performance(data->acpi_data, data->cpu)) { - dprintk("register performance failed: bad ACPI data\n"); - return -EIO; - } - - /* verify the data contained in the ACPI structures */ -- if (data->acpi_data.state_count <= 1) { -+ if (data->acpi_data->state_count <= 1) { - dprintk("No ACPI P-States\n"); - goto err_out; - } - -- if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || -- (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { -+ if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || -+ (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Invalid control/status registers (%x - %x)\n", -- data->acpi_data.control_register.space_id, -- data->acpi_data.status_register.space_id); -+ data->acpi_data->control_register.space_id, -+ data->acpi_data->status_register.space_id); - goto err_out; - } - - /* fill in data->powernow_table */ - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) -- * (data->acpi_data.state_count + 1)), GFP_KERNEL); -+ * (data->acpi_data->state_count + 1)), GFP_KERNEL); - if (!powernow_table) { - dprintk("powernow_table memory alloc failure\n"); - goto err_out; -@@ -808,28 +815,43 @@ static int powernow_k8_cpu_init_acpi(str - if (ret_val) - goto err_out_mem; - -- powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; -- powernow_table[data->acpi_data.state_count].index = 0; -+ powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; -+ powernow_table[data->acpi_data->state_count].index = 0; - data->powernow_table = powernow_table; - - /* fill in data */ -- data->numps = data->acpi_data.state_count; -+ data->numps = data->acpi_data->state_count; - print_basics(data); - powernow_k8_acpi_pst_values(data, 0); - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - -+ /* determine affinity, from ACPI if available */ -+ if (preregister_valid) { -+ if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) || -+ (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY)) -+ data->starting_core_affinity = data->acpi_data->shared_cpu_map; -+ else -+ data->starting_core_affinity = cpumask_of_cpu(data->cpu); -+ } else { -+ /* best guess from family if not */ -+ if (cpu_family == CPU_HW_PSTATE) -+ data->starting_core_affinity = cpumask_of_cpu(data->cpu); -+ else -+ data->starting_core_affinity = cpu_core_map[data->cpu]; -+ } -+ - return 0; - - err_out_mem: - kfree(powernow_table); - - err_out: -- acpi_processor_unregister_performance(&data->acpi_data, data->cpu); -+ acpi_processor_unregister_performance(data->acpi_data, data->cpu); - -- /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ -- data->acpi_data.state_count = 0; -+ /* data->acpi_data->state_count informs us at ->exit() whether ACPI was used */ -+ data->acpi_data->state_count = 0; - - return -ENODEV; - } -@@ -837,41 +859,23 @@ err_out: - static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) - { - int i; -+ u32 hi = 0, lo = 0; -+ rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); -+ data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - -- for (i = 0; i < data->acpi_data.state_count; i++) { -+ for (i = 0; i < data->acpi_data->state_count; i++) { - u32 index; -- u32 hi = 0, lo = 0; -- u32 fid; -- u32 did; - -- index = data->acpi_data.states[i].control & HW_PSTATE_MASK; -- if (index > MAX_HW_PSTATE) { -+ index = data->acpi_data->states[i].control & HW_PSTATE_MASK; -+ if (index > data->max_hw_pstate) { - printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); - printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); -- } -- rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); -- if (!(hi & HW_PSTATE_VALID_MASK)) { -- dprintk("invalid pstate %d, ignoring\n", index); -- powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; - continue; - } - -- fid = lo & HW_PSTATE_FID_MASK; -- did = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT; -- -- dprintk(" %d : fid 0x%x, did 0x%x\n", index, fid, did); -+ powernow_table[i].index = index; -+ powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000; - -- powernow_table[i].index = index | (fid << HW_FID_INDEX_SHIFT) | (did << HW_DID_INDEX_SHIFT); -- -- powernow_table[i].frequency = find_khz_freq_from_fiddid(fid, did); -- -- if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { -- printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", -- powernow_table[i].frequency, -- (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); -- powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; -- continue; -- } - } - return 0; - } -@@ -880,16 +884,16 @@ static int fill_powernow_table_fidvid(st - { - int i; - int cntlofreq = 0; -- for (i = 0; i < data->acpi_data.state_count; i++) { -+ for (i = 0; i < data->acpi_data->state_count; i++) { - u32 fid; - u32 vid; - - if (data->exttype) { -- fid = data->acpi_data.states[i].status & EXT_FID_MASK; -- vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; -+ fid = data->acpi_data->states[i].status & EXT_FID_MASK; -+ vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; - } else { -- fid = data->acpi_data.states[i].control & FID_MASK; -- vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; -+ fid = data->acpi_data->states[i].control & FID_MASK; -+ vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; - } - - dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); -@@ -930,10 +934,10 @@ static int fill_powernow_table_fidvid(st - cntlofreq = i; - } - -- if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { -+ if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { - printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", - powernow_table[i].frequency, -- (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); -+ (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); - powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; - continue; - } -@@ -943,14 +947,15 @@ static int fill_powernow_table_fidvid(st - - static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) - { -- if (data->acpi_data.state_count) -- acpi_processor_unregister_performance(&data->acpi_data, data->cpu); -+ if (data->acpi_data->state_count) -+ acpi_processor_unregister_performance(data->acpi_data, data->cpu); - } - - #else - static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } - static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } - static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } -+static int powernow_k8_cpu_preinit_acpi() { return -ENODEV; } - #endif /* CONFIG_X86_POWERNOW_K8_ACPI */ - - /* Take a frequency, and issue the fid/vid transition command */ -@@ -1012,22 +1017,18 @@ static int transition_frequency_fidvid(s - /* Take a frequency, and issue the hardware pstate transition command */ - static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index) - { -- u32 fid = 0; -- u32 did = 0; - u32 pstate = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - -- /* get fid did for hardware pstate transition */ -+ /* get MSR index for hardware pstate transition */ - pstate = index & HW_PSTATE_MASK; -- if (pstate > MAX_HW_PSTATE) -+ if (pstate > data->max_hw_pstate) - return 0; -- fid = (index & HW_FID_INDEX_MASK) >> HW_FID_INDEX_SHIFT; -- did = (index & HW_DID_INDEX_MASK) >> HW_DID_INDEX_SHIFT; -- freqs.old = find_khz_freq_from_fiddid(data->currfid, data->currdid); -- freqs.new = find_khz_freq_from_fiddid(fid, did); -+ freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); -+ freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu_mask(i, *(data->available_cores)) { - freqs.cpu = i; -@@ -1035,9 +1036,7 @@ static int transition_frequency_pstate(s - } - - res = transition_pstate(data, pstate); -- data->currfid = find_fid_from_pstate(pstate); -- data->currdid = find_did_from_pstate(pstate); -- freqs.new = find_khz_freq_from_fiddid(data->currfid, data->currdid); -+ freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu_mask(i, *(data->available_cores)) { - freqs.cpu = i; -@@ -1082,10 +1081,7 @@ static int powernowk8_target(struct cpuf - if (query_current_values_with_pending_wait(data)) - goto err_out; - -- if (cpu_family == CPU_HW_PSTATE) -- dprintk("targ: curr fid 0x%x, did 0x%x\n", -- data->currfid, data->currvid); -- else { -+ if (cpu_family != CPU_HW_PSTATE) { - dprintk("targ: curr fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - -@@ -1116,7 +1112,7 @@ static int powernowk8_target(struct cpuf - mutex_unlock(&fidvid_mutex); - - if (cpu_family == CPU_HW_PSTATE) -- pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid); -+ pol->cur = find_khz_freq_from_pstate(data->powernow_table, newstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - ret = 0; -@@ -1164,7 +1160,7 @@ static int __cpuinit powernowk8_cpu_init - * an UP version, and is deprecated by AMD. - */ - if (num_online_cpus() != 1) { -- printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n"); -+ printk(KERN_ERR PFX "Your BIOS does not provide _PSS objects. PowerNow! does not work on SMP systems without _PSS objects. Complain to your BIOS vendor.\n"); - kfree(data); - return -ENODEV; - } -@@ -1204,10 +1200,7 @@ static int __cpuinit powernowk8_cpu_init - set_cpus_allowed(current, oldmask); - - pol->governor = CPUFREQ_DEFAULT_GOVERNOR; -- if (cpu_family == CPU_HW_PSTATE) -- pol->cpus = cpumask_of_cpu(pol->cpu); -- else -- pol->cpus = cpu_core_map[pol->cpu]; -+ pol->cpus = data->starting_core_affinity; - data->available_cores = &(pol->cpus); - - /* Take a crude guess here. -@@ -1216,7 +1209,7 @@ static int __cpuinit powernowk8_cpu_init - + (3 * (1 << data->irt) * 10)) * 1000; - - if (cpu_family == CPU_HW_PSTATE) -- pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid); -+ pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - dprintk("policy current frequency %d kHz\n", pol->cur); -@@ -1233,8 +1226,7 @@ static int __cpuinit powernowk8_cpu_init - cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); - - if (cpu_family == CPU_HW_PSTATE) -- dprintk("cpu_init done, current fid 0x%x, did 0x%x\n", -- data->currfid, data->currdid); -+ dprintk("cpu_init done, current pstate 0x%x\n", data->currpstate); - else - dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); -@@ -1289,7 +1281,10 @@ static unsigned int powernowk8_get (unsi - if (query_current_values_with_pending_wait(data)) - goto out; - -- khz = find_khz_freq_from_fid(data->currfid); -+ if (cpu_family == CPU_HW_PSTATE) -+ khz = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); -+ else -+ khz = find_khz_freq_from_fid(data->currfid); - - out: - set_cpus_allowed(current, oldmask); -@@ -1323,6 +1318,7 @@ static int __cpuinit powernowk8_init(voi - } - - if (supported_cpus == num_online_cpus()) { -+ powernow_k8_cpu_preinit_acpi(); - printk(KERN_INFO PFX "Found %d %s " - "processors (" VERSION ")\n", supported_cpus, - boot_cpu_data.x86_model_id); -diff -rpuN linux-2.6.18.8/arch/i386/kernel/cpu/cpufreq/powernow-k8.h linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/cpufreq/powernow-k8.h ---- linux-2.6.18.8/arch/i386/kernel/cpu/cpufreq/powernow-k8.h 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/cpufreq/powernow-k8.h 2008-02-15 16:21:49.000000000 -0800 -@@ -1,5 +1,5 @@ - /* -- * (c) 2003-2006 Advanced Micro Devices, Inc. -+#* (c) 2003-2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html -@@ -10,6 +10,7 @@ struct powernow_k8_data { - - u32 numps; /* number of p-states */ - u32 batps; /* number of p-states supported on battery */ -+ u32 max_hw_pstate; /* maximum legal hardware pstate */ - - /* these values are constant when the PSB is used to determine - * vid/fid pairings, but are modified during the ->target() call -@@ -21,8 +22,8 @@ struct powernow_k8_data { - u32 plllock; /* pll lock time, units 1 us */ - u32 exttype; /* extended interface = 1 */ - -- /* keep track of the current fid / vid or did */ -- u32 currvid, currfid, currdid; -+ /* keep track of the current fid / vid or pstate */ -+ u32 currvid, currfid, currpstate; - - /* the powernow_table includes all frequency and vid/fid pairings: - * fid are the lower 8 bits of the index, vid are the upper 8 bits. -@@ -32,12 +33,13 @@ struct powernow_k8_data { - #ifdef CONFIG_X86_POWERNOW_K8_ACPI - /* the acpi table needs to be kept. it's only available if ACPI was - * used to determine valid frequency/vid/fid states */ -- struct acpi_processor_performance acpi_data; -+ struct acpi_processor_performance *acpi_data; - #endif - /* we need to keep track of associated cores, but let cpufreq - * handle hotplug events - so just point at cpufreq pol->cpus - * structure */ - cpumask_t *available_cores; -+ cpumask_t starting_core_affinity; - }; - - -@@ -87,23 +89,14 @@ struct powernow_k8_data { - - /* Hardware Pstate _PSS and MSR definitions */ - #define USE_HW_PSTATE 0x00000080 --#define HW_PSTATE_FID_MASK 0x0000003f --#define HW_PSTATE_DID_MASK 0x000001c0 --#define HW_PSTATE_DID_SHIFT 6 --#define HW_PSTATE_MASK 0x00000007 --#define HW_PSTATE_VALID_MASK 0x80000000 --#define HW_FID_INDEX_SHIFT 8 --#define HW_FID_INDEX_MASK 0x0000ff00 --#define HW_DID_INDEX_SHIFT 16 --#define HW_DID_INDEX_MASK 0x00ff0000 --#define HW_WATTS_MASK 0xff --#define HW_PWR_DVR_MASK 0x300 --#define HW_PWR_DVR_SHIFT 8 --#define HW_PWR_MAX_MULT 3 --#define MAX_HW_PSTATE 8 /* hw pstate supports up to 8 */ -+#define HW_PSTATE_MASK 0x00000007 -+#define HW_PSTATE_VALID_MASK 0x80000000 -+#define HW_PSTATE_MAX_MASK 0x000000f0 -+#define HW_PSTATE_MAX_SHIFT 4 - #define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ - #define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ - #define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ -+#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ - - /* define the two driver architectures */ - #define CPU_OPTERON 0 -diff -rpuN linux-2.6.18.8/arch/i386/kernel/cpu/mtrr/Makefile linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/mtrr/Makefile ---- linux-2.6.18.8/arch/i386/kernel/cpu/mtrr/Makefile 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/mtrr/Makefile 2008-02-15 16:21:49.000000000 -0800 -@@ -3,3 +3,4 @@ obj-y += amd.o - obj-y += cyrix.o - obj-y += centaur.o - -+obj-$(CONFIG_XEN) := main.o if.o -diff -rpuN linux-2.6.18.8/arch/i386/kernel/cpu/mtrr/main-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/mtrr/main-xen.c ---- linux-2.6.18.8/arch/i386/kernel/cpu/mtrr/main-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/cpu/mtrr/main-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,198 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include "mtrr.h" -+ -+static DEFINE_MUTEX(mtrr_mutex); -+ -+void generic_get_mtrr(unsigned int reg, unsigned long *base, -+ unsigned int *size, mtrr_type * type) -+{ -+ struct xen_platform_op op; -+ -+ op.cmd = XENPF_read_memtype; -+ op.u.read_memtype.reg = reg; -+ if (unlikely(HYPERVISOR_platform_op(&op))) -+ memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype)); -+ -+ *size = op.u.read_memtype.nr_mfns; -+ *base = op.u.read_memtype.mfn; -+ *type = op.u.read_memtype.type; -+} -+ -+struct mtrr_ops generic_mtrr_ops = { -+ .use_intel_if = 1, -+ .get = generic_get_mtrr, -+}; -+ -+struct mtrr_ops *mtrr_if = &generic_mtrr_ops; -+unsigned int num_var_ranges; -+unsigned int *usage_table; -+ -+static void __init set_num_var_ranges(void) -+{ -+ struct xen_platform_op op; -+ -+ for (num_var_ranges = 0; ; num_var_ranges++) { -+ op.cmd = XENPF_read_memtype; -+ op.u.read_memtype.reg = num_var_ranges; -+ if (HYPERVISOR_platform_op(&op) != 0) -+ break; -+ } -+} -+ -+static void __init init_table(void) -+{ -+ int i, max; -+ -+ max = num_var_ranges; -+ if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) -+ == NULL) { -+ printk(KERN_ERR "mtrr: could not allocate\n"); -+ return; -+ } -+ for (i = 0; i < max; i++) -+ usage_table[i] = 0; -+} -+ -+int mtrr_add_page(unsigned long base, unsigned long size, -+ unsigned int type, char increment) -+{ -+ int error; -+ struct xen_platform_op op; -+ -+ mutex_lock(&mtrr_mutex); -+ -+ op.cmd = XENPF_add_memtype; -+ op.u.add_memtype.mfn = base; -+ op.u.add_memtype.nr_mfns = size; -+ op.u.add_memtype.type = type; -+ error = HYPERVISOR_platform_op(&op); -+ if (error) { -+ mutex_unlock(&mtrr_mutex); -+ BUG_ON(error > 0); -+ return error; -+ } -+ -+ if (increment) -+ ++usage_table[op.u.add_memtype.reg]; -+ -+ mutex_unlock(&mtrr_mutex); -+ -+ return op.u.add_memtype.reg; -+} -+ -+static int mtrr_check(unsigned long base, unsigned long size) -+{ -+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { -+ printk(KERN_WARNING -+ "mtrr: size and base must be multiples of 4 kiB\n"); -+ printk(KERN_DEBUG -+ "mtrr: size: 0x%lx base: 0x%lx\n", size, base); -+ dump_stack(); -+ return -1; -+ } -+ return 0; -+} -+ -+int -+mtrr_add(unsigned long base, unsigned long size, unsigned int type, -+ char increment) -+{ -+ if (mtrr_check(base, size)) -+ return -EINVAL; -+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, -+ increment); -+} -+ -+int mtrr_del_page(int reg, unsigned long base, unsigned long size) -+{ -+ unsigned i; -+ mtrr_type ltype; -+ unsigned long lbase; -+ unsigned int lsize; -+ int error = -EINVAL; -+ struct xen_platform_op op; -+ -+ mutex_lock(&mtrr_mutex); -+ -+ if (reg < 0) { -+ /* Search for existing MTRR */ -+ for (i = 0; i < num_var_ranges; ++i) { -+ mtrr_if->get(i, &lbase, &lsize, <ype); -+ if (lbase == base && lsize == size) { -+ reg = i; -+ break; -+ } -+ } -+ if (reg < 0) { -+ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, -+ size); -+ goto out; -+ } -+ } -+ if (usage_table[reg] < 1) { -+ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); -+ goto out; -+ } -+ if (--usage_table[reg] < 1) { -+ op.cmd = XENPF_del_memtype; -+ op.u.del_memtype.handle = 0; -+ op.u.del_memtype.reg = reg; -+ error = HYPERVISOR_platform_op(&op); -+ if (error) { -+ BUG_ON(error > 0); -+ goto out; -+ } -+ } -+ error = reg; -+ out: -+ mutex_unlock(&mtrr_mutex); -+ return error; -+} -+ -+int -+mtrr_del(int reg, unsigned long base, unsigned long size) -+{ -+ if (mtrr_check(base, size)) -+ return -EINVAL; -+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); -+} -+ -+EXPORT_SYMBOL(mtrr_add); -+EXPORT_SYMBOL(mtrr_del); -+ -+void __init mtrr_bp_init(void) -+{ -+} -+ -+void mtrr_ap_init(void) -+{ -+} -+ -+static int __init mtrr_init(void) -+{ -+ struct cpuinfo_x86 *c = &boot_cpu_data; -+ -+ if (!is_initial_xendomain()) -+ return -ENODEV; -+ -+ if ((!cpu_has(c, X86_FEATURE_MTRR)) && -+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) && -+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && -+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) -+ return -ENODEV; -+ -+ set_num_var_ranges(); -+ init_table(); -+ -+ return 0; -+} -+ -+subsys_initcall(mtrr_init); -diff -rpuN linux-2.6.18.8/arch/i386/kernel/crash.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/crash.c ---- linux-2.6.18.8/arch/i386/kernel/crash.c 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/crash.c 2008-02-15 16:21:49.000000000 -0800 -@@ -90,6 +90,7 @@ static void crash_save_self(struct pt_re - crash_save_this_cpu(regs, cpu); - } - -+#ifndef CONFIG_XEN - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) - static atomic_t waiting_for_crash_ipi; - -@@ -154,6 +155,7 @@ static void nmi_shootdown_cpus(void) - /* There are no cpus to shootdown */ - } - #endif -+#endif /* CONFIG_XEN */ - - void machine_crash_shutdown(struct pt_regs *regs) - { -@@ -170,10 +172,12 @@ void machine_crash_shutdown(struct pt_re - - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = smp_processor_id(); -+#ifndef CONFIG_XEN - nmi_shootdown_cpus(); - lapic_shutdown(); - #if defined(CONFIG_X86_IO_APIC) - disable_IO_APIC(); - #endif -+#endif /* CONFIG_XEN */ - crash_save_self(regs); - } -diff -rpuN linux-2.6.18.8/arch/i386/kernel/early_printk-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/early_printk-xen.c ---- linux-2.6.18.8/arch/i386/kernel/early_printk-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/early_printk-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,2 @@ -+ -+#include "../../x86_64/kernel/early_printk-xen.c" -diff -rpuN linux-2.6.18.8/arch/i386/kernel/entry-xen.S linux-2.6.18-xen-3.2.0/arch/i386/kernel/entry-xen.S ---- linux-2.6.18.8/arch/i386/kernel/entry-xen.S 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/entry-xen.S 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,1238 @@ -+/* -+ * linux/arch/i386/entry.S -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ */ -+ -+/* -+ * entry.S contains the system-call and fault low-level handling routines. -+ * This also contains the timer-interrupt handler, as well as all interrupts -+ * and faults that can result in a task-switch. -+ * -+ * NOTE: This code handles signal-recognition, which happens every time -+ * after a timer-interrupt and after each system call. -+ * -+ * I changed all the .align's to 4 (16 byte alignment), as that's faster -+ * on a 486. -+ * -+ * Stack layout in 'ret_from_system_call': -+ * ptrace needs to have all regs on the stack. -+ * if the order here is changed, it needs to be -+ * updated in fork.c:copy_process, signal.c:do_signal, -+ * ptrace.c and ptrace.h -+ * -+ * 0(%esp) - %ebx -+ * 4(%esp) - %ecx -+ * 8(%esp) - %edx -+ * C(%esp) - %esi -+ * 10(%esp) - %edi -+ * 14(%esp) - %ebp -+ * 18(%esp) - %eax -+ * 1C(%esp) - %ds -+ * 20(%esp) - %es -+ * 24(%esp) - orig_eax -+ * 28(%esp) - %eip -+ * 2C(%esp) - %cs -+ * 30(%esp) - %eflags -+ * 34(%esp) - %oldesp -+ * 38(%esp) - %oldss -+ * -+ * "current" is in register %ebx during any slow entries. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "irq_vectors.h" -+#include -+ -+#define nr_syscalls ((syscall_table_size)/4) -+ -+EBX = 0x00 -+ECX = 0x04 -+EDX = 0x08 -+ESI = 0x0C -+EDI = 0x10 -+EBP = 0x14 -+EAX = 0x18 -+DS = 0x1C -+ES = 0x20 -+ORIG_EAX = 0x24 -+EIP = 0x28 -+CS = 0x2C -+EFLAGS = 0x30 -+OLDESP = 0x34 -+OLDSS = 0x38 -+ -+CF_MASK = 0x00000001 -+TF_MASK = 0x00000100 -+IF_MASK = 0x00000200 -+DF_MASK = 0x00000400 -+NT_MASK = 0x00004000 -+VM_MASK = 0x00020000 -+/* Pseudo-eflags. */ -+NMI_MASK = 0x80000000 -+ -+#ifndef CONFIG_XEN -+#define DISABLE_INTERRUPTS cli -+#define ENABLE_INTERRUPTS sti -+#else -+/* Offsets into shared_info_t. */ -+#define evtchn_upcall_pending /* 0 */ -+#define evtchn_upcall_mask 1 -+ -+#define sizeof_vcpu_shift 6 -+ -+#ifdef CONFIG_SMP -+#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ -+ shl $sizeof_vcpu_shift,%esi ; \ -+ addl HYPERVISOR_shared_info,%esi -+#else -+#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi -+#endif -+ -+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) -+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) -+#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \ -+ __DISABLE_INTERRUPTS -+#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \ -+ __ENABLE_INTERRUPTS -+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) -+#endif -+ -+#ifdef CONFIG_PREEMPT -+#define preempt_stop cli; TRACE_IRQS_OFF -+#else -+#define preempt_stop -+#define resume_kernel restore_nocheck -+#endif -+ -+.macro TRACE_IRQS_IRET -+#ifdef CONFIG_TRACE_IRQFLAGS -+ testl $IF_MASK,EFLAGS(%esp) # interrupts off? -+ jz 1f -+ TRACE_IRQS_ON -+1: -+#endif -+.endm -+ -+#ifdef CONFIG_VM86 -+#define resume_userspace_sig check_userspace -+#else -+#define resume_userspace_sig resume_userspace -+#endif -+ -+#define SAVE_ALL \ -+ cld; \ -+ pushl %es; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ /*CFI_REL_OFFSET es, 0;*/\ -+ pushl %ds; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ /*CFI_REL_OFFSET ds, 0;*/\ -+ pushl %eax; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET eax, 0;\ -+ pushl %ebp; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET ebp, 0;\ -+ pushl %edi; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET edi, 0;\ -+ pushl %esi; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET esi, 0;\ -+ pushl %edx; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET edx, 0;\ -+ pushl %ecx; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET ecx, 0;\ -+ pushl %ebx; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET ebx, 0;\ -+ movl $(__USER_DS), %edx; \ -+ movl %edx, %ds; \ -+ movl %edx, %es; -+ -+#define RESTORE_INT_REGS \ -+ popl %ebx; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE ebx;\ -+ popl %ecx; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE ecx;\ -+ popl %edx; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE edx;\ -+ popl %esi; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE esi;\ -+ popl %edi; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE edi;\ -+ popl %ebp; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE ebp;\ -+ popl %eax; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE eax -+ -+#define RESTORE_REGS \ -+ RESTORE_INT_REGS; \ -+1: popl %ds; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ /*CFI_RESTORE ds;*/\ -+2: popl %es; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ /*CFI_RESTORE es;*/\ -+.section .fixup,"ax"; \ -+3: movl $0,(%esp); \ -+ jmp 1b; \ -+4: movl $0,(%esp); \ -+ jmp 2b; \ -+.previous; \ -+.section __ex_table,"a";\ -+ .align 4; \ -+ .long 1b,3b; \ -+ .long 2b,4b; \ -+.previous -+ -+#define RING0_INT_FRAME \ -+ CFI_STARTPROC simple;\ -+ CFI_DEF_CFA esp, 3*4;\ -+ /*CFI_OFFSET cs, -2*4;*/\ -+ CFI_OFFSET eip, -3*4 -+ -+#define RING0_EC_FRAME \ -+ CFI_STARTPROC simple;\ -+ CFI_DEF_CFA esp, 4*4;\ -+ /*CFI_OFFSET cs, -2*4;*/\ -+ CFI_OFFSET eip, -3*4 -+ -+#define RING0_PTREGS_FRAME \ -+ CFI_STARTPROC simple;\ -+ CFI_DEF_CFA esp, OLDESP-EBX;\ -+ /*CFI_OFFSET cs, CS-OLDESP;*/\ -+ CFI_OFFSET eip, EIP-OLDESP;\ -+ /*CFI_OFFSET es, ES-OLDESP;*/\ -+ /*CFI_OFFSET ds, DS-OLDESP;*/\ -+ CFI_OFFSET eax, EAX-OLDESP;\ -+ CFI_OFFSET ebp, EBP-OLDESP;\ -+ CFI_OFFSET edi, EDI-OLDESP;\ -+ CFI_OFFSET esi, ESI-OLDESP;\ -+ CFI_OFFSET edx, EDX-OLDESP;\ -+ CFI_OFFSET ecx, ECX-OLDESP;\ -+ CFI_OFFSET ebx, EBX-OLDESP -+ -+ENTRY(ret_from_fork) -+ CFI_STARTPROC -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ call schedule_tail -+ GET_THREAD_INFO(%ebp) -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ pushl $0x0202 # Reset kernel eflags -+ CFI_ADJUST_CFA_OFFSET 4 -+ popfl -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp syscall_exit -+ CFI_ENDPROC -+ -+/* -+ * Return to user mode is not as complex as all this looks, -+ * but we want the default path for a system call return to -+ * go as quickly as possible which is why some of this is -+ * less clear than it otherwise should be. -+ */ -+ -+ # userspace resumption stub bypassing syscall exit tracing -+ ALIGN -+ RING0_PTREGS_FRAME -+ret_from_exception: -+ preempt_stop -+ret_from_intr: -+ GET_THREAD_INFO(%ebp) -+check_userspace: -+ movl EFLAGS(%esp), %eax # mix EFLAGS and CS -+ movb CS(%esp), %al -+ testl $(VM_MASK | 2), %eax -+ jz resume_kernel -+ENTRY(resume_userspace) -+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt -+ # setting need_resched or sigpending -+ # between sampling and the iret -+ movl TI_flags(%ebp), %ecx -+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on -+ # int/exception return? -+ jne work_pending -+ jmp restore_all -+ -+#ifdef CONFIG_PREEMPT -+ENTRY(resume_kernel) -+ cli -+ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? -+ jnz restore_nocheck -+need_resched: -+ movl TI_flags(%ebp), %ecx # need_resched set ? -+ testb $_TIF_NEED_RESCHED, %cl -+ jz restore_all -+ testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? -+ jz restore_all -+ call preempt_schedule_irq -+ jmp need_resched -+#endif -+ CFI_ENDPROC -+ -+/* SYSENTER_RETURN points to after the "sysenter" instruction in -+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ -+ -+ # sysenter call handler stub -+ENTRY(sysenter_entry) -+ CFI_STARTPROC simple -+ CFI_DEF_CFA esp, 0 -+ CFI_REGISTER esp, ebp -+ movl SYSENTER_stack_esp0(%esp),%esp -+sysenter_past_esp: -+ /* -+ * No need to follow this irqs on/off section: the syscall -+ * disabled irqs and here we enable it straight after entry: -+ */ -+ sti -+ pushl $(__USER_DS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET ss, 0*/ -+ pushl %ebp -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET esp, 0 -+ pushfl -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $(__USER_CS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET cs, 0*/ -+ /* -+ * Push current_thread_info()->sysenter_return to the stack. -+ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words -+ * pushed above; +8 corresponds to copy_thread's esp0 setting. -+ */ -+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET eip, 0 -+ -+/* -+ * Load the potential sixth argument from user stack. -+ * Careful about security. -+ */ -+ cmpl $__PAGE_OFFSET-3,%ebp -+ jae syscall_fault -+1: movl (%ebp),%ebp -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,syscall_fault -+.previous -+ -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ GET_THREAD_INFO(%ebp) -+ -+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ -+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) -+ jnz syscall_trace_entry -+ cmpl $(nr_syscalls), %eax -+ jae syscall_badsys -+ call *sys_call_table(,%eax,4) -+ movl %eax,EAX(%esp) -+ DISABLE_INTERRUPTS -+ TRACE_IRQS_OFF -+ movl TI_flags(%ebp), %ecx -+ testw $_TIF_ALLWORK_MASK, %cx -+ jne syscall_exit_work -+/* if something modifies registers it must also disable sysexit */ -+ movl EIP(%esp), %edx -+ movl OLDESP(%esp), %ecx -+ xorl %ebp,%ebp -+#ifdef CONFIG_XEN -+ TRACE_IRQS_ON -+ __ENABLE_INTERRUPTS -+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ -+ __TEST_PENDING -+ jnz 14f # process more events if necessary... -+ movl ESI(%esp), %esi -+ sysexit -+14: __DISABLE_INTERRUPTS -+ TRACE_IRQS_OFF -+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ -+ push %esp -+ call evtchn_do_upcall -+ add $4,%esp -+ jmp ret_from_intr -+#else -+ TRACE_IRQS_ON -+ sti -+ sysexit -+#endif /* !CONFIG_XEN */ -+ CFI_ENDPROC -+ -+ # pv sysenter call handler stub -+ENTRY(sysenter_entry_pv) -+ RING0_INT_FRAME -+ movl $__USER_DS,16(%esp) -+ movl %ebp,12(%esp) -+ movl $__USER_CS,4(%esp) -+ addl $4,%esp -+ /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ -+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -+/* -+ * Load the potential sixth argument from user stack. -+ * Careful about security. -+ */ -+ cmpl $__PAGE_OFFSET-3,%ebp -+ jae syscall_fault -+1: movl (%ebp),%ebp -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,syscall_fault -+.previous -+ /* fall through */ -+ CFI_ENDPROC -+ENDPROC(sysenter_entry_pv) -+ -+ # system call handler stub -+ENTRY(system_call) -+ RING0_INT_FRAME # can't unwind into user space anyway -+ pushl %eax # save orig_eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ GET_THREAD_INFO(%ebp) -+ testl $TF_MASK,EFLAGS(%esp) -+ jz no_singlestep -+ orl $_TIF_SINGLESTEP,TI_flags(%ebp) -+no_singlestep: -+ # system call tracing in operation / emulation -+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ -+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) -+ jnz syscall_trace_entry -+ cmpl $(nr_syscalls), %eax -+ jae syscall_badsys -+syscall_call: -+ call *sys_call_table(,%eax,4) -+ movl %eax,EAX(%esp) # store the return value -+syscall_exit: -+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt -+ # setting need_resched or sigpending -+ # between sampling and the iret -+ TRACE_IRQS_OFF -+ movl TI_flags(%ebp), %ecx -+ testw $_TIF_ALLWORK_MASK, %cx # current->work -+ jne syscall_exit_work -+ -+restore_all: -+#ifndef CONFIG_XEN -+ movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS -+ # Warning: OLDSS(%esp) contains the wrong/random values if we -+ # are returning to the kernel. -+ # See comments in process.c:copy_thread() for details. -+ movb OLDSS(%esp), %ah -+ movb CS(%esp), %al -+ andl $(VM_MASK | (4 << 8) | 3), %eax -+ cmpl $((4 << 8) | 3), %eax -+ CFI_REMEMBER_STATE -+ je ldt_ss # returning to user-space with LDT SS -+restore_nocheck: -+#else -+restore_nocheck: -+ movl EFLAGS(%esp), %eax -+ testl $(VM_MASK|NMI_MASK), %eax -+ CFI_REMEMBER_STATE -+ jnz hypervisor_iret -+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF -+ GET_VCPU_INFO -+ andb evtchn_upcall_mask(%esi),%al -+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask -+ CFI_REMEMBER_STATE -+ jnz restore_all_enable_events # != 0 => enable event delivery -+#endif -+ TRACE_IRQS_IRET -+restore_nocheck_notrace: -+ RESTORE_REGS -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+1: iret -+.section .fixup,"ax" -+iret_exc: -+#ifndef CONFIG_XEN -+ TRACE_IRQS_ON -+ sti -+#endif -+ pushl $0 # no error code -+ pushl $do_iret_error -+ jmp error_code -+.previous -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+ -+ CFI_RESTORE_STATE -+#ifndef CONFIG_XEN -+ldt_ss: -+ larl OLDSS(%esp), %eax -+ jnz restore_nocheck -+ testl $0x00400000, %eax # returning to 32bit stack? -+ jnz restore_nocheck # allright, normal return -+ /* If returning to userspace with 16bit stack, -+ * try to fix the higher word of ESP, as the CPU -+ * won't restore it. -+ * This is an "official" bug of all the x86-compatible -+ * CPUs, which we can try to work around to make -+ * dosemu and wine happy. */ -+ subl $8, %esp # reserve space for switch16 pointer -+ CFI_ADJUST_CFA_OFFSET 8 -+ cli -+ TRACE_IRQS_OFF -+ movl %esp, %eax -+ /* Set up the 16bit stack frame with switch32 pointer on top, -+ * and a switch16 pointer on top of the current frame. */ -+ call setup_x86_bogus_stack -+ CFI_ADJUST_CFA_OFFSET -8 # frame has moved -+ TRACE_IRQS_IRET -+ RESTORE_REGS -+ lss 20+4(%esp), %esp # switch to 16bit stack -+1: iret -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+#else -+ ALIGN -+restore_all_enable_events: -+ TRACE_IRQS_ON -+ __ENABLE_INTERRUPTS -+scrit: /**** START OF CRITICAL REGION ****/ -+ __TEST_PENDING -+ jnz 14f # process more events if necessary... -+ RESTORE_REGS -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+1: iret -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+14: __DISABLE_INTERRUPTS -+ TRACE_IRQS_OFF -+ jmp 11f -+ecrit: /**** END OF CRITICAL REGION ****/ -+ -+ CFI_RESTORE_STATE -+hypervisor_iret: -+ andl $~NMI_MASK, EFLAGS(%esp) -+ RESTORE_REGS -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp hypercall_page + (__HYPERVISOR_iret * 32) -+#endif -+ CFI_ENDPROC -+ -+ # perform work that needs to be done immediately before resumption -+ ALIGN -+ RING0_PTREGS_FRAME # can't unwind into user space anyway -+work_pending: -+ testb $_TIF_NEED_RESCHED, %cl -+ jz work_notifysig -+work_resched: -+ call schedule -+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt -+ # setting need_resched or sigpending -+ # between sampling and the iret -+ TRACE_IRQS_OFF -+ movl TI_flags(%ebp), %ecx -+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other -+ # than syscall tracing? -+ jz restore_all -+ testb $_TIF_NEED_RESCHED, %cl -+ jnz work_resched -+ -+work_notifysig: # deal with pending signals and -+ # notify-resume requests -+ testl $VM_MASK, EFLAGS(%esp) -+ movl %esp, %eax -+ jne work_notifysig_v86 # returning to kernel-space or -+ # vm86-space -+ xorl %edx, %edx -+ call do_notify_resume -+ jmp resume_userspace_sig -+ -+ ALIGN -+work_notifysig_v86: -+#ifdef CONFIG_VM86 -+ pushl %ecx # save ti_flags for do_notify_resume -+ CFI_ADJUST_CFA_OFFSET 4 -+ call save_v86_state # %eax contains pt_regs pointer -+ popl %ecx -+ CFI_ADJUST_CFA_OFFSET -4 -+ movl %eax, %esp -+ xorl %edx, %edx -+ call do_notify_resume -+ jmp resume_userspace_sig -+#endif -+ -+ # perform syscall exit tracing -+ ALIGN -+syscall_trace_entry: -+ movl $-ENOSYS,EAX(%esp) -+ movl %esp, %eax -+ xorl %edx,%edx -+ call do_syscall_trace -+ cmpl $0, %eax -+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, -+ # so must skip actual syscall -+ movl ORIG_EAX(%esp), %eax -+ cmpl $(nr_syscalls), %eax -+ jnae syscall_call -+ jmp syscall_exit -+ -+ # perform syscall exit tracing -+ ALIGN -+syscall_exit_work: -+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl -+ jz work_pending -+ TRACE_IRQS_ON -+ ENABLE_INTERRUPTS # could let do_syscall_trace() call -+ # schedule() instead -+ movl %esp, %eax -+ movl $1, %edx -+ call do_syscall_trace -+ jmp resume_userspace -+ CFI_ENDPROC -+ -+ RING0_INT_FRAME # can't unwind into user space anyway -+syscall_fault: -+ pushl %eax # save orig_eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ GET_THREAD_INFO(%ebp) -+ movl $-EFAULT,EAX(%esp) -+ jmp resume_userspace -+ -+syscall_badsys: -+ movl $-ENOSYS,EAX(%esp) -+ jmp resume_userspace -+ CFI_ENDPROC -+ -+#ifndef CONFIG_XEN -+#define FIXUP_ESPFIX_STACK \ -+ movl %esp, %eax; \ -+ /* switch to 32bit stack using the pointer on top of 16bit stack */ \ -+ lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ -+ /* copy data from 16bit stack to 32bit stack */ \ -+ call fixup_x86_bogus_stack; \ -+ /* put ESP to the proper location */ \ -+ movl %eax, %esp; -+#define UNWIND_ESPFIX_STACK \ -+ pushl %eax; \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ movl %ss, %eax; \ -+ /* see if on 16bit stack */ \ -+ cmpw $__ESPFIX_SS, %ax; \ -+ je 28f; \ -+27: popl %eax; \ -+ CFI_ADJUST_CFA_OFFSET -4; \ -+.section .fixup,"ax"; \ -+28: movl $__KERNEL_DS, %eax; \ -+ movl %eax, %ds; \ -+ movl %eax, %es; \ -+ /* switch to 32bit stack */ \ -+ FIXUP_ESPFIX_STACK; \ -+ jmp 27b; \ -+.previous -+ -+/* -+ * Build the entry stubs and pointer table with -+ * some assembler magic. -+ */ -+.data -+ENTRY(interrupt) -+.text -+ -+vector=0 -+ENTRY(irq_entries_start) -+ RING0_INT_FRAME -+.rept NR_IRQS -+ ALIGN -+ .if vector -+ CFI_ADJUST_CFA_OFFSET -4 -+ .endif -+1: pushl $~(vector) -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp common_interrupt -+.data -+ .long 1b -+.text -+vector=vector+1 -+.endr -+ -+/* -+ * the CPU automatically disables interrupts when executing an IRQ vector, -+ * so IRQ-flags tracing has to follow that: -+ */ -+ ALIGN -+common_interrupt: -+ SAVE_ALL -+ TRACE_IRQS_OFF -+ movl %esp,%eax -+ call do_IRQ -+ jmp ret_from_intr -+ CFI_ENDPROC -+ -+#define BUILD_INTERRUPT(name, nr) \ -+ENTRY(name) \ -+ RING0_INT_FRAME; \ -+ pushl $~(nr); \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ SAVE_ALL; \ -+ TRACE_IRQS_OFF \ -+ movl %esp,%eax; \ -+ call smp_/**/name; \ -+ jmp ret_from_intr; \ -+ CFI_ENDPROC -+ -+/* The include is where all of the SMP etc. interrupts come from */ -+#include "entry_arch.h" -+#else -+#define UNWIND_ESPFIX_STACK -+#endif -+ -+ENTRY(divide_error) -+ RING0_INT_FRAME -+ pushl $0 # no error code -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_divide_error -+ CFI_ADJUST_CFA_OFFSET 4 -+ ALIGN -+error_code: -+ pushl %ds -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET ds, 0*/ -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET eax, 0 -+ xorl %eax, %eax -+ pushl %ebp -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET ebp, 0 -+ pushl %edi -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET edi, 0 -+ pushl %esi -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET esi, 0 -+ pushl %edx -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET edx, 0 -+ decl %eax # eax = -1 -+ pushl %ecx -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET ecx, 0 -+ pushl %ebx -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET ebx, 0 -+ cld -+ pushl %es -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET es, 0*/ -+ UNWIND_ESPFIX_STACK -+ popl %ecx -+ CFI_ADJUST_CFA_OFFSET -4 -+ /*CFI_REGISTER es, ecx*/ -+ movl ES(%esp), %edi # get the function address -+ movl ORIG_EAX(%esp), %edx # get the error code -+ movl %eax, ORIG_EAX(%esp) -+ movl %ecx, ES(%esp) -+ /*CFI_REL_OFFSET es, ES*/ -+ movl $(__USER_DS), %ecx -+ movl %ecx, %ds -+ movl %ecx, %es -+ movl %esp,%eax # pt_regs pointer -+ call *%edi -+ jmp ret_from_exception -+ CFI_ENDPROC -+ -+#ifdef CONFIG_XEN -+# A note on the "critical region" in our callback handler. -+# We want to avoid stacking callback handlers due to events occurring -+# during handling of the last event. To do this, we keep events disabled -+# until we've done all processing. HOWEVER, we must enable events before -+# popping the stack frame (can't be done atomically) and so it would still -+# be possible to get enough handler activations to overflow the stack. -+# Although unlikely, bugs of that kind are hard to track down, so we'd -+# like to avoid the possibility. -+# So, on entry to the handler we detect whether we interrupted an -+# existing activation in its critical region -- if so, we pop the current -+# activation and restart the handler using the previous one. -+# -+# The sysexit critical region is slightly different. sysexit -+# atomically removes the entire stack frame. If we interrupt in the -+# critical region we know that the entire frame is present and correct -+# so we can simply throw away the new one. -+ENTRY(hypervisor_callback) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ movl EIP(%esp),%eax -+ cmpl $scrit,%eax -+ jb 11f -+ cmpl $ecrit,%eax -+ jb critical_region_fixup -+ cmpl $sysexit_scrit,%eax -+ jb 11f -+ cmpl $sysexit_ecrit,%eax -+ ja 11f -+ addl $OLDESP,%esp # Remove eflags...ebx from stack frame. -+11: push %esp -+ CFI_ADJUST_CFA_OFFSET 4 -+ call evtchn_do_upcall -+ add $4,%esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp ret_from_intr -+ CFI_ENDPROC -+ -+# [How we do the fixup]. We want to merge the current stack frame with the -+# just-interrupted frame. How we do this depends on where in the critical -+# region the interrupted handler was executing, and so how many saved -+# registers are in each frame. We do this quickly using the lookup table -+# 'critical_fixup_table'. For each byte offset in the critical region, it -+# provides the number of bytes which have already been popped from the -+# interrupted stack frame. -+critical_region_fixup: -+ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped -+ cmpb $0xff,%cl # 0xff => vcpu_info critical region -+ jne 15f -+ xorl %ecx,%ecx -+15: leal (%esp,%ecx),%esi # %esi points at end of src region -+ leal OLDESP(%esp),%edi # %edi points at end of dst region -+ shrl $2,%ecx # convert words to bytes -+ je 17f # skip loop if nothing to copy -+16: subl $4,%esi # pre-decrementing copy loop -+ subl $4,%edi -+ movl (%esi),%eax -+ movl %eax,(%edi) -+ loop 16b -+17: movl %edi,%esp # final %edi is top of merged stack -+ jmp 11b -+ -+.section .rodata,"a" -+critical_fixup_table: -+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING -+ .byte 0xff,0xff # jnz 14f -+ .byte 0x00 # pop %ebx -+ .byte 0x04 # pop %ecx -+ .byte 0x08 # pop %edx -+ .byte 0x0c # pop %esi -+ .byte 0x10 # pop %edi -+ .byte 0x14 # pop %ebp -+ .byte 0x18 # pop %eax -+ .byte 0x1c # pop %ds -+ .byte 0x20 # pop %es -+ .byte 0x24,0x24,0x24 # add $4,%esp -+ .byte 0x28 # iret -+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) -+ .byte 0x00,0x00 # jmp 11b -+.previous -+ -+# Hypervisor uses this for application faults while it executes. -+# We get here for two reasons: -+# 1. Fault while reloading DS, ES, FS or GS -+# 2. Fault while executing IRET -+# Category 1 we fix up by reattempting the load, and zeroing the segment -+# register if the load fails. -+# Category 2 we fix up by jumping to do_iret_error. We cannot use the -+# normal Linux return path in this case because if we use the IRET hypercall -+# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -+# We distinguish between categories by maintaining a status value in EAX. -+ENTRY(failsafe_callback) -+ pushl %eax -+ movl $1,%eax -+1: mov 4(%esp),%ds -+2: mov 8(%esp),%es -+3: mov 12(%esp),%fs -+4: mov 16(%esp),%gs -+ testl %eax,%eax -+ popl %eax -+ jz 5f -+ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) -+ jmp iret_exc -+5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) -+ RING0_INT_FRAME -+ pushl $0 -+ SAVE_ALL -+ jmp ret_from_exception -+.section .fixup,"ax"; \ -+6: xorl %eax,%eax; \ -+ movl %eax,4(%esp); \ -+ jmp 1b; \ -+7: xorl %eax,%eax; \ -+ movl %eax,8(%esp); \ -+ jmp 2b; \ -+8: xorl %eax,%eax; \ -+ movl %eax,12(%esp); \ -+ jmp 3b; \ -+9: xorl %eax,%eax; \ -+ movl %eax,16(%esp); \ -+ jmp 4b; \ -+.previous; \ -+.section __ex_table,"a"; \ -+ .align 4; \ -+ .long 1b,6b; \ -+ .long 2b,7b; \ -+ .long 3b,8b; \ -+ .long 4b,9b; \ -+.previous -+#endif -+ CFI_ENDPROC -+ -+ENTRY(coprocessor_error) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_coprocessor_error -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(simd_coprocessor_error) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_simd_coprocessor_error -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(device_not_available) -+ RING0_INT_FRAME -+ pushl $-1 # mark this as an int -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+#ifndef CONFIG_XEN -+ movl %cr0, %eax -+ testl $0x4, %eax # EM (math emulation bit) -+ je device_available_emulate -+ pushl $0 # temporary storage for ORIG_EIP -+ CFI_ADJUST_CFA_OFFSET 4 -+ call math_emulate -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp ret_from_exception -+device_available_emulate: -+#endif -+ preempt_stop -+ call math_state_restore -+ jmp ret_from_exception -+ CFI_ENDPROC -+ -+#ifndef CONFIG_XEN -+/* -+ * Debug traps and NMI can happen at the one SYSENTER instruction -+ * that sets up the real kernel stack. Check here, since we can't -+ * allow the wrong stack to be used. -+ * -+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have -+ * already pushed 3 words if it hits on the sysenter instruction: -+ * eflags, cs and eip. -+ * -+ * We just load the right stack, and push the three (known) values -+ * by hand onto the new stack - while updating the return eip past -+ * the instruction that would have done it for sysenter. -+ */ -+#define FIX_STACK(offset, ok, label) \ -+ cmpw $__KERNEL_CS,4(%esp); \ -+ jne ok; \ -+label: \ -+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ -+ pushfl; \ -+ pushl $__KERNEL_CS; \ -+ pushl $sysenter_past_esp -+#endif /* CONFIG_XEN */ -+ -+KPROBE_ENTRY(debug) -+ RING0_INT_FRAME -+#ifndef CONFIG_XEN -+ cmpl $sysenter_entry,(%esp) -+ jne debug_stack_correct -+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -+debug_stack_correct: -+#endif /* !CONFIG_XEN */ -+ pushl $-1 # mark this as an int -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # error code 0 -+ movl %esp,%eax # pt_regs pointer -+ call do_debug -+ jmp ret_from_exception -+ CFI_ENDPROC -+ .previous .text -+#ifndef CONFIG_XEN -+/* -+ * NMI is doubly nasty. It can happen _while_ we're handling -+ * a debug fault, and the debug fault hasn't yet been able to -+ * clear up the stack. So we first check whether we got an -+ * NMI on the sysenter entry path, but after that we need to -+ * check whether we got an NMI on the debug path where the debug -+ * fault happened on the sysenter path. -+ */ -+ENTRY(nmi) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ movl %ss, %eax -+ cmpw $__ESPFIX_SS, %ax -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ je nmi_16bit_stack -+ cmpl $sysenter_entry,(%esp) -+ je nmi_stack_fixup -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ movl %esp,%eax -+ /* Do not access memory above the end of our stack page, -+ * it might not exist. -+ */ -+ andl $(THREAD_SIZE-1),%eax -+ cmpl $(THREAD_SIZE-20),%eax -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ jae nmi_stack_correct -+ cmpl $sysenter_entry,12(%esp) -+ je nmi_debug_stack_check -+nmi_stack_correct: -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # zero error code -+ movl %esp,%eax # pt_regs pointer -+ call do_nmi -+ jmp restore_nocheck_notrace -+ CFI_ENDPROC -+ -+nmi_stack_fixup: -+ FIX_STACK(12,nmi_stack_correct, 1) -+ jmp nmi_stack_correct -+nmi_debug_stack_check: -+ cmpw $__KERNEL_CS,16(%esp) -+ jne nmi_stack_correct -+ cmpl $debug,(%esp) -+ jb nmi_stack_correct -+ cmpl $debug_esp_fix_insn,(%esp) -+ ja nmi_stack_correct -+ FIX_STACK(24,nmi_stack_correct, 1) -+ jmp nmi_stack_correct -+ -+nmi_16bit_stack: -+ RING0_INT_FRAME -+ /* create the pointer to lss back */ -+ pushl %ss -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl %esp -+ CFI_ADJUST_CFA_OFFSET 4 -+ movzwl %sp, %esp -+ addw $4, (%esp) -+ /* copy the iret frame of 12 bytes */ -+ .rept 3 -+ pushl 16(%esp) -+ CFI_ADJUST_CFA_OFFSET 4 -+ .endr -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ FIXUP_ESPFIX_STACK # %eax == %esp -+ CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved -+ xorl %edx,%edx # zero error code -+ call do_nmi -+ RESTORE_REGS -+ lss 12+4(%esp), %esp # back to 16bit stack -+1: iret -+ CFI_ENDPROC -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+#else -+ENTRY(nmi) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # zero error code -+ movl %esp,%eax # pt_regs pointer -+ call do_nmi -+ orl $NMI_MASK, EFLAGS(%esp) -+ jmp restore_all -+ CFI_ENDPROC -+#endif -+ -+KPROBE_ENTRY(int3) -+ RING0_INT_FRAME -+ pushl $-1 # mark this as an int -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # zero error code -+ movl %esp,%eax # pt_regs pointer -+ call do_int3 -+ jmp ret_from_exception -+ CFI_ENDPROC -+ .previous .text -+ -+ENTRY(overflow) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_overflow -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(bounds) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_bounds -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(invalid_op) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_invalid_op -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(coprocessor_segment_overrun) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_coprocessor_segment_overrun -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(invalid_TSS) -+ RING0_EC_FRAME -+ pushl $do_invalid_TSS -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(segment_not_present) -+ RING0_EC_FRAME -+ pushl $do_segment_not_present -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+ENTRY(stack_segment) -+ RING0_EC_FRAME -+ pushl $do_stack_segment -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+KPROBE_ENTRY(general_protection) -+ RING0_EC_FRAME -+ pushl $do_general_protection -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ .previous .text -+ -+ENTRY(alignment_check) -+ RING0_EC_FRAME -+ pushl $do_alignment_check -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+KPROBE_ENTRY(page_fault) -+ RING0_EC_FRAME -+ pushl $do_page_fault -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ .previous .text -+ -+#ifdef CONFIG_X86_MCE -+ENTRY(machine_check) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl machine_check_vector -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+#endif -+ -+#ifndef CONFIG_XEN -+ENTRY(spurious_interrupt_bug) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_spurious_interrupt_bug -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+#endif /* !CONFIG_XEN */ -+ -+#ifdef CONFIG_STACK_UNWIND -+ENTRY(arch_unwind_init_running) -+ CFI_STARTPROC -+ movl 4(%esp), %edx -+ movl (%esp), %ecx -+ leal 4(%esp), %eax -+ movl %ebx, EBX(%edx) -+ xorl %ebx, %ebx -+ movl %ebx, ECX(%edx) -+ movl %ebx, EDX(%edx) -+ movl %esi, ESI(%edx) -+ movl %edi, EDI(%edx) -+ movl %ebp, EBP(%edx) -+ movl %ebx, EAX(%edx) -+ movl $__USER_DS, DS(%edx) -+ movl $__USER_DS, ES(%edx) -+ movl %ebx, ORIG_EAX(%edx) -+ movl %ecx, EIP(%edx) -+ movl 12(%esp), %ecx -+ movl $__KERNEL_CS, CS(%edx) -+ movl %ebx, EFLAGS(%edx) -+ movl %eax, OLDESP(%edx) -+ movl 8(%esp), %eax -+ movl %ecx, 8(%esp) -+ movl EBX(%edx), %ebx -+ movl $__KERNEL_DS, OLDSS(%edx) -+ jmpl *%eax -+ CFI_ENDPROC -+ENDPROC(arch_unwind_init_running) -+#endif -+ -+ENTRY(fixup_4gb_segment) -+ RING0_EC_FRAME -+ pushl $do_fixup_4gb_segment -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+.section .rodata,"a" -+#include "syscall_table.S" -+ -+syscall_table_size=(.-sys_call_table) -diff -rpuN linux-2.6.18.8/arch/i386/kernel/entry.S linux-2.6.18-xen-3.2.0/arch/i386/kernel/entry.S ---- linux-2.6.18.8/arch/i386/kernel/entry.S 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/entry.S 2008-02-15 16:21:49.000000000 -0800 -@@ -269,7 +269,7 @@ ENTRY(sysenter_entry) - CFI_STARTPROC simple - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp -- movl TSS_sysenter_esp0(%esp),%esp -+ movl SYSENTER_stack_esp0(%esp),%esp - sysenter_past_esp: - /* - * No need to follow this irqs on/off section: the syscall -@@ -689,7 +689,7 @@ device_not_available_emulate: - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * -- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have -+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * -@@ -701,7 +701,7 @@ device_not_available_emulate: - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ - label: \ -- movl TSS_sysenter_esp0+offset(%esp),%esp; \ -+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ - pushfl; \ - pushl $__KERNEL_CS; \ - pushl $sysenter_past_esp -diff -rpuN linux-2.6.18.8/arch/i386/kernel/fixup.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/fixup.c ---- linux-2.6.18.8/arch/i386/kernel/fixup.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/fixup.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,88 @@ -+/****************************************************************************** -+ * fixup.c -+ * -+ * Binary-rewriting of certain IA32 instructions, on notification by Xen. -+ * Used to avoid repeated slow emulation of common instructions used by the -+ * user-space TLS (Thread-Local Storage) libraries. -+ * -+ * **** NOTE **** -+ * Issues with the binary rewriting have caused it to be removed. Instead -+ * we rely on Xen's emulator to boot the kernel, and then print a banner -+ * message recommending that the user disables /lib/tls. -+ * -+ * Copyright (c) 2004, K A Fraser -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) -+ -+fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) -+{ -+ static unsigned long printed = 0; -+ char info[100]; -+ int i; -+ -+ /* Ignore statically-linked init. */ -+ if (current->tgid == 1) -+ return; -+ -+ VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, -+ VMASST_TYPE_4gb_segments_notify)); -+ -+ if (test_and_set_bit(0, &printed)) -+ return; -+ -+ sprintf(info, "%s (pid=%d)", current->comm, current->tgid); -+ -+ DP(""); -+ DP("***************************************************************"); -+ DP("***************************************************************"); -+ DP("** WARNING: Currently emulating unsupported memory accesses **"); -+ DP("** in /lib/tls glibc libraries. The emulation is **"); -+ DP("** slow. To ensure full performance you should **"); -+ DP("** install a 'xen-friendly' (nosegneg) version of **"); -+ DP("** the library, or disable tls support by executing **"); -+ DP("** the following as root: **"); -+ DP("** mv /lib/tls /lib/tls.disabled **"); -+ DP("** Offending process: %-38.38s **", info); -+ DP("***************************************************************"); -+ DP("***************************************************************"); -+ DP(""); -+ -+ for (i = 5; i > 0; i--) { -+ touch_softlockup_watchdog(); -+ printk("Pausing... %d", i); -+ mdelay(1000); -+ printk("\b\b\b\b\b\b\b\b\b\b\b\b"); -+ } -+ -+ printk("Continuing...\n\n"); -+} -+ -+static int __init fixup_init(void) -+{ -+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, -+ VMASST_TYPE_4gb_segments_notify)); -+ return 0; -+} -+__initcall(fixup_init); -diff -rpuN linux-2.6.18.8/arch/i386/kernel/head-xen.S linux-2.6.18-xen-3.2.0/arch/i386/kernel/head-xen.S ---- linux-2.6.18.8/arch/i386/kernel/head-xen.S 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/head-xen.S 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,207 @@ -+ -+ -+.text -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * References to members of the new_cpu_data structure. -+ */ -+ -+#define X86 new_cpu_data+CPUINFO_x86 -+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor -+#define X86_MODEL new_cpu_data+CPUINFO_x86_model -+#define X86_MASK new_cpu_data+CPUINFO_x86_mask -+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math -+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level -+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability -+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -+ -+#define VIRT_ENTRY_OFFSET 0x0 -+.org VIRT_ENTRY_OFFSET -+ENTRY(startup_32) -+ movl %esi,xen_start_info -+ cld -+ -+ /* Set up the stack pointer */ -+ movl $(init_thread_union+THREAD_SIZE),%esp -+ -+ /* get vendor info */ -+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID -+ XEN_CPUID -+ movl %eax,X86_CPUID # save CPUID level -+ movl %ebx,X86_VENDOR_ID # lo 4 chars -+ movl %edx,X86_VENDOR_ID+4 # next 4 chars -+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars -+ -+ movl $1,%eax # Use the CPUID instruction to get CPU type -+ XEN_CPUID -+ movb %al,%cl # save reg for future use -+ andb $0x0f,%ah # mask processor family -+ movb %ah,X86 -+ andb $0xf0,%al # mask model -+ shrb $4,%al -+ movb %al,X86_MODEL -+ andb $0x0f,%cl # mask mask revision -+ movb %cl,X86_MASK -+ movl %edx,X86_CAPABILITY -+ -+ movb $1,X86_HARD_MATH -+ -+ xorl %eax,%eax # Clear FS/GS and LDT -+ movl %eax,%fs -+ movl %eax,%gs -+ cld # gcc2 wants the direction flag cleared at all times -+ -+ pushl %eax # fake return address -+ jmp start_kernel -+ -+#define HYPERCALL_PAGE_OFFSET 0x1000 -+.org HYPERCALL_PAGE_OFFSET -+ENTRY(hypercall_page) -+ CFI_STARTPROC -+.skip 0x1000 -+ CFI_ENDPROC -+ -+/* -+ * Real beginning of normal "text" segment -+ */ -+ENTRY(stext) -+ENTRY(_stext) -+ -+/* -+ * BSS section -+ */ -+.section ".bss.page_aligned","w" -+ENTRY(empty_zero_page) -+ .fill 4096,1,0 -+ -+/* -+ * This starts the data section. -+ */ -+.data -+ -+/* -+ * The Global Descriptor Table contains 28 quadwords, per-CPU. -+ */ -+ .align L1_CACHE_BYTES -+ENTRY(cpu_gdt_table) -+ .quad 0x0000000000000000 /* NULL descriptor */ -+ .quad 0x0000000000000000 /* 0x0b reserved */ -+ .quad 0x0000000000000000 /* 0x13 reserved */ -+ .quad 0x0000000000000000 /* 0x1b reserved */ -+ .quad 0x0000000000000000 /* 0x20 unused */ -+ .quad 0x0000000000000000 /* 0x28 unused */ -+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ -+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ -+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ -+ .quad 0x0000000000000000 /* 0x4b reserved */ -+ .quad 0x0000000000000000 /* 0x53 reserved */ -+ .quad 0x0000000000000000 /* 0x5b reserved */ -+ -+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ -+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ -+ .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ -+ .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ -+ -+ .quad 0x0000000000000000 /* 0x80 TSS descriptor */ -+ .quad 0x0000000000000000 /* 0x88 LDT descriptor */ -+ -+ /* -+ * Segments used for calling PnP BIOS have byte granularity. -+ * They code segments and data segments have fixed 64k limits, -+ * the transfer segment sizes are set at run time. -+ */ -+ .quad 0x0000000000000000 /* 0x90 32-bit code */ -+ .quad 0x0000000000000000 /* 0x98 16-bit code */ -+ .quad 0x0000000000000000 /* 0xa0 16-bit data */ -+ .quad 0x0000000000000000 /* 0xa8 16-bit data */ -+ .quad 0x0000000000000000 /* 0xb0 16-bit data */ -+ -+ /* -+ * The APM segments have byte granularity and their bases -+ * are set at run time. All have 64k limits. -+ */ -+ .quad 0x0000000000000000 /* 0xb8 APM CS code */ -+ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ -+ .quad 0x0000000000000000 /* 0xc8 APM DS data */ -+ -+ .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */ -+ .quad 0x0000000000000000 /* 0xd8 - unused */ -+ .quad 0x0000000000000000 /* 0xe0 - unused */ -+ .quad 0x0000000000000000 /* 0xe8 - unused */ -+ .quad 0x0000000000000000 /* 0xf0 - unused */ -+ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ -+ -+#if CONFIG_XEN_COMPAT <= 0x030002 -+/* -+ * __xen_guest information -+ */ -+.macro utoa value -+ .if (\value) < 0 || (\value) >= 0x10 -+ utoa (((\value)>>4)&0x0fffffff) -+ .endif -+ .if ((\value) & 0xf) < 10 -+ .byte '0' + ((\value) & 0xf) -+ .else -+ .byte 'A' + ((\value) & 0xf) - 10 -+ .endif -+.endm -+ -+.section __xen_guest -+ .ascii "GUEST_OS=linux,GUEST_VER=2.6" -+ .ascii ",XEN_VER=xen-3.0" -+ .ascii ",VIRT_BASE=0x" -+ utoa __PAGE_OFFSET -+ .ascii ",ELF_PADDR_OFFSET=0x" -+ utoa __PAGE_OFFSET -+ .ascii ",VIRT_ENTRY=0x" -+ utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET) -+ .ascii ",HYPERCALL_PAGE=0x" -+ utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) -+ .ascii ",FEATURES=writable_page_tables" -+ .ascii "|writable_descriptor_tables" -+ .ascii "|auto_translated_physmap" -+ .ascii "|pae_pgdir_above_4gb" -+ .ascii "|supervisor_mode_kernel" -+#ifdef CONFIG_X86_PAE -+ .ascii ",PAE=yes[extended-cr3]" -+#else -+ .ascii ",PAE=no" -+#endif -+ .ascii ",LOADER=generic" -+ .byte 0 -+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ -+ -+ -+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") -+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") -+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") -+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) -+#if CONFIG_XEN_COMPAT <= 0x030002 -+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET) -+#else -+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0) -+#endif -+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) -+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) -+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) -+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") -+#ifdef CONFIG_X86_PAE -+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") -+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) -+#else -+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") -+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT) -+#endif -+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") -+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) -diff -rpuN linux-2.6.18.8/arch/i386/kernel/init_task-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/init_task-xen.c ---- linux-2.6.18.8/arch/i386/kernel/init_task-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/init_task-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,51 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+static struct fs_struct init_fs = INIT_FS; -+static struct files_struct init_files = INIT_FILES; -+static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -+ -+#define swapper_pg_dir ((pgd_t *)NULL) -+struct mm_struct init_mm = INIT_MM(init_mm); -+#undef swapper_pg_dir -+ -+EXPORT_SYMBOL(init_mm); -+ -+/* -+ * Initial thread structure. -+ * -+ * We need to make sure that this is THREAD_SIZE aligned due to the -+ * way process stacks are handled. This is done by having a special -+ * "init_task" linker map entry.. -+ */ -+union thread_union init_thread_union -+ __attribute__((__section__(".data.init_task"))) = -+ { INIT_THREAD_INFO(init_task) }; -+ -+/* -+ * Initial task structure. -+ * -+ * All other task structs will be allocated on slabs in fork.c -+ */ -+struct task_struct init_task = INIT_TASK(init_task); -+ -+EXPORT_SYMBOL(init_task); -+ -+#ifndef CONFIG_X86_NO_TSS -+/* -+ * per-CPU TSS segments. Threads are completely 'soft' on Linux, -+ * no more per-task TSS's. -+ */ -+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; -+#endif -+ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/io_apic-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/io_apic-xen.c ---- linux-2.6.18.8/arch/i386/kernel/io_apic-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/io_apic-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,2770 @@ -+/* -+ * Intel IO-APIC support for multi-Pentium hosts. -+ * -+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo -+ * -+ * Many thanks to Stig Venaas for trying out countless experimental -+ * patches and reporting/debugging problems patiently! -+ * -+ * (c) 1999, Multiple IO-APIC support, developed by -+ * Ken-ichi Yaku and -+ * Hidemi Kishimoto , -+ * further tested and cleaned up by Zach Brown -+ * and Ingo Molnar -+ * -+ * Fixes -+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; -+ * thanks to Eric Gilmore -+ * and Rolf G. Tews -+ * for testing these extensively -+ * Paul Diefenbaugh : Added full ACPI support -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "io_ports.h" -+ -+#ifdef CONFIG_XEN -+ -+#include -+#include -+ -+/* Fake i8259 */ -+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -+#define disable_8259A_irq(_irq) ((void)0) -+#define i8259A_irq_pending(_irq) (0) -+ -+unsigned long io_apic_irqs; -+ -+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) -+{ -+ struct physdev_apic apic_op; -+ int ret; -+ -+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; -+ apic_op.reg = reg; -+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); -+ if (ret) -+ return ret; -+ return apic_op.value; -+} -+ -+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -+{ -+ struct physdev_apic apic_op; -+ -+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; -+ apic_op.reg = reg; -+ apic_op.value = value; -+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); -+} -+ -+#define io_apic_read(a,r) xen_io_apic_read(a,r) -+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) -+ -+#endif /* CONFIG_XEN */ -+ -+int (*ioapic_renumber_irq)(int ioapic, int irq); -+atomic_t irq_mis_count; -+ -+/* Where if anywhere is the i8259 connect in external int mode */ -+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -+ -+static DEFINE_SPINLOCK(ioapic_lock); -+static DEFINE_SPINLOCK(vector_lock); -+ -+int timer_over_8254 __initdata = 1; -+ -+/* -+ * Is the SiS APIC rmw bug present ? -+ * -1 = don't know, 0 = no, 1 = yes -+ */ -+int sis_apic_bug = -1; -+ -+/* -+ * # of IRQ routing registers -+ */ -+int nr_ioapic_registers[MAX_IO_APICS]; -+ -+int disable_timer_pin_1 __initdata; -+ -+/* -+ * Rough estimation of how many shared IRQs there are, can -+ * be changed anytime. -+ */ -+#define MAX_PLUS_SHARED_IRQS NR_IRQS -+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) -+ -+/* -+ * This is performance-critical, we want to do it O(1) -+ * -+ * the indexing order of this array favors 1:1 mappings -+ * between pins and IRQs. -+ */ -+ -+static struct irq_pin_list { -+ int apic, pin, next; -+} irq_2_pin[PIN_MAP_SIZE]; -+ -+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -+#ifdef CONFIG_PCI_MSI -+#define vector_to_irq(vector) \ -+ (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -+#else -+#define vector_to_irq(vector) (vector) -+#endif -+ -+/* -+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are -+ * shared ISA-space IRQs, so we have to support them. We are super -+ * fast in the common case, and fast for shared ISA-space IRQs. -+ */ -+static void add_pin_to_irq(unsigned int irq, int apic, int pin) -+{ -+ static int first_free_entry = NR_IRQS; -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ -+ while (entry->next) -+ entry = irq_2_pin + entry->next; -+ -+ if (entry->pin != -1) { -+ entry->next = first_free_entry; -+ entry = irq_2_pin + entry->next; -+ if (++first_free_entry >= PIN_MAP_SIZE) -+ panic("io_apic.c: whoops"); -+ } -+ entry->apic = apic; -+ entry->pin = pin; -+} -+ -+#ifdef CONFIG_XEN -+#define clear_IO_APIC() ((void)0) -+#else -+/* -+ * Reroute an IRQ to a different pin. -+ */ -+static void __init replace_pin_at_irq(unsigned int irq, -+ int oldapic, int oldpin, -+ int newapic, int newpin) -+{ -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ -+ while (1) { -+ if (entry->apic == oldapic && entry->pin == oldpin) { -+ entry->apic = newapic; -+ entry->pin = newpin; -+ } -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+} -+ -+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) -+{ -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ unsigned int pin, reg; -+ -+ for (;;) { -+ pin = entry->pin; -+ if (pin == -1) -+ break; -+ reg = io_apic_read(entry->apic, 0x10 + pin*2); -+ reg &= ~disable; -+ reg |= enable; -+ io_apic_modify(entry->apic, 0x10 + pin*2, reg); -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+} -+ -+/* mask = 1 */ -+static void __mask_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0x00010000, 0); -+} -+ -+/* mask = 0 */ -+static void __unmask_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0, 0x00010000); -+} -+ -+/* mask = 1, trigger = 0 */ -+static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -+} -+ -+/* mask = 0, trigger = 1 */ -+static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -+} -+ -+static void mask_IO_APIC_irq (unsigned int irq) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __mask_IO_APIC_irq(irq); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+static void unmask_IO_APIC_irq (unsigned int irq) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __unmask_IO_APIC_irq(irq); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -+{ -+ struct IO_APIC_route_entry entry; -+ unsigned long flags; -+ -+ /* Check delivery_mode to be sure we're not clearing an SMI pin */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); -+ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ if (entry.delivery_mode == dest_SMI) -+ return; -+ -+ /* -+ * Disable it in the IO-APIC irq-routing table: -+ */ -+ memset(&entry, 0, sizeof(entry)); -+ entry.mask = 1; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); -+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+static void clear_IO_APIC (void) -+{ -+ int apic, pin; -+ -+ for (apic = 0; apic < nr_ioapics; apic++) -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) -+ clear_IO_APIC_pin(apic, pin); -+} -+ -+#ifdef CONFIG_SMP -+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -+{ -+ unsigned long flags; -+ int pin; -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ unsigned int apicid_value; -+ cpumask_t tmp; -+ -+ cpus_and(tmp, cpumask, cpu_online_map); -+ if (cpus_empty(tmp)) -+ tmp = TARGET_CPUS; -+ -+ cpus_and(cpumask, tmp, CPU_MASK_ALL); -+ -+ apicid_value = cpu_mask_to_apicid(cpumask); -+ /* Prepare to do the io_apic_write */ -+ apicid_value = apicid_value << 24; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ for (;;) { -+ pin = entry->pin; -+ if (pin == -1) -+ break; -+ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+ set_irq_info(irq, cpumask); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+#if defined(CONFIG_IRQBALANCE) -+# include /* kernel_thread() */ -+# include /* kstat */ -+# include /* kmalloc() */ -+# include /* time_after() */ -+ -+#ifdef CONFIG_BALANCED_IRQ_DEBUG -+# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) -+# define Dprintk(x...) do { TDprintk(x); } while (0) -+# else -+# define TDprintk(x...) -+# define Dprintk(x...) -+# endif -+ -+#define IRQBALANCE_CHECK_ARCH -999 -+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -+#define BALANCED_IRQ_MORE_DELTA (HZ/10) -+#define BALANCED_IRQ_LESS_DELTA (HZ) -+ -+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -+static int physical_balance __read_mostly; -+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; -+ -+static struct irq_cpu_info { -+ unsigned long * last_irq; -+ unsigned long * irq_delta; -+ unsigned long irq; -+} irq_cpu_data[NR_CPUS]; -+ -+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) -+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) -+ -+#define IDLE_ENOUGH(cpu,now) \ -+ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) -+ -+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -+ -+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) -+ -+static cpumask_t balance_irq_affinity[NR_IRQS] = { -+ [0 ... NR_IRQS-1] = CPU_MASK_ALL -+}; -+ -+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -+{ -+ balance_irq_affinity[irq] = mask; -+} -+ -+static unsigned long move(int curr_cpu, cpumask_t allowed_mask, -+ unsigned long now, int direction) -+{ -+ int search_idle = 1; -+ int cpu = curr_cpu; -+ -+ goto inside; -+ -+ do { -+ if (unlikely(cpu == curr_cpu)) -+ search_idle = 0; -+inside: -+ if (direction == 1) { -+ cpu++; -+ if (cpu >= NR_CPUS) -+ cpu = 0; -+ } else { -+ cpu--; -+ if (cpu == -1) -+ cpu = NR_CPUS-1; -+ } -+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || -+ (search_idle && !IDLE_ENOUGH(cpu,now))); -+ -+ return cpu; -+} -+ -+static inline void balance_irq(int cpu, int irq) -+{ -+ unsigned long now = jiffies; -+ cpumask_t allowed_mask; -+ unsigned int new_cpu; -+ -+ if (irqbalance_disabled) -+ return; -+ -+ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); -+ new_cpu = move(cpu, allowed_mask, now, 1); -+ if (cpu != new_cpu) { -+ set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -+ } -+} -+ -+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -+{ -+ int i, j; -+ Dprintk("Rotating IRQs among CPUs.\n"); -+ for_each_online_cpu(i) { -+ for (j = 0; j < NR_IRQS; j++) { -+ if (!irq_desc[j].action) -+ continue; -+ /* Is it a significant load ? */ -+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < -+ useful_load_threshold) -+ continue; -+ balance_irq(i, j); -+ } -+ } -+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, -+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); -+ return; -+} -+ -+static void do_irq_balance(void) -+{ -+ int i, j; -+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); -+ unsigned long move_this_load = 0; -+ int max_loaded = 0, min_loaded = 0; -+ int load; -+ unsigned long useful_load_threshold = balanced_irq_interval + 10; -+ int selected_irq; -+ int tmp_loaded, first_attempt = 1; -+ unsigned long tmp_cpu_irq; -+ unsigned long imbalance = 0; -+ cpumask_t allowed_mask, target_cpu_mask, tmp; -+ -+ for_each_possible_cpu(i) { -+ int package_index; -+ CPU_IRQ(i) = 0; -+ if (!cpu_online(i)) -+ continue; -+ package_index = CPU_TO_PACKAGEINDEX(i); -+ for (j = 0; j < NR_IRQS; j++) { -+ unsigned long value_now, delta; -+ /* Is this an active IRQ? */ -+ if (!irq_desc[j].action) -+ continue; -+ if ( package_index == i ) -+ IRQ_DELTA(package_index,j) = 0; -+ /* Determine the total count per processor per IRQ */ -+ value_now = (unsigned long) kstat_cpu(i).irqs[j]; -+ -+ /* Determine the activity per processor per IRQ */ -+ delta = value_now - LAST_CPU_IRQ(i,j); -+ -+ /* Update last_cpu_irq[][] for the next time */ -+ LAST_CPU_IRQ(i,j) = value_now; -+ -+ /* Ignore IRQs whose rate is less than the clock */ -+ if (delta < useful_load_threshold) -+ continue; -+ /* update the load for the processor or package total */ -+ IRQ_DELTA(package_index,j) += delta; -+ -+ /* Keep track of the higher numbered sibling as well */ -+ if (i != package_index) -+ CPU_IRQ(i) += delta; -+ /* -+ * We have sibling A and sibling B in the package -+ * -+ * cpu_irq[A] = load for cpu A + load for cpu B -+ * cpu_irq[B] = load for cpu B -+ */ -+ CPU_IRQ(package_index) += delta; -+ } -+ } -+ /* Find the least loaded processor package */ -+ for_each_online_cpu(i) { -+ if (i != CPU_TO_PACKAGEINDEX(i)) -+ continue; -+ if (min_cpu_irq > CPU_IRQ(i)) { -+ min_cpu_irq = CPU_IRQ(i); -+ min_loaded = i; -+ } -+ } -+ max_cpu_irq = ULONG_MAX; -+ -+tryanothercpu: -+ /* Look for heaviest loaded processor. -+ * We may come back to get the next heaviest loaded processor. -+ * Skip processors with trivial loads. -+ */ -+ tmp_cpu_irq = 0; -+ tmp_loaded = -1; -+ for_each_online_cpu(i) { -+ if (i != CPU_TO_PACKAGEINDEX(i)) -+ continue; -+ if (max_cpu_irq <= CPU_IRQ(i)) -+ continue; -+ if (tmp_cpu_irq < CPU_IRQ(i)) { -+ tmp_cpu_irq = CPU_IRQ(i); -+ tmp_loaded = i; -+ } -+ } -+ -+ if (tmp_loaded == -1) { -+ /* In the case of small number of heavy interrupt sources, -+ * loading some of the cpus too much. We use Ingo's original -+ * approach to rotate them around. -+ */ -+ if (!first_attempt && imbalance >= useful_load_threshold) { -+ rotate_irqs_among_cpus(useful_load_threshold); -+ return; -+ } -+ goto not_worth_the_effort; -+ } -+ -+ first_attempt = 0; /* heaviest search */ -+ max_cpu_irq = tmp_cpu_irq; /* load */ -+ max_loaded = tmp_loaded; /* processor */ -+ imbalance = (max_cpu_irq - min_cpu_irq) / 2; -+ -+ Dprintk("max_loaded cpu = %d\n", max_loaded); -+ Dprintk("min_loaded cpu = %d\n", min_loaded); -+ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); -+ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); -+ Dprintk("load imbalance = %lu\n", imbalance); -+ -+ /* if imbalance is less than approx 10% of max load, then -+ * observe diminishing returns action. - quit -+ */ -+ if (imbalance < (max_cpu_irq >> 3)) { -+ Dprintk("Imbalance too trivial\n"); -+ goto not_worth_the_effort; -+ } -+ -+tryanotherirq: -+ /* if we select an IRQ to move that can't go where we want, then -+ * see if there is another one to try. -+ */ -+ move_this_load = 0; -+ selected_irq = -1; -+ for (j = 0; j < NR_IRQS; j++) { -+ /* Is this an active IRQ? */ -+ if (!irq_desc[j].action) -+ continue; -+ if (imbalance <= IRQ_DELTA(max_loaded,j)) -+ continue; -+ /* Try to find the IRQ that is closest to the imbalance -+ * without going over. -+ */ -+ if (move_this_load < IRQ_DELTA(max_loaded,j)) { -+ move_this_load = IRQ_DELTA(max_loaded,j); -+ selected_irq = j; -+ } -+ } -+ if (selected_irq == -1) { -+ goto tryanothercpu; -+ } -+ -+ imbalance = move_this_load; -+ -+ /* For physical_balance case, we accumlated both load -+ * values in the one of the siblings cpu_irq[], -+ * to use the same code for physical and logical processors -+ * as much as possible. -+ * -+ * NOTE: the cpu_irq[] array holds the sum of the load for -+ * sibling A and sibling B in the slot for the lowest numbered -+ * sibling (A), _AND_ the load for sibling B in the slot for -+ * the higher numbered sibling. -+ * -+ * We seek the least loaded sibling by making the comparison -+ * (A+B)/2 vs B -+ */ -+ load = CPU_IRQ(min_loaded) >> 1; -+ for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { -+ if (load > CPU_IRQ(j)) { -+ /* This won't change cpu_sibling_map[min_loaded] */ -+ load = CPU_IRQ(j); -+ min_loaded = j; -+ } -+ } -+ -+ cpus_and(allowed_mask, -+ cpu_online_map, -+ balance_irq_affinity[selected_irq]); -+ target_cpu_mask = cpumask_of_cpu(min_loaded); -+ cpus_and(tmp, target_cpu_mask, allowed_mask); -+ -+ if (!cpus_empty(tmp)) { -+ -+ Dprintk("irq = %d moved to cpu = %d\n", -+ selected_irq, min_loaded); -+ /* mark for change destination */ -+ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); -+ -+ /* Since we made a change, come back sooner to -+ * check for more variation. -+ */ -+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, -+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); -+ return; -+ } -+ goto tryanotherirq; -+ -+not_worth_the_effort: -+ /* -+ * if we did not find an IRQ to move, then adjust the time interval -+ * upward -+ */ -+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, -+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); -+ Dprintk("IRQ worth rotating not found\n"); -+ return; -+} -+ -+static int balanced_irq(void *unused) -+{ -+ int i; -+ unsigned long prev_balance_time = jiffies; -+ long time_remaining = balanced_irq_interval; -+ -+ daemonize("kirqd"); -+ -+ /* push everything to CPU 0 to give us a starting point. */ -+ for (i = 0 ; i < NR_IRQS ; i++) { -+ irq_desc[i].pending_mask = cpumask_of_cpu(0); -+ set_pending_irq(i, cpumask_of_cpu(0)); -+ } -+ -+ for ( ; ; ) { -+ time_remaining = schedule_timeout_interruptible(time_remaining); -+ try_to_freeze(); -+ if (time_after(jiffies, -+ prev_balance_time+balanced_irq_interval)) { -+ preempt_disable(); -+ do_irq_balance(); -+ prev_balance_time = jiffies; -+ time_remaining = balanced_irq_interval; -+ preempt_enable(); -+ } -+ } -+ return 0; -+} -+ -+static int __init balanced_irq_init(void) -+{ -+ int i; -+ struct cpuinfo_x86 *c; -+ cpumask_t tmp; -+ -+ cpus_shift_right(tmp, cpu_online_map, 2); -+ c = &boot_cpu_data; -+ /* When not overwritten by the command line ask subarchitecture. */ -+ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) -+ irqbalance_disabled = NO_BALANCE_IRQ; -+ if (irqbalance_disabled) -+ return 0; -+ -+ /* disable irqbalance completely if there is only one processor online */ -+ if (num_online_cpus() < 2) { -+ irqbalance_disabled = 1; -+ return 0; -+ } -+ /* -+ * Enable physical balance only if more than 1 physical processor -+ * is present -+ */ -+ if (smp_num_siblings > 1 && !cpus_empty(tmp)) -+ physical_balance = 1; -+ -+ for_each_online_cpu(i) { -+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); -+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); -+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { -+ printk(KERN_ERR "balanced_irq_init: out of memory"); -+ goto failed; -+ } -+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); -+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); -+ } -+ -+ printk(KERN_INFO "Starting balanced_irq\n"); -+ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) -+ return 0; -+ else -+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -+failed: -+ for_each_possible_cpu(i) { -+ kfree(irq_cpu_data[i].irq_delta); -+ irq_cpu_data[i].irq_delta = NULL; -+ kfree(irq_cpu_data[i].last_irq); -+ irq_cpu_data[i].last_irq = NULL; -+ } -+ return 0; -+} -+ -+int __init irqbalance_disable(char *str) -+{ -+ irqbalance_disabled = 1; -+ return 1; -+} -+ -+__setup("noirqbalance", irqbalance_disable); -+ -+late_initcall(balanced_irq_init); -+#endif /* CONFIG_IRQBALANCE */ -+#endif /* CONFIG_SMP */ -+#endif -+ -+#ifndef CONFIG_SMP -+void fastcall send_IPI_self(int vector) -+{ -+#ifndef CONFIG_XEN -+ unsigned int cfg; -+ -+ /* -+ * Wait for idle. -+ */ -+ apic_wait_icr_idle(); -+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; -+ /* -+ * Send the IPI. The write to APIC_ICR fires this off. -+ */ -+ apic_write_around(APIC_ICR, cfg); -+#endif -+} -+#endif /* !CONFIG_SMP */ -+ -+ -+/* -+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to -+ * specific CPU-side IRQs. -+ */ -+ -+#define MAX_PIRQS 8 -+static int pirq_entries [MAX_PIRQS]; -+static int pirqs_enabled; -+int skip_ioapic_setup; -+ -+static int __init ioapic_setup(char *str) -+{ -+ skip_ioapic_setup = 1; -+ return 1; -+} -+ -+__setup("noapic", ioapic_setup); -+ -+static int __init ioapic_pirq_setup(char *str) -+{ -+ int i, max; -+ int ints[MAX_PIRQS+1]; -+ -+ get_options(str, ARRAY_SIZE(ints), ints); -+ -+ for (i = 0; i < MAX_PIRQS; i++) -+ pirq_entries[i] = -1; -+ -+ pirqs_enabled = 1; -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "PIRQ redirection, working around broken MP-BIOS.\n"); -+ max = MAX_PIRQS; -+ if (ints[0] < MAX_PIRQS) -+ max = ints[0]; -+ -+ for (i = 0; i < max; i++) { -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); -+ /* -+ * PIRQs are mapped upside down, usually. -+ */ -+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; -+ } -+ return 1; -+} -+ -+__setup("pirq=", ioapic_pirq_setup); -+ -+/* -+ * Find the IRQ entry number of a certain pin. -+ */ -+static int find_irq_entry(int apic, int pin, int type) -+{ -+ int i; -+ -+ for (i = 0; i < mp_irq_entries; i++) -+ if (mp_irqs[i].mpc_irqtype == type && -+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || -+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && -+ mp_irqs[i].mpc_dstirq == pin) -+ return i; -+ -+ return -1; -+} -+ -+/* -+ * Find the pin to which IRQ[irq] (ISA) is connected -+ */ -+static int __init find_isa_irq_pin(int irq, int type) -+{ -+ int i; -+ -+ for (i = 0; i < mp_irq_entries; i++) { -+ int lbus = mp_irqs[i].mpc_srcbus; -+ -+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_EISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_MCA || -+ mp_bus_id_to_type[lbus] == MP_BUS_NEC98 -+ ) && -+ (mp_irqs[i].mpc_irqtype == type) && -+ (mp_irqs[i].mpc_srcbusirq == irq)) -+ -+ return mp_irqs[i].mpc_dstirq; -+ } -+ return -1; -+} -+ -+static int __init find_isa_irq_apic(int irq, int type) -+{ -+ int i; -+ -+ for (i = 0; i < mp_irq_entries; i++) { -+ int lbus = mp_irqs[i].mpc_srcbus; -+ -+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_EISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_MCA || -+ mp_bus_id_to_type[lbus] == MP_BUS_NEC98 -+ ) && -+ (mp_irqs[i].mpc_irqtype == type) && -+ (mp_irqs[i].mpc_srcbusirq == irq)) -+ break; -+ } -+ if (i < mp_irq_entries) { -+ int apic; -+ for(apic = 0; apic < nr_ioapics; apic++) { -+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) -+ return apic; -+ } -+ } -+ -+ return -1; -+} -+ -+/* -+ * Find a specific PCI IRQ entry. -+ * Not an __init, possibly needed by modules -+ */ -+static int pin_2_irq(int idx, int apic, int pin); -+ -+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -+{ -+ int apic, i, best_guess = -1; -+ -+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " -+ "slot:%d, pin:%d.\n", bus, slot, pin); -+ if (mp_bus_id_to_pci_bus[bus] == -1) { -+ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); -+ return -1; -+ } -+ for (i = 0; i < mp_irq_entries; i++) { -+ int lbus = mp_irqs[i].mpc_srcbus; -+ -+ for (apic = 0; apic < nr_ioapics; apic++) -+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || -+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) -+ break; -+ -+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && -+ !mp_irqs[i].mpc_irqtype && -+ (bus == lbus) && -+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { -+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); -+ -+ if (!(apic || IO_APIC_IRQ(irq))) -+ continue; -+ -+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) -+ return irq; -+ /* -+ * Use the first all-but-pin matching entry as a -+ * best-guess fuzzy result for broken mptables. -+ */ -+ if (best_guess < 0) -+ best_guess = irq; -+ } -+ } -+ return best_guess; -+} -+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -+ -+/* -+ * This function currently is only a helper for the i386 smp boot process where -+ * we need to reprogram the ioredtbls to cater for the cpus which have come online -+ * so mask in all cases should simply be TARGET_CPUS -+ */ -+#ifdef CONFIG_SMP -+#ifndef CONFIG_XEN -+void __init setup_ioapic_dest(void) -+{ -+ int pin, ioapic, irq, irq_entry; -+ -+ if (skip_ioapic_setup == 1) -+ return; -+ -+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { -+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { -+ irq_entry = find_irq_entry(ioapic, pin, mp_INT); -+ if (irq_entry == -1) -+ continue; -+ irq = pin_2_irq(irq_entry, ioapic, pin); -+ set_ioapic_affinity_irq(irq, TARGET_CPUS); -+ } -+ -+ } -+} -+#endif /* !CONFIG_XEN */ -+#endif -+ -+/* -+ * EISA Edge/Level control register, ELCR -+ */ -+static int EISA_ELCR(unsigned int irq) -+{ -+ if (irq < 16) { -+ unsigned int port = 0x4d0 + (irq >> 3); -+ return (inb(port) >> (irq & 7)) & 1; -+ } -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "Broken MPtable reports ISA irq %d\n", irq); -+ return 0; -+} -+ -+/* EISA interrupts are always polarity zero and can be edge or level -+ * trigger depending on the ELCR value. If an interrupt is listed as -+ * EISA conforming in the MP table, that means its trigger type must -+ * be read in from the ELCR */ -+ -+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -+#define default_EISA_polarity(idx) (0) -+ -+/* ISA interrupts are always polarity zero edge triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_ISA_trigger(idx) (0) -+#define default_ISA_polarity(idx) (0) -+ -+/* PCI interrupts are always polarity one level triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_PCI_trigger(idx) (1) -+#define default_PCI_polarity(idx) (1) -+ -+/* MCA interrupts are always polarity zero level triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_MCA_trigger(idx) (1) -+#define default_MCA_polarity(idx) (0) -+ -+/* NEC98 interrupts are always polarity zero edge triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_NEC98_trigger(idx) (0) -+#define default_NEC98_polarity(idx) (0) -+ -+static int __init MPBIOS_polarity(int idx) -+{ -+ int bus = mp_irqs[idx].mpc_srcbus; -+ int polarity; -+ -+ /* -+ * Determine IRQ line polarity (high active or low active): -+ */ -+ switch (mp_irqs[idx].mpc_irqflag & 3) -+ { -+ case 0: /* conforms, ie. bus-type dependent polarity */ -+ { -+ switch (mp_bus_id_to_type[bus]) -+ { -+ case MP_BUS_ISA: /* ISA pin */ -+ { -+ polarity = default_ISA_polarity(idx); -+ break; -+ } -+ case MP_BUS_EISA: /* EISA pin */ -+ { -+ polarity = default_EISA_polarity(idx); -+ break; -+ } -+ case MP_BUS_PCI: /* PCI pin */ -+ { -+ polarity = default_PCI_polarity(idx); -+ break; -+ } -+ case MP_BUS_MCA: /* MCA pin */ -+ { -+ polarity = default_MCA_polarity(idx); -+ break; -+ } -+ case MP_BUS_NEC98: /* NEC 98 pin */ -+ { -+ polarity = default_NEC98_polarity(idx); -+ break; -+ } -+ default: -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ polarity = 1; -+ break; -+ } -+ } -+ break; -+ } -+ case 1: /* high active */ -+ { -+ polarity = 0; -+ break; -+ } -+ case 2: /* reserved */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ polarity = 1; -+ break; -+ } -+ case 3: /* low active */ -+ { -+ polarity = 1; -+ break; -+ } -+ default: /* invalid */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ polarity = 1; -+ break; -+ } -+ } -+ return polarity; -+} -+ -+static int MPBIOS_trigger(int idx) -+{ -+ int bus = mp_irqs[idx].mpc_srcbus; -+ int trigger; -+ -+ /* -+ * Determine IRQ trigger mode (edge or level sensitive): -+ */ -+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) -+ { -+ case 0: /* conforms, ie. bus-type dependent */ -+ { -+ switch (mp_bus_id_to_type[bus]) -+ { -+ case MP_BUS_ISA: /* ISA pin */ -+ { -+ trigger = default_ISA_trigger(idx); -+ break; -+ } -+ case MP_BUS_EISA: /* EISA pin */ -+ { -+ trigger = default_EISA_trigger(idx); -+ break; -+ } -+ case MP_BUS_PCI: /* PCI pin */ -+ { -+ trigger = default_PCI_trigger(idx); -+ break; -+ } -+ case MP_BUS_MCA: /* MCA pin */ -+ { -+ trigger = default_MCA_trigger(idx); -+ break; -+ } -+ case MP_BUS_NEC98: /* NEC 98 pin */ -+ { -+ trigger = default_NEC98_trigger(idx); -+ break; -+ } -+ default: -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ trigger = 1; -+ break; -+ } -+ } -+ break; -+ } -+ case 1: /* edge */ -+ { -+ trigger = 0; -+ break; -+ } -+ case 2: /* reserved */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ trigger = 1; -+ break; -+ } -+ case 3: /* level */ -+ { -+ trigger = 1; -+ break; -+ } -+ default: /* invalid */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ trigger = 0; -+ break; -+ } -+ } -+ return trigger; -+} -+ -+static inline int irq_polarity(int idx) -+{ -+ return MPBIOS_polarity(idx); -+} -+ -+static inline int irq_trigger(int idx) -+{ -+ return MPBIOS_trigger(idx); -+} -+ -+static int pin_2_irq(int idx, int apic, int pin) -+{ -+ int irq, i; -+ int bus = mp_irqs[idx].mpc_srcbus; -+ -+ /* -+ * Debugging check, we are in big trouble if this message pops up! -+ */ -+ if (mp_irqs[idx].mpc_dstirq != pin) -+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); -+ -+ switch (mp_bus_id_to_type[bus]) -+ { -+ case MP_BUS_ISA: /* ISA pin */ -+ case MP_BUS_EISA: -+ case MP_BUS_MCA: -+ case MP_BUS_NEC98: -+ { -+ irq = mp_irqs[idx].mpc_srcbusirq; -+ break; -+ } -+ case MP_BUS_PCI: /* PCI pin */ -+ { -+ /* -+ * PCI IRQs are mapped in order -+ */ -+ i = irq = 0; -+ while (i < apic) -+ irq += nr_ioapic_registers[i++]; -+ irq += pin; -+ -+ /* -+ * For MPS mode, so far only needed by ES7000 platform -+ */ -+ if (ioapic_renumber_irq) -+ irq = ioapic_renumber_irq(apic, irq); -+ -+ break; -+ } -+ default: -+ { -+ printk(KERN_ERR "unknown bus type %d.\n",bus); -+ irq = 0; -+ break; -+ } -+ } -+ -+ /* -+ * PCI IRQ command line redirection. Yes, limits are hardcoded. -+ */ -+ if ((pin >= 16) && (pin <= 23)) { -+ if (pirq_entries[pin-16] != -1) { -+ if (!pirq_entries[pin-16]) { -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ "disabling PIRQ%d\n", pin-16); -+ } else { -+ irq = pirq_entries[pin-16]; -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ "using PIRQ%d -> IRQ %d\n", -+ pin-16, irq); -+ } -+ } -+ } -+ return irq; -+} -+ -+static inline int IO_APIC_irq_trigger(int irq) -+{ -+ int apic, idx, pin; -+ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { -+ idx = find_irq_entry(apic,pin,mp_INT); -+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) -+ return irq_trigger(idx); -+ } -+ } -+ /* -+ * nonexistent IRQs are edge default -+ */ -+ return 0; -+} -+ -+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ -+ -+int assign_irq_vector(int irq) -+{ -+ unsigned long flags; -+ int vector; -+ struct physdev_irq irq_op; -+ -+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); -+ -+ spin_lock_irqsave(&vector_lock, flags); -+ -+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { -+ spin_unlock_irqrestore(&vector_lock, flags); -+ return IO_APIC_VECTOR(irq); -+ } -+ -+ irq_op.irq = irq; -+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { -+ spin_unlock_irqrestore(&vector_lock, flags); -+ return -ENOSPC; -+ } -+ -+ vector = irq_op.vector; -+ vector_irq[vector] = irq; -+ if (irq != AUTO_ASSIGN) -+ IO_APIC_VECTOR(irq) = vector; -+ -+ spin_unlock_irqrestore(&vector_lock, flags); -+ -+ return vector; -+} -+ -+#ifndef CONFIG_XEN -+static struct hw_interrupt_type ioapic_level_type; -+static struct hw_interrupt_type ioapic_edge_type; -+ -+#define IOAPIC_AUTO -1 -+#define IOAPIC_EDGE 0 -+#define IOAPIC_LEVEL 1 -+ -+static void ioapic_register_intr(int irq, int vector, unsigned long trigger) -+{ -+ unsigned idx; -+ -+ idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; -+ -+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || -+ trigger == IOAPIC_LEVEL) -+ irq_desc[idx].chip = &ioapic_level_type; -+ else -+ irq_desc[idx].chip = &ioapic_edge_type; -+ set_intr_gate(vector, interrupt[idx]); -+} -+#else -+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) -+#endif -+ -+static void __init setup_IO_APIC_irqs(void) -+{ -+ struct IO_APIC_route_entry entry; -+ int apic, pin, idx, irq, first_notcon = 1, vector; -+ unsigned long flags; -+ -+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -+ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { -+ -+ /* -+ * add it to the IO-APIC irq-routing table: -+ */ -+ memset(&entry,0,sizeof(entry)); -+ -+ entry.delivery_mode = INT_DELIVERY_MODE; -+ entry.dest_mode = INT_DEST_MODE; -+ entry.mask = 0; /* enable IRQ */ -+ entry.dest.logical.logical_dest = -+ cpu_mask_to_apicid(TARGET_CPUS); -+ -+ idx = find_irq_entry(apic,pin,mp_INT); -+ if (idx == -1) { -+ if (first_notcon) { -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ " IO-APIC (apicid-pin) %d-%d", -+ mp_ioapics[apic].mpc_apicid, -+ pin); -+ first_notcon = 0; -+ } else -+ apic_printk(APIC_VERBOSE, ", %d-%d", -+ mp_ioapics[apic].mpc_apicid, pin); -+ continue; -+ } -+ -+ entry.trigger = irq_trigger(idx); -+ entry.polarity = irq_polarity(idx); -+ -+ if (irq_trigger(idx)) { -+ entry.trigger = 1; -+ entry.mask = 1; -+ } -+ -+ irq = pin_2_irq(idx, apic, pin); -+ /* -+ * skip adding the timer int on secondary nodes, which causes -+ * a small but painful rift in the time-space continuum -+ */ -+ if (multi_timer_check(apic, irq)) -+ continue; -+ else -+ add_pin_to_irq(irq, apic, pin); -+ -+ if (/*!apic &&*/ !IO_APIC_IRQ(irq)) -+ continue; -+ -+ if (IO_APIC_IRQ(irq)) { -+ vector = assign_irq_vector(irq); -+ entry.vector = vector; -+ ioapic_register_intr(irq, vector, IOAPIC_AUTO); -+ -+ if (!apic && (irq < 16)) -+ disable_8259A_irq(irq); -+ } -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); -+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); -+ set_native_irq_info(irq, TARGET_CPUS); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ } -+ } -+ -+ if (!first_notcon) -+ apic_printk(APIC_VERBOSE, " not connected.\n"); -+} -+ -+/* -+ * Set up the 8259A-master output pin: -+ */ -+#ifndef CONFIG_XEN -+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) -+{ -+ struct IO_APIC_route_entry entry; -+ unsigned long flags; -+ -+ memset(&entry,0,sizeof(entry)); -+ -+ disable_8259A_irq(0); -+ -+ /* mask LVT0 */ -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); -+ -+ /* -+ * We use logical delivery to get the timer IRQ -+ * to the first CPU. -+ */ -+ entry.dest_mode = INT_DEST_MODE; -+ entry.mask = 0; /* unmask IRQ now */ -+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); -+ entry.delivery_mode = INT_DELIVERY_MODE; -+ entry.polarity = 0; -+ entry.trigger = 0; -+ entry.vector = vector; -+ -+ /* -+ * The timer IRQ doesn't have to know that behind the -+ * scene we have a 8259A-master in AEOI mode ... -+ */ -+ irq_desc[0].chip = &ioapic_edge_type; -+ -+ /* -+ * Add it to the IO-APIC irq-routing table: -+ */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); -+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ enable_8259A_irq(0); -+} -+ -+static inline void UNEXPECTED_IO_APIC(void) -+{ -+} -+ -+void __init print_IO_APIC(void) -+{ -+ int apic, i; -+ union IO_APIC_reg_00 reg_00; -+ union IO_APIC_reg_01 reg_01; -+ union IO_APIC_reg_02 reg_02; -+ union IO_APIC_reg_03 reg_03; -+ unsigned long flags; -+ -+ if (apic_verbosity == APIC_QUIET) -+ return; -+ -+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); -+ for (i = 0; i < nr_ioapics; i++) -+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", -+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); -+ -+ /* -+ * We are a bit conservative about what we expect. We have to -+ * know about every hardware change ASAP. -+ */ -+ printk(KERN_INFO "testing the IO APIC.......................\n"); -+ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(apic, 0); -+ reg_01.raw = io_apic_read(apic, 1); -+ if (reg_01.bits.version >= 0x10) -+ reg_02.raw = io_apic_read(apic, 2); -+ if (reg_01.bits.version >= 0x20) -+ reg_03.raw = io_apic_read(apic, 3); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); -+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); -+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); -+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); -+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); -+ if (reg_00.bits.ID >= get_physical_broadcast()) -+ UNEXPECTED_IO_APIC(); -+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) -+ UNEXPECTED_IO_APIC(); -+ -+ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); -+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); -+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ -+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ -+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ -+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ -+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ -+ (reg_01.bits.entries != 0x2E) && -+ (reg_01.bits.entries != 0x3F) -+ ) -+ UNEXPECTED_IO_APIC(); -+ -+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); -+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); -+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ -+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ -+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ -+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ -+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ -+ ) -+ UNEXPECTED_IO_APIC(); -+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) -+ UNEXPECTED_IO_APIC(); -+ -+ /* -+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, -+ * but the value of reg_02 is read as the previous read register -+ * value, so ignore it if reg_02 == reg_01. -+ */ -+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { -+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); -+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); -+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) -+ UNEXPECTED_IO_APIC(); -+ } -+ -+ /* -+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 -+ * or reg_03, but the value of reg_0[23] is read as the previous read -+ * register value, so ignore it if reg_03 == reg_0[12]. -+ */ -+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && -+ reg_03.raw != reg_01.raw) { -+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); -+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); -+ if (reg_03.bits.__reserved_1) -+ UNEXPECTED_IO_APIC(); -+ } -+ -+ printk(KERN_DEBUG ".... IRQ redirection table:\n"); -+ -+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" -+ " Stat Dest Deli Vect: \n"); -+ -+ for (i = 0; i <= reg_01.bits.entries; i++) { -+ struct IO_APIC_route_entry entry; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); -+ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ printk(KERN_DEBUG " %02x %03X %02X ", -+ i, -+ entry.dest.logical.logical_dest, -+ entry.dest.physical.physical_dest -+ ); -+ -+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", -+ entry.mask, -+ entry.trigger, -+ entry.irr, -+ entry.polarity, -+ entry.delivery_status, -+ entry.dest_mode, -+ entry.delivery_mode, -+ entry.vector -+ ); -+ } -+ } -+ if (use_pci_vector()) -+ printk(KERN_INFO "Using vector-based indexing\n"); -+ printk(KERN_DEBUG "IRQ to pin mappings:\n"); -+ for (i = 0; i < NR_IRQS; i++) { -+ struct irq_pin_list *entry = irq_2_pin + i; -+ if (entry->pin < 0) -+ continue; -+ if (use_pci_vector() && !platform_legacy_irq(i)) -+ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); -+ else -+ printk(KERN_DEBUG "IRQ%d ", i); -+ for (;;) { -+ printk("-> %d:%d", entry->apic, entry->pin); -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+ printk("\n"); -+ } -+ -+ printk(KERN_INFO ".................................... done.\n"); -+ -+ return; -+} -+ -+static void print_APIC_bitfield (int base) -+{ -+ unsigned int v; -+ int i, j; -+ -+ if (apic_verbosity == APIC_QUIET) -+ return; -+ -+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); -+ for (i = 0; i < 8; i++) { -+ v = apic_read(base + i*0x10); -+ for (j = 0; j < 32; j++) { -+ if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ -+ apic_write(APIC_ESR, 0); -+ v = apic_read(APIC_ESR); -+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v); -+ } -+ -+ v = apic_read(APIC_ICR); -+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v); -+ v = apic_read(APIC_ICR2); -+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); -+ -+ v = apic_read(APIC_LVTT); -+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); -+ -+ if (maxlvt > 3) { /* PC is LVT#4. */ -+ v = apic_read(APIC_LVTPC); -+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); -+ } -+ v = apic_read(APIC_LVT0); -+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); -+ v = apic_read(APIC_LVT1); -+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); -+ -+ if (maxlvt > 2) { /* ERR is LVT#3. */ -+ v = apic_read(APIC_LVTERR); -+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); -+ } -+ -+ v = apic_read(APIC_TMICT); -+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); -+ v = apic_read(APIC_TMCCT); -+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); -+ v = apic_read(APIC_TDCR); -+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); -+ printk("\n"); -+} -+ -+void print_all_local_APICs (void) -+{ -+ on_each_cpu(print_local_APIC, NULL, 1, 1); -+} -+ -+void /*__init*/ print_PIC(void) -+{ -+ unsigned int v; -+ unsigned long flags; -+ -+ if (apic_verbosity == APIC_QUIET) -+ return; -+ -+ printk(KERN_DEBUG "\nprinting PIC contents\n"); -+ -+ spin_lock_irqsave(&i8259A_lock, flags); -+ -+ v = inb(0xa1) << 8 | inb(0x21); -+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v); -+ -+ v = inb(0xa0) << 8 | inb(0x20); -+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v); -+ -+ outb(0x0b,0xa0); -+ outb(0x0b,0x20); -+ v = inb(0xa0) << 8 | inb(0x20); -+ outb(0x0a,0xa0); -+ outb(0x0a,0x20); -+ -+ spin_unlock_irqrestore(&i8259A_lock, flags); -+ -+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v); -+ -+ v = inb(0x4d1) << 8 | inb(0x4d0); -+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -+} -+#endif /* !CONFIG_XEN */ -+ -+static void __init enable_IO_APIC(void) -+{ -+ union IO_APIC_reg_01 reg_01; -+ int i8259_apic, i8259_pin; -+ int i, apic; -+ unsigned long flags; -+ -+ for (i = 0; i < PIN_MAP_SIZE; i++) { -+ irq_2_pin[i].pin = -1; -+ irq_2_pin[i].next = 0; -+ } -+ if (!pirqs_enabled) -+ for (i = 0; i < MAX_PIRQS; i++) -+ pirq_entries[i] = -1; -+ -+ /* -+ * The number of IO-APIC IRQ registers (== #pins): -+ */ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_01.raw = io_apic_read(apic, 1); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ nr_ioapic_registers[apic] = reg_01.bits.entries+1; -+ } -+ for(apic = 0; apic < nr_ioapics; apic++) { -+ int pin; -+ /* See if any of the pins is in ExtINT mode */ -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { -+ struct IO_APIC_route_entry entry; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); -+ *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ -+ /* If the interrupt line is enabled and in ExtInt mode -+ * I have found the pin where the i8259 is connected. -+ */ -+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { -+ ioapic_i8259.apic = apic; -+ ioapic_i8259.pin = pin; -+ goto found_i8259; -+ } -+ } -+ } -+ found_i8259: -+ /* Look to see what if the MP table has reported the ExtINT */ -+ /* If we could not find the appropriate pin by looking at the ioapic -+ * the i8259 probably is not connected the ioapic but give the -+ * mptable a chance anyway. -+ */ -+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); -+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT); -+ /* Trust the MP table if nothing is setup in the hardware */ -+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { -+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); -+ ioapic_i8259.pin = i8259_pin; -+ ioapic_i8259.apic = i8259_apic; -+ } -+ /* Complain if the MP table and the hardware disagree */ -+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && -+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) -+ { -+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); -+ } -+ -+ /* -+ * Do not trust the IO-APIC being empty at bootup -+ */ -+ clear_IO_APIC(); -+} -+ -+/* -+ * Not an __init, needed by the reboot code -+ */ -+void disable_IO_APIC(void) -+{ -+ /* -+ * Clear the IO-APIC before rebooting: -+ */ -+ clear_IO_APIC(); -+ -+#ifndef CONFIG_XEN -+ /* -+ * If the i8259 is routed through an IOAPIC -+ * Put that IOAPIC in virtual wire mode -+ * so legacy interrupts can be delivered. -+ */ -+ if (ioapic_i8259.pin != -1) { -+ struct IO_APIC_route_entry entry; -+ unsigned long flags; -+ -+ memset(&entry, 0, sizeof(entry)); -+ entry.mask = 0; /* Enabled */ -+ entry.trigger = 0; /* Edge */ -+ entry.irr = 0; -+ entry.polarity = 0; /* High */ -+ entry.delivery_status = 0; -+ entry.dest_mode = 0; /* Physical */ -+ entry.delivery_mode = dest_ExtINT; /* ExtInt */ -+ entry.vector = 0; -+ entry.dest.physical.physical_dest = -+ GET_APIC_ID(apic_read(APIC_ID)); -+ -+ /* -+ * Add it to the IO-APIC irq-routing table: -+ */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, -+ *(((int *)&entry)+1)); -+ io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, -+ *(((int *)&entry)+0)); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ } -+ disconnect_bsp_APIC(ioapic_i8259.pin != -1); -+#endif -+} -+ -+/* -+ * function to set the IO-APIC physical IDs based on the -+ * values stored in the MPC table. -+ * -+ * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 -+ */ -+ -+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) -+static void __init setup_ioapic_ids_from_mpc(void) -+{ -+ union IO_APIC_reg_00 reg_00; -+ physid_mask_t phys_id_present_map; -+ int apic; -+ int i; -+ unsigned char old_id; -+ unsigned long flags; -+ -+ /* -+ * Don't check I/O APIC IDs for xAPIC systems. They have -+ * no meaning without the serial APIC bus. -+ */ -+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) -+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) -+ return; -+ /* -+ * This is broken; anything with a real cpu count has to -+ * circumvent this idiocy regardless. -+ */ -+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); -+ -+ /* -+ * Set the IOAPIC ID to the value stored in the MPC table. -+ */ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ -+ /* Read the register 0 value */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(apic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ old_id = mp_ioapics[apic].mpc_apicid; -+ -+ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { -+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", -+ apic, mp_ioapics[apic].mpc_apicid); -+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", -+ reg_00.bits.ID); -+ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; -+ } -+ -+ /* -+ * Sanity check, is the ID really free? Every APIC in a -+ * system must have a unique ID or we get lots of nice -+ * 'stuck on smp_invalidate_needed IPI wait' messages. -+ */ -+ if (check_apicid_used(phys_id_present_map, -+ mp_ioapics[apic].mpc_apicid)) { -+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", -+ apic, mp_ioapics[apic].mpc_apicid); -+ for (i = 0; i < get_physical_broadcast(); i++) -+ if (!physid_isset(i, phys_id_present_map)) -+ break; -+ if (i >= get_physical_broadcast()) -+ panic("Max APIC ID exceeded!\n"); -+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", -+ i); -+ physid_set(i, phys_id_present_map); -+ mp_ioapics[apic].mpc_apicid = i; -+ } else { -+ physid_mask_t tmp; -+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); -+ apic_printk(APIC_VERBOSE, "Setting %d in the " -+ "phys_id_present_map\n", -+ mp_ioapics[apic].mpc_apicid); -+ physids_or(phys_id_present_map, phys_id_present_map, tmp); -+ } -+ -+ -+ /* -+ * We need to adjust the IRQ routing table -+ * if the ID changed. -+ */ -+ if (old_id != mp_ioapics[apic].mpc_apicid) -+ for (i = 0; i < mp_irq_entries; i++) -+ if (mp_irqs[i].mpc_dstapic == old_id) -+ mp_irqs[i].mpc_dstapic -+ = mp_ioapics[apic].mpc_apicid; -+ -+ /* -+ * Read the right value from the MPC table and -+ * write it into the ID register. -+ */ -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "...changing IO-APIC physical APIC ID to %d ...", -+ mp_ioapics[apic].mpc_apicid); -+ -+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0, reg_00.raw); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ /* -+ * Sanity check -+ */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(apic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) -+ printk("could not set ID!\n"); -+ else -+ apic_printk(APIC_VERBOSE, " ok.\n"); -+ } -+} -+#else -+static void __init setup_ioapic_ids_from_mpc(void) { } -+#endif -+ -+#ifndef CONFIG_XEN -+/* -+ * There is a nasty bug in some older SMP boards, their mptable lies -+ * about the timer IRQ. We do the following to work around the situation: -+ * -+ * - timer IRQ defaults to IO-APIC IRQ -+ * - if this function detects that timer IRQs are defunct, then we fall -+ * back to ISA timer IRQs -+ */ -+static int __init timer_irq_works(void) -+{ -+ unsigned long t1 = jiffies; -+ -+ local_irq_enable(); -+ /* Let ten ticks pass... */ -+ mdelay((10 * 1000) / HZ); -+ -+ /* -+ * Expect a few ticks at least, to be sure some possible -+ * glue logic does not lock up after one or two first -+ * ticks in a non-ExtINT mode. Also the local APIC -+ * might have cached one ExtINT interrupt. Finally, at -+ * least one tick may be lost due to delays. -+ */ -+ if (jiffies - t1 > 4) -+ return 1; -+ -+ return 0; -+} -+ -+/* -+ * In the SMP+IOAPIC case it might happen that there are an unspecified -+ * number of pending IRQ events unhandled. These cases are very rare, -+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much -+ * better to do it this way as thus we do not have to be aware of -+ * 'pending' interrupts in the IRQ path, except at this point. -+ */ -+/* -+ * Edge triggered needs to resend any interrupt -+ * that was delayed but this is now handled in the device -+ * independent code. -+ */ -+ -+/* -+ * Starting up a edge-triggered IO-APIC interrupt is -+ * nasty - we need to make sure that we get the edge. -+ * If it is already asserted for some reason, we need -+ * return 1 to indicate that is was pending. -+ * -+ * This is not complete - we should be able to fake -+ * an edge even if it isn't on the 8259A... -+ */ -+static unsigned int startup_edge_ioapic_irq(unsigned int irq) -+{ -+ int was_pending = 0; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ if (irq < 16) { -+ disable_8259A_irq(irq); -+ if (i8259A_irq_pending(irq)) -+ was_pending = 1; -+ } -+ __unmask_IO_APIC_irq(irq); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return was_pending; -+} -+ -+/* -+ * Once we have recorded IRQ_PENDING already, we can mask the -+ * interrupt for real. This prevents IRQ storms from unhandled -+ * devices. -+ */ -+static void ack_edge_ioapic_irq(unsigned int irq) -+{ -+ move_irq(irq); -+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) -+ == (IRQ_PENDING | IRQ_DISABLED)) -+ mask_IO_APIC_irq(irq); -+ ack_APIC_irq(); -+} -+ -+/* -+ * Level triggered interrupts can just be masked, -+ * and shutting down and starting up the interrupt -+ * is the same as enabling and disabling them -- except -+ * with a startup need to return a "was pending" value. -+ * -+ * Level triggered interrupts are special because we -+ * do not touch any IO-APIC register while handling -+ * them. We ack the APIC in the end-IRQ handler, not -+ * in the start-IRQ-handler. Protection against reentrance -+ * from the same interrupt is still provided, both by the -+ * generic IRQ layer and by the fact that an unacked local -+ * APIC does not accept IRQs. -+ */ -+static unsigned int startup_level_ioapic_irq (unsigned int irq) -+{ -+ unmask_IO_APIC_irq(irq); -+ -+ return 0; /* don't check for pending */ -+} -+ -+static void end_level_ioapic_irq (unsigned int irq) -+{ -+ unsigned long v; -+ int i; -+ -+ move_irq(irq); -+/* -+ * It appears there is an erratum which affects at least version 0x11 -+ * of I/O APIC (that's the 82093AA and cores integrated into various -+ * chipsets). Under certain conditions a level-triggered interrupt is -+ * erroneously delivered as edge-triggered one but the respective IRR -+ * bit gets set nevertheless. As a result the I/O unit expects an EOI -+ * message but it will never arrive and further interrupts are blocked -+ * from the source. The exact reason is so far unknown, but the -+ * phenomenon was observed when two consecutive interrupt requests -+ * from a given source get delivered to the same CPU and the source is -+ * temporarily disabled in between. -+ * -+ * A workaround is to simulate an EOI message manually. We achieve it -+ * by setting the trigger mode to edge and then to level when the edge -+ * trigger mode gets detected in the TMR of a local APIC for a -+ * level-triggered interrupt. We mask the source for the time of the -+ * operation to prevent an edge-triggered interrupt escaping meanwhile. -+ * The idea is from Manfred Spraul. --macro -+ */ -+ i = IO_APIC_VECTOR(irq); -+ -+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -+ -+ ack_APIC_irq(); -+ -+ if (!(v & (1 << (i & 0x1f)))) { -+ atomic_inc(&irq_mis_count); -+ spin_lock(&ioapic_lock); -+ __mask_and_edge_IO_APIC_irq(irq); -+ __unmask_and_level_IO_APIC_irq(irq); -+ spin_unlock(&ioapic_lock); -+ } -+} -+ -+#ifdef CONFIG_PCI_MSI -+static unsigned int startup_edge_ioapic_vector(unsigned int vector) -+{ -+ int irq = vector_to_irq(vector); -+ -+ return startup_edge_ioapic_irq(irq); -+} -+ -+static void ack_edge_ioapic_vector(unsigned int vector) -+{ -+ int irq = vector_to_irq(vector); -+ -+ move_native_irq(vector); -+ ack_edge_ioapic_irq(irq); -+} -+ -+static unsigned int startup_level_ioapic_vector (unsigned int vector) -+{ -+ int irq = vector_to_irq(vector); -+ -+ return startup_level_ioapic_irq (irq); -+} -+ -+static void end_level_ioapic_vector (unsigned int vector) -+{ -+ int irq = vector_to_irq(vector); -+ -+ move_native_irq(vector); -+ end_level_ioapic_irq(irq); -+} -+ -+static void mask_IO_APIC_vector (unsigned int vector) -+{ -+ int irq = vector_to_irq(vector); -+ -+ mask_IO_APIC_irq(irq); -+} -+ -+static void unmask_IO_APIC_vector (unsigned int vector) -+{ -+ int irq = vector_to_irq(vector); -+ -+ unmask_IO_APIC_irq(irq); -+} -+ -+#ifdef CONFIG_SMP -+static void set_ioapic_affinity_vector (unsigned int vector, -+ cpumask_t cpu_mask) -+{ -+ int irq = vector_to_irq(vector); -+ -+ set_native_irq_info(vector, cpu_mask); -+ set_ioapic_affinity_irq(irq, cpu_mask); -+} -+#endif -+#endif -+ -+static int ioapic_retrigger(unsigned int irq) -+{ -+ send_IPI_self(IO_APIC_VECTOR(irq)); -+ -+ return 1; -+} -+ -+/* -+ * Level and edge triggered IO-APIC interrupts need different handling, -+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be -+ * handled with the level-triggered descriptor, but that one has slightly -+ * more overhead. Level-triggered interrupts cannot be handled with the -+ * edge-triggered handler, without risking IRQ storms and other ugly -+ * races. -+ */ -+static struct hw_interrupt_type ioapic_edge_type __read_mostly = { -+ .typename = "IO-APIC-edge", -+ .startup = startup_edge_ioapic, -+ .shutdown = shutdown_edge_ioapic, -+ .enable = enable_edge_ioapic, -+ .disable = disable_edge_ioapic, -+ .ack = ack_edge_ioapic, -+ .end = end_edge_ioapic, -+#ifdef CONFIG_SMP -+ .set_affinity = set_ioapic_affinity, -+#endif -+ .retrigger = ioapic_retrigger, -+}; -+ -+static struct hw_interrupt_type ioapic_level_type __read_mostly = { -+ .typename = "IO-APIC-level", -+ .startup = startup_level_ioapic, -+ .shutdown = shutdown_level_ioapic, -+ .enable = enable_level_ioapic, -+ .disable = disable_level_ioapic, -+ .ack = mask_and_ack_level_ioapic, -+ .end = end_level_ioapic, -+#ifdef CONFIG_SMP -+ .set_affinity = set_ioapic_affinity, -+#endif -+ .retrigger = ioapic_retrigger, -+}; -+#endif /* !CONFIG_XEN */ -+ -+static inline void init_IO_APIC_traps(void) -+{ -+ int irq; -+ -+ /* -+ * NOTE! The local APIC isn't very good at handling -+ * multiple interrupts at the same interrupt level. -+ * As the interrupt level is determined by taking the -+ * vector number and shifting that right by 4, we -+ * want to spread these out a bit so that they don't -+ * all fall in the same interrupt level. -+ * -+ * Also, we've got to be careful not to trash gate -+ * 0x80, because int 0x80 is hm, kind of importantish. ;) -+ */ -+ for (irq = 0; irq < NR_IRQS ; irq++) { -+ int tmp = irq; -+ if (use_pci_vector()) { -+ if (!platform_legacy_irq(tmp)) -+ if ((tmp = vector_to_irq(tmp)) == -1) -+ continue; -+ } -+ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { -+ /* -+ * Hmm.. We don't have an entry for this, -+ * so default to an old-fashioned 8259 -+ * interrupt if we can.. -+ */ -+ if (irq < 16) -+ make_8259A_irq(irq); -+#ifndef CONFIG_XEN -+ else -+ /* Strange. Oh, well.. */ -+ irq_desc[irq].chip = &no_irq_type; -+#endif -+ } -+ } -+} -+ -+#ifndef CONFIG_XEN -+static void enable_lapic_irq (unsigned int irq) -+{ -+ unsigned long v; -+ -+ v = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); -+} -+ -+static void disable_lapic_irq (unsigned int irq) -+{ -+ unsigned long v; -+ -+ v = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -+} -+ -+static void ack_lapic_irq (unsigned int irq) -+{ -+ ack_APIC_irq(); -+} -+ -+static void end_lapic_irq (unsigned int i) { /* nothing */ } -+ -+static struct hw_interrupt_type lapic_irq_type __read_mostly = { -+ .typename = "local-APIC-edge", -+ .startup = NULL, /* startup_irq() not used for IRQ0 */ -+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ -+ .enable = enable_lapic_irq, -+ .disable = disable_lapic_irq, -+ .ack = ack_lapic_irq, -+ .end = end_lapic_irq -+}; -+ -+static void setup_nmi (void) -+{ -+ /* -+ * Dirty trick to enable the NMI watchdog ... -+ * We put the 8259A master into AEOI mode and -+ * unmask on all local APICs LVT0 as NMI. -+ * -+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') -+ * is from Maciej W. Rozycki - so we do not have to EOI from -+ * the NMI handler or the timer interrupt. -+ */ -+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); -+ -+ on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); -+ -+ apic_printk(APIC_VERBOSE, " done.\n"); -+} -+ -+/* -+ * This looks a bit hackish but it's about the only one way of sending -+ * a few INTA cycles to 8259As and any associated glue logic. ICR does -+ * not support the ExtINT mode, unfortunately. We need to send these -+ * cycles as some i82489DX-based boards have glue logic that keeps the -+ * 8259A interrupt line asserted until INTA. --macro -+ */ -+static inline void unlock_ExtINT_logic(void) -+{ -+ int apic, pin, i; -+ struct IO_APIC_route_entry entry0, entry1; -+ unsigned char save_control, save_freq_select; -+ unsigned long flags; -+ -+ pin = find_isa_irq_pin(8, mp_INT); -+ apic = find_isa_irq_apic(8, mp_INT); -+ if (pin == -1) -+ return; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); -+ *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ clear_IO_APIC_pin(apic, pin); -+ -+ memset(&entry1, 0, sizeof(entry1)); -+ -+ entry1.dest_mode = 0; /* physical delivery */ -+ entry1.mask = 0; /* unmask IRQ now */ -+ entry1.dest.physical.physical_dest = hard_smp_processor_id(); -+ entry1.delivery_mode = dest_ExtINT; -+ entry1.polarity = entry0.polarity; -+ entry1.trigger = 0; -+ entry1.vector = 0; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); -+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ save_control = CMOS_READ(RTC_CONTROL); -+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); -+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, -+ RTC_FREQ_SELECT); -+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); -+ -+ i = 100; -+ while (i-- > 0) { -+ mdelay(10); -+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) -+ i -= 10; -+ } -+ -+ CMOS_WRITE(save_control, RTC_CONTROL); -+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); -+ clear_IO_APIC_pin(apic, pin); -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); -+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+int timer_uses_ioapic_pin_0; -+ -+/* -+ * This code may look a bit paranoid, but it's supposed to cooperate with -+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ -+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast -+ * fanatically on his truly buggy board. -+ */ -+static inline void check_timer(void) -+{ -+ int apic1, pin1, apic2, pin2; -+ int vector; -+ -+ /* -+ * get/set the timer IRQ vector: -+ */ -+ disable_8259A_irq(0); -+ vector = assign_irq_vector(0); -+ set_intr_gate(vector, interrupt[0]); -+ -+ /* -+ * Subtle, code in do_timer_interrupt() expects an AEOI -+ * mode for the 8259A whenever interrupts are routed -+ * through I/O APICs. Also IRQ0 has to be enabled in -+ * the 8259A which implies the virtual wire has to be -+ * disabled in the local APIC. -+ */ -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); -+ init_8259A(1); -+ timer_ack = 1; -+ if (timer_over_8254 > 0) -+ enable_8259A_irq(0); -+ -+ pin1 = find_isa_irq_pin(0, mp_INT); -+ apic1 = find_isa_irq_apic(0, mp_INT); -+ pin2 = ioapic_i8259.pin; -+ apic2 = ioapic_i8259.apic; -+ -+ if (pin1 == 0) -+ timer_uses_ioapic_pin_0 = 1; -+ -+ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", -+ vector, apic1, pin1, apic2, pin2); -+ -+ if (pin1 != -1) { -+ /* -+ * Ok, does IRQ0 through the IOAPIC work? -+ */ -+ unmask_IO_APIC_irq(0); -+ if (timer_irq_works()) { -+ if (nmi_watchdog == NMI_IO_APIC) { -+ disable_8259A_irq(0); -+ setup_nmi(); -+ enable_8259A_irq(0); -+ } -+ if (disable_timer_pin_1 > 0) -+ clear_IO_APIC_pin(0, pin1); -+ return; -+ } -+ clear_IO_APIC_pin(apic1, pin1); -+ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " -+ "IO-APIC\n"); -+ } -+ -+ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); -+ if (pin2 != -1) { -+ printk("\n..... (found pin %d) ...", pin2); -+ /* -+ * legacy devices should be connected to IO APIC #0 -+ */ -+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); -+ if (timer_irq_works()) { -+ printk("works.\n"); -+ if (pin1 != -1) -+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2); -+ else -+ add_pin_to_irq(0, apic2, pin2); -+ if (nmi_watchdog == NMI_IO_APIC) { -+ setup_nmi(); -+ } -+ return; -+ } -+ /* -+ * Cleanup, just in case ... -+ */ -+ clear_IO_APIC_pin(apic2, pin2); -+ } -+ printk(" failed.\n"); -+ -+ if (nmi_watchdog == NMI_IO_APIC) { -+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); -+ nmi_watchdog = 0; -+ } -+ -+ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); -+ -+ disable_8259A_irq(0); -+ irq_desc[0].chip = &lapic_irq_type; -+ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ -+ enable_8259A_irq(0); -+ -+ if (timer_irq_works()) { -+ printk(" works.\n"); -+ return; -+ } -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); -+ printk(" failed.\n"); -+ -+ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); -+ -+ timer_ack = 0; -+ init_8259A(0); -+ make_8259A_irq(0); -+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); -+ -+ unlock_ExtINT_logic(); -+ -+ if (timer_irq_works()) { -+ printk(" works.\n"); -+ return; -+ } -+ printk(" failed :(.\n"); -+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " -+ "report. Then try booting with the 'noapic' option"); -+} -+#else -+int timer_uses_ioapic_pin_0 = 0; -+#define check_timer() ((void)0) -+#endif -+ -+/* -+ * -+ * IRQ's that are handled by the PIC in the MPS IOAPIC case. -+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. -+ * Linux doesn't really care, as it's not actually used -+ * for any interrupt handling anyway. -+ */ -+#define PIC_IRQS (1 << PIC_CASCADE_IR) -+ -+void __init setup_IO_APIC(void) -+{ -+ enable_IO_APIC(); -+ -+ if (acpi_ioapic) -+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ -+ else -+ io_apic_irqs = ~PIC_IRQS; -+ -+ printk("ENABLING IO-APIC IRQs\n"); -+ -+ /* -+ * Set up IO-APIC IRQ routing. -+ */ -+ if (!acpi_ioapic) -+ setup_ioapic_ids_from_mpc(); -+#ifndef CONFIG_XEN -+ sync_Arb_IDs(); -+#endif -+ setup_IO_APIC_irqs(); -+ init_IO_APIC_traps(); -+ check_timer(); -+ if (!acpi_ioapic) -+ print_IO_APIC(); -+} -+ -+static int __init setup_disable_8254_timer(char *s) -+{ -+ timer_over_8254 = -1; -+ return 1; -+} -+static int __init setup_enable_8254_timer(char *s) -+{ -+ timer_over_8254 = 2; -+ return 1; -+} -+ -+__setup("disable_8254_timer", setup_disable_8254_timer); -+__setup("enable_8254_timer", setup_enable_8254_timer); -+ -+/* -+ * Called after all the initialization is done. If we didnt find any -+ * APIC bugs then we can allow the modify fast path -+ */ -+ -+static int __init io_apic_bug_finalize(void) -+{ -+ if(sis_apic_bug == -1) -+ sis_apic_bug = 0; -+ if (is_initial_xendomain()) { -+ struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; -+ op.u.platform_quirk.quirk_id = sis_apic_bug ? -+ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; -+ VOID(HYPERVISOR_platform_op(&op)); -+ } -+ return 0; -+} -+ -+late_initcall(io_apic_bug_finalize); -+ -+struct sysfs_ioapic_data { -+ struct sys_device dev; -+ struct IO_APIC_route_entry entry[0]; -+}; -+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; -+ -+static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -+{ -+ struct IO_APIC_route_entry *entry; -+ struct sysfs_ioapic_data *data; -+ unsigned long flags; -+ int i; -+ -+ data = container_of(dev, struct sysfs_ioapic_data, dev); -+ entry = data->entry; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { -+ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); -+ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); -+ } -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return 0; -+} -+ -+static int ioapic_resume(struct sys_device *dev) -+{ -+ struct IO_APIC_route_entry *entry; -+ struct sysfs_ioapic_data *data; -+ unsigned long flags; -+ union IO_APIC_reg_00 reg_00; -+ int i; -+ -+ data = container_of(dev, struct sysfs_ioapic_data, dev); -+ entry = data->entry; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(dev->id, 0); -+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { -+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; -+ io_apic_write(dev->id, 0, reg_00.raw); -+ } -+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { -+ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); -+ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); -+ } -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return 0; -+} -+ -+static struct sysdev_class ioapic_sysdev_class = { -+ set_kset_name("ioapic"), -+ .suspend = ioapic_suspend, -+ .resume = ioapic_resume, -+}; -+ -+static int __init ioapic_init_sysfs(void) -+{ -+ struct sys_device * dev; -+ int i, size, error = 0; -+ -+ error = sysdev_class_register(&ioapic_sysdev_class); -+ if (error) -+ return error; -+ -+ for (i = 0; i < nr_ioapics; i++ ) { -+ size = sizeof(struct sys_device) + nr_ioapic_registers[i] -+ * sizeof(struct IO_APIC_route_entry); -+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); -+ if (!mp_ioapic_data[i]) { -+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); -+ continue; -+ } -+ memset(mp_ioapic_data[i], 0, size); -+ dev = &mp_ioapic_data[i]->dev; -+ dev->id = i; -+ dev->cls = &ioapic_sysdev_class; -+ error = sysdev_register(dev); -+ if (error) { -+ kfree(mp_ioapic_data[i]); -+ mp_ioapic_data[i] = NULL; -+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); -+ continue; -+ } -+ } -+ -+ return 0; -+} -+ -+device_initcall(ioapic_init_sysfs); -+ -+/* -------------------------------------------------------------------------- -+ ACPI-based IOAPIC Configuration -+ -------------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_ACPI -+ -+int __init io_apic_get_unique_id (int ioapic, int apic_id) -+{ -+#ifndef CONFIG_XEN -+ union IO_APIC_reg_00 reg_00; -+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE; -+ physid_mask_t tmp; -+ unsigned long flags; -+ int i = 0; -+ -+ /* -+ * The P4 platform supports up to 256 APIC IDs on two separate APIC -+ * buses (one for LAPICs, one for IOAPICs), where predecessors only -+ * supports up to 16 on one shared APIC bus. -+ * -+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full -+ * advantage of new APIC bus architecture. -+ */ -+ -+ if (physids_empty(apic_id_map)) -+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(ioapic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ if (apic_id >= get_physical_broadcast()) { -+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " -+ "%d\n", ioapic, apic_id, reg_00.bits.ID); -+ apic_id = reg_00.bits.ID; -+ } -+ -+ /* -+ * Every APIC in a system must have a unique ID or we get lots of nice -+ * 'stuck on smp_invalidate_needed IPI wait' messages. -+ */ -+ if (check_apicid_used(apic_id_map, apic_id)) { -+ -+ for (i = 0; i < get_physical_broadcast(); i++) { -+ if (!check_apicid_used(apic_id_map, i)) -+ break; -+ } -+ -+ if (i == get_physical_broadcast()) -+ panic("Max apic_id exceeded!\n"); -+ -+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " -+ "trying %d\n", ioapic, apic_id, i); -+ -+ apic_id = i; -+ } -+ -+ tmp = apicid_to_cpu_present(apic_id); -+ physids_or(apic_id_map, apic_id_map, tmp); -+ -+ if (reg_00.bits.ID != apic_id) { -+ reg_00.bits.ID = apic_id; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(ioapic, 0, reg_00.raw); -+ reg_00.raw = io_apic_read(ioapic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ /* Sanity check */ -+ if (reg_00.bits.ID != apic_id) { -+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); -+ return -1; -+ } -+ } -+ -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); -+#endif /* !CONFIG_XEN */ -+ -+ return apic_id; -+} -+ -+ -+int __init io_apic_get_version (int ioapic) -+{ -+ union IO_APIC_reg_01 reg_01; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_01.raw = io_apic_read(ioapic, 1); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return reg_01.bits.version; -+} -+ -+ -+int __init io_apic_get_redir_entries (int ioapic) -+{ -+ union IO_APIC_reg_01 reg_01; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_01.raw = io_apic_read(ioapic, 1); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return reg_01.bits.entries; -+} -+ -+ -+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) -+{ -+ struct IO_APIC_route_entry entry; -+ unsigned long flags; -+ -+ if (!IO_APIC_IRQ(irq)) { -+ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", -+ ioapic); -+ return -EINVAL; -+ } -+ -+ /* -+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. -+ * Note that we mask (disable) IRQs now -- these get enabled when the -+ * corresponding device driver registers for this IRQ. -+ */ -+ -+ memset(&entry,0,sizeof(entry)); -+ -+ entry.delivery_mode = INT_DELIVERY_MODE; -+ entry.dest_mode = INT_DEST_MODE; -+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); -+ entry.trigger = edge_level; -+ entry.polarity = active_high_low; -+ entry.mask = 1; -+ -+ /* -+ * IRQs < 16 are already in the irq_2_pin[] map -+ */ -+ if (irq >= 16) -+ add_pin_to_irq(irq, ioapic, pin); -+ -+ entry.vector = assign_irq_vector(irq); -+ -+ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " -+ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, -+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, -+ edge_level, active_high_low); -+ -+ ioapic_register_intr(irq, entry.vector, edge_level); -+ -+ if (!ioapic && (irq < 16)) -+ disable_8259A_irq(irq); -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); -+ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); -+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return 0; -+} -+ -+#endif /* CONFIG_ACPI */ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/ioport-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/ioport-xen.c ---- linux-2.6.18.8/arch/i386/kernel/ioport-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/ioport-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,123 @@ -+/* -+ * linux/arch/i386/kernel/ioport.c -+ * -+ * This contains the io-permission bitmap code - written by obz, with changes -+ * by Linus. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -+{ -+ unsigned long mask; -+ unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); -+ unsigned int low_index = base & (BITS_PER_LONG-1); -+ int length = low_index + extent; -+ -+ if (low_index != 0) { -+ mask = (~0UL << low_index); -+ if (length < BITS_PER_LONG) -+ mask &= ~(~0UL << length); -+ if (new_value) -+ *bitmap_base++ |= mask; -+ else -+ *bitmap_base++ &= ~mask; -+ length -= BITS_PER_LONG; -+ } -+ -+ mask = (new_value ? ~0UL : 0UL); -+ while (length >= BITS_PER_LONG) { -+ *bitmap_base++ = mask; -+ length -= BITS_PER_LONG; -+ } -+ -+ if (length > 0) { -+ mask = ~(~0UL << length); -+ if (new_value) -+ *bitmap_base++ |= mask; -+ else -+ *bitmap_base++ &= ~mask; -+ } -+} -+ -+ -+/* -+ * this changes the io permissions bitmap in the current task. -+ */ -+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -+{ -+ struct thread_struct * t = ¤t->thread; -+ unsigned long *bitmap; -+ struct physdev_set_iobitmap set_iobitmap; -+ -+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) -+ return -EINVAL; -+ if (turn_on && !capable(CAP_SYS_RAWIO)) -+ return -EPERM; -+ -+ /* -+ * If it's the first ioperm() call in this thread's lifetime, set the -+ * IO bitmap up. ioperm() is much less timing critical than clone(), -+ * this is why we delay this operation until now: -+ */ -+ if (!t->io_bitmap_ptr) { -+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); -+ if (!bitmap) -+ return -ENOMEM; -+ -+ memset(bitmap, 0xff, IO_BITMAP_BYTES); -+ t->io_bitmap_ptr = bitmap; -+ set_thread_flag(TIF_IO_BITMAP); -+ -+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); -+ set_iobitmap.nr_ports = IO_BITMAP_BITS; -+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, -+ &set_iobitmap)); -+ } -+ -+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); -+ -+ return 0; -+} -+ -+/* -+ * sys_iopl has to be used when you want to access the IO ports -+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped -+ * you'd need 8kB of bitmaps/process, which is a bit excessive. -+ * -+ * Here we just change the eflags value on the stack: we allow -+ * only the super-user to do it. This depends on the stack-layout -+ * on system-call entry - see also fork() and the signal handling -+ * code. -+ */ -+ -+asmlinkage long sys_iopl(unsigned long unused) -+{ -+ volatile struct pt_regs * regs = (struct pt_regs *) &unused; -+ unsigned int level = regs->ebx; -+ struct thread_struct *t = ¤t->thread; -+ unsigned int old = (t->iopl >> 12) & 3; -+ -+ if (level > 3) -+ return -EINVAL; -+ /* Trying to gain more privileges? */ -+ if (level > old) { -+ if (!capable(CAP_SYS_RAWIO)) -+ return -EPERM; -+ } -+ t->iopl = level << 12; -+ set_iopl_mask(t->iopl); -+ return 0; -+} -diff -rpuN linux-2.6.18.8/arch/i386/kernel/irq-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/irq-xen.c ---- linux-2.6.18.8/arch/i386/kernel/irq-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/irq-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,324 @@ -+/* -+ * linux/arch/i386/kernel/irq.c -+ * -+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar -+ * -+ * This file contains the lowest level x86-specific interrupt -+ * entry, irq-stacks and irq statistics code. All the remaining -+ * irq logic is done by the generic kernel/irq/ code and -+ * by the x86-specific irq controller code. (e.g. i8259.c and -+ * io_apic.c.) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; -+EXPORT_PER_CPU_SYMBOL(irq_stat); -+ -+#ifndef CONFIG_X86_LOCAL_APIC -+/* -+ * 'what should we do if we get a hw irq event on an illegal vector'. -+ * each architecture has to answer this themselves. -+ */ -+void ack_bad_irq(unsigned int irq) -+{ -+ printk("unexpected IRQ trap at vector %02x\n", irq); -+} -+#endif -+ -+#ifdef CONFIG_4KSTACKS -+/* -+ * per-CPU IRQ handling contexts (thread information and stack) -+ */ -+union irq_ctx { -+ struct thread_info tinfo; -+ u32 stack[THREAD_SIZE/sizeof(u32)]; -+}; -+ -+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -+#endif -+ -+/* -+ * do_IRQ handles all normal device IRQ's (the special -+ * SMP cross-CPU interrupts have their own specific -+ * handlers). -+ */ -+fastcall unsigned int do_IRQ(struct pt_regs *regs) -+{ -+ /* high bit used in ret_from_ code */ -+ int irq = ~regs->orig_eax; -+#ifdef CONFIG_4KSTACKS -+ union irq_ctx *curctx, *irqctx; -+ u32 *isp; -+#endif -+ -+ if (unlikely((unsigned)irq >= NR_IRQS)) { -+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n", -+ __FUNCTION__, irq); -+ BUG(); -+ } -+ -+ irq_enter(); -+#ifdef CONFIG_DEBUG_STACKOVERFLOW -+ /* Debugging check for stack overflow: is there less than 1KB free? */ -+ { -+ long esp; -+ -+ __asm__ __volatile__("andl %%esp,%0" : -+ "=r" (esp) : "0" (THREAD_SIZE - 1)); -+ if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { -+ printk("do_IRQ: stack overflow: %ld\n", -+ esp - sizeof(struct thread_info)); -+ dump_stack(); -+ } -+ } -+#endif -+ -+#ifdef CONFIG_4KSTACKS -+ -+ curctx = (union irq_ctx *) current_thread_info(); -+ irqctx = hardirq_ctx[smp_processor_id()]; -+ -+ /* -+ * this is where we switch to the IRQ stack. However, if we are -+ * already using the IRQ stack (because we interrupted a hardirq -+ * handler) we can't do that and just have to keep using the -+ * current stack (which is the irq stack already after all) -+ */ -+ if (curctx != irqctx) { -+ int arg1, arg2, ebx; -+ -+ /* build the stack frame on the IRQ stack */ -+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); -+ irqctx->tinfo.task = curctx->tinfo.task; -+ irqctx->tinfo.previous_esp = current_stack_pointer; -+ -+ /* -+ * Copy the softirq bits in preempt_count so that the -+ * softirq checks work in the hardirq context. -+ */ -+ irqctx->tinfo.preempt_count = -+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | -+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK); -+ -+ asm volatile( -+ " xchgl %%ebx,%%esp \n" -+ " call __do_IRQ \n" -+ " movl %%ebx,%%esp \n" -+ : "=a" (arg1), "=d" (arg2), "=b" (ebx) -+ : "0" (irq), "1" (regs), "2" (isp) -+ : "memory", "cc", "ecx" -+ ); -+ } else -+#endif -+ __do_IRQ(irq, regs); -+ -+ irq_exit(); -+ -+ return 1; -+} -+ -+#ifdef CONFIG_4KSTACKS -+ -+/* -+ * These should really be __section__(".bss.page_aligned") as well, but -+ * gcc's 3.0 and earlier don't handle that correctly. -+ */ -+static char softirq_stack[NR_CPUS * THREAD_SIZE] -+ __attribute__((__aligned__(THREAD_SIZE))); -+ -+static char hardirq_stack[NR_CPUS * THREAD_SIZE] -+ __attribute__((__aligned__(THREAD_SIZE))); -+ -+/* -+ * allocate per-cpu stacks for hardirq and for softirq processing -+ */ -+void irq_ctx_init(int cpu) -+{ -+ union irq_ctx *irqctx; -+ -+ if (hardirq_ctx[cpu]) -+ return; -+ -+ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; -+ irqctx->tinfo.task = NULL; -+ irqctx->tinfo.exec_domain = NULL; -+ irqctx->tinfo.cpu = cpu; -+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; -+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); -+ -+ hardirq_ctx[cpu] = irqctx; -+ -+ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; -+ irqctx->tinfo.task = NULL; -+ irqctx->tinfo.exec_domain = NULL; -+ irqctx->tinfo.cpu = cpu; -+ irqctx->tinfo.preempt_count = 0; -+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); -+ -+ softirq_ctx[cpu] = irqctx; -+ -+ printk("CPU %u irqstacks, hard=%p soft=%p\n", -+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); -+} -+ -+void irq_ctx_exit(int cpu) -+{ -+ hardirq_ctx[cpu] = NULL; -+} -+ -+extern asmlinkage void __do_softirq(void); -+ -+asmlinkage void do_softirq(void) -+{ -+ unsigned long flags; -+ struct thread_info *curctx; -+ union irq_ctx *irqctx; -+ u32 *isp; -+ -+ if (in_interrupt()) -+ return; -+ -+ local_irq_save(flags); -+ -+ if (local_softirq_pending()) { -+ curctx = current_thread_info(); -+ irqctx = softirq_ctx[smp_processor_id()]; -+ irqctx->tinfo.task = curctx->task; -+ irqctx->tinfo.previous_esp = current_stack_pointer; -+ -+ /* build the stack frame on the softirq stack */ -+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); -+ -+ asm volatile( -+ " xchgl %%ebx,%%esp \n" -+ " call __do_softirq \n" -+ " movl %%ebx,%%esp \n" -+ : "=b"(isp) -+ : "0"(isp) -+ : "memory", "cc", "edx", "ecx", "eax" -+ ); -+ /* -+ * Shouldnt happen, we returned above if in_interrupt(): -+ */ -+ WARN_ON_ONCE(softirq_count()); -+ } -+ -+ local_irq_restore(flags); -+} -+ -+EXPORT_SYMBOL(do_softirq); -+#endif -+ -+/* -+ * Interrupt statistics: -+ */ -+ -+atomic_t irq_err_count; -+ -+/* -+ * /proc/interrupts printing: -+ */ -+ -+int show_interrupts(struct seq_file *p, void *v) -+{ -+ int i = *(loff_t *) v, j; -+ struct irqaction * action; -+ unsigned long flags; -+ -+ if (i == 0) { -+ seq_printf(p, " "); -+ for_each_online_cpu(j) -+ seq_printf(p, "CPU%-8d",j); -+ seq_putc(p, '\n'); -+ } -+ -+ if (i < NR_IRQS) { -+ spin_lock_irqsave(&irq_desc[i].lock, flags); -+ action = irq_desc[i].action; -+ if (!action) -+ goto skip; -+ seq_printf(p, "%3d: ",i); -+#ifndef CONFIG_SMP -+ seq_printf(p, "%10u ", kstat_irqs(i)); -+#else -+ for_each_online_cpu(j) -+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -+#endif -+ seq_printf(p, " %14s", irq_desc[i].chip->typename); -+ seq_printf(p, " %s", action->name); -+ -+ for (action=action->next; action; action = action->next) -+ seq_printf(p, ", %s", action->name); -+ -+ seq_putc(p, '\n'); -+skip: -+ spin_unlock_irqrestore(&irq_desc[i].lock, flags); -+ } else if (i == NR_IRQS) { -+ seq_printf(p, "NMI: "); -+ for_each_online_cpu(j) -+ seq_printf(p, "%10u ", nmi_count(j)); -+ seq_putc(p, '\n'); -+#ifdef CONFIG_X86_LOCAL_APIC -+ seq_printf(p, "LOC: "); -+ for_each_online_cpu(j) -+ seq_printf(p, "%10u ", -+ per_cpu(irq_stat,j).apic_timer_irqs); -+ seq_putc(p, '\n'); -+#endif -+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -+#if defined(CONFIG_X86_IO_APIC) -+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -+#endif -+ } -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+ -+void fixup_irqs(cpumask_t map) -+{ -+ unsigned int irq; -+ static int warned; -+ -+ for (irq = 0; irq < NR_IRQS; irq++) { -+ cpumask_t mask; -+ if (irq == 2) -+ continue; -+ -+ cpus_and(mask, irq_desc[irq].affinity, map); -+ if (any_online_cpu(mask) == NR_CPUS) { -+ /*printk("Breaking affinity for irq %i\n", irq);*/ -+ mask = map; -+ } -+ if (irq_desc[irq].chip->set_affinity) -+ irq_desc[irq].chip->set_affinity(irq, mask); -+ else if (irq_desc[irq].action && !(warned++)) -+ printk("Cannot set affinity for irq %i\n", irq); -+ } -+ -+#if 0 -+ barrier(); -+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected -+ [note the nop - the interrupt-enable boundary on x86 is two -+ instructions from sti] - to flush out pending hardirqs and -+ IPIs. After this point nothing is supposed to reach this CPU." */ -+ __asm__ __volatile__("sti; nop; cli"); -+ barrier(); -+#else -+ /* That doesn't seem sufficient. Give it 1ms. */ -+ local_irq_enable(); -+ mdelay(1); -+ local_irq_disable(); -+#endif -+} -+#endif -+ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/ldt-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/ldt-xen.c ---- linux-2.6.18.8/arch/i386/kernel/ldt-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/ldt-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,270 @@ -+/* -+ * linux/kernel/ldt.c -+ * -+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds -+ * Copyright (C) 1999 Ingo Molnar -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -+static void flush_ldt(void *null) -+{ -+ if (current->active_mm) -+ load_LDT(¤t->active_mm->context); -+} -+#endif -+ -+static int alloc_ldt(mm_context_t *pc, int mincount, int reload) -+{ -+ void *oldldt; -+ void *newldt; -+ int oldsize; -+ -+ if (mincount <= pc->size) -+ return 0; -+ oldsize = pc->size; -+ mincount = (mincount+511)&(~511); -+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) -+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE); -+ else -+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); -+ -+ if (!newldt) -+ return -ENOMEM; -+ -+ if (oldsize) -+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); -+ oldldt = pc->ldt; -+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); -+ pc->ldt = newldt; -+ wmb(); -+ pc->size = mincount; -+ wmb(); -+ -+ if (reload) { -+#ifdef CONFIG_SMP -+ cpumask_t mask; -+ preempt_disable(); -+#endif -+ make_pages_readonly( -+ pc->ldt, -+ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ load_LDT(pc); -+#ifdef CONFIG_SMP -+ mask = cpumask_of_cpu(smp_processor_id()); -+ if (!cpus_equal(current->mm->cpu_vm_mask, mask)) -+ smp_call_function(flush_ldt, NULL, 1, 1); -+ preempt_enable(); -+#endif -+ } -+ if (oldsize) { -+ make_pages_writable( -+ oldldt, -+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) -+ vfree(oldldt); -+ else -+ kfree(oldldt); -+ } -+ return 0; -+} -+ -+static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -+{ -+ int err = alloc_ldt(new, old->size, 0); -+ if (err < 0) -+ return err; -+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); -+ make_pages_readonly( -+ new->ldt, -+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ return 0; -+} -+ -+/* -+ * we do not have to muck with descriptors here, that is -+ * done in switch_mm() as needed. -+ */ -+int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -+{ -+ struct mm_struct * old_mm; -+ int retval = 0; -+ -+ init_MUTEX(&mm->context.sem); -+ mm->context.size = 0; -+ mm->context.has_foreign_mappings = 0; -+ old_mm = current->mm; -+ if (old_mm && old_mm->context.size > 0) { -+ down(&old_mm->context.sem); -+ retval = copy_ldt(&mm->context, &old_mm->context); -+ up(&old_mm->context.sem); -+ } -+ return retval; -+} -+ -+/* -+ * No need to lock the MM as we are the last user -+ */ -+void destroy_context(struct mm_struct *mm) -+{ -+ if (mm->context.size) { -+ if (mm == current->active_mm) -+ clear_LDT(); -+ make_pages_writable( -+ mm->context.ldt, -+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) -+ vfree(mm->context.ldt); -+ else -+ kfree(mm->context.ldt); -+ mm->context.size = 0; -+ } -+} -+ -+static int read_ldt(void __user * ptr, unsigned long bytecount) -+{ -+ int err; -+ unsigned long size; -+ struct mm_struct * mm = current->mm; -+ -+ if (!mm->context.size) -+ return 0; -+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) -+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; -+ -+ down(&mm->context.sem); -+ size = mm->context.size*LDT_ENTRY_SIZE; -+ if (size > bytecount) -+ size = bytecount; -+ -+ err = 0; -+ if (copy_to_user(ptr, mm->context.ldt, size)) -+ err = -EFAULT; -+ up(&mm->context.sem); -+ if (err < 0) -+ goto error_return; -+ if (size != bytecount) { -+ /* zero-fill the rest */ -+ if (clear_user(ptr+size, bytecount-size) != 0) { -+ err = -EFAULT; -+ goto error_return; -+ } -+ } -+ return bytecount; -+error_return: -+ return err; -+} -+ -+static int read_default_ldt(void __user * ptr, unsigned long bytecount) -+{ -+ int err; -+ unsigned long size; -+ void *address; -+ -+ err = 0; -+ address = &default_ldt[0]; -+ size = 5*sizeof(struct desc_struct); -+ if (size > bytecount) -+ size = bytecount; -+ -+ err = size; -+ if (copy_to_user(ptr, address, size)) -+ err = -EFAULT; -+ -+ return err; -+} -+ -+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -+{ -+ struct mm_struct * mm = current->mm; -+ __u32 entry_1, entry_2; -+ int error; -+ struct user_desc ldt_info; -+ -+ error = -EINVAL; -+ if (bytecount != sizeof(ldt_info)) -+ goto out; -+ error = -EFAULT; -+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) -+ goto out; -+ -+ error = -EINVAL; -+ if (ldt_info.entry_number >= LDT_ENTRIES) -+ goto out; -+ if (ldt_info.contents == 3) { -+ if (oldmode) -+ goto out; -+ if (ldt_info.seg_not_present == 0) -+ goto out; -+ } -+ -+ down(&mm->context.sem); -+ if (ldt_info.entry_number >= mm->context.size) { -+ error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); -+ if (error < 0) -+ goto out_unlock; -+ } -+ -+ /* Allow LDTs to be cleared by the user. */ -+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { -+ if (oldmode || LDT_empty(&ldt_info)) { -+ entry_1 = 0; -+ entry_2 = 0; -+ goto install; -+ } -+ } -+ -+ entry_1 = LDT_entry_a(&ldt_info); -+ entry_2 = LDT_entry_b(&ldt_info); -+ if (oldmode) -+ entry_2 &= ~(1 << 20); -+ -+ /* Install the new entry ... */ -+install: -+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, -+ entry_1, entry_2); -+ -+out_unlock: -+ up(&mm->context.sem); -+out: -+ return error; -+} -+ -+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -+{ -+ int ret = -ENOSYS; -+ -+ switch (func) { -+ case 0: -+ ret = read_ldt(ptr, bytecount); -+ break; -+ case 1: -+ ret = write_ldt(ptr, bytecount, 1); -+ break; -+ case 2: -+ ret = read_default_ldt(ptr, bytecount); -+ break; -+ case 0x11: -+ ret = write_ldt(ptr, bytecount, 0); -+ break; -+ } -+ return ret; -+} -diff -rpuN linux-2.6.18.8/arch/i386/kernel/machine_kexec.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/machine_kexec.c ---- linux-2.6.18.8/arch/i386/kernel/machine_kexec.c 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/machine_kexec.c 2008-02-15 16:21:49.000000000 -0800 -@@ -19,123 +19,52 @@ - #include - #include - --#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -- --#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) --#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) --#define L2_ATTR (_PAGE_PRESENT) -- --#define LEVEL0_SIZE (1UL << 12UL) -- --#ifndef CONFIG_X86_PAE --#define LEVEL1_SIZE (1UL << 22UL) --static u32 pgtable_level1[1024] PAGE_ALIGNED; -- --static void identity_map_page(unsigned long address) --{ -- unsigned long level1_index, level2_index; -- u32 *pgtable_level2; -- -- /* Find the current page table */ -- pgtable_level2 = __va(read_cr3()); -- -- /* Find the indexes of the physical address to identity map */ -- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; -- level2_index = address / LEVEL1_SIZE; -- -- /* Identity map the page table entry */ -- pgtable_level1[level1_index] = address | L0_ATTR; -- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; -- -- /* Flush the tlb so the new mapping takes effect. -- * Global tlb entries are not flushed but that is not an issue. -- */ -- load_cr3(pgtable_level2); --} -- --#else --#define LEVEL1_SIZE (1UL << 21UL) --#define LEVEL2_SIZE (1UL << 30UL) --static u64 pgtable_level1[512] PAGE_ALIGNED; --static u64 pgtable_level2[512] PAGE_ALIGNED; -- --static void identity_map_page(unsigned long address) --{ -- unsigned long level1_index, level2_index, level3_index; -- u64 *pgtable_level3; -- -- /* Find the current page table */ -- pgtable_level3 = __va(read_cr3()); -+#ifdef CONFIG_XEN -+#include -+#endif - -- /* Find the indexes of the physical address to identity map */ -- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; -- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; -- level3_index = address / LEVEL2_SIZE; -- -- /* Identity map the page table entry */ -- pgtable_level1[level1_index] = address | L0_ATTR; -- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; -- set_64bit(&pgtable_level3[level3_index], -- __pa(pgtable_level2) | L2_ATTR); -- -- /* Flush the tlb so the new mapping takes effect. -- * Global tlb entries are not flushed but that is not an issue. -- */ -- load_cr3(pgtable_level3); --} -+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -+static u32 kexec_pgd[1024] PAGE_ALIGNED; -+#ifdef CONFIG_X86_PAE -+static u32 kexec_pmd0[1024] PAGE_ALIGNED; -+static u32 kexec_pmd1[1024] PAGE_ALIGNED; - #endif -+static u32 kexec_pte0[1024] PAGE_ALIGNED; -+static u32 kexec_pte1[1024] PAGE_ALIGNED; - --static void set_idt(void *newidt, __u16 limit) --{ -- struct Xgt_desc_struct curidt; -+#ifdef CONFIG_XEN - -- /* ia32 supports unaliged loads & stores */ -- curidt.size = limit; -- curidt.address = (unsigned long)newidt; -+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) - -- load_idt(&curidt); --}; -+#if PAGES_NR > KEXEC_XEN_NO_PAGES -+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break -+#endif - -+#if PA_CONTROL_PAGE != 0 -+#error PA_CONTROL_PAGE is non zero - Xen support will break -+#endif - --static void set_gdt(void *newgdt, __u16 limit) -+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) - { -- struct Xgt_desc_struct curgdt; -+ void *control_page; - -- /* ia32 supports unaligned loads & stores */ -- curgdt.size = limit; -- curgdt.address = (unsigned long)newgdt; -+ memset(xki->page_list, 0, sizeof(xki->page_list)); - -- load_gdt(&curgdt); --}; -+ control_page = page_address(image->control_code_page); -+ memcpy(control_page, relocate_kernel, PAGE_SIZE); - --static void load_segments(void) --{ --#define __STR(X) #X --#define STR(X) __STR(X) -+ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); -+ xki->page_list[PA_PGD] = __ma(kexec_pgd); -+#ifdef CONFIG_X86_PAE -+ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); -+ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); -+#endif -+ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); -+ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); - -- __asm__ __volatile__ ( -- "\tljmp $"STR(__KERNEL_CS)",$1f\n" -- "\t1:\n" -- "\tmovl $"STR(__KERNEL_DS)",%%eax\n" -- "\tmovl %%eax,%%ds\n" -- "\tmovl %%eax,%%es\n" -- "\tmovl %%eax,%%fs\n" -- "\tmovl %%eax,%%gs\n" -- "\tmovl %%eax,%%ss\n" -- ::: "eax", "memory"); --#undef STR --#undef __STR - } - --typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( -- unsigned long indirection_page, -- unsigned long reboot_code_buffer, -- unsigned long start_address, -- unsigned int has_pae) ATTRIB_NORET; -- --extern const unsigned char relocate_new_kernel[]; --extern void relocate_new_kernel_end(void); --extern const unsigned int relocate_new_kernel_size; -+#endif /* CONFIG_XEN */ - - /* - * A architecture hook called to validate the -@@ -163,49 +92,38 @@ void machine_kexec_cleanup(struct kimage - { - } - -+#ifndef CONFIG_XEN - /* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ - NORET_TYPE void machine_kexec(struct kimage *image) - { -- unsigned long page_list; -- unsigned long reboot_code_buffer; -- -- relocate_new_kernel_t rnk; -+ unsigned long page_list[PAGES_NR]; -+ void *control_page; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - -- /* Compute some offsets */ -- reboot_code_buffer = page_to_pfn(image->control_code_page) -- << PAGE_SHIFT; -- page_list = image->head; -- -- /* Set up an identity mapping for the reboot_code_buffer */ -- identity_map_page(reboot_code_buffer); -- -- /* copy it out */ -- memcpy((void *)reboot_code_buffer, relocate_new_kernel, -- relocate_new_kernel_size); -- -- /* The segment registers are funny things, they have both a -- * visible and an invisible part. Whenever the visible part is -- * set to a specific selector, the invisible part is loaded -- * with from a table in memory. At no other time is the -- * descriptor table in memory accessed. -- * -- * I take advantage of this here by force loading the -- * segments, before I zap the gdt with an invalid value. -- */ -- load_segments(); -- /* The gdt & idt are now invalid. -- * If you want to load them you must set up your own idt & gdt. -- */ -- set_gdt(phys_to_virt(0),0); -- set_idt(phys_to_virt(0),0); -- -- /* now call it */ -- rnk = (relocate_new_kernel_t) reboot_code_buffer; -- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); -+ control_page = page_address(image->control_code_page); -+ memcpy(control_page, relocate_kernel, PAGE_SIZE); -+ -+ page_list[PA_CONTROL_PAGE] = __pa(control_page); -+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; -+ page_list[PA_PGD] = __pa(kexec_pgd); -+ page_list[VA_PGD] = (unsigned long)kexec_pgd; -+#ifdef CONFIG_X86_PAE -+ page_list[PA_PMD_0] = __pa(kexec_pmd0); -+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; -+ page_list[PA_PMD_1] = __pa(kexec_pmd1); -+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; -+#endif -+ page_list[PA_PTE_0] = __pa(kexec_pte0); -+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0; -+ page_list[PA_PTE_1] = __pa(kexec_pte1); -+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1; -+ -+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list, -+ image->start, cpu_has_pae); - } -+#endif -diff -rpuN linux-2.6.18.8/arch/i386/kernel/microcode-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/microcode-xen.c ---- linux-2.6.18.8/arch/i386/kernel/microcode-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/microcode-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,144 @@ -+/* -+ * Intel CPU Microcode Update Driver for Linux -+ * -+ * Copyright (C) 2000-2004 Tigran Aivazian -+ * -+ * This driver allows to upgrade microcode on Intel processors -+ * belonging to IA-32 family - PentiumPro, Pentium II, -+ * Pentium III, Xeon, Pentium 4, etc. -+ * -+ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, -+ * Order Number 245472 or free download from: -+ * -+ * http://developer.intel.com/design/pentium4/manuals/245472.htm -+ * -+ * For more information, go to http://www.urbanmyth.org/microcode -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+//#define DEBUG /* pr_debug */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -+MODULE_AUTHOR("Tigran Aivazian "); -+MODULE_LICENSE("GPL"); -+ -+static int verbose; -+module_param(verbose, int, 0644); -+ -+#define MICROCODE_VERSION "1.14a-xen" -+ -+#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -+#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ -+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -+ -+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -+static DEFINE_MUTEX(microcode_mutex); -+ -+static int microcode_open (struct inode *unused1, struct file *unused2) -+{ -+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -+} -+ -+ -+static int do_microcode_update (const void __user *ubuf, size_t len) -+{ -+ int err; -+ void *kbuf; -+ -+ kbuf = vmalloc(len); -+ if (!kbuf) -+ return -ENOMEM; -+ -+ if (copy_from_user(kbuf, ubuf, len) == 0) { -+ struct xen_platform_op op; -+ -+ op.cmd = XENPF_microcode_update; -+ set_xen_guest_handle(op.u.microcode.data, kbuf); -+ op.u.microcode.length = len; -+ err = HYPERVISOR_platform_op(&op); -+ } else -+ err = -EFAULT; -+ -+ vfree(kbuf); -+ -+ return err; -+} -+ -+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -+{ -+ ssize_t ret; -+ -+ if (len < MC_HEADER_SIZE) { -+ printk(KERN_ERR "microcode: not enough data\n"); -+ return -EINVAL; -+ } -+ -+ mutex_lock(µcode_mutex); -+ -+ ret = do_microcode_update(buf, len); -+ if (!ret) -+ ret = (ssize_t)len; -+ -+ mutex_unlock(µcode_mutex); -+ -+ return ret; -+} -+ -+static struct file_operations microcode_fops = { -+ .owner = THIS_MODULE, -+ .write = microcode_write, -+ .open = microcode_open, -+}; -+ -+static struct miscdevice microcode_dev = { -+ .minor = MICROCODE_MINOR, -+ .name = "microcode", -+ .fops = µcode_fops, -+}; -+ -+static int __init microcode_init (void) -+{ -+ int error; -+ -+ error = misc_register(µcode_dev); -+ if (error) { -+ printk(KERN_ERR -+ "microcode: can't misc_register on minor=%d\n", -+ MICROCODE_MINOR); -+ return error; -+ } -+ -+ printk(KERN_INFO -+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); -+ return 0; -+} -+ -+static void __exit microcode_exit (void) -+{ -+ misc_deregister(µcode_dev); -+} -+ -+module_init(microcode_init) -+module_exit(microcode_exit) -+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -diff -rpuN linux-2.6.18.8/arch/i386/kernel/mpparse-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/mpparse-xen.c ---- linux-2.6.18.8/arch/i386/kernel/mpparse-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/mpparse-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,1185 @@ -+/* -+ * Intel Multiprocessor Specification 1.1 and 1.4 -+ * compliant MP-table parsing routines. -+ * -+ * (c) 1995 Alan Cox, Building #3 -+ * (c) 1998, 1999, 2000 Ingo Molnar -+ * -+ * Fixes -+ * Erich Boleyn : MP v1.4 and additional changes. -+ * Alan Cox : Added EBDA scanning -+ * Ingo Molnar : various cleanups and rewrites -+ * Maciej W. Rozycki: Bits for default MP configurations -+ * Paul Diefenbaugh: Added full ACPI support -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+/* Have we found an MP table */ -+int smp_found_config; -+unsigned int __initdata maxcpus = NR_CPUS; -+ -+/* -+ * Various Linux-internal data structures created from the -+ * MP-table. -+ */ -+int apic_version [MAX_APICS]; -+int mp_bus_id_to_type [MAX_MP_BUSSES]; -+int mp_bus_id_to_node [MAX_MP_BUSSES]; -+int mp_bus_id_to_local [MAX_MP_BUSSES]; -+int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -+static int mp_current_pci_id; -+ -+/* I/O APIC entries */ -+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; -+ -+/* # of MP IRQ source entries */ -+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; -+ -+/* MP IRQ source entries */ -+int mp_irq_entries; -+ -+int nr_ioapics; -+ -+int pic_mode; -+unsigned long mp_lapic_addr; -+ -+unsigned int def_to_bigsmp = 0; -+ -+/* Processor that is doing the boot up */ -+unsigned int boot_cpu_physical_apicid = -1U; -+/* Internal processor count */ -+static unsigned int __devinitdata num_processors; -+ -+/* Bitmask of physically existing CPUs */ -+physid_mask_t phys_cpu_present_map; -+ -+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -+ -+/* -+ * Intel MP BIOS table parsing routines: -+ */ -+ -+ -+/* -+ * Checksum an MP configuration block. -+ */ -+ -+static int __init mpf_checksum(unsigned char *mp, int len) -+{ -+ int sum = 0; -+ -+ while (len--) -+ sum += *mp++; -+ -+ return sum & 0xFF; -+} -+ -+/* -+ * Have to match translation table entries to main table entries by counter -+ * hence the mpc_record variable .... can't see a less disgusting way of -+ * doing this .... -+ */ -+ -+static int mpc_record; -+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; -+ -+#ifndef CONFIG_XEN -+static void __devinit MP_processor_info (struct mpc_config_processor *m) -+{ -+ int ver, apicid; -+ physid_mask_t phys_cpu; -+ -+ if (!(m->mpc_cpuflag & CPU_ENABLED)) -+ return; -+ -+ apicid = mpc_apic_id(m, translation_table[mpc_record]); -+ -+ if (m->mpc_featureflag&(1<<0)) -+ Dprintk(" Floating point unit present.\n"); -+ if (m->mpc_featureflag&(1<<7)) -+ Dprintk(" Machine Exception supported.\n"); -+ if (m->mpc_featureflag&(1<<8)) -+ Dprintk(" 64 bit compare & exchange supported.\n"); -+ if (m->mpc_featureflag&(1<<9)) -+ Dprintk(" Internal APIC present.\n"); -+ if (m->mpc_featureflag&(1<<11)) -+ Dprintk(" SEP present.\n"); -+ if (m->mpc_featureflag&(1<<12)) -+ Dprintk(" MTRR present.\n"); -+ if (m->mpc_featureflag&(1<<13)) -+ Dprintk(" PGE present.\n"); -+ if (m->mpc_featureflag&(1<<14)) -+ Dprintk(" MCA present.\n"); -+ if (m->mpc_featureflag&(1<<15)) -+ Dprintk(" CMOV present.\n"); -+ if (m->mpc_featureflag&(1<<16)) -+ Dprintk(" PAT present.\n"); -+ if (m->mpc_featureflag&(1<<17)) -+ Dprintk(" PSE present.\n"); -+ if (m->mpc_featureflag&(1<<18)) -+ Dprintk(" PSN present.\n"); -+ if (m->mpc_featureflag&(1<<19)) -+ Dprintk(" Cache Line Flush Instruction present.\n"); -+ /* 20 Reserved */ -+ if (m->mpc_featureflag&(1<<21)) -+ Dprintk(" Debug Trace and EMON Store present.\n"); -+ if (m->mpc_featureflag&(1<<22)) -+ Dprintk(" ACPI Thermal Throttle Registers present.\n"); -+ if (m->mpc_featureflag&(1<<23)) -+ Dprintk(" MMX present.\n"); -+ if (m->mpc_featureflag&(1<<24)) -+ Dprintk(" FXSR present.\n"); -+ if (m->mpc_featureflag&(1<<25)) -+ Dprintk(" XMM present.\n"); -+ if (m->mpc_featureflag&(1<<26)) -+ Dprintk(" Willamette New Instructions present.\n"); -+ if (m->mpc_featureflag&(1<<27)) -+ Dprintk(" Self Snoop present.\n"); -+ if (m->mpc_featureflag&(1<<28)) -+ Dprintk(" HT present.\n"); -+ if (m->mpc_featureflag&(1<<29)) -+ Dprintk(" Thermal Monitor present.\n"); -+ /* 30, 31 Reserved */ -+ -+ -+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { -+ Dprintk(" Bootup CPU\n"); -+ boot_cpu_physical_apicid = m->mpc_apicid; -+ } -+ -+ ver = m->mpc_apicver; -+ -+ /* -+ * Validate version -+ */ -+ if (ver == 0x0) { -+ printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " -+ "fixing up to 0x10. (tell your hw vendor)\n", -+ m->mpc_apicid); -+ ver = 0x10; -+ } -+ apic_version[m->mpc_apicid] = ver; -+ -+ phys_cpu = apicid_to_cpu_present(apicid); -+ physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); -+ -+ if (num_processors >= NR_CPUS) { -+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." -+ " Processor ignored.\n", NR_CPUS); -+ return; -+ } -+ -+ if (num_processors >= maxcpus) { -+ printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." -+ " Processor ignored.\n", maxcpus); -+ return; -+ } -+ -+ cpu_set(num_processors, cpu_possible_map); -+ num_processors++; -+ -+ /* -+ * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y -+ * but we need to work other dependencies like SMP_SUSPEND etc -+ * before this can be done without some confusion. -+ * if (CPU_HOTPLUG_ENABLED || num_processors > 8) -+ * - Ashok Raj -+ */ -+ if (num_processors > 8) { -+ switch (boot_cpu_data.x86_vendor) { -+ case X86_VENDOR_INTEL: -+ if (!APIC_XAPIC(ver)) { -+ def_to_bigsmp = 0; -+ break; -+ } -+ /* If P4 and above fall through */ -+ case X86_VENDOR_AMD: -+ def_to_bigsmp = 1; -+ } -+ } -+ bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; -+} -+#else -+void __init MP_processor_info (struct mpc_config_processor *m) -+{ -+ num_processors++; -+} -+#endif /* CONFIG_XEN */ -+ -+static void __init MP_bus_info (struct mpc_config_bus *m) -+{ -+ char str[7]; -+ -+ memcpy(str, m->mpc_bustype, 6); -+ str[6] = 0; -+ -+ mpc_oem_bus_info(m, str, translation_table[mpc_record]); -+ -+ if (m->mpc_busid >= MAX_MP_BUSSES) { -+ printk(KERN_WARNING "MP table busid value (%d) for bustype %s " -+ " is too large, max. supported is %d\n", -+ m->mpc_busid, str, MAX_MP_BUSSES - 1); -+ return; -+ } -+ -+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { -+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; -+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { -+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; -+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { -+ mpc_oem_pci_bus(m, translation_table[mpc_record]); -+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; -+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; -+ mp_current_pci_id++; -+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { -+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; -+ } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { -+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; -+ } else { -+ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); -+ } -+} -+ -+static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -+{ -+ if (!(m->mpc_flags & MPC_APIC_USABLE)) -+ return; -+ -+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", -+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); -+ if (nr_ioapics >= MAX_IO_APICS) { -+ printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", -+ MAX_IO_APICS, nr_ioapics); -+ panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); -+ } -+ if (!m->mpc_apicaddr) { -+ printk(KERN_ERR "WARNING: bogus zero I/O APIC address" -+ " found in MP table, skipping!\n"); -+ return; -+ } -+ mp_ioapics[nr_ioapics] = *m; -+ nr_ioapics++; -+} -+ -+static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -+{ -+ mp_irqs [mp_irq_entries] = *m; -+ Dprintk("Int: type %d, pol %d, trig %d, bus %d," -+ " IRQ %02x, APIC ID %x, APIC INT %02x\n", -+ m->mpc_irqtype, m->mpc_irqflag & 3, -+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, -+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); -+ if (++mp_irq_entries == MAX_IRQ_SOURCES) -+ panic("Max # of irq sources exceeded!!\n"); -+} -+ -+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -+{ -+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d," -+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n", -+ m->mpc_irqtype, m->mpc_irqflag & 3, -+ (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, -+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); -+ /* -+ * Well it seems all SMP boards in existence -+ * use ExtINT/LVT1 == LINT0 and -+ * NMI/LVT2 == LINT1 - the following check -+ * will show us if this assumptions is false. -+ * Until then we do not have to add baggage. -+ */ -+ if ((m->mpc_irqtype == mp_ExtINT) && -+ (m->mpc_destapiclint != 0)) -+ BUG(); -+ if ((m->mpc_irqtype == mp_NMI) && -+ (m->mpc_destapiclint != 1)) -+ BUG(); -+} -+ -+#ifdef CONFIG_X86_NUMAQ -+static void __init MP_translation_info (struct mpc_config_translation *m) -+{ -+ printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); -+ -+ if (mpc_record >= MAX_MPC_ENTRY) -+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); -+ else -+ translation_table[mpc_record] = m; /* stash this for later */ -+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) -+ node_set_online(m->trans_quad); -+} -+ -+/* -+ * Read/parse the MPC oem tables -+ */ -+ -+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ -+ unsigned short oemsize) -+{ -+ int count = sizeof (*oemtable); /* the header size */ -+ unsigned char *oemptr = ((unsigned char *)oemtable)+count; -+ -+ mpc_record = 0; -+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); -+ if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) -+ { -+ printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", -+ oemtable->oem_signature[0], -+ oemtable->oem_signature[1], -+ oemtable->oem_signature[2], -+ oemtable->oem_signature[3]); -+ return; -+ } -+ if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) -+ { -+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); -+ return; -+ } -+ while (count < oemtable->oem_length) { -+ switch (*oemptr) { -+ case MP_TRANSLATION: -+ { -+ struct mpc_config_translation *m= -+ (struct mpc_config_translation *)oemptr; -+ MP_translation_info(m); -+ oemptr += sizeof(*m); -+ count += sizeof(*m); -+ ++mpc_record; -+ break; -+ } -+ default: -+ { -+ printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); -+ return; -+ } -+ } -+ } -+} -+ -+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, -+ char *productid) -+{ -+ if (strncmp(oem, "IBM NUMA", 8)) -+ printk("Warning! May not be a NUMA-Q system!\n"); -+ if (mpc->mpc_oemptr) -+ smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, -+ mpc->mpc_oemsize); -+} -+#endif /* CONFIG_X86_NUMAQ */ -+ -+/* -+ * Read/parse the MPC -+ */ -+ -+static int __init smp_read_mpc(struct mp_config_table *mpc) -+{ -+ char str[16]; -+ char oem[10]; -+ int count=sizeof(*mpc); -+ unsigned char *mpt=((unsigned char *)mpc)+count; -+ -+ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { -+ printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", -+ *(u32 *)mpc->mpc_signature); -+ return 0; -+ } -+ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { -+ printk(KERN_ERR "SMP mptable: checksum error!\n"); -+ return 0; -+ } -+ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { -+ printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", -+ mpc->mpc_spec); -+ return 0; -+ } -+ if (!mpc->mpc_lapic) { -+ printk(KERN_ERR "SMP mptable: null local APIC address!\n"); -+ return 0; -+ } -+ memcpy(oem,mpc->mpc_oem,8); -+ oem[8]=0; -+ printk(KERN_INFO "OEM ID: %s ",oem); -+ -+ memcpy(str,mpc->mpc_productid,12); -+ str[12]=0; -+ printk("Product ID: %s ",str); -+ -+ mps_oem_check(mpc, oem, str); -+ -+ printk("APIC at: 0x%lX\n",mpc->mpc_lapic); -+ -+ /* -+ * Save the local APIC address (it might be non-default) -- but only -+ * if we're not using ACPI. -+ */ -+ if (!acpi_lapic) -+ mp_lapic_addr = mpc->mpc_lapic; -+ -+ /* -+ * Now process the configuration blocks. -+ */ -+ mpc_record = 0; -+ while (count < mpc->mpc_length) { -+ switch(*mpt) { -+ case MP_PROCESSOR: -+ { -+ struct mpc_config_processor *m= -+ (struct mpc_config_processor *)mpt; -+ /* ACPI may have already provided this data */ -+ if (!acpi_lapic) -+ MP_processor_info(m); -+ mpt += sizeof(*m); -+ count += sizeof(*m); -+ break; -+ } -+ case MP_BUS: -+ { -+ struct mpc_config_bus *m= -+ (struct mpc_config_bus *)mpt; -+ MP_bus_info(m); -+ mpt += sizeof(*m); -+ count += sizeof(*m); -+ break; -+ } -+ case MP_IOAPIC: -+ { -+ struct mpc_config_ioapic *m= -+ (struct mpc_config_ioapic *)mpt; -+ MP_ioapic_info(m); -+ mpt+=sizeof(*m); -+ count+=sizeof(*m); -+ break; -+ } -+ case MP_INTSRC: -+ { -+ struct mpc_config_intsrc *m= -+ (struct mpc_config_intsrc *)mpt; -+ -+ MP_intsrc_info(m); -+ mpt+=sizeof(*m); -+ count+=sizeof(*m); -+ break; -+ } -+ case MP_LINTSRC: -+ { -+ struct mpc_config_lintsrc *m= -+ (struct mpc_config_lintsrc *)mpt; -+ MP_lintsrc_info(m); -+ mpt+=sizeof(*m); -+ count+=sizeof(*m); -+ break; -+ } -+ default: -+ { -+ count = mpc->mpc_length; -+ break; -+ } -+ } -+ ++mpc_record; -+ } -+ clustered_apic_check(); -+ if (!num_processors) -+ printk(KERN_ERR "SMP mptable: no processors registered!\n"); -+ return num_processors; -+} -+ -+static int __init ELCR_trigger(unsigned int irq) -+{ -+ unsigned int port; -+ -+ port = 0x4d0 + (irq >> 3); -+ return (inb(port) >> (irq & 7)) & 1; -+} -+ -+static void __init construct_default_ioirq_mptable(int mpc_default_type) -+{ -+ struct mpc_config_intsrc intsrc; -+ int i; -+ int ELCR_fallback = 0; -+ -+ intsrc.mpc_type = MP_INTSRC; -+ intsrc.mpc_irqflag = 0; /* conforming */ -+ intsrc.mpc_srcbus = 0; -+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; -+ -+ intsrc.mpc_irqtype = mp_INT; -+ -+ /* -+ * If true, we have an ISA/PCI system with no IRQ entries -+ * in the MP table. To prevent the PCI interrupts from being set up -+ * incorrectly, we try to use the ELCR. The sanity check to see if -+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can -+ * never be level sensitive, so we simply see if the ELCR agrees. -+ * If it does, we assume it's valid. -+ */ -+ if (mpc_default_type == 5) { -+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); -+ -+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) -+ printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); -+ else { -+ printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); -+ ELCR_fallback = 1; -+ } -+ } -+ -+ for (i = 0; i < 16; i++) { -+ switch (mpc_default_type) { -+ case 2: -+ if (i == 0 || i == 13) -+ continue; /* IRQ0 & IRQ13 not connected */ -+ /* fall through */ -+ default: -+ if (i == 2) -+ continue; /* IRQ2 is never connected */ -+ } -+ -+ if (ELCR_fallback) { -+ /* -+ * If the ELCR indicates a level-sensitive interrupt, we -+ * copy that information over to the MP table in the -+ * irqflag field (level sensitive, active high polarity). -+ */ -+ if (ELCR_trigger(i)) -+ intsrc.mpc_irqflag = 13; -+ else -+ intsrc.mpc_irqflag = 0; -+ } -+ -+ intsrc.mpc_srcbusirq = i; -+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ -+ MP_intsrc_info(&intsrc); -+ } -+ -+ intsrc.mpc_irqtype = mp_ExtINT; -+ intsrc.mpc_srcbusirq = 0; -+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ -+ MP_intsrc_info(&intsrc); -+} -+ -+static inline void __init construct_default_ISA_mptable(int mpc_default_type) -+{ -+ struct mpc_config_processor processor; -+ struct mpc_config_bus bus; -+ struct mpc_config_ioapic ioapic; -+ struct mpc_config_lintsrc lintsrc; -+ int linttypes[2] = { mp_ExtINT, mp_NMI }; -+ int i; -+ -+ /* -+ * local APIC has default address -+ */ -+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -+ -+ /* -+ * 2 CPUs, numbered 0 & 1. -+ */ -+ processor.mpc_type = MP_PROCESSOR; -+ /* Either an integrated APIC or a discrete 82489DX. */ -+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; -+ processor.mpc_cpuflag = CPU_ENABLED; -+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | -+ (boot_cpu_data.x86_model << 4) | -+ boot_cpu_data.x86_mask; -+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; -+ processor.mpc_reserved[0] = 0; -+ processor.mpc_reserved[1] = 0; -+ for (i = 0; i < 2; i++) { -+ processor.mpc_apicid = i; -+ MP_processor_info(&processor); -+ } -+ -+ bus.mpc_type = MP_BUS; -+ bus.mpc_busid = 0; -+ switch (mpc_default_type) { -+ default: -+ printk("???\n"); -+ printk(KERN_ERR "Unknown standard configuration %d\n", -+ mpc_default_type); -+ /* fall through */ -+ case 1: -+ case 5: -+ memcpy(bus.mpc_bustype, "ISA ", 6); -+ break; -+ case 2: -+ case 6: -+ case 3: -+ memcpy(bus.mpc_bustype, "EISA ", 6); -+ break; -+ case 4: -+ case 7: -+ memcpy(bus.mpc_bustype, "MCA ", 6); -+ } -+ MP_bus_info(&bus); -+ if (mpc_default_type > 4) { -+ bus.mpc_busid = 1; -+ memcpy(bus.mpc_bustype, "PCI ", 6); -+ MP_bus_info(&bus); -+ } -+ -+ ioapic.mpc_type = MP_IOAPIC; -+ ioapic.mpc_apicid = 2; -+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; -+ ioapic.mpc_flags = MPC_APIC_USABLE; -+ ioapic.mpc_apicaddr = 0xFEC00000; -+ MP_ioapic_info(&ioapic); -+ -+ /* -+ * We set up most of the low 16 IO-APIC pins according to MPS rules. -+ */ -+ construct_default_ioirq_mptable(mpc_default_type); -+ -+ lintsrc.mpc_type = MP_LINTSRC; -+ lintsrc.mpc_irqflag = 0; /* conforming */ -+ lintsrc.mpc_srcbusid = 0; -+ lintsrc.mpc_srcbusirq = 0; -+ lintsrc.mpc_destapic = MP_APIC_ALL; -+ for (i = 0; i < 2; i++) { -+ lintsrc.mpc_irqtype = linttypes[i]; -+ lintsrc.mpc_destapiclint = i; -+ MP_lintsrc_info(&lintsrc); -+ } -+} -+ -+static struct intel_mp_floating *mpf_found; -+ -+/* -+ * Scan the memory blocks for an SMP configuration block. -+ */ -+void __init get_smp_config (void) -+{ -+ struct intel_mp_floating *mpf = mpf_found; -+ -+ /* -+ * ACPI supports both logical (e.g. Hyper-Threading) and physical -+ * processors, where MPS only supports physical. -+ */ -+ if (acpi_lapic && acpi_ioapic) { -+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); -+ return; -+ } -+ else if (acpi_lapic) -+ printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); -+ -+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); -+ if (mpf->mpf_feature2 & (1<<7)) { -+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); -+ pic_mode = 1; -+ } else { -+ printk(KERN_INFO " Virtual Wire compatibility mode.\n"); -+ pic_mode = 0; -+ } -+ -+ /* -+ * Now see if we need to read further. -+ */ -+ if (mpf->mpf_feature1 != 0) { -+ -+ printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); -+ construct_default_ISA_mptable(mpf->mpf_feature1); -+ -+ } else if (mpf->mpf_physptr) { -+ -+ /* -+ * Read the physical hardware table. Anything here will -+ * override the defaults. -+ */ -+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { -+ smp_found_config = 0; -+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); -+ printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); -+ return; -+ } -+ /* -+ * If there are no explicit MP IRQ entries, then we are -+ * broken. We set up most of the low 16 IO-APIC pins to -+ * ISA defaults and hope it will work. -+ */ -+ if (!mp_irq_entries) { -+ struct mpc_config_bus bus; -+ -+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); -+ -+ bus.mpc_type = MP_BUS; -+ bus.mpc_busid = 0; -+ memcpy(bus.mpc_bustype, "ISA ", 6); -+ MP_bus_info(&bus); -+ -+ construct_default_ioirq_mptable(0); -+ } -+ -+ } else -+ BUG(); -+ -+ printk(KERN_INFO "Processors: %d\n", num_processors); -+ /* -+ * Only use the first configuration found. -+ */ -+} -+ -+static int __init smp_scan_config (unsigned long base, unsigned long length) -+{ -+ unsigned long *bp = isa_bus_to_virt(base); -+ struct intel_mp_floating *mpf; -+ -+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); -+ if (sizeof(*mpf) != 16) -+ printk("Error: MPF size\n"); -+ -+ while (length > 0) { -+ mpf = (struct intel_mp_floating *)bp; -+ if ((*bp == SMP_MAGIC_IDENT) && -+ (mpf->mpf_length == 1) && -+ !mpf_checksum((unsigned char *)bp, 16) && -+ ((mpf->mpf_specification == 1) -+ || (mpf->mpf_specification == 4)) ) { -+ -+ smp_found_config = 1; -+#ifndef CONFIG_XEN -+ printk(KERN_INFO "found SMP MP-table at %08lx\n", -+ virt_to_phys(mpf)); -+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); -+ if (mpf->mpf_physptr) { -+ /* -+ * We cannot access to MPC table to compute -+ * table size yet, as only few megabytes from -+ * the bottom is mapped now. -+ * PC-9800's MPC table places on the very last -+ * of physical memory; so that simply reserving -+ * PAGE_SIZE from mpg->mpf_physptr yields BUG() -+ * in reserve_bootmem. -+ */ -+ unsigned long size = PAGE_SIZE; -+ unsigned long end = max_low_pfn * PAGE_SIZE; -+ if (mpf->mpf_physptr + size > end) -+ size = end - mpf->mpf_physptr; -+ reserve_bootmem(mpf->mpf_physptr, size); -+ } -+#else -+ printk(KERN_INFO "found SMP MP-table at %08lx\n", -+ ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); -+#endif -+ -+ mpf_found = mpf; -+ return 1; -+ } -+ bp += 4; -+ length -= 16; -+ } -+ return 0; -+} -+ -+void __init find_smp_config (void) -+{ -+#ifndef CONFIG_XEN -+ unsigned int address; -+#endif -+ -+ /* -+ * FIXME: Linux assumes you have 640K of base ram.. -+ * this continues the error... -+ * -+ * 1) Scan the bottom 1K for a signature -+ * 2) Scan the top 1K of base RAM -+ * 3) Scan the 64K of bios -+ */ -+ if (smp_scan_config(0x0,0x400) || -+ smp_scan_config(639*0x400,0x400) || -+ smp_scan_config(0xF0000,0x10000)) -+ return; -+ /* -+ * If it is an SMP machine we should know now, unless the -+ * configuration is in an EISA/MCA bus machine with an -+ * extended bios data area. -+ * -+ * there is a real-mode segmented pointer pointing to the -+ * 4K EBDA area at 0x40E, calculate and scan it here. -+ * -+ * NOTE! There are Linux loaders that will corrupt the EBDA -+ * area, and as such this kind of SMP config may be less -+ * trustworthy, simply because the SMP table may have been -+ * stomped on during early boot. These loaders are buggy and -+ * should be fixed. -+ * -+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA. -+ */ -+ -+#ifndef CONFIG_XEN -+ address = get_bios_ebda(); -+ if (address) -+ smp_scan_config(address, 0x400); -+#endif -+} -+ -+int es7000_plat; -+ -+/* -------------------------------------------------------------------------- -+ ACPI-based MP Configuration -+ -------------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_ACPI -+ -+void __init mp_register_lapic_address ( -+ u64 address) -+{ -+#ifndef CONFIG_XEN -+ mp_lapic_addr = (unsigned long) address; -+ -+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); -+ -+ if (boot_cpu_physical_apicid == -1U) -+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -+ -+ Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); -+#endif -+} -+ -+ -+void __devinit mp_register_lapic ( -+ u8 id, -+ u8 enabled) -+{ -+ struct mpc_config_processor processor; -+ int boot_cpu = 0; -+ -+ if (MAX_APICS - id <= 0) { -+ printk(KERN_WARNING "Processor #%d invalid (max %d)\n", -+ id, MAX_APICS); -+ return; -+ } -+ -+ if (id == boot_cpu_physical_apicid) -+ boot_cpu = 1; -+ -+#ifndef CONFIG_XEN -+ processor.mpc_type = MP_PROCESSOR; -+ processor.mpc_apicid = id; -+ processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); -+ processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); -+ processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); -+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | -+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; -+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; -+ processor.mpc_reserved[0] = 0; -+ processor.mpc_reserved[1] = 0; -+#endif -+ -+ MP_processor_info(&processor); -+} -+ -+#ifdef CONFIG_X86_IO_APIC -+ -+#define MP_ISA_BUS 0 -+#define MP_MAX_IOAPIC_PIN 127 -+ -+static struct mp_ioapic_routing { -+ int apic_id; -+ int gsi_base; -+ int gsi_end; -+ u32 pin_programmed[4]; -+} mp_ioapic_routing[MAX_IO_APICS]; -+ -+ -+static int mp_find_ioapic ( -+ int gsi) -+{ -+ int i = 0; -+ -+ /* Find the IOAPIC that manages this GSI. */ -+ for (i = 0; i < nr_ioapics; i++) { -+ if ((gsi >= mp_ioapic_routing[i].gsi_base) -+ && (gsi <= mp_ioapic_routing[i].gsi_end)) -+ return i; -+ } -+ -+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); -+ -+ return -1; -+} -+ -+ -+void __init mp_register_ioapic ( -+ u8 id, -+ u32 address, -+ u32 gsi_base) -+{ -+ int idx = 0; -+ int tmpid; -+ -+ if (nr_ioapics >= MAX_IO_APICS) { -+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " -+ "(found %d)\n", MAX_IO_APICS, nr_ioapics); -+ panic("Recompile kernel with bigger MAX_IO_APICS!\n"); -+ } -+ if (!address) { -+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" -+ " found in MADT table, skipping!\n"); -+ return; -+ } -+ -+ idx = nr_ioapics++; -+ -+ mp_ioapics[idx].mpc_type = MP_IOAPIC; -+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; -+ mp_ioapics[idx].mpc_apicaddr = address; -+ -+#ifndef CONFIG_XEN -+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); -+#endif -+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) -+ && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) -+ tmpid = io_apic_get_unique_id(idx, id); -+ else -+ tmpid = id; -+ if (tmpid == -1) { -+ nr_ioapics--; -+ return; -+ } -+ mp_ioapics[idx].mpc_apicid = tmpid; -+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); -+ -+ /* -+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups -+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs). -+ */ -+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; -+ mp_ioapic_routing[idx].gsi_base = gsi_base; -+ mp_ioapic_routing[idx].gsi_end = gsi_base + -+ io_apic_get_redir_entries(idx); -+ -+ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " -+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, -+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, -+ mp_ioapic_routing[idx].gsi_base, -+ mp_ioapic_routing[idx].gsi_end); -+ -+ return; -+} -+ -+ -+void __init mp_override_legacy_irq ( -+ u8 bus_irq, -+ u8 polarity, -+ u8 trigger, -+ u32 gsi) -+{ -+ struct mpc_config_intsrc intsrc; -+ int ioapic = -1; -+ int pin = -1; -+ -+ /* -+ * Convert 'gsi' to 'ioapic.pin'. -+ */ -+ ioapic = mp_find_ioapic(gsi); -+ if (ioapic < 0) -+ return; -+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base; -+ -+ /* -+ * TBD: This check is for faulty timer entries, where the override -+ * erroneously sets the trigger to level, resulting in a HUGE -+ * increase of timer interrupts! -+ */ -+ if ((bus_irq == 0) && (trigger == 3)) -+ trigger = 1; -+ -+ intsrc.mpc_type = MP_INTSRC; -+ intsrc.mpc_irqtype = mp_INT; -+ intsrc.mpc_irqflag = (trigger << 2) | polarity; -+ intsrc.mpc_srcbus = MP_ISA_BUS; -+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ -+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ -+ intsrc.mpc_dstirq = pin; /* INTIN# */ -+ -+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", -+ intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, -+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, -+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); -+ -+ mp_irqs[mp_irq_entries] = intsrc; -+ if (++mp_irq_entries == MAX_IRQ_SOURCES) -+ panic("Max # of irq sources exceeded!\n"); -+ -+ return; -+} -+ -+void __init mp_config_acpi_legacy_irqs (void) -+{ -+ struct mpc_config_intsrc intsrc; -+ int i = 0; -+ int ioapic = -1; -+ -+ /* -+ * Fabricate the legacy ISA bus (bus #31). -+ */ -+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; -+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); -+ -+ /* -+ * Older generations of ES7000 have no legacy identity mappings -+ */ -+ if (es7000_plat == 1) -+ return; -+ -+ /* -+ * Locate the IOAPIC that manages the ISA IRQs (0-15). -+ */ -+ ioapic = mp_find_ioapic(0); -+ if (ioapic < 0) -+ return; -+ -+ intsrc.mpc_type = MP_INTSRC; -+ intsrc.mpc_irqflag = 0; /* Conforming */ -+ intsrc.mpc_srcbus = MP_ISA_BUS; -+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; -+ -+ /* -+ * Use the default configuration for the IRQs 0-15. Unless -+ * overriden by (MADT) interrupt source override entries. -+ */ -+ for (i = 0; i < 16; i++) { -+ int idx; -+ -+ for (idx = 0; idx < mp_irq_entries; idx++) { -+ struct mpc_config_intsrc *irq = mp_irqs + idx; -+ -+ /* Do we already have a mapping for this ISA IRQ? */ -+ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) -+ break; -+ -+ /* Do we already have a mapping for this IOAPIC pin */ -+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && -+ (irq->mpc_dstirq == i)) -+ break; -+ } -+ -+ if (idx != mp_irq_entries) { -+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); -+ continue; /* IRQ already used */ -+ } -+ -+ intsrc.mpc_irqtype = mp_INT; -+ intsrc.mpc_srcbusirq = i; /* Identity mapped */ -+ intsrc.mpc_dstirq = i; -+ -+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " -+ "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, -+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, -+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, -+ intsrc.mpc_dstirq); -+ -+ mp_irqs[mp_irq_entries] = intsrc; -+ if (++mp_irq_entries == MAX_IRQ_SOURCES) -+ panic("Max # of irq sources exceeded!\n"); -+ } -+} -+ -+#define MAX_GSI_NUM 4096 -+ -+int mp_register_gsi (u32 gsi, int triggering, int polarity) -+{ -+ int ioapic = -1; -+ int ioapic_pin = 0; -+ int idx, bit = 0; -+ static int pci_irq = 16; -+ /* -+ * Mapping between Global System Interrups, which -+ * represent all possible interrupts, and IRQs -+ * assigned to actual devices. -+ */ -+ static int gsi_to_irq[MAX_GSI_NUM]; -+ -+ /* Don't set up the ACPI SCI because it's already set up */ -+ if (acpi_fadt.sci_int == gsi) -+ return gsi; -+ -+ ioapic = mp_find_ioapic(gsi); -+ if (ioapic < 0) { -+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); -+ return gsi; -+ } -+ -+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; -+ -+ if (ioapic_renumber_irq) -+ gsi = ioapic_renumber_irq(ioapic, gsi); -+ -+ /* -+ * Avoid pin reprogramming. PRTs typically include entries -+ * with redundant pin->gsi mappings (but unique PCI devices); -+ * we only program the IOAPIC on the first. -+ */ -+ bit = ioapic_pin % 32; -+ idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); -+ if (idx > 3) { -+ printk(KERN_ERR "Invalid reference to IOAPIC pin " -+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, -+ ioapic_pin); -+ return gsi; -+ } -+ if ((1< 15), but -+ * avoid a problem where the 8254 timer (IRQ0) is setup -+ * via an override (so it's not on pin 0 of the ioapic), -+ * and at the same time, the pin 0 interrupt is a PCI -+ * type. The gsi > 15 test could cause these two pins -+ * to be shared as IRQ0, and they are not shareable. -+ * So test for this condition, and if necessary, avoid -+ * the pin collision. -+ */ -+ if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) -+ gsi = pci_irq++; -+ /* -+ * Don't assign IRQ used by ACPI SCI -+ */ -+ if (gsi == acpi_fadt.sci_int) -+ gsi = pci_irq++; -+ gsi_to_irq[irq] = gsi; -+ } else { -+ printk(KERN_ERR "GSI %u is too high\n", gsi); -+ return gsi; -+ } -+ } -+ -+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, -+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, -+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1); -+ return gsi; -+} -+ -+#endif /* CONFIG_X86_IO_APIC */ -+#endif /* CONFIG_ACPI */ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/pci-dma-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/pci-dma-xen.c ---- linux-2.6.18.8/arch/i386/kernel/pci-dma-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/pci-dma-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,382 @@ -+/* -+ * Dynamic DMA mapping support. -+ * -+ * On i386 there is no hardware dynamic DMA address translation, -+ * so consistent alloc/free are merely page allocation/freeing. -+ * The rest of the dynamic DMA mapping interface is implemented -+ * in asm/pci.h. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef __x86_64__ -+#include -+ -+int iommu_merge __read_mostly = 0; -+EXPORT_SYMBOL(iommu_merge); -+ -+dma_addr_t bad_dma_address __read_mostly; -+EXPORT_SYMBOL(bad_dma_address); -+ -+/* This tells the BIO block layer to assume merging. Default to off -+ because we cannot guarantee merging later. */ -+int iommu_bio_merge __read_mostly = 0; -+EXPORT_SYMBOL(iommu_bio_merge); -+ -+int force_iommu __read_mostly= 0; -+ -+__init int iommu_setup(char *p) -+{ -+ return 1; -+} -+ -+void __init pci_iommu_alloc(void) -+{ -+#ifdef CONFIG_SWIOTLB -+ pci_swiotlb_init(); -+#endif -+} -+ -+static int __init pci_iommu_init(void) -+{ -+ no_iommu_init(); -+ return 0; -+} -+ -+/* Must execute after PCI subsystem */ -+fs_initcall(pci_iommu_init); -+#endif -+ -+struct dma_coherent_mem { -+ void *virt_base; -+ u32 device_base; -+ int size; -+ int flags; -+ unsigned long *bitmap; -+}; -+ -+#define IOMMU_BUG_ON(test) \ -+do { \ -+ if (unlikely(test)) { \ -+ printk(KERN_ALERT "Fatal DMA error! " \ -+ "Please use 'swiotlb=force'\n"); \ -+ BUG(); \ -+ } \ -+} while (0) -+ -+int -+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, -+ enum dma_data_direction direction) -+{ -+ int i, rc; -+ -+ if (direction == DMA_NONE) -+ BUG(); -+ WARN_ON(nents == 0 || sg[0].length == 0); -+ -+ if (swiotlb) { -+ rc = swiotlb_map_sg(hwdev, sg, nents, direction); -+ } else { -+ for (i = 0; i < nents; i++ ) { -+ BUG_ON(!sg[i].page); -+ sg[i].dma_address = -+ gnttab_dma_map_page(sg[i].page) + sg[i].offset; -+ sg[i].dma_length = sg[i].length; -+ IOMMU_BUG_ON(address_needs_mapping( -+ hwdev, sg[i].dma_address)); -+ IOMMU_BUG_ON(range_straddles_page_boundary( -+ page_to_pseudophys(sg[i].page) + sg[i].offset, -+ sg[i].length)); -+ } -+ rc = nents; -+ } -+ -+ flush_write_buffers(); -+ return rc; -+} -+EXPORT_SYMBOL(dma_map_sg); -+ -+void -+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, -+ enum dma_data_direction direction) -+{ -+ int i; -+ -+ BUG_ON(direction == DMA_NONE); -+ if (swiotlb) -+ swiotlb_unmap_sg(hwdev, sg, nents, direction); -+ else { -+ for (i = 0; i < nents; i++ ) -+ gnttab_dma_unmap_page(sg[i].dma_address); -+ } -+} -+EXPORT_SYMBOL(dma_unmap_sg); -+ -+#ifdef CONFIG_HIGHMEM -+dma_addr_t -+dma_map_page(struct device *dev, struct page *page, unsigned long offset, -+ size_t size, enum dma_data_direction direction) -+{ -+ dma_addr_t dma_addr; -+ -+ BUG_ON(direction == DMA_NONE); -+ -+ if (swiotlb) { -+ dma_addr = swiotlb_map_page( -+ dev, page, offset, size, direction); -+ } else { -+ dma_addr = gnttab_dma_map_page(page) + offset; -+ IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); -+ } -+ -+ return dma_addr; -+} -+EXPORT_SYMBOL(dma_map_page); -+ -+void -+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, -+ enum dma_data_direction direction) -+{ -+ BUG_ON(direction == DMA_NONE); -+ if (swiotlb) -+ swiotlb_unmap_page(dev, dma_address, size, direction); -+ else -+ gnttab_dma_unmap_page(dma_address); -+} -+EXPORT_SYMBOL(dma_unmap_page); -+#endif /* CONFIG_HIGHMEM */ -+ -+int -+dma_mapping_error(dma_addr_t dma_addr) -+{ -+ if (swiotlb) -+ return swiotlb_dma_mapping_error(dma_addr); -+ return 0; -+} -+EXPORT_SYMBOL(dma_mapping_error); -+ -+int -+dma_supported(struct device *dev, u64 mask) -+{ -+ if (swiotlb) -+ return swiotlb_dma_supported(dev, mask); -+ /* -+ * By default we'll BUG when an infeasible DMA is requested, and -+ * request swiotlb=force (see IOMMU_BUG_ON). -+ */ -+ return 1; -+} -+EXPORT_SYMBOL(dma_supported); -+ -+void *dma_alloc_coherent(struct device *dev, size_t size, -+ dma_addr_t *dma_handle, gfp_t gfp) -+{ -+ void *ret; -+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; -+ unsigned int order = get_order(size); -+ unsigned long vstart; -+ u64 mask; -+ -+ /* ignore region specifiers */ -+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); -+ -+ if (mem) { -+ int page = bitmap_find_free_region(mem->bitmap, mem->size, -+ order); -+ if (page >= 0) { -+ *dma_handle = mem->device_base + (page << PAGE_SHIFT); -+ ret = mem->virt_base + (page << PAGE_SHIFT); -+ memset(ret, 0, size); -+ return ret; -+ } -+ if (mem->flags & DMA_MEMORY_EXCLUSIVE) -+ return NULL; -+ } -+ -+ if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) -+ gfp |= GFP_DMA; -+ -+ vstart = __get_free_pages(gfp, order); -+ ret = (void *)vstart; -+ -+ if (dev != NULL && dev->coherent_dma_mask) -+ mask = dev->coherent_dma_mask; -+ else -+ mask = 0xffffffff; -+ -+ if (ret != NULL) { -+ if (xen_create_contiguous_region(vstart, order, -+ fls64(mask)) != 0) { -+ free_pages(vstart, order); -+ return NULL; -+ } -+ memset(ret, 0, size); -+ *dma_handle = virt_to_bus(ret); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(dma_alloc_coherent); -+ -+void dma_free_coherent(struct device *dev, size_t size, -+ void *vaddr, dma_addr_t dma_handle) -+{ -+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; -+ int order = get_order(size); -+ -+ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { -+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; -+ -+ bitmap_release_region(mem->bitmap, page, order); -+ } else { -+ xen_destroy_contiguous_region((unsigned long)vaddr, order); -+ free_pages((unsigned long)vaddr, order); -+ } -+} -+EXPORT_SYMBOL(dma_free_coherent); -+ -+#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY -+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, -+ dma_addr_t device_addr, size_t size, int flags) -+{ -+ void __iomem *mem_base; -+ int pages = size >> PAGE_SHIFT; -+ int bitmap_size = (pages + 31)/32; -+ -+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) -+ goto out; -+ if (!size) -+ goto out; -+ if (dev->dma_mem) -+ goto out; -+ -+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ -+ -+ mem_base = ioremap(bus_addr, size); -+ if (!mem_base) -+ goto out; -+ -+ dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); -+ if (!dev->dma_mem) -+ goto out; -+ memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); -+ dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); -+ if (!dev->dma_mem->bitmap) -+ goto free1_out; -+ memset(dev->dma_mem->bitmap, 0, bitmap_size); -+ -+ dev->dma_mem->virt_base = mem_base; -+ dev->dma_mem->device_base = device_addr; -+ dev->dma_mem->size = pages; -+ dev->dma_mem->flags = flags; -+ -+ if (flags & DMA_MEMORY_MAP) -+ return DMA_MEMORY_MAP; -+ -+ return DMA_MEMORY_IO; -+ -+ free1_out: -+ kfree(dev->dma_mem->bitmap); -+ out: -+ return 0; -+} -+EXPORT_SYMBOL(dma_declare_coherent_memory); -+ -+void dma_release_declared_memory(struct device *dev) -+{ -+ struct dma_coherent_mem *mem = dev->dma_mem; -+ -+ if(!mem) -+ return; -+ dev->dma_mem = NULL; -+ iounmap(mem->virt_base); -+ kfree(mem->bitmap); -+ kfree(mem); -+} -+EXPORT_SYMBOL(dma_release_declared_memory); -+ -+void *dma_mark_declared_memory_occupied(struct device *dev, -+ dma_addr_t device_addr, size_t size) -+{ -+ struct dma_coherent_mem *mem = dev->dma_mem; -+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; -+ int pos, err; -+ -+ if (!mem) -+ return ERR_PTR(-EINVAL); -+ -+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT; -+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); -+ if (err != 0) -+ return ERR_PTR(err); -+ return mem->virt_base + (pos << PAGE_SHIFT); -+} -+EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -+#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ -+ -+dma_addr_t -+dma_map_single(struct device *dev, void *ptr, size_t size, -+ enum dma_data_direction direction) -+{ -+ dma_addr_t dma; -+ -+ if (direction == DMA_NONE) -+ BUG(); -+ WARN_ON(size == 0); -+ -+ if (swiotlb) { -+ dma = swiotlb_map_single(dev, ptr, size, direction); -+ } else { -+ dma = gnttab_dma_map_page(virt_to_page(ptr)) + -+ offset_in_page(ptr); -+ IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size)); -+ IOMMU_BUG_ON(address_needs_mapping(dev, dma)); -+ } -+ -+ flush_write_buffers(); -+ return dma; -+} -+EXPORT_SYMBOL(dma_map_single); -+ -+void -+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, -+ enum dma_data_direction direction) -+{ -+ if (direction == DMA_NONE) -+ BUG(); -+ if (swiotlb) -+ swiotlb_unmap_single(dev, dma_addr, size, direction); -+ else -+ gnttab_dma_unmap_page(dma_addr); -+} -+EXPORT_SYMBOL(dma_unmap_single); -+ -+void -+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, -+ enum dma_data_direction direction) -+{ -+ if (swiotlb) -+ swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); -+} -+EXPORT_SYMBOL(dma_sync_single_for_cpu); -+ -+void -+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, -+ enum dma_data_direction direction) -+{ -+ if (swiotlb) -+ swiotlb_sync_single_for_device(dev, dma_handle, size, direction); -+} -+EXPORT_SYMBOL(dma_sync_single_for_device); -diff -rpuN linux-2.6.18.8/arch/i386/kernel/process-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/process-xen.c ---- linux-2.6.18.8/arch/i386/kernel/process-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/process-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,856 @@ -+/* -+ * linux/arch/i386/kernel/process.c -+ * -+ * Copyright (C) 1995 Linus Torvalds -+ * -+ * Pentium III FXSR, SSE support -+ * Gareth Hughes , May 2000 -+ */ -+ -+/* -+ * This file handles the architecture-dependent parts of process handling.. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#ifdef CONFIG_MATH_EMULATION -+#include -+#endif -+ -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+ -+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -+ -+static int hlt_counter; -+ -+unsigned long boot_option_idle_override = 0; -+EXPORT_SYMBOL(boot_option_idle_override); -+ -+/* -+ * Return saved PC of a blocked thread. -+ */ -+unsigned long thread_saved_pc(struct task_struct *tsk) -+{ -+ return ((unsigned long *)tsk->thread.esp)[3]; -+} -+ -+/* -+ * Powermanagement idle function, if any.. -+ */ -+void (*pm_idle)(void); -+EXPORT_SYMBOL(pm_idle); -+static DEFINE_PER_CPU(unsigned int, cpu_idle_state); -+ -+void disable_hlt(void) -+{ -+ hlt_counter++; -+} -+ -+EXPORT_SYMBOL(disable_hlt); -+ -+void enable_hlt(void) -+{ -+ hlt_counter--; -+} -+ -+EXPORT_SYMBOL(enable_hlt); -+ -+/* -+ * On SMP it's slightly faster (but much more power-consuming!) -+ * to poll the ->work.need_resched flag instead of waiting for the -+ * cross-CPU IPI to arrive. Use this option with caution. -+ */ -+static void poll_idle (void) -+{ -+ local_irq_enable(); -+ -+ asm volatile( -+ "2:" -+ "testl %0, %1;" -+ "rep; nop;" -+ "je 2b;" -+ : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); -+} -+ -+static void xen_idle(void) -+{ -+ local_irq_disable(); -+ -+ if (need_resched()) -+ local_irq_enable(); -+ else { -+ current_thread_info()->status &= ~TS_POLLING; -+ smp_mb__after_clear_bit(); -+ safe_halt(); -+ current_thread_info()->status |= TS_POLLING; -+ } -+} -+#ifdef CONFIG_APM_MODULE -+EXPORT_SYMBOL(default_idle); -+#endif -+ -+#ifdef CONFIG_HOTPLUG_CPU -+extern cpumask_t cpu_initialized; -+static inline void play_dead(void) -+{ -+ idle_task_exit(); -+ local_irq_disable(); -+ cpu_clear(smp_processor_id(), cpu_initialized); -+ preempt_enable_no_resched(); -+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); -+ cpu_bringup(); -+} -+#else -+static inline void play_dead(void) -+{ -+ BUG(); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+/* -+ * The idle thread. There's no useful work to be -+ * done, so just try to conserve power and have a -+ * low exit latency (ie sit in a loop waiting for -+ * somebody to say that they'd like to reschedule) -+ */ -+void cpu_idle(void) -+{ -+ int cpu = smp_processor_id(); -+ -+ current_thread_info()->status |= TS_POLLING; -+ -+ /* endless idle loop with no priority at all */ -+ while (1) { -+ while (!need_resched()) { -+ void (*idle)(void); -+ -+ if (__get_cpu_var(cpu_idle_state)) -+ __get_cpu_var(cpu_idle_state) = 0; -+ -+ rmb(); -+ idle = xen_idle; /* no alternatives */ -+ -+ if (cpu_is_offline(cpu)) -+ play_dead(); -+ -+ __get_cpu_var(irq_stat).idle_timestamp = jiffies; -+ idle(); -+ } -+ preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+ } -+} -+ -+void cpu_idle_wait(void) -+{ -+ unsigned int cpu, this_cpu = get_cpu(); -+ cpumask_t map; -+ -+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); -+ put_cpu(); -+ -+ cpus_clear(map); -+ for_each_online_cpu(cpu) { -+ per_cpu(cpu_idle_state, cpu) = 1; -+ cpu_set(cpu, map); -+ } -+ -+ __get_cpu_var(cpu_idle_state) = 0; -+ -+ wmb(); -+ do { -+ ssleep(1); -+ for_each_online_cpu(cpu) { -+ if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) -+ cpu_clear(cpu, map); -+ } -+ cpus_and(map, map, cpu_online_map); -+ } while (!cpus_empty(map)); -+} -+EXPORT_SYMBOL_GPL(cpu_idle_wait); -+ -+void __devinit select_idle_routine(const struct cpuinfo_x86 *c) -+{ -+} -+ -+static int __init idle_setup (char *str) -+{ -+ if (!strncmp(str, "poll", 4)) { -+ printk("using polling idle threads.\n"); -+ pm_idle = poll_idle; -+ } -+ -+ boot_option_idle_override = 1; -+ return 1; -+} -+ -+__setup("idle=", idle_setup); -+ -+void show_regs(struct pt_regs * regs) -+{ -+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; -+ -+ printk("\n"); -+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm); -+ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); -+ print_symbol("EIP is at %s\n", regs->eip); -+ -+ if (user_mode_vm(regs)) -+ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); -+ printk(" EFLAGS: %08lx %s (%s %.*s)\n", -+ regs->eflags, print_tainted(), system_utsname.release, -+ (int)strcspn(system_utsname.version, " "), -+ system_utsname.version); -+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", -+ regs->eax,regs->ebx,regs->ecx,regs->edx); -+ printk("ESI: %08lx EDI: %08lx EBP: %08lx", -+ regs->esi, regs->edi, regs->ebp); -+ printk(" DS: %04x ES: %04x\n", -+ 0xffff & regs->xds,0xffff & regs->xes); -+ -+ cr0 = read_cr0(); -+ cr2 = read_cr2(); -+ cr3 = read_cr3(); -+ cr4 = read_cr4_safe(); -+ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); -+ show_trace(NULL, regs, ®s->esp); -+} -+ -+/* -+ * This gets run with %ebx containing the -+ * function to call, and %edx containing -+ * the "args". -+ */ -+extern void kernel_thread_helper(void); -+__asm__(".section .text\n" -+ ".align 4\n" -+ "kernel_thread_helper:\n\t" -+ "movl %edx,%eax\n\t" -+ "pushl %edx\n\t" -+ "call *%ebx\n\t" -+ "pushl %eax\n\t" -+ "call do_exit\n" -+ ".previous"); -+ -+/* -+ * Create a kernel thread -+ */ -+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) -+{ -+ struct pt_regs regs; -+ -+ memset(®s, 0, sizeof(regs)); -+ -+ regs.ebx = (unsigned long) fn; -+ regs.edx = (unsigned long) arg; -+ -+ regs.xds = __USER_DS; -+ regs.xes = __USER_DS; -+ regs.orig_eax = -1; -+ regs.eip = (unsigned long) kernel_thread_helper; -+ regs.xcs = GET_KERNEL_CS(); -+ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; -+ -+ /* Ok, create the new process.. */ -+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -+} -+EXPORT_SYMBOL(kernel_thread); -+ -+/* -+ * Free current thread data structures etc.. -+ */ -+void exit_thread(void) -+{ -+ /* The process may have allocated an io port bitmap... nuke it. */ -+ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { -+ struct task_struct *tsk = current; -+ struct thread_struct *t = &tsk->thread; -+ struct physdev_set_iobitmap set_iobitmap; -+ memset(&set_iobitmap, 0, sizeof(set_iobitmap)); -+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, -+ &set_iobitmap)); -+ kfree(t->io_bitmap_ptr); -+ t->io_bitmap_ptr = NULL; -+ clear_thread_flag(TIF_IO_BITMAP); -+ } -+} -+ -+void flush_thread(void) -+{ -+ struct task_struct *tsk = current; -+ -+ memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); -+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -+ clear_tsk_thread_flag(tsk, TIF_DEBUG); -+ /* -+ * Forget coprocessor state.. -+ */ -+ clear_fpu(tsk); -+ clear_used_math(); -+} -+ -+void release_thread(struct task_struct *dead_task) -+{ -+ BUG_ON(dead_task->mm); -+ release_vm86_irqs(dead_task); -+} -+ -+/* -+ * This gets called before we allocate a new thread and copy -+ * the current task into it. -+ */ -+void prepare_to_copy(struct task_struct *tsk) -+{ -+ unlazy_fpu(tsk); -+} -+ -+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, -+ unsigned long unused, -+ struct task_struct * p, struct pt_regs * regs) -+{ -+ struct pt_regs * childregs; -+ struct task_struct *tsk; -+ int err; -+ -+ childregs = task_pt_regs(p); -+ *childregs = *regs; -+ childregs->eax = 0; -+ childregs->esp = esp; -+ -+ p->thread.esp = (unsigned long) childregs; -+ p->thread.esp0 = (unsigned long) (childregs+1); -+ -+ p->thread.eip = (unsigned long) ret_from_fork; -+ -+ savesegment(fs,p->thread.fs); -+ savesegment(gs,p->thread.gs); -+ -+ tsk = current; -+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { -+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); -+ if (!p->thread.io_bitmap_ptr) { -+ p->thread.io_bitmap_max = 0; -+ return -ENOMEM; -+ } -+ memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, -+ IO_BITMAP_BYTES); -+ set_tsk_thread_flag(p, TIF_IO_BITMAP); -+ } -+ -+ /* -+ * Set a new TLS for the child thread? -+ */ -+ if (clone_flags & CLONE_SETTLS) { -+ struct desc_struct *desc; -+ struct user_desc info; -+ int idx; -+ -+ err = -EFAULT; -+ if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) -+ goto out; -+ err = -EINVAL; -+ if (LDT_empty(&info)) -+ goto out; -+ -+ idx = info.entry_number; -+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) -+ goto out; -+ -+ desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; -+ desc->a = LDT_entry_a(&info); -+ desc->b = LDT_entry_b(&info); -+ } -+ -+ p->thread.iopl = current->thread.iopl; -+ -+ err = 0; -+ out: -+ if (err && p->thread.io_bitmap_ptr) { -+ kfree(p->thread.io_bitmap_ptr); -+ p->thread.io_bitmap_max = 0; -+ } -+ return err; -+} -+ -+/* -+ * fill in the user structure for a core dump.. -+ */ -+void dump_thread(struct pt_regs * regs, struct user * dump) -+{ -+ int i; -+ -+/* changed the size calculations - should hopefully work better. lbt */ -+ dump->magic = CMAGIC; -+ dump->start_code = 0; -+ dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); -+ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; -+ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; -+ dump->u_dsize -= dump->u_tsize; -+ dump->u_ssize = 0; -+ for (i = 0; i < 8; i++) -+ dump->u_debugreg[i] = current->thread.debugreg[i]; -+ -+ if (dump->start_stack < TASK_SIZE) -+ dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; -+ -+ dump->regs.ebx = regs->ebx; -+ dump->regs.ecx = regs->ecx; -+ dump->regs.edx = regs->edx; -+ dump->regs.esi = regs->esi; -+ dump->regs.edi = regs->edi; -+ dump->regs.ebp = regs->ebp; -+ dump->regs.eax = regs->eax; -+ dump->regs.ds = regs->xds; -+ dump->regs.es = regs->xes; -+ savesegment(fs,dump->regs.fs); -+ savesegment(gs,dump->regs.gs); -+ dump->regs.orig_eax = regs->orig_eax; -+ dump->regs.eip = regs->eip; -+ dump->regs.cs = regs->xcs; -+ dump->regs.eflags = regs->eflags; -+ dump->regs.esp = regs->esp; -+ dump->regs.ss = regs->xss; -+ -+ dump->u_fpvalid = dump_fpu (regs, &dump->i387); -+} -+EXPORT_SYMBOL(dump_thread); -+ -+/* -+ * Capture the user space registers if the task is not running (in user space) -+ */ -+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -+{ -+ struct pt_regs ptregs = *task_pt_regs(tsk); -+ ptregs.xcs &= 0xffff; -+ ptregs.xds &= 0xffff; -+ ptregs.xes &= 0xffff; -+ ptregs.xss &= 0xffff; -+ -+ elf_core_copy_regs(regs, &ptregs); -+ -+ return 1; -+} -+ -+static noinline void __switch_to_xtra(struct task_struct *next_p) -+{ -+ struct thread_struct *next; -+ -+ next = &next_p->thread; -+ -+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { -+ set_debugreg(next->debugreg[0], 0); -+ set_debugreg(next->debugreg[1], 1); -+ set_debugreg(next->debugreg[2], 2); -+ set_debugreg(next->debugreg[3], 3); -+ /* no 4 and 5 */ -+ set_debugreg(next->debugreg[6], 6); -+ set_debugreg(next->debugreg[7], 7); -+ } -+} -+ -+/* -+ * This function selects if the context switch from prev to next -+ * has to tweak the TSC disable bit in the cr4. -+ */ -+static inline void disable_tsc(struct task_struct *prev_p, -+ struct task_struct *next_p) -+{ -+ struct thread_info *prev, *next; -+ -+ /* -+ * gcc should eliminate the ->thread_info dereference if -+ * has_secure_computing returns 0 at compile time (SECCOMP=n). -+ */ -+ prev = task_thread_info(prev_p); -+ next = task_thread_info(next_p); -+ -+ if (has_secure_computing(prev) || has_secure_computing(next)) { -+ /* slow path here */ -+ if (has_secure_computing(prev) && -+ !has_secure_computing(next)) { -+ write_cr4(read_cr4() & ~X86_CR4_TSD); -+ } else if (!has_secure_computing(prev) && -+ has_secure_computing(next)) -+ write_cr4(read_cr4() | X86_CR4_TSD); -+ } -+} -+ -+/* -+ * switch_to(x,yn) should switch tasks from x to y. -+ * -+ * We fsave/fwait so that an exception goes off at the right time -+ * (as a call from the fsave or fwait in effect) rather than to -+ * the wrong process. Lazy FP saving no longer makes any sense -+ * with modern CPU's, and this simplifies a lot of things (SMP -+ * and UP become the same). -+ * -+ * NOTE! We used to use the x86 hardware context switching. The -+ * reason for not using it any more becomes apparent when you -+ * try to recover gracefully from saved state that is no longer -+ * valid (stale segment register values in particular). With the -+ * hardware task-switch, there is no way to fix up bad state in -+ * a reasonable manner. -+ * -+ * The fact that Intel documents the hardware task-switching to -+ * be slow is a fairly red herring - this code is not noticeably -+ * faster. However, there _is_ some room for improvement here, -+ * so the performance issues may eventually be a valid point. -+ * More important, however, is the fact that this allows us much -+ * more flexibility. -+ * -+ * The return value (in %eax) will be the "prev" task after -+ * the task-switch, and shows up in ret_from_fork in entry.S, -+ * for example. -+ */ -+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) -+{ -+ struct thread_struct *prev = &prev_p->thread, -+ *next = &next_p->thread; -+ int cpu = smp_processor_id(); -+#ifndef CONFIG_X86_NO_TSS -+ struct tss_struct *tss = &per_cpu(init_tss, cpu); -+#endif -+ struct physdev_set_iopl iopl_op; -+ struct physdev_set_iobitmap iobmp_op; -+ multicall_entry_t _mcl[8], *mcl = _mcl; -+ -+ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ -+ -+ /* -+ * This is basically '__unlazy_fpu', except that we queue a -+ * multicall to indicate FPU task switch, rather than -+ * synchronously trapping to Xen. -+ */ -+ if (prev_p->thread_info->status & TS_USEDFPU) { -+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ -+ mcl->op = __HYPERVISOR_fpu_taskswitch; -+ mcl->args[0] = 1; -+ mcl++; -+ } -+#if 0 /* lazy fpu sanity check */ -+ else BUG_ON(!(read_cr0() & 8)); -+#endif -+ -+ /* -+ * Reload esp0. -+ * This is load_esp0(tss, next) with a multicall. -+ */ -+ mcl->op = __HYPERVISOR_stack_switch; -+ mcl->args[0] = __KERNEL_DS; -+ mcl->args[1] = next->esp0; -+ mcl++; -+ -+ /* -+ * Load the per-thread Thread-Local Storage descriptor. -+ * This is load_TLS(next, cpu) with multicalls. -+ */ -+#define C(i) do { \ -+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ -+ next->tls_array[i].b != prev->tls_array[i].b)) { \ -+ mcl->op = __HYPERVISOR_update_descriptor; \ -+ *(u64 *)&mcl->args[0] = virt_to_machine( \ -+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ -+ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ -+ mcl++; \ -+ } \ -+} while (0) -+ C(0); C(1); C(2); -+#undef C -+ -+ if (unlikely(prev->iopl != next->iopl)) { -+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; -+ mcl->op = __HYPERVISOR_physdev_op; -+ mcl->args[0] = PHYSDEVOP_set_iopl; -+ mcl->args[1] = (unsigned long)&iopl_op; -+ mcl++; -+ } -+ -+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { -+ set_xen_guest_handle(iobmp_op.bitmap, -+ (char *)next->io_bitmap_ptr); -+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; -+ mcl->op = __HYPERVISOR_physdev_op; -+ mcl->args[0] = PHYSDEVOP_set_iobitmap; -+ mcl->args[1] = (unsigned long)&iobmp_op; -+ mcl++; -+ } -+ -+ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); -+ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) -+ BUG(); -+ -+ /* -+ * Restore %fs and %gs if needed. -+ * -+ * Glibc normally makes %fs be zero, and %gs is one of -+ * the TLS segments. -+ */ -+ if (unlikely(next->fs)) -+ loadsegment(fs, next->fs); -+ -+ if (next->gs) -+ loadsegment(gs, next->gs); -+ -+ /* -+ * Now maybe handle debug registers -+ */ -+ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) -+ __switch_to_xtra(next_p); -+ -+ disable_tsc(prev_p, next_p); -+ -+ return prev_p; -+} -+ -+asmlinkage int sys_fork(struct pt_regs regs) -+{ -+ return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); -+} -+ -+asmlinkage int sys_clone(struct pt_regs regs) -+{ -+ unsigned long clone_flags; -+ unsigned long newsp; -+ int __user *parent_tidptr, *child_tidptr; -+ -+ clone_flags = regs.ebx; -+ newsp = regs.ecx; -+ parent_tidptr = (int __user *)regs.edx; -+ child_tidptr = (int __user *)regs.edi; -+ if (!newsp) -+ newsp = regs.esp; -+ return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); -+} -+ -+/* -+ * This is trivial, and on the face of it looks like it -+ * could equally well be done in user mode. -+ * -+ * Not so, for quite unobvious reasons - register pressure. -+ * In user mode vfork() cannot have a stack frame, and if -+ * done by calling the "clone()" system call directly, you -+ * do not have enough call-clobbered registers to hold all -+ * the information you need. -+ */ -+asmlinkage int sys_vfork(struct pt_regs regs) -+{ -+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); -+} -+ -+/* -+ * sys_execve() executes a new program. -+ */ -+asmlinkage int sys_execve(struct pt_regs regs) -+{ -+ int error; -+ char * filename; -+ -+ filename = getname((char __user *) regs.ebx); -+ error = PTR_ERR(filename); -+ if (IS_ERR(filename)) -+ goto out; -+ error = do_execve(filename, -+ (char __user * __user *) regs.ecx, -+ (char __user * __user *) regs.edx, -+ ®s); -+ if (error == 0) { -+ task_lock(current); -+ current->ptrace &= ~PT_DTRACE; -+ task_unlock(current); -+ /* Make sure we don't return using sysenter.. */ -+ set_thread_flag(TIF_IRET); -+ } -+ putname(filename); -+out: -+ return error; -+} -+ -+#define top_esp (THREAD_SIZE - sizeof(unsigned long)) -+#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) -+ -+unsigned long get_wchan(struct task_struct *p) -+{ -+ unsigned long ebp, esp, eip; -+ unsigned long stack_page; -+ int count = 0; -+ if (!p || p == current || p->state == TASK_RUNNING) -+ return 0; -+ stack_page = (unsigned long)task_stack_page(p); -+ esp = p->thread.esp; -+ if (!stack_page || esp < stack_page || esp > top_esp+stack_page) -+ return 0; -+ /* include/asm-i386/system.h:switch_to() pushes ebp last. */ -+ ebp = *(unsigned long *) esp; -+ do { -+ if (ebp < stack_page || ebp > top_ebp+stack_page) -+ return 0; -+ eip = *(unsigned long *) (ebp+4); -+ if (!in_sched_functions(eip)) -+ return eip; -+ ebp = *(unsigned long *) ebp; -+ } while (count++ < 16); -+ return 0; -+} -+ -+/* -+ * sys_alloc_thread_area: get a yet unused TLS descriptor index. -+ */ -+static int get_free_idx(void) -+{ -+ struct thread_struct *t = ¤t->thread; -+ int idx; -+ -+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) -+ if (desc_empty(t->tls_array + idx)) -+ return idx + GDT_ENTRY_TLS_MIN; -+ return -ESRCH; -+} -+ -+/* -+ * Set a given TLS descriptor: -+ */ -+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -+{ -+ struct thread_struct *t = ¤t->thread; -+ struct user_desc info; -+ struct desc_struct *desc; -+ int cpu, idx; -+ -+ if (copy_from_user(&info, u_info, sizeof(info))) -+ return -EFAULT; -+ idx = info.entry_number; -+ -+ /* -+ * index -1 means the kernel should try to find and -+ * allocate an empty descriptor: -+ */ -+ if (idx == -1) { -+ idx = get_free_idx(); -+ if (idx < 0) -+ return idx; -+ if (put_user(idx, &u_info->entry_number)) -+ return -EFAULT; -+ } -+ -+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) -+ return -EINVAL; -+ -+ desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; -+ -+ /* -+ * We must not get preempted while modifying the TLS. -+ */ -+ cpu = get_cpu(); -+ -+ if (LDT_empty(&info)) { -+ desc->a = 0; -+ desc->b = 0; -+ } else { -+ desc->a = LDT_entry_a(&info); -+ desc->b = LDT_entry_b(&info); -+ } -+ load_TLS(t, cpu); -+ -+ put_cpu(); -+ -+ return 0; -+} -+ -+/* -+ * Get the current Thread-Local Storage area: -+ */ -+ -+#define GET_BASE(desc) ( \ -+ (((desc)->a >> 16) & 0x0000ffff) | \ -+ (((desc)->b << 16) & 0x00ff0000) | \ -+ ( (desc)->b & 0xff000000) ) -+ -+#define GET_LIMIT(desc) ( \ -+ ((desc)->a & 0x0ffff) | \ -+ ((desc)->b & 0xf0000) ) -+ -+#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -+#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -+#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -+#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -+#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -+#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) -+ -+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -+{ -+ struct user_desc info; -+ struct desc_struct *desc; -+ int idx; -+ -+ if (get_user(idx, &u_info->entry_number)) -+ return -EFAULT; -+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) -+ return -EINVAL; -+ -+ memset(&info, 0, sizeof(info)); -+ -+ desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; -+ -+ info.entry_number = idx; -+ info.base_addr = GET_BASE(desc); -+ info.limit = GET_LIMIT(desc); -+ info.seg_32bit = GET_32BIT(desc); -+ info.contents = GET_CONTENTS(desc); -+ info.read_exec_only = !GET_WRITABLE(desc); -+ info.limit_in_pages = GET_LIMIT_PAGES(desc); -+ info.seg_not_present = !GET_PRESENT(desc); -+ info.useable = GET_USEABLE(desc); -+ -+ if (copy_to_user(u_info, &info, sizeof(info))) -+ return -EFAULT; -+ return 0; -+} -+ -+unsigned long arch_align_stack(unsigned long sp) -+{ -+ if (randomize_va_space) -+ sp -= get_random_int() % 8192; -+ return sp & ~0xf; -+} -diff -rpuN linux-2.6.18.8/arch/i386/kernel/quirks-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/quirks-xen.c ---- linux-2.6.18.8/arch/i386/kernel/quirks-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/quirks-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,47 @@ -+/* -+ * This file contains work-arounds for x86 and x86_64 platform bugs. -+ */ -+#include -+#include -+ -+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) -+ -+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) -+{ -+ u8 config, rev; -+ u32 word; -+ -+ /* BIOS may enable hardware IRQ balancing for -+ * E7520/E7320/E7525(revision ID 0x9 and below) -+ * based platforms. -+ * Disable SW irqbalance/affinity on those platforms. -+ */ -+ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); -+ if (rev > 0x9) -+ return; -+ -+ printk(KERN_INFO "Intel E7520/7320/7525 detected."); -+ -+ /* enable access to config space*/ -+ pci_read_config_byte(dev, 0xf4, &config); -+ pci_write_config_byte(dev, 0xf4, config|0x2); -+ -+ /* read xTPR register */ -+ raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); -+ -+ if (!(word & (1 << 13))) { -+ struct xen_platform_op op; -+ printk(KERN_INFO "Disabling irq balancing and affinity\n"); -+ op.cmd = XENPF_platform_quirk; -+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; -+ WARN_ON(HYPERVISOR_platform_op(&op)); -+ } -+ -+ /* put back the original value for config space*/ -+ if (!(config & 0x2)) -+ pci_write_config_byte(dev, 0xf4, config); -+} -+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); -+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); -+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); -+#endif -diff -rpuN linux-2.6.18.8/arch/i386/kernel/relocate_kernel.S linux-2.6.18-xen-3.2.0/arch/i386/kernel/relocate_kernel.S ---- linux-2.6.18.8/arch/i386/kernel/relocate_kernel.S 2007-02-23 15:52:30.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/relocate_kernel.S 2008-02-15 16:21:49.000000000 -0800 -@@ -7,16 +7,138 @@ - */ - - #include -+#include -+#include -+ -+/* -+ * Must be relocatable PIC code callable as a C function -+ */ -+ -+#define PTR(x) (x << 2) -+#define PAGE_ALIGNED (1 << PAGE_SHIFT) -+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ -+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ -+ -+ .text -+ .align PAGE_ALIGNED -+ .globl relocate_kernel -+relocate_kernel: -+ movl 8(%esp), %ebp /* list of pages */ -+ -+#ifdef CONFIG_X86_PAE -+ /* map the control page at its virtual address */ -+ -+ movl PTR(VA_PGD)(%ebp), %edi -+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax -+ andl $0xc0000000, %eax -+ shrl $27, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_PMD_0)(%ebp), %edx -+ orl $PAE_PGD_ATTR, %edx -+ movl %edx, (%eax) -+ -+ movl PTR(VA_PMD_0)(%ebp), %edi -+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax -+ andl $0x3fe00000, %eax -+ shrl $18, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_PTE_0)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+ -+ movl PTR(VA_PTE_0)(%ebp), %edi -+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax -+ andl $0x001ff000, %eax -+ shrl $9, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+ -+ /* identity map the control page at its physical address */ -+ -+ movl PTR(VA_PGD)(%ebp), %edi -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax -+ andl $0xc0000000, %eax -+ shrl $27, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_PMD_1)(%ebp), %edx -+ orl $PAE_PGD_ATTR, %edx -+ movl %edx, (%eax) -+ -+ movl PTR(VA_PMD_1)(%ebp), %edi -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax -+ andl $0x3fe00000, %eax -+ shrl $18, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_PTE_1)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+ -+ movl PTR(VA_PTE_1)(%ebp), %edi -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax -+ andl $0x001ff000, %eax -+ shrl $9, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+#else -+ /* map the control page at its virtual address */ -+ -+ movl PTR(VA_PGD)(%ebp), %edi -+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax -+ andl $0xffc00000, %eax -+ shrl $20, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_PTE_0)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+ -+ movl PTR(VA_PTE_0)(%ebp), %edi -+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax -+ andl $0x003ff000, %eax -+ shrl $10, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+ -+ /* identity map the control page at its physical address */ -+ -+ movl PTR(VA_PGD)(%ebp), %edi -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax -+ andl $0xffc00000, %eax -+ shrl $20, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_PTE_1)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+ -+ movl PTR(VA_PTE_1)(%ebp), %edi -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax -+ andl $0x003ff000, %eax -+ shrl $10, %eax -+ addl %edi, %eax -+ -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx -+ orl $PAGE_ATTR, %edx -+ movl %edx, (%eax) -+#endif - -- /* -- * Must be relocatable PIC code callable as a C function, that once -- * it starts can not use the previous processes stack. -- */ -- .globl relocate_new_kernel - relocate_new_kernel: - /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ -- movl 8(%esp), %ebp /* reboot_code_buffer */ -+ movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ - -@@ -24,11 +146,57 @@ relocate_new_kernel: - pushl $0 - popfl - -- /* set a new stack at the bottom of our page... */ -- lea 4096(%ebp), %esp -+ /* get physical address of control page now */ -+ /* this is impossible after page table switch */ -+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi - -- /* store the parameters back on the stack */ -- pushl %edx /* store the start address */ -+ /* switch to new set of page tables */ -+ movl PTR(PA_PGD)(%ebp), %eax -+ movl %eax, %cr3 -+ -+ /* setup idt */ -+ movl %edi, %eax -+ addl $(idt_48 - relocate_kernel), %eax -+ lidtl (%eax) -+ -+ /* setup gdt */ -+ movl %edi, %eax -+ addl $(gdt - relocate_kernel), %eax -+ movl %edi, %esi -+ addl $((gdt_48 - relocate_kernel) + 2), %esi -+ movl %eax, (%esi) -+ -+ movl %edi, %eax -+ addl $(gdt_48 - relocate_kernel), %eax -+ lgdtl (%eax) -+ -+ /* setup data segment registers */ -+ mov $(gdt_ds - gdt), %eax -+ mov %eax, %ds -+ mov %eax, %es -+ mov %eax, %fs -+ mov %eax, %gs -+ mov %eax, %ss -+ -+ /* setup a new stack at the end of the physical control page */ -+ lea 4096(%edi), %esp -+ -+ /* load new code segment and jump to identity mapped page */ -+ movl %edi, %esi -+ xorl %eax, %eax -+ pushl %eax -+ pushl %esi -+ pushl %eax -+ movl $(gdt_cs - gdt), %eax -+ pushl %eax -+ movl %edi, %eax -+ addl $(identity_mapped - relocate_kernel),%eax -+ pushl %eax -+ iretl -+ -+identity_mapped: -+ /* store the start address on the stack */ -+ pushl %edx - - /* Set cr0 to a known state: - * 31 0 == Paging disabled -@@ -113,8 +281,20 @@ relocate_new_kernel: - xorl %edi, %edi - xorl %ebp, %ebp - ret --relocate_new_kernel_end: - -- .globl relocate_new_kernel_size --relocate_new_kernel_size: -- .long relocate_new_kernel_end - relocate_new_kernel -+ .align 16 -+gdt: -+ .quad 0x0000000000000000 /* NULL descriptor */ -+gdt_cs: -+ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ -+gdt_ds: -+ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ -+gdt_end: -+ -+gdt_48: -+ .word gdt_end - gdt - 1 /* limit */ -+ .long 0 /* base - filled in by code above */ -+ -+idt_48: -+ .word 0 /* limit */ -+ .long 0 /* base */ -diff -rpuN linux-2.6.18.8/arch/i386/kernel/setup-xen.c linux-2.6.18-xen-3.2.0/arch/i386/kernel/setup-xen.c ---- linux-2.6.18.8/arch/i386/kernel/setup-xen.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux-2.6.18-xen-3.2.0/arch/i386/kernel/setup-xen.c 2008-02-15 16:21:49.000000000 -0800 -@@ -0,0 +1,1919 @@ -+/* -+ * linux/arch/i386/kernel/setup.c -+ * -+ * Copyright (C) 1995 Linus Torvalds -+ * -+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 -+ * -+ * Memory region support -+ * David Parsons , July-August 1999 -+ * -+ * Added E820 sanitization routine (removes overlapping memory regions); -+ * Brian Moyle , February 2001 -+ * -+ * Moved CPU detection code to cpu/${cpu}.c -+ * Patrick Mochel , March 2002 -+ * -+ * Provisions for empty E820 memory regions (reported by certain BIOSes). -+ * Alex Achenbach , December 2002. -+ * -+ */ -+ -+/* -+ * This file handles the architecture-dependent parts of initialization -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include