Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Beulich <jbeulich@suse.com>2013-06-05 15:41:12 +0200
committerJan Beulich <jbeulich@suse.com>2013-06-05 15:41:12 +0200
commit7c3a112e6566f9151694db2ba0d3850e641359b4 (patch)
tree4665e406f7c5b40b2a037a2a69ae102b94c13af5
parentbb74f54b0741c619717cd6668bdb7f05f83d80f8 (diff)
- Update Xen patches to 3.10-rc4.
- use vCPU time info registration to support vread_pvclock(). - Delete patches.xen/xen-netback-nr-irqs. - config.conf: Re-enable Xen flavors. - Update x86 config files. suse-commit: 4967b19cee8107ad36c1d4384a771a17b8f3c022
-rw-r--r--Documentation/filesystems/proc.txt1
-rw-r--r--Documentation/kernel-parameters.txt29
-rw-r--r--arch/arm/Kconfig4
-rw-r--r--arch/arm/Makefile2
-rw-r--r--arch/arm/include/asm/xen/interface.h18
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/Makefile2
-rw-r--r--arch/ia64/include/asm/xen/hypervisor.h4
-rw-r--r--arch/ia64/include/asm/xen/interface.h25
-rw-r--r--arch/ia64/kernel/asm-offsets.c2
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S2
-rw-r--r--arch/ia64/xen/Kconfig8
-rw-r--r--arch/ia64/xen/xcom_hcall.c2
-rw-r--r--arch/x86/Kbuild4
-rw-r--r--arch/x86/Kconfig212
-rw-r--r--arch/x86/Kconfig.cpu21
-rw-r--r--arch/x86/Kconfig.debug7
-rw-r--r--arch/x86/Makefile24
-rw-r--r--arch/x86/boot/Makefile19
-rw-r--r--arch/x86/boot/boot.h4
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/header.S5
-rw-r--r--arch/x86/boot/tools/build.c8
-rw-r--r--arch/x86/ia32/ia32entry-xen.S388
-rw-r--r--arch/x86/include/asm/acpi.h8
-rw-r--r--arch/x86/include/asm/agp.h3
-rw-r--r--arch/x86/include/asm/apic.h26
-rw-r--r--arch/x86/include/asm/apicdef.h16
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/compat.h4
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/debugreg.h10
-rw-r--r--arch/x86/include/asm/dwarf2.h2
-rw-r--r--arch/x86/include/asm/efi.h4
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h10
-rw-r--r--arch/x86/include/asm/hypervisor.h5
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/io.h4
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/kexec.h29
-rw-r--r--arch/x86/include/asm/mach_traps.h25
-rw-r--r--arch/x86/include/asm/mc146818rtc.h2
-rw-r--r--arch/x86/include/asm/mmu.h5
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/page_64.h15
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/include/asm/pvclock.h4
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/segment.h4
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h1
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h2
-rw-r--r--arch/x86/include/asm/xen/interface.h25
-rw-r--r--arch/x86/include/mach-xen/asm/agp.h58
-rw-r--r--arch/x86/include/mach-xen/asm/cmpxchg_32.h26
-rw-r--r--arch/x86/include/mach-xen/asm/cmpxchg_64.h13
-rw-r--r--arch/x86/include/mach-xen/asm/desc.h434
-rw-r--r--arch/x86/include/mach-xen/asm/dma-mapping.h23
-rw-r--r--arch/x86/include/mach-xen/asm/fixmap.h242
-rw-r--r--arch/x86/include/mach-xen/asm/fpu-internal.h66
-rw-r--r--arch/x86/include/mach-xen/asm/gnttab_dma.h49
-rw-r--r--arch/x86/include/mach-xen/asm/highmem.h98
-rw-r--r--arch/x86/include/mach-xen/asm/hypercall.h434
-rw-r--r--arch/x86/include/mach-xen/asm/hypercall_32.h62
-rw-r--r--arch/x86/include/mach-xen/asm/hypercall_64.h54
-rw-r--r--arch/x86/include/mach-xen/asm/hypervisor.h403
-rw-r--r--arch/x86/include/mach-xen/asm/io.h343
-rw-r--r--arch/x86/include/mach-xen/asm/ipi.h13
-rw-r--r--arch/x86/include/mach-xen/asm/irq_vectors.h98
-rw-r--r--arch/x86/include/mach-xen/asm/irqflags.h213
-rw-r--r--arch/x86/include/mach-xen/asm/kbdleds.h16
-rw-r--r--arch/x86/include/mach-xen/asm/mach_traps.h37
-rw-r--r--arch/x86/include/mach-xen/asm/maddr.h155
-rw-r--r--arch/x86/include/mach-xen/asm/maddr_32.h35
-rw-r--r--arch/x86/include/mach-xen/asm/maddr_64.h21
-rw-r--r--arch/x86/include/mach-xen/asm/mmu_context.h165
-rw-r--r--arch/x86/include/mach-xen/asm/mutex.h3
-rw-r--r--arch/x86/include/mach-xen/asm/pci.h198
-rw-r--r--arch/x86/include/mach-xen/asm/percpu.h75
-rw-r--r--arch/x86/include/mach-xen/asm/perf_event.h44
-rw-r--r--arch/x86/include/mach-xen/asm/pgalloc.h159
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable-3level.h206
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable-3level_types.h44
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable.h939
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable_32.h83
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable_64.h206
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable_64_types.h68
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable_types.h413
-rw-r--r--arch/x86/include/mach-xen/asm/probe_roms.h10
-rw-r--r--arch/x86/include/mach-xen/asm/processor.h984
-rw-r--r--arch/x86/include/mach-xen/asm/pvclock-abi.h12
-rw-r--r--arch/x86/include/mach-xen/asm/setup.h16
-rw-r--r--arch/x86/include/mach-xen/asm/smp-processor-id.h36
-rw-r--r--arch/x86/include/mach-xen/asm/smp.h240
-rw-r--r--arch/x86/include/mach-xen/asm/special_insns.h247
-rw-r--r--arch/x86/include/mach-xen/asm/spinlock.h367
-rw-r--r--arch/x86/include/mach-xen/asm/spinlock_types.h62
-rw-r--r--arch/x86/include/mach-xen/asm/swiotlb.h8
-rw-r--r--arch/x86/include/mach-xen/asm/switch_to.h9
-rw-r--r--arch/x86/include/mach-xen/asm/time.h18
-rw-r--r--arch/x86/include/mach-xen/asm/tlbflush.h114
-rw-r--r--arch/x86/include/mach-xen/asm/tsc.h5
-rw-r--r--arch/x86/include/mach-xen/asm/vga.h26
-rw-r--r--arch/x86/include/mach-xen/asm/xenoprof.h48
-rw-r--r--arch/x86/include/mach-xen/asm/xor.h28
-rw-r--r--arch/x86/include/uapi/asm/e820.h4
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c27
-rw-r--r--arch/x86/kernel/acpi/processor_extcntl_xen.c310
-rw-r--r--arch/x86/kernel/amd_nb.c6
-rw-r--r--arch/x86/kernel/apic/Makefile4
-rw-r--r--arch/x86/kernel/apic/apic-xen.c69
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c8
-rw-r--r--arch/x86/kernel/apic/io_apic-xen.c4031
-rw-r--r--arch/x86/kernel/apic/ipi-xen.c43
-rw-r--r--arch/x86/kernel/apic/probe_32-xen.c57
-rw-r--r--arch/x86/kernel/asm-offsets.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c2
-rw-r--r--arch/x86/kernel/cpu/common-xen.c1482
-rw-r--r--arch/x86/kernel/cpu/intel.c15
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c28
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_dom0.c185
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main-xen.c326
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c4
-rw-r--r--arch/x86/kernel/e820-xen.c1308
-rw-r--r--arch/x86/kernel/early_printk-xen.c274
-rw-r--r--arch/x86/kernel/entry_32-xen.S1731
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64-xen.S1465
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/fixup.c89
-rw-r--r--arch/x86/kernel/head-xen.c244
-rw-r--r--arch/x86/kernel/head32-xen.c84
-rw-r--r--arch/x86/kernel/head64-xen.c227
-rw-r--r--arch/x86/kernel/head_32-xen.S238
-rw-r--r--arch/x86/kernel/head_64-xen.S203
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/ioport-xen.c84
-rw-r--r--arch/x86/kernel/irq-xen.c375
-rw-r--r--arch/x86/kernel/irq_64.c6
-rw-r--r--arch/x86/kernel/irq_work-xen.c21
-rw-r--r--arch/x86/kernel/ldt-xen.c271
-rw-r--r--arch/x86/kernel/machine_kexec_32.c107
-rw-r--r--arch/x86/kernel/machine_kexec_64.c101
-rw-r--r--arch/x86/kernel/machine_kexec_xen.c29
-rw-r--r--arch/x86/kernel/microcode_core-xen.c303
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c8
-rw-r--r--arch/x86/kernel/mpparse-xen.c960
-rw-r--r--arch/x86/kernel/msr-xen.c339
-rw-r--r--arch/x86/kernel/nmi.c17
-rw-r--r--arch/x86/kernel/pci-dma-xen.c362
-rw-r--r--arch/x86/kernel/pci-nommu-xen.c108
-rw-r--r--arch/x86/kernel/pcspeaker.c5
-rw-r--r--arch/x86/kernel/probe_roms.c7
-rw-r--r--arch/x86/kernel/process-xen.c482
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32-xen.c407
-rw-r--r--arch/x86/kernel/process_64-xen.c607
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/quirks.c17
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S39
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S36
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/rtc.c9
-rw-r--r--arch/x86/kernel/setup-xen.c1608
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/smp-xen.c254
-rw-r--r--arch/x86/kernel/syscall_32-xen.c20
-rw-r--r--arch/x86/kernel/tboot.c6
-rw-r--r--arch/x86/kernel/time-xen.c668
-rw-r--r--arch/x86/kernel/topology.c6
-rw-r--r--arch/x86/kernel/traps-xen.c806
-rw-r--r--arch/x86/kernel/vm86_32.c12
-rw-r--r--arch/x86/kernel/vmlinux.lds.S10
-rw-r--r--arch/x86/kernel/vsyscall_64-xen.c408
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kernel/x86_init-xen.c100
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/cache-smp-xen.c27
-rw-r--r--arch/x86/lib/scrub.c21
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/dump_pagetables-xen.c392
-rw-r--r--arch/x86/mm/fault-xen.c1262
-rw-r--r--arch/x86/mm/highmem_32-xen.c195
-rw-r--r--arch/x86/mm/hypervisor.c1303
-rw-r--r--arch/x86/mm/init-xen.c624
-rw-r--r--arch/x86/mm/init_32-xen.c980
-rw-r--r--arch/x86/mm/init_64-xen.c1767
-rw-r--r--arch/x86/mm/iomap_32-xen.c121
-rw-r--r--arch/x86/mm/ioremap-xen.c821
-rw-r--r--arch/x86/mm/mm_internal.h2
-rw-r--r--arch/x86/mm/pageattr-xen.c1611
-rw-r--r--arch/x86/mm/pat-xen.c874
-rw-r--r--arch/x86/mm/pat_internal.h4
-rw-r--r--arch/x86/mm/pgtable-xen.c988
-rw-r--r--arch/x86/mm/pgtable_32-xen.c178
-rw-r--r--arch/x86/mm/tlb-xen.c155
-rw-r--r--arch/x86/oprofile/Makefile7
-rw-r--r--arch/x86/oprofile/xenoprof.c179
-rw-r--r--arch/x86/pci/Makefile3
-rw-r--r--arch/x86/pci/amd_bus.c15
-rw-r--r--arch/x86/pci/common.c22
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/irq.c9
-rw-r--r--arch/x86/pci/mmconfig-shared.c38
-rw-r--r--arch/x86/pci/pcifront.c59
-rw-r--r--arch/x86/platform/efi/Makefile2
-rw-r--r--arch/x86/platform/efi/efi-xen.c714
-rw-r--r--arch/x86/platform/sfi/sfi.c7
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/syscalls/Makefile10
-rw-r--r--arch/x86/vdso/Makefile1
-rw-r--r--arch/x86/vdso/vclock_gettime.c21
-rw-r--r--arch/x86/vdso/vdso32-setup-xen.c483
-rw-r--r--arch/x86/vdso/vdso32.S2
-rw-r--r--arch/x86/vdso/vdso32/note.S6
-rw-r--r--arch/x86/vdso/vdso32/syscall.S2
-rw-r--r--arch/x86/xen/Kconfig19
-rw-r--r--arch/x86/xen/enlighten.c10
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--drivers/Makefile3
-rw-r--r--drivers/acpi/Kconfig14
-rw-r--r--drivers/acpi/Makefile1
-rw-r--r--drivers/acpi/acpi_memhotplug.c17
-rw-r--r--drivers/acpi/acpi_pad-xen.c234
-rw-r--r--drivers/acpi/acpica/hwesleep.c8
-rw-r--r--drivers/acpi/acpica/hwsleep.c2
-rw-r--r--drivers/acpi/acpica/hwxfsleep.c2
-rw-r--r--drivers/acpi/apei/ghes.c4
-rw-r--r--drivers/acpi/osl.c20
-rw-r--r--drivers/acpi/pci_irq.c77
-rw-r--r--drivers/acpi/pci_root.c70
-rw-r--r--drivers/acpi/processor_core.c51
-rw-r--r--drivers/acpi/processor_driver.c167
-rw-r--r--drivers/acpi/processor_extcntl.c214
-rw-r--r--drivers/acpi/processor_idle.c51
-rw-r--r--drivers/acpi/processor_perflib.c27
-rw-r--r--drivers/acpi/sleep.c2
-rw-r--r--drivers/base/cpu.c4
-rw-r--r--drivers/block/Kconfig10
-rw-r--r--drivers/block/Makefile4
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/xen-blkback/Makefile2
-rw-r--r--drivers/cdrom/Makefile1
-rw-r--r--drivers/char/Kconfig2
-rw-r--r--drivers/char/agp/agp.h4
-rw-r--r--drivers/char/agp/amd-k7-agp.c4
-rw-r--r--drivers/char/agp/amd64-agp.c4
-rw-r--r--drivers/char/agp/ati-agp.c4
-rw-r--r--drivers/char/agp/efficeon-agp.c2
-rw-r--r--drivers/char/agp/generic.c8
-rw-r--r--drivers/char/agp/intel-gtt.c22
-rw-r--r--drivers/char/agp/sworks-agp.c6
-rw-r--r--drivers/char/mem.c16
-rw-r--r--drivers/char/tpm/Kconfig9
-rw-r--r--drivers/char/tpm/Makefile2
-rw-r--r--drivers/char/tpm/tpm.h10
-rw-r--r--drivers/char/tpm/tpm_vtpm.c548
-rw-r--r--drivers/char/tpm/tpm_vtpm.h55
-rw-r--r--drivers/char/tpm/tpm_xen.c718
-rw-r--r--drivers/cpufreq/Kconfig1
-rw-r--r--drivers/cpuidle/Kconfig1
-rw-r--r--drivers/dma/Kconfig2
-rw-r--r--drivers/dma/ioat/Makefile3
-rw-r--r--drivers/dma/ioat/dca.c12
-rw-r--r--drivers/dma/ioat/dma.h17
-rw-r--r--drivers/dma/ioat/dma_v2.h6
-rw-r--r--drivers/dma/ioat/hw.h4
-rw-r--r--drivers/dma/ioat/pci.c7
-rw-r--r--drivers/edac/Kconfig7
-rw-r--r--drivers/edac/edac_mc.c4
-rw-r--r--drivers/edac/sb_edac.c4
-rw-r--r--drivers/firmware/Kconfig3
-rw-r--r--drivers/firmware/dcdbas.c28
-rw-r--r--drivers/firmware/dell_rbu.c45
-rw-r--r--drivers/firmware/dmi_scan.c5
-rw-r--r--drivers/gpu/drm/i915/i915_drv.c4
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h5
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c11
-rw-r--r--drivers/gpu/drm/i915/i915_gem_gtt.c7
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c4
-rw-r--r--drivers/gpu/drm/radeon/radeon_device.c12
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo.c8
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c6
-rw-r--r--drivers/gpu/drm/ttm/ttm_page_alloc.c27
-rw-r--r--drivers/gpu/drm/vmwgfx/Kconfig2
-rw-r--r--drivers/hwmon/Kconfig6
-rw-r--r--drivers/hwmon/coretemp-xen.c922
-rw-r--r--drivers/hwmon/via-cputemp-xen.c397
-rw-r--r--drivers/ide/ide-lib.c11
-rw-r--r--drivers/idle/Kconfig2
-rw-r--r--drivers/input/misc/Kconfig2
-rw-r--r--drivers/iommu/Kconfig1
-rw-r--r--drivers/misc/vmw_vmci/Kconfig2
-rw-r--r--drivers/net/Kconfig14
-rw-r--r--drivers/net/Makefile4
-rw-r--r--drivers/net/caif/Kconfig2
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c10
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/sge.c62
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/version.h4
-rw-r--r--drivers/net/xen-netback/Makefile2
-rw-r--r--drivers/oprofile/buffer_sync.c81
-rw-r--r--drivers/oprofile/cpu_buffer.c73
-rw-r--r--drivers/oprofile/cpu_buffer.h12
-rw-r--r--drivers/oprofile/event_buffer.h3
-rw-r--r--drivers/oprofile/oprof.c32
-rw-r--r--drivers/oprofile/oprof.h3
-rw-r--r--drivers/oprofile/oprofile_files.c145
-rw-r--r--drivers/pci/Kconfig54
-rw-r--r--drivers/pci/Makefile7
-rw-r--r--drivers/pci/guestdev.c882
-rw-r--r--drivers/pci/iomulti.c899
-rw-r--r--drivers/pci/iomulti.h122
-rw-r--r--drivers/pci/msi-xen.c1018
-rw-r--r--drivers/pci/pci-iomul.c440
-rw-r--r--drivers/pci/pci.c7
-rw-r--r--drivers/pci/pci.h23
-rw-r--r--drivers/pci/probe.c16
-rw-r--r--drivers/pci/reserve.c137
-rw-r--r--drivers/pci/setup-bus.c8
-rw-r--r--drivers/platform/x86/Kconfig2
-rw-r--r--drivers/remoteproc/Kconfig1
-rw-r--r--drivers/rtc/Kconfig2
-rw-r--r--drivers/scsi/Kconfig2
-rw-r--r--drivers/scsi/arcmsr/arcmsr.h2
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c7
-rw-r--r--drivers/scsi/scsi_error.c25
-rw-r--r--drivers/sfi/sfi_core.c5
-rw-r--r--drivers/thermal/Kconfig2
-rw-r--r--drivers/tty/hvc/Kconfig2
-rw-r--r--drivers/tty/serial/8250/Kconfig1
-rw-r--r--drivers/tty/tty_io.c9
-rw-r--r--drivers/video/Kconfig4
-rw-r--r--drivers/virtio/Kconfig1
-rw-r--r--drivers/watchdog/Kconfig2
-rw-r--r--drivers/watchdog/xen_wdt.c11
-rw-r--r--drivers/xen/Kconfig526
-rw-r--r--drivers/xen/Makefile63
-rw-r--r--drivers/xen/acpi.c26
-rw-r--r--drivers/xen/balloon/Makefile2
-rw-r--r--drivers/xen/balloon/balloon.c801
-rw-r--r--drivers/xen/balloon/common.h57
-rw-r--r--drivers/xen/balloon/sysfs.c209
-rw-r--r--drivers/xen/blkback/Makefile4
-rw-r--r--drivers/xen/blkback/blkback-pagemap.c97
-rw-r--r--drivers/xen/blkback/blkback-pagemap.h38
-rw-r--r--drivers/xen/blkback/blkback.c781
-rw-r--r--drivers/xen/blkback/cdrom.c154
-rw-r--r--drivers/xen/blkback/common.h161
-rw-r--r--drivers/xen/blkback/interface.c161
-rw-r--r--drivers/xen/blkback/vbd.c212
-rw-r--r--drivers/xen/blkback/xenbus.c680
-rw-r--r--drivers/xen/blkfront/Makefile5
-rw-r--r--drivers/xen/blkfront/blkfront.c1408
-rw-r--r--drivers/xen/blkfront/block.h181
-rw-r--r--drivers/xen/blkfront/vbd.c612
-rw-r--r--drivers/xen/blkfront/vcd.c504
-rw-r--r--drivers/xen/blktap/Makefile5
-rw-r--r--drivers/xen/blktap/blktap.c1795
-rw-r--r--drivers/xen/blktap/blocktap.c1
-rw-r--r--drivers/xen/blktap/common.h114
-rw-r--r--drivers/xen/blktap/interface.c133
-rw-r--r--drivers/xen/blktap/xenbus.c520
-rw-r--r--drivers/xen/blktap2-new/Makefile4
-rw-r--r--drivers/xen/blktap2-new/blktap.h219
-rw-r--r--drivers/xen/blktap2-new/control.c317
-rw-r--r--drivers/xen/blktap2-new/device.c569
-rw-r--r--drivers/xen/blktap2-new/request.c418
-rw-r--r--drivers/xen/blktap2-new/ring.c546
-rw-r--r--drivers/xen/blktap2-new/sysfs.c299
-rw-r--r--drivers/xen/blktap2/Makefile4
-rw-r--r--drivers/xen/blktap2/blktap.h265
-rw-r--r--drivers/xen/blktap2/control.c286
-rw-r--r--drivers/xen/blktap2/device.c1174
-rw-r--r--drivers/xen/blktap2/request.c296
-rw-r--r--drivers/xen/blktap2/ring.c608
-rw-r--r--drivers/xen/blktap2/sysfs.c475
-rw-r--r--drivers/xen/blktap2/wait_queue.c40
-rw-r--r--drivers/xen/char/Makefile1
-rw-r--r--drivers/xen/char/mem.c222
-rw-r--r--drivers/xen/console/Makefile2
-rw-r--r--drivers/xen/console/console.c778
-rw-r--r--drivers/xen/console/xencons.h12
-rw-r--r--drivers/xen/console/xencons_ring.c129
-rw-r--r--drivers/xen/core/Makefile16
-rw-r--r--drivers/xen/core/acpi_memhotplug.c185
-rw-r--r--drivers/xen/core/clockevents.c304
-rw-r--r--drivers/xen/core/cpu_hotplug.c182
-rw-r--r--drivers/xen/core/domctl.c579
-rw-r--r--drivers/xen/core/domctl.h4
-rw-r--r--drivers/xen/core/evtchn.c2017
-rw-r--r--drivers/xen/core/firmware.c103
-rw-r--r--drivers/xen/core/gnttab.c1006
-rw-r--r--drivers/xen/core/machine_kexec.c403
-rw-r--r--drivers/xen/core/machine_reboot.c293
-rw-r--r--drivers/xen/core/pcpu.c447
-rw-r--r--drivers/xen/core/reboot.c349
-rw-r--r--drivers/xen/core/smpboot.c406
-rw-r--r--drivers/xen/core/spinlock.c420
-rw-r--r--drivers/xen/core/xen_proc.c31
-rw-r--r--drivers/xen/dbgp.c4
-rw-r--r--drivers/xen/evtchn.c18
-rw-r--r--drivers/xen/fallback.c11
-rw-r--r--drivers/xen/fbfront/Makefile2
-rw-r--r--drivers/xen/fbfront/xenfb.c912
-rw-r--r--drivers/xen/fbfront/xenkbd.c368
-rw-r--r--drivers/xen/features.c9
-rw-r--r--drivers/xen/gntdev/Makefile1
-rw-r--r--drivers/xen/gntdev/gntdev.c978
-rw-r--r--drivers/xen/netback/Makefile5
-rw-r--r--drivers/xen/netback/accel.c263
-rw-r--r--drivers/xen/netback/common.h289
-rw-r--r--drivers/xen/netback/interface.c366
-rw-r--r--drivers/xen/netback/loopback.c279
-rw-r--r--drivers/xen/netback/netback.c2025
-rw-r--r--drivers/xen/netback/xenbus.c489
-rw-r--r--drivers/xen/netfront/Makefile4
-rw-r--r--drivers/xen/netfront/accel.c828
-rw-r--r--drivers/xen/netfront/netfront.c2300
-rw-r--r--drivers/xen/netfront/netfront.h289
-rw-r--r--drivers/xen/pci.c11
-rw-r--r--drivers/xen/pcifront/Makefile5
-rw-r--r--drivers/xen/pcifront/pci.c44
-rw-r--r--drivers/xen/pcifront/pci_op.c657
-rw-r--r--drivers/xen/pcifront/pcifront.h89
-rw-r--r--drivers/xen/pcifront/xenbus.c471
-rw-r--r--drivers/xen/privcmd/Makefile3
-rw-r--r--drivers/xen/privcmd/compat_privcmd.c140
-rw-r--r--drivers/xen/privcmd/privcmd.c475
-rw-r--r--drivers/xen/scsiback/Makefile4
-rw-r--r--drivers/xen/scsiback/common.h174
-rw-r--r--drivers/xen/scsiback/emulate.c480
-rw-r--r--drivers/xen/scsiback/interface.c141
-rw-r--r--drivers/xen/scsiback/scsiback.c729
-rw-r--r--drivers/xen/scsiback/translate.c168
-rw-r--r--drivers/xen/scsiback/xenbus.c375
-rw-r--r--drivers/xen/scsifront/Makefile3
-rw-r--r--drivers/xen/scsifront/common.h130
-rw-r--r--drivers/xen/scsifront/scsifront.c510
-rw-r--r--drivers/xen/scsifront/xenbus.c424
-rw-r--r--drivers/xen/sfc_netback/Makefile12
-rw-r--r--drivers/xen/sfc_netback/accel.c147
-rw-r--r--drivers/xen/sfc_netback/accel.h392
-rw-r--r--drivers/xen/sfc_netback/accel_debugfs.c148
-rw-r--r--drivers/xen/sfc_netback/accel_fwd.c420
-rw-r--r--drivers/xen/sfc_netback/accel_msg.c391
-rw-r--r--drivers/xen/sfc_netback/accel_solarflare.c1292
-rw-r--r--drivers/xen/sfc_netback/accel_solarflare.h88
-rw-r--r--drivers/xen/sfc_netback/accel_xenbus.c831
-rw-r--r--drivers/xen/sfc_netback/ci/compat.h53
-rw-r--r--drivers/xen/sfc_netback/ci/compat/gcc.h158
-rw-r--r--drivers/xen/sfc_netback/ci/compat/gcc_x86.h115
-rw-r--r--drivers/xen/sfc_netback/ci/compat/primitive.h77
-rw-r--r--drivers/xen/sfc_netback/ci/compat/sysdep.h166
-rw-r--r--drivers/xen/sfc_netback/ci/compat/utils.h269
-rw-r--r--drivers/xen/sfc_netback/ci/compat/x86.h48
-rw-r--r--drivers/xen/sfc_netback/ci/compat/x86_64.h54
-rw-r--r--drivers/xen/sfc_netback/ci/tools/config.h49
-rw-r--r--drivers/xen/sfc_netback/ci/tools/debug.h336
-rw-r--r--drivers/xen/sfc_netback/ci/tools/log.h269
-rw-r--r--drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h370
-rw-r--r--drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h361
-rw-r--r--drivers/xen/sfc_netback/ci/tools/sysdep.h132
-rw-r--r--drivers/xen/sfc_netfront/Makefile11
-rw-r--r--drivers/xen/sfc_netfront/accel.h495
-rw-r--r--drivers/xen/sfc_netfront/accel_bufs.c393
-rw-r--r--drivers/xen/sfc_netfront/accel_bufs.h181
-rw-r--r--drivers/xen/sfc_netfront/accel_debugfs.c227
-rw-r--r--drivers/xen/sfc_netfront/accel_msg.c567
-rw-r--r--drivers/xen/sfc_netfront/accel_netfront.c330
-rw-r--r--drivers/xen/sfc_netfront/accel_ssr.c308
-rw-r--r--drivers/xen/sfc_netfront/accel_ssr.h88
-rw-r--r--drivers/xen/sfc_netfront/accel_tso.c509
-rw-r--r--drivers/xen/sfc_netfront/accel_tso.h57
-rw-r--r--drivers/xen/sfc_netfront/accel_vi.c1203
-rw-r--r--drivers/xen/sfc_netfront/accel_xenbus.c775
-rw-r--r--drivers/xen/sfc_netfront/ef_vi_falcon.h172
-rw-r--r--drivers/xen/sfc_netfront/ef_vi_falcon_core.h1075
-rw-r--r--drivers/xen/sfc_netfront/ef_vi_falcon_desc.h43
-rw-r--r--drivers/xen/sfc_netfront/ef_vi_falcon_event.h123
-rw-r--r--drivers/xen/sfc_netfront/ef_vi_internal.h256
-rw-r--r--drivers/xen/sfc_netfront/etherfabric/ef_vi.h647
-rw-r--r--drivers/xen/sfc_netfront/falcon_event.c346
-rw-r--r--drivers/xen/sfc_netfront/falcon_vi.c473
-rw-r--r--drivers/xen/sfc_netfront/pt_tx.c91
-rw-r--r--drivers/xen/sfc_netfront/sysdep.h185
-rw-r--r--drivers/xen/sfc_netfront/vi_init.c183
-rw-r--r--drivers/xen/sfc_netutil/Makefile11
-rw-r--r--drivers/xen/sfc_netutil/accel_cuckoo_hash.c649
-rw-r--r--drivers/xen/sfc_netutil/accel_cuckoo_hash.h227
-rw-r--r--drivers/xen/sfc_netutil/accel_msg_iface.c301
-rw-r--r--drivers/xen/sfc_netutil/accel_msg_iface.h415
-rw-r--r--drivers/xen/sfc_netutil/accel_shared_fifo.h127
-rw-r--r--drivers/xen/sfc_netutil/accel_util.c336
-rw-r--r--drivers/xen/sfc_netutil/accel_util.h124
-rw-r--r--drivers/xen/sys-hypervisor.c49
-rw-r--r--drivers/xen/tmem.c35
-rw-r--r--drivers/xen/tpmback/Makefile4
-rw-r--r--drivers/xen/tpmback/common.h93
-rw-r--r--drivers/xen/tpmback/interface.c133
-rw-r--r--drivers/xen/tpmback/tpmback.c947
-rw-r--r--drivers/xen/tpmback/xenbus.c268
-rw-r--r--drivers/xen/usbback/Makefile4
-rw-r--r--drivers/xen/usbback/interface.c190
-rw-r--r--drivers/xen/usbback/usbback.c1199
-rw-r--r--drivers/xen/usbback/usbback.h171
-rw-r--r--drivers/xen/usbback/usbstub.c324
-rw-r--r--drivers/xen/usbback/xenbus.c334
-rw-r--r--drivers/xen/usbfront/Makefile11
-rw-r--r--drivers/xen/usbfront/usbfront-dbg.c101
-rw-r--r--drivers/xen/usbfront/usbfront-hcd.c232
-rw-r--r--drivers/xen/usbfront/usbfront-hub.c471
-rw-r--r--drivers/xen/usbfront/usbfront-q.c542
-rw-r--r--drivers/xen/usbfront/usbfront.h198
-rw-r--r--drivers/xen/usbfront/xenbus.c415
-rw-r--r--drivers/xen/util.c74
-rw-r--r--drivers/xen/xen-pciback/Makefile16
-rw-r--r--drivers/xen/xen-pciback/conf_space_capability.c15
-rw-r--r--drivers/xen/xen-pciback/conf_space_header.c9
-rw-r--r--drivers/xen/xen-pciback/controller.c450
-rw-r--r--drivers/xen/xen-pciback/pci_stub.c53
-rw-r--r--drivers/xen/xen-pciback/pciback.h21
-rw-r--r--drivers/xen/xen-pciback/pciback_ops.c52
-rw-r--r--drivers/xen/xen-pciback/slot.c200
-rw-r--r--drivers/xen/xen-pciback/xenbus.c122
-rw-r--r--drivers/xen/xen-selfballoon.c5
-rw-r--r--drivers/xen/xenbus/Makefile23
-rw-r--r--drivers/xen/xenbus/xenbus_backend_client.c145
-rw-r--r--drivers/xen/xenbus/xenbus_client.c137
-rw-r--r--drivers/xen/xenbus/xenbus_comms.c73
-rw-r--r--drivers/xen/xenbus/xenbus_comms.h23
-rw-r--r--drivers/xen/xenbus/xenbus_dev.c507
-rw-r--r--drivers/xen/xenbus/xenbus_dev_backend.c12
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c963
-rw-r--r--drivers/xen/xenbus/xenbus_probe.h32
-rw-r--r--drivers/xen/xenbus/xenbus_probe_backend.c80
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c153
-rw-r--r--drivers/xen/xenoprof/xenoprofile.c585
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/aio.c120
-rw-r--r--fs/compat_ioctl.c23
-rw-r--r--fs/proc/kcore.c6
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--include/acpi/processor.h141
-rw-r--r--include/drm/intel-gtt.h2
-rw-r--r--include/linux/acpi.h12
-rw-r--r--include/linux/clocksource.h2
-rw-r--r--include/linux/console.h1
-rw-r--r--include/linux/cpufreq.h2
-rw-r--r--include/linux/efi.h4
-rw-r--r--include/linux/elfnote.h2
-rw-r--r--include/linux/highmem.h6
-rw-r--r--include/linux/interrupt.h5
-rw-r--r--include/linux/kexec.h20
-rw-r--r--include/linux/mm.h21
-rw-r--r--include/linux/msi.h7
-rw-r--r--include/linux/nmi.h3
-rw-r--r--include/linux/oprofile.h20
-rw-r--r--include/linux/page-flags.h46
-rw-r--r--include/linux/pci.h17
-rw-r--r--include/linux/pci_ids.h1
-rw-r--r--include/linux/timex.h3
-rw-r--r--include/linux/usb/ehci_def.h2
-rw-r--r--include/linux/vermagic.h7
-rw-r--r--include/uapi/linux/sysctl.h1
-rw-r--r--include/uapi/xen/Kbuild3
-rw-r--r--include/uapi/xen/evtchn.h89
-rw-r--r--include/uapi/xen/gntdev.h1
-rw-r--r--include/uapi/xen/privcmd.h99
-rw-r--r--include/uapi/xen/public/Kbuild5
-rw-r--r--include/uapi/xen/public/evtchn.h88
-rw-r--r--include/uapi/xen/public/gntdev.h (renamed from include/xen/gntdev.h)0
-rw-r--r--include/uapi/xen/public/iomulti.h50
-rw-r--r--include/uapi/xen/public/privcmd.h100
-rw-r--r--include/uapi/xen/public/xenbus.h52
-rw-r--r--include/xen/acpi.h4
-rw-r--r--include/xen/balloon.h67
-rw-r--r--include/xen/barrier.h10
-rw-r--r--include/xen/blkif.h160
-rw-r--r--include/xen/clock.h21
-rw-r--r--include/xen/compat_ioctl.h75
-rw-r--r--include/xen/cpu_hotplug.h39
-rw-r--r--include/xen/driver_util.h14
-rw-r--r--include/xen/evtchn.h235
-rw-r--r--include/xen/features.h3
-rw-r--r--include/xen/firmware.h16
-rw-r--r--include/xen/gnttab.h212
-rw-r--r--include/xen/hvm.h9
-rw-r--r--include/xen/hypercall.h30
-rw-r--r--include/xen/interface/COPYING38
-rw-r--r--include/xen/interface/arch-x86/cpuid.h68
-rw-r--r--include/xen/interface/arch-x86/hvm/save.h600
-rw-r--r--include/xen/interface/arch-x86/xen-mca.h440
-rw-r--r--include/xen/interface/arch-x86/xen-x86_32.h171
-rw-r--r--include/xen/interface/arch-x86/xen-x86_64.h202
-rw-r--r--include/xen/interface/arch-x86/xen.h263
-rw-r--r--include/xen/interface/arch-x86_32.h27
-rw-r--r--include/xen/interface/arch-x86_64.h43
-rw-r--r--include/xen/interface/callback.h11
-rw-r--r--include/xen/interface/dom0_ops.h120
-rw-r--r--include/xen/interface/domctl.h1008
-rw-r--r--include/xen/interface/elfnote.h120
-rw-r--r--include/xen/interface/event_channel.h119
-rw-r--r--include/xen/interface/features.h29
-rw-r--r--include/xen/interface/gcov.h115
-rw-r--r--include/xen/interface/grant_table.h180
-rw-r--r--include/xen/interface/hvm/e820.h34
-rw-r--r--include/xen/interface/hvm/hvm_info_table.h72
-rw-r--r--include/xen/interface/hvm/hvm_op.h229
-rw-r--r--include/xen/interface/hvm/hvm_xs_strings.h79
-rw-r--r--include/xen/interface/hvm/ioreq.h140
-rw-r--r--include/xen/interface/hvm/params.h63
-rw-r--r--include/xen/interface/hvm/pvdrivers.h47
-rw-r--r--include/xen/interface/hvm/save.h113
-rw-r--r--include/xen/interface/io/blkif.h589
-rw-r--r--include/xen/interface/io/cdromif.h120
-rw-r--r--include/xen/interface/io/console.h18
-rw-r--r--include/xen/interface/io/fbif.h26
-rw-r--r--include/xen/interface/io/fsif.h192
-rw-r--r--include/xen/interface/io/libxenvchan.h97
-rw-r--r--include/xen/interface/io/netif.h77
-rw-r--r--include/xen/interface/io/protocols.h22
-rw-r--r--include/xen/interface/io/ring.h94
-rw-r--r--include/xen/interface/io/tpmif.h77
-rw-r--r--include/xen/interface/io/usbif.h150
-rw-r--r--include/xen/interface/io/vscsiif.h117
-rw-r--r--include/xen/interface/io/xenbus.h1
-rw-r--r--include/xen/interface/io/xs_wire.h34
-rw-r--r--include/xen/interface/kexec.h168
-rw-r--r--include/xen/interface/mem_event.h82
-rw-r--r--include/xen/interface/memory.h295
-rw-r--r--include/xen/interface/nmi.h85
-rw-r--r--include/xen/interface/physdev.h69
-rw-r--r--include/xen/interface/platform.h252
-rw-r--r--include/xen/interface/sched.h99
-rw-r--r--include/xen/interface/sysctl.h693
-rw-r--r--include/xen/interface/tmem.h148
-rw-r--r--include/xen/interface/trace.h310
-rw-r--r--include/xen/interface/vcpu.h62
-rw-r--r--include/xen/interface/version.h45
-rw-r--r--include/xen/interface/xen-compat.h44
-rw-r--r--include/xen/interface/xen-mca.h4
-rw-r--r--include/xen/interface/xen.h567
-rw-r--r--include/xen/interface/xenoprof.h152
-rw-r--r--include/xen/interface/xsm/acm.h223
-rw-r--r--include/xen/interface/xsm/acm_ops.h159
-rw-r--r--include/xen/interface/xsm/flask_op.h201
-rw-r--r--include/xen/net-util.h69
-rw-r--r--include/xen/pcifront.h31
-rw-r--r--include/xen/pcpu.h19
-rw-r--r--include/xen/sysctl.h11
-rw-r--r--include/xen/xen.h6
-rw-r--r--include/xen/xen_proc.h12
-rw-r--r--include/xen/xenbus.h150
-rw-r--r--include/xen/xencons.h12
-rw-r--r--include/xen/xenoprof.h42
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/context_tracking.c7
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kexec.c95
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/mutex.c6
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/sched/cputime.c54
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/sysctl_binary.c12
-rw-r--r--kernel/time/ntp.c5
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--lib/swiotlb-xen.c841
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/memory.c46
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/page_alloc.c34
-rw-r--r--mm/tmem-xen.c56
-rw-r--r--mm/vmalloc.c30
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--scripts/Makefile.build15
-rw-r--r--scripts/Makefile.lib6
-rw-r--r--scripts/Makefile.xen.awk34
-rw-r--r--scripts/xen-hypercalls.sh9
700 files changed, 126359 insertions, 1498 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d594fc7..42bb9b779527 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -463,6 +463,7 @@ manner. The codes are the following:
mm - mixed map area
hg - huge page advise flag
nh - no-huge page advise flag
+ fo - area has foreign pages
mg - mergable advise flag
Note that there is no guarantee that every flag and associated mnemonic will
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d49250e87383..796cafeb89ab 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -978,6 +978,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
Default: 1024
+ guestdev= [PCI,ACPI,XEN]
+ Format: {<device path>|<sbdf>}][,{<device path>|<sbdf>}[,...]]
+ Format of device path: <hid>[:<uid>]-<dev>.<func>[-<dev>.<func>[,...]][+iomul]
+ Format of sbdf: [<segment>:]<bus>:<dev>.<func>[+iomul]
+ Specifies PCI device for guest domain.
+ If PCI-PCI bridge is specified, all PCI devices
+ behind PCI-PCI bridge are reserved.
+ +iomul means that this PCI function will share
+ IO ports with other +iomul functions under same
+ switch. NOTE: if +iomul is specfied, all the functions
+ of the device will share IO ports.
+
+ guestiomuldev= [PCI,ACPI,XEN]
+ Format: [sbd][,<sbd>][,...]
+ Format of sbdf: [<segment>:]<bus>:<dev>
+ Note: function shouldn't be specified.
+ Specifies PCI device for IO port multiplexing driver.
+
hashdist= [KNL,NUMA] Large hashes allocated during boot
are distributed across NUMA nodes. Defaults on
for 64-bit NUMA, off otherwise.
@@ -2362,6 +2380,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
only look for one device below a PCIe downstream
port.
+ pci_reserve= [PCI]
+ Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
+ Format of sbdf: [<segment>:]<bus>:<dev>.<func>
+ Specifies the least reserved io size or memory size
+ which is assigned to PCI bridge even when no child
+ pci device exists. This is useful with PCI hotplug.
+
pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
Management.
off Disable ASPM.
@@ -2679,6 +2704,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Run specified binary instead of /init from the ramdisk,
used for early userspace startup. See initrd.
+ reassign_resources [PCI,ACPI,XEN]
+ Use guestdev= parameter to reassign device's
+ resources, or specify =all here.
+
reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
Format: <reboot_mode>[,<reboot_mode2>[,...]]
See arch/*/kernel/reboot.c or arch/*/kernel/process.c
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2b4b39ce70b4..702f96928d21 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1783,9 +1783,9 @@ config CC_STACKPROTECTOR
config XEN_DOM0
def_bool y
- depends on XEN
+ depends on PARAVIRT_XEN
-config XEN
+config PARAVIRT_XEN
bool "Xen guest support on ARM (EXPERIMENTAL)"
depends on ARM && AEABI && OF
depends on CPU_V7 && !CPU_V6
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 1ba358ba16b8..59d58867b1e3 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -248,7 +248,7 @@ endif
core-$(CONFIG_FPE_NWFPE) += arch/arm/nwfpe/
core-$(CONFIG_FPE_FASTFPE) += $(FASTFPE_OBJ)
core-$(CONFIG_VFP) += arch/arm/vfp/
-core-$(CONFIG_XEN) += arch/arm/xen/
+core-$(CONFIG_PARAVIRT_XEN) += arch/arm/xen/
core-$(CONFIG_KVM_ARM_HOST) += arch/arm/kvm/
# If we have a machine-specific directory, then include it in the build.
diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h
index 44b35d4c3fab..89e6ef358295 100644
--- a/arch/arm/include/asm/xen/interface.h
+++ b/arch/arm/include/asm/xen/interface.h
@@ -11,14 +11,14 @@
#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef struct { union { type *p; uint64_aligned_t q; }; } \
__guest_handle_ ## name
#define DEFINE_GUEST_HANDLE_STRUCT(name) \
- __DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name) __guest_handle_ ## name
+ __DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
#define set_xen_guest_handle(hnd, val) \
do { \
@@ -40,16 +40,6 @@ typedef uint64_t xen_pfn_t;
#define PRI_xen_pfn "llx"
typedef uint64_t xen_ulong_t;
#define PRI_xen_ulong "llx"
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint, unsigned int);
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-DEFINE_GUEST_HANDLE(uint32_t);
-DEFINE_GUEST_HANDLE(xen_pfn_t);
-DEFINE_GUEST_HANDLE(xen_ulong_t);
/* Maximum number of virtual CPUs in multi-processor guests. */
#define MAX_VIRT_CPUS 1
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 1a2b7749b047..c81332577288 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -234,7 +234,7 @@ config IA64_HP_SIM
config IA64_XEN_GUEST
bool "Xen guest"
select SWIOTLB
- depends on XEN
+ depends on PARAVIRT_XEN
help
Build a kernel that runs on Xen guest domain. At this moment only
16KB page size in supported.
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index be7bfa12b705..342907d1abeb 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -55,7 +55,7 @@ core-$(CONFIG_IA64_XEN_GUEST) += arch/ia64/dig/
core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/
core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/
core-$(CONFIG_KVM) += arch/ia64/kvm/
-core-$(CONFIG_XEN) += arch/ia64/xen/
+core-$(CONFIG_PARAVIRT_XEN) += arch/ia64/xen/
drivers-$(CONFIG_PCI) += arch/ia64/pci/
drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h
index 67455c2ed2b1..aacad12a83f7 100644
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -34,13 +34,13 @@
#define _ASM_IA64_XEN_HYPERVISOR_H
#include <linux/err.h>
+#include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
#include <xen/interface/xen.h>
#include <xen/interface/version.h> /* to compile feature.c */
#include <xen/features.h> /* to comiple xen-netfront.c */
-#include <xen/xen.h>
#include <asm/xen/hypercall.h>
-#ifdef CONFIG_XEN
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h
index e88c5de27410..edc2a4d8a4cb 100644
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -56,14 +56,14 @@
#ifndef _ASM_IA64_XEN_INTERFACE_H
#define _ASM_IA64_XEN_INTERFACE_H
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
#define DEFINE_GUEST_HANDLE_STRUCT(name) \
- __DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name) __guest_handle_ ## name
-#define GUEST_HANDLE_64(name) GUEST_HANDLE(name)
+ __DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
+#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name)
#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
#ifndef __ASSEMBLY__
@@ -72,19 +72,6 @@
* guests. */
typedef unsigned long xen_pfn_t;
typedef unsigned long xen_ulong_t;
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint, unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-DEFINE_GUEST_HANDLE(uint32_t);
-
-DEFINE_GUEST_HANDLE(xen_pfn_t);
#define PRI_xen_pfn "lx"
#endif
@@ -96,7 +83,7 @@ DEFINE_GUEST_HANDLE(xen_pfn_t);
/* Maximum number of virtual CPUs in multi-processor guests. */
/* keep sizeof(struct shared_page) <= PAGE_SIZE.
* this is checked in arch/ia64/xen/hypervisor.c. */
-#define MAX_VIRT_CPUS 64
+#define XEN_LEGACY_MAX_VCPUS 64
#ifndef __ASSEMBLY__
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 46c9e3007315..27d7aa3afffc 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -290,7 +290,7 @@ void foo(void)
DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
offsetof (struct itc_jitter_data_t, itc_lastcycle));
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
BLANK();
DEFINE(XEN_NATIVE_ASM, XEN_NATIVE);
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 0ccb28fab27e..3d3f305e3342 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -182,7 +182,7 @@ SECTIONS {
__start_gate_section = .;
*(.data..gate)
__stop_gate_section = .;
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
. = ALIGN(PAGE_SIZE);
__xen_start_gate_section = .;
*(.data..gate.xen)
diff --git a/arch/ia64/xen/Kconfig b/arch/ia64/xen/Kconfig
index 5d8a06b0ddf7..950ea4001117 100644
--- a/arch/ia64/xen/Kconfig
+++ b/arch/ia64/xen/Kconfig
@@ -2,7 +2,7 @@
# This Kconfig describes xen/ia64 options
#
-config XEN
+config PARAVIRT_XEN
bool "Xen hypervisor support"
default y
depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB
@@ -16,10 +16,6 @@ config XEN
Enable Xen hypervisor support. Resulting kernel runs
both as a guest OS on Xen and natively on hardware.
-config XEN_XENCOMM
- depends on XEN
- bool
-
config NO_IDLE_HZ
- depends on XEN
+ depends on PARAVIRT_XEN
bool
diff --git a/arch/ia64/xen/xcom_hcall.c b/arch/ia64/xen/xcom_hcall.c
index ccaf7431f7c8..7690fc36dd27 100644
--- a/arch/ia64/xen/xcom_hcall.c
+++ b/arch/ia64/xen/xcom_hcall.c
@@ -343,7 +343,7 @@ xencommize_memory_reservation(struct xencomm_mini *xc_area,
int
xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
{
- GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
struct xen_memory_reservation *xmr = NULL;
int rc;
struct xencomm_handle *desc;
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index e5287d8517aa..4de36f91dfb3 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,12 +1,14 @@
obj-$(CONFIG_KVM) += kvm/
# Xen paravirtualization support
-obj-$(CONFIG_XEN) += xen/
+obj-$(CONFIG_PARAVIRT_XEN) += xen/
# lguest paravirtualization support
obj-$(CONFIG_LGUEST_GUEST) += lguest/
+ifneq ($(CONFIG_XEN),y)
obj-y += realmode/
+endif
obj-y += kernel/
obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4a5932c9d7a1..e724b425218c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -9,7 +9,7 @@ config 64BIT
config X86_32
def_bool y
depends on !64BIT
- select CLKSRC_I8253
+ select CLKSRC_I8253 if !XEN
select HAVE_UID16
config X86_64
@@ -27,7 +27,7 @@ config X86
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
- select HAVE_PCSPKR_PLATFORM
+ select HAVE_PCSPKR_PLATFORM if !XEN_UNPRIVILEGED_GUEST
select HAVE_PERF_EVENTS
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
@@ -37,7 +37,7 @@ config X86
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
- select HAVE_DMA_CONTIGUOUS if !SWIOTLB
+ select HAVE_DMA_CONTIGUOUS if !SWIOTLB && !XEN
select HAVE_KRETPROBES
select HAVE_OPTPROBES
select HAVE_KPROBES_ON_FTRACE
@@ -52,8 +52,8 @@ config X86
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_SYSCALL_TRACEPOINTS
select SYSCTL_EXCEPTION_TRACE
- select HAVE_KVM
- select HAVE_ARCH_KGDB
+ select HAVE_KVM if !XEN
+ select HAVE_ARCH_KGDB if !XEN
select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -61,14 +61,14 @@ config X86
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_DMA_API_DEBUG
select HAVE_KERNEL_GZIP
- select HAVE_KERNEL_BZIP2
- select HAVE_KERNEL_LZMA
- select HAVE_KERNEL_XZ
- select HAVE_KERNEL_LZO
+ select HAVE_KERNEL_BZIP2 if !XEN
+ select HAVE_KERNEL_LZMA if !XEN
+ select HAVE_KERNEL_XZ if !XEN
+ select HAVE_KERNEL_LZO if !XEN
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
- select HAVE_PERF_EVENTS_NMI
+ select HAVE_PERF_EVENTS_NMI if !XEN
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_DEBUG_KMEMLEAK
@@ -92,8 +92,8 @@ config X86
select IRQ_FORCED_THREADING
select USE_GENERIC_SMP_HELPERS if SMP
select HAVE_BPF_JIT if X86_64
- select HAVE_ARCH_TRANSPARENT_HUGEPAGE
- select CLKEVT_I8253
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE if !XEN
+ select CLKEVT_I8253 if !XEN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
select DCACHE_WORD_ACCESS
@@ -102,10 +102,10 @@ config X86
select HAVE_ARCH_SECCOMP_FILTER
select BUILDTIME_EXTABLE_SORT
select GENERIC_CMOS_UPDATE
- select CLOCKSOURCE_WATCHDOG
+ select CLOCKSOURCE_WATCHDOG if !XEN
select GENERIC_CLOCKEVENTS
select ARCH_CLOCKSOURCE_DATA if X86_64
- select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
+ select GENERIC_CLOCKEVENTS_BROADCAST if (X86_64 || (X86_32 && X86_LOCAL_APIC)) && !XEN
select GENERIC_TIME_VSYSCALL if X86_64
select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
@@ -153,7 +153,7 @@ config SBUS
config NEED_DMA_MAP_STATE
def_bool y
- depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
+ depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
config NEED_SG_DMA_LENGTH
def_bool y
@@ -203,6 +203,7 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK
config ARCH_HIBERNATION_POSSIBLE
def_bool y
+ depends on !XEN
config ARCH_SUSPEND_POSSIBLE
def_bool y
@@ -235,7 +236,15 @@ config X86_64_SMP
config X86_HT
def_bool y
- depends on SMP
+ depends on SMP && !XEN
+
+config X86_NO_TSS
+ def_bool y
+ depends on XEN
+
+config X86_NO_IDT
+ def_bool y
+ depends on XEN
config X86_32_LAZY_GS
def_bool y
@@ -248,7 +257,7 @@ config ARCH_HWEIGHT_CFLAGS
config ARCH_CPU_PROBE_RELEASE
def_bool y
- depends on HOTPLUG_CPU
+ depends on HOTPLUG_CPU && !XEN
config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -315,9 +324,18 @@ config X86_MPPARSE
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
+config X86_XEN
+ bool "Xen-compatible"
+ depends on X86_32
+ select XEN
+ select X86_PAE
+ help
+ Choose this option if you plan to run this kernel on top of the
+ Xen Hypervisor.
+
config X86_BIGSMP
bool "Support for big SMP systems with more than 8 CPUs"
- depends on X86_32 && SMP
+ depends on X86_32 && SMP && !XEN
---help---
This option is needed for the systems that have more than 8 CPUs
@@ -325,7 +343,7 @@ config GOLDFISH
def_bool y
depends on X86_GOLDFISH
-if X86_32
+if X86_32 && !XEN
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
@@ -349,7 +367,15 @@ config X86_EXTENDED_PLATFORM
generic distribution kernel, say Y here - otherwise say N.
endif
-if X86_64
+config X86_64_XEN
+ bool "Enable Xen compatible kernel"
+ depends on X86_64
+ select XEN
+ select PARAVIRT_CLOCK
+ help
+ This option will compile a kernel compatible with Xen hypervisor
+
+if X86_64 && !XEN
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
@@ -409,7 +435,7 @@ config X86_UV
config X86_GOLDFISH
bool "Goldfish (Virtual Platform)"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Enable support for the Goldfish virtual platform used primarily
for Android development. Unless you are building for the Android
@@ -568,7 +594,7 @@ config X86_ES7000
config X86_32_IRIS
tristate "Eurobraille/Iris poweroff module"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
The Iris machines from EuroBraille do not have APM or ACPI support
to shut themselves down properly. A special I/O sequence is
@@ -593,6 +619,7 @@ config SCHED_OMIT_FRAME_POINTER
menuconfig HYPERVISOR_GUEST
bool "Linux guest support"
+ depends on !XEN
---help---
Say Y here to enable options for running Linux under various hyper-
visors. This option enables basic hypervisor detection and platform
@@ -659,16 +686,17 @@ config PARAVIRT_TIME_ACCOUNTING
If in doubt, say N here.
+endif #HYPERVISOR_GUEST
+
config PARAVIRT_CLOCK
bool
-endif #HYPERVISOR_GUEST
-
config NO_BOOTMEM
def_bool y
config MEMTEST
bool "Memtest"
+ depends on !XEN
---help---
This option adds a kernel parameter 'memtest', which allows memtest
to be set.
@@ -691,6 +719,7 @@ source "arch/x86/Kconfig.cpu"
config HPET_TIMER
def_bool X86_64
prompt "HPET Timer Support" if X86_32
+ depends on !XEN
---help---
Use the IA-PC HPET (High Precision Event Timer) to manage
time in preference to the PIT and RTC, if a HPET is
@@ -728,6 +757,7 @@ config APB_TIMER
config DMI
default y
bool "Enable DMI scanning" if EXPERT
+ depends on !XEN_UNPRIVILEGED_GUEST
---help---
Enabled scanning of DMI to identify machine quirks. Say Y
here unless you have verified that your setup is not
@@ -738,7 +768,7 @@ config GART_IOMMU
bool "GART IOMMU support" if EXPERT
default y
select SWIOTLB
- depends on X86_64 && PCI && AMD_NB
+ depends on X86_64 && PCI && AMD_NB && !X86_64_XEN
---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
@@ -753,7 +783,7 @@ config GART_IOMMU
config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
select SWIOTLB
- depends on X86_64 && PCI
+ depends on X86_64 && PCI && !X86_64_XEN
---help---
Support for hardware IOMMUs in IBM's xSeries x366 and x460
systems. Needed to run systems with more than 3GB of memory
@@ -781,7 +811,8 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
- def_bool y if X86_64
+ def_bool y if X86_64 || XEN
+ prompt "Software I/O TLB" if XEN_UNPRIVILEGED_GUEST && !XEN_PCIDEV_FRONTEND
---help---
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU. Using this PCI devices
@@ -803,11 +834,12 @@ config MAXSMP
config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP
- range 2 8 if SMP && X86_32 && !X86_BIGSMP
+ range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
range 2 512 if SMP && !MAXSMP
default "1" if !SMP
default "4096" if MAXSMP
default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+ default "16" if X86_64_XEN
default "8" if SMP
---help---
This allows you to specify the maximum number of CPUs which this
@@ -839,7 +871,7 @@ source "kernel/Kconfig.preempt"
config X86_UP_APIC
bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+ depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST
---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
@@ -865,10 +897,12 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ depends on !XEN_UNPRIVILEGED_GUEST
config X86_IO_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
+ depends on !XEN_UNPRIVILEGED_GUEST
config X86_VISWS_APIC
def_bool y
@@ -876,7 +910,7 @@ config X86_VISWS_APIC
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
- depends on X86_IO_APIC
+ depends on X86_IO_APIC && !XEN
---help---
This option enables a workaround that fixes a source of
spurious interrupts. This is recommended when threaded
@@ -900,6 +934,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
config X86_MCE
bool "Machine Check / overheating reporting"
default y
+ depends on !XEN_UNPRIVILEGED_GUEST
---help---
Machine Check support allows the processor to notify the
kernel if it detects a problem (e.g. overheating, data corruption).
@@ -909,7 +944,7 @@ config X86_MCE
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && !XEN
---help---
Additional support for intel specific MCE features such as
the thermal monitor.
@@ -917,14 +952,14 @@ config X86_MCE_INTEL
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && !XEN
---help---
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
config X86_ANCIENT_MCE
bool "Support for old Pentium 5 / WinChip machine checks"
- depends on X86_32 && X86_MCE
+ depends on X86_32 && X86_MCE && !XEN
---help---
Include support for machine check handling on old Pentium 5 or WinChip
systems. These typically need to be enabled explicitely on the command
@@ -942,6 +977,10 @@ config X86_MCE_INJECT
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
+config X86_XEN_MCE
+ def_bool y
+ depends on XEN && X86_MCE
+
config X86_THERMAL_VECTOR
def_bool y
depends on X86_MCE_INTEL
@@ -995,7 +1034,7 @@ config I8K
config X86_REBOOTFIXUPS
bool "Enable X86 board specific fixups for reboot"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
This enables chipset and/or board specific fixups to be done
in order to get reboot to work correctly. This is only needed on
@@ -1012,6 +1051,7 @@ config X86_REBOOTFIXUPS
config MICROCODE
tristate "CPU microcode loading support"
+ depends on !XEN_UNPRIVILEGED_GUEST
select FW_LOADER
---help---
@@ -1030,7 +1070,7 @@ config MICROCODE
config MICROCODE_INTEL
bool "Intel microcode loading support"
- depends on MICROCODE
+ depends on MICROCODE && !XEN
default MICROCODE
select FW_LOADER
---help---
@@ -1043,7 +1083,7 @@ config MICROCODE_INTEL
config MICROCODE_AMD
bool "AMD microcode loading support"
- depends on MICROCODE
+ depends on MICROCODE && !XEN
select FW_LOADER
---help---
If you select this option, microcode patch loading support for AMD
@@ -1073,6 +1113,7 @@ config MICROCODE_EARLY
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
+ select XEN_DOMCTL if XEN_PRIVILEGED_GUEST
---help---
This device gives privileged processes access to the x86
Model-Specific Registers (MSRs). It is a character device with
@@ -1090,7 +1131,7 @@ config X86_CPUID
choice
prompt "High Memory Support"
- default HIGHMEM64G if X86_NUMAQ
+ default HIGHMEM64G if X86_NUMAQ || XEN
default HIGHMEM4G
depends on X86_32
@@ -1133,7 +1174,7 @@ config NOHIGHMEM
config HIGHMEM4G
bool "4GB"
- depends on !X86_NUMAQ
+ depends on !X86_NUMAQ && !XEN
---help---
Select this if you have a 32-bit processor and between 1 and 4
gigabytes of physical RAM.
@@ -1210,12 +1251,12 @@ config ARCH_PHYS_ADDR_T_64BIT
config ARCH_DMA_ADDR_T_64BIT
def_bool y
- depends on X86_64 || HIGHMEM64G
+ depends on X86_64 || XEN || HIGHMEM64G
config DIRECT_GBPAGES
bool "Enable 1GB pages for kernel pagetables" if EXPERT
default y
- depends on X86_64
+ depends on X86_64 && !XEN
---help---
Allow the kernel linear mapping to use 1GB pages on CPUs that
support it. This can improve the kernel's performance a tiny bit by
@@ -1224,7 +1265,7 @@ config DIRECT_GBPAGES
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
- depends on SMP
+ depends on SMP && !XEN
depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI))
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
---help---
@@ -1317,12 +1358,13 @@ config ARCH_DISCONTIGMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD
+ depends on !XEN
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
config ARCH_SPARSEMEM_DEFAULT
def_bool y
- depends on X86_64
+ depends on X86_64 && !X86_64_XEN
config ARCH_SELECT_MEMORY_MODEL
def_bool y
@@ -1354,6 +1396,7 @@ config HIGHPTE
config X86_CHECK_BIOS_CORRUPTION
bool "Check for low memory corruption"
+ depends on !XEN
---help---
Periodically check for memory corruption in low memory, which
is suspected to be caused by BIOS. Even when enabled in the
@@ -1384,6 +1427,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
config X86_RESERVE_LOW
int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+ depends on !XEN
default 64
range 4 640
---help---
@@ -1414,6 +1458,7 @@ config X86_RESERVE_LOW
config MATH_EMULATION
bool
prompt "Math emulation" if X86_32
+ depends on !XEN
---help---
Linux can emulate a math coprocessor (used for floating point
operations) if you don't have one. 486DX and Pentium processors have
@@ -1440,6 +1485,7 @@ config MATH_EMULATION
config MTRR
def_bool y
prompt "MTRR (Memory Type Range Register) support" if EXPERT
+ depends on !XEN_UNPRIVILEGED_GUEST
---help---
On Intel P6 family processors (Pentium Pro, Pentium II and later)
the Memory Type Range Registers (MTRRs) may be used to control
@@ -1475,7 +1521,7 @@ config MTRR
config MTRR_SANITIZER
def_bool y
prompt "MTRR cleanup support"
- depends on MTRR
+ depends on MTRR && !XEN
---help---
Convert MTRR layout from continuous to discrete, so X drivers can
add writeback entries.
@@ -1505,8 +1551,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
config X86_PAT
def_bool y
- prompt "x86 PAT support" if EXPERT
- depends on MTRR
+ prompt "x86 PAT support" if EXPERT || XEN_UNPRIVILEGED_GUEST
+ depends on MTRR || (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
---help---
Use PAT attributes to setup page level cache control.
@@ -1534,6 +1580,7 @@ config ARCH_RANDOM
config X86_SMAP
def_bool y
prompt "Supervisor Mode Access Prevention" if EXPERT
+ depends on !XEN
---help---
Supervisor Mode Access Prevention (SMAP) is a security
feature in newer Intel processors. There is a small
@@ -1544,7 +1591,7 @@ config X86_SMAP
config EFI
bool "EFI runtime service support"
- depends on ACPI
+ depends on ACPI && !XEN_UNPRIVILEGED_GUEST
select UCS2_STRING
---help---
This enables the kernel to use EFI runtime services that are
@@ -1559,7 +1606,7 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI
+ depends on EFI && (!XEN || XEN_BZIMAGE)
---help---
This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
@@ -1602,6 +1649,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ depends on !XEN_UNPRIVILEGED_GUEST
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1619,6 +1667,7 @@ config KEXEC
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
+ depends on !XEN
---help---
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
@@ -1638,7 +1687,8 @@ config KEXEC_JUMP
code in physical address mode via KEXEC
config PHYSICAL_START
- hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
+ hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP || XEN)
+ default 0x100000 if XEN
default "0x1000000"
---help---
This gives the physical address where the kernel is loaded.
@@ -1680,6 +1730,7 @@ config PHYSICAL_START
config RELOCATABLE
bool "Build a relocatable kernel"
+ depends on !XEN
default y
---help---
This builds a kernel image that retains relocation information
@@ -1701,7 +1752,8 @@ config X86_NEED_RELOCS
depends on X86_32 && RELOCATABLE
config PHYSICAL_ALIGN
- hex "Alignment value to which kernel should be aligned" if X86_32
+ hex "Alignment value to which kernel should be aligned" if X86_32 && !XEN
+ default 0x2000 if XEN
default "0x1000000"
range 0x2000 0x1000000
---help---
@@ -1723,6 +1775,30 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
+config XEN_BZIMAGE
+ bool "Produce bzImage (rather than ELF) format executable"
+# depends on XEN && (XEN_COMPAT >= (XEN_UNPRIVILEGED_GUEST ? 0x030402 : 0x040000))
+ depends on XEN
+ depends on !XEN_COMPAT_030002_AND_LATER
+ depends on !XEN_COMPAT_030004_AND_LATER
+ depends on !XEN_COMPAT_030100_AND_LATER
+ depends on !XEN_COMPAT_030200_AND_LATER
+ depends on !XEN_COMPAT_030300_AND_LATER
+ depends on !XEN_COMPAT_030400_AND_LATER
+# depends on XEN_UNPRIVILEGED_GUEST || !XEN_COMPAT_030402_AND_LATER
+ select HAVE_KERNEL_BZIP2 if XEN_ADVANCED_COMPRESSION
+ select HAVE_KERNEL_LZMA if XEN_ADVANCED_COMPRESSION
+# select HAVE_KERNEL_XZ if XEN_ADVANCED_COMPRESSION && (XEN_COMPAT > 0x040100)
+ select HAVE_KERNEL_XZ if XEN_ADVANCED_COMPRESSION && !XEN_COMPAT_040000_AND_LATER && !XEN_COMPAT_040100_AND_LATER
+ select HAVE_KERNEL_LZO if XEN_ADVANCED_COMPRESSION && !XEN_COMPAT_040000_AND_LATER
+ ---help---
+ Select whether, at the price of being incompatible with pre-3.4
+ (pre-4.0 for Dom0) hypervisor versions, you want the final image
+ to be in bzImage format, including the option to compress its
+ embedded ELF image with methods better than gzip.
+ Note that this is a prerequiste for building a kernel that can be
+ used for secure boot from UEFI.
+
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
depends on SMP && HOTPLUG
@@ -1736,7 +1812,7 @@ config HOTPLUG_CPU
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
default n
- depends on HOTPLUG_CPU
+ depends on HOTPLUG_CPU && !XEN
---help---
Set whether default state of cpu0_hotpluggable is on or off.
@@ -1765,7 +1841,7 @@ config BOOTPARAM_HOTPLUG_CPU0
config DEBUG_HOTPLUG_CPU0
def_bool n
prompt "Debug CPU0 hotplug"
- depends on HOTPLUG_CPU
+ depends on HOTPLUG_CPU && !XEN
---help---
Enabling this option offlines CPU0 (if CPU0 can be offlined) as
soon as possible and boots up userspace with CPU0 offlined. User
@@ -1838,6 +1914,7 @@ endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
+ depends on !XEN
config ARCH_ENABLE_MEMORY_HOTREMOVE
def_bool y
@@ -1855,6 +1932,8 @@ config ARCH_HIBERNATION_HEADER
source "kernel/power/Kconfig"
+if !XEN_UNPRIVILEGED_GUEST
+
source "drivers/acpi/Kconfig"
source "drivers/sfi/Kconfig"
@@ -1865,7 +1944,7 @@ config X86_APM_BOOT
menuconfig APM
tristate "APM (Advanced Power Management) BIOS support"
- depends on X86_32 && PM_SLEEP
+ depends on X86_32 && PM_SLEEP && !XEN
---help---
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
@@ -1991,6 +2070,8 @@ source "drivers/cpuidle/Kconfig"
source "drivers/idle/Kconfig"
+endif # !XEN_UNPRIVILEGED_GUEST
+
endmenu
@@ -2000,6 +2081,7 @@ config PCI
bool "PCI support"
default y
select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
+ select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
---help---
Find out whether you have a PCI motherboard. PCI is the name of a
bus system, i.e. the way the CPU talks to the other stuff inside
@@ -2027,25 +2109,36 @@ choice
config PCI_GOBIOS
bool "BIOS"
+ depends on !XEN
config PCI_GOMMCONFIG
bool "MMConfig"
+ depends on !XEN_UNPRIVILEGED_GUEST
config PCI_GODIRECT
bool "Direct"
+ depends on !XEN_UNPRIVILEGED_GUEST
config PCI_GOOLPC
bool "OLPC XO-1"
- depends on OLPC
+ depends on OLPC && !XEN_UNPRIVILEGED_GUEST
+
+config PCI_GOXEN_FE
+ bool "Xen PCI Frontend"
+ depends on X86_XEN
+ help
+ The PCI device frontend driver allows the kernel to import arbitrary
+ PCI devices from a PCI backend to support PCI driver domains.
config PCI_GOANY
bool "Any"
+ depends on !XEN_UNPRIVILEGED_GUEST
endchoice
config PCI_BIOS
def_bool y
- depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
+ depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY)
# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
config PCI_DIRECT
@@ -2062,7 +2155,7 @@ config PCI_OLPC
config PCI_XEN
def_bool y
- depends on PCI && XEN
+ depends on PCI && PARAVIRT_XEN
select SWIOTLB_XEN
config PCI_DOMAINS
@@ -2092,7 +2185,7 @@ source "drivers/pci/Kconfig"
# x86_64 have no ISA slots, but can have ISA-style DMA.
config ISA_DMA_API
- bool "ISA-style DMA support" if (X86_64 && EXPERT)
+ bool "ISA-style DMA support" if ((X86_64 || XEN) && EXPERT) || XEN_UNPRIVILEGED_GUEST
default y
help
Enables ISA-style DMA support for devices requiring such controllers.
@@ -2102,6 +2195,7 @@ if X86_32
config ISA
bool "ISA support"
+ depends on !XEN
---help---
Find out whether you have ISA slots on your motherboard. ISA is the
name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -2150,7 +2244,7 @@ config SCx200HR_TIMER
config OLPC
bool "One Laptop Per Child support"
- depends on !X86_PAE
+ depends on !X86_PAE && !XEN
select GPIOLIB
select OF
select OF_PROMTREE
@@ -2239,7 +2333,7 @@ endif # X86_32
config AMD_NB
def_bool y
- depends on CPU_SUP_AMD && PCI
+ depends on CPU_SUP_AMD && PCI && !XEN_UNPRIVILEGED_GUEST
source "drivers/pcmcia/Kconfig"
@@ -2332,7 +2426,9 @@ source "net/Kconfig"
source "drivers/Kconfig"
+if !XEN_UNPRIVILEGED_GUEST
source "drivers/firmware/Kconfig"
+endif
source "fs/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c026cca5602c..b7f0f3b36c5c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -6,7 +6,7 @@ choice
config M486
bool "486"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
This is the processor type of your CPU. This information is
used for optimizing purposes. In order to compile a kernel
@@ -48,7 +48,7 @@ config M486
config M586
bool "586/K5/5x86/6x86/6x86MX"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for an 586 or 686 series processor such as the AMD K5,
the Cyrix 5x86, 6x86 and 6x86MX. This choice does not
@@ -56,14 +56,14 @@ config M586
config M586TSC
bool "Pentium-Classic"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for a Pentium Classic processor with the RDTSC (Read
Time Stamp Counter) instruction for benchmarking.
config M586MMX
bool "Pentium-MMX"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for a Pentium with the MMX graphics/multimedia
extended instructions.
@@ -177,7 +177,7 @@ config MEFFICEON
config MWINCHIPC6
bool "Winchip-C6"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for an IDT Winchip C6 chip. Linux and GCC
treat this chip as a 586TSC with some extended instructions
@@ -185,7 +185,7 @@ config MWINCHIPC6
config MWINCHIP3D
bool "Winchip-2/Winchip-2A/Winchip-3"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
treat this chip as a 586TSC with some extended instructions
@@ -195,7 +195,7 @@ config MWINCHIP3D
config MELAN
bool "AMD Elan"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for an AMD Elan processor.
@@ -203,19 +203,19 @@ config MELAN
config MGEODEGX1
bool "GeodeGX1"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for a Geode GX1 (Cyrix MediaGX) chip.
config MGEODE_LX
bool "Geode GX/LX"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for AMD Geode GX and LX processors.
config MCYRIXIII
bool "CyrixIII/VIA-C3"
- depends on X86_32
+ depends on X86_32 && !XEN
---help---
Select this for a Cyrix III or C3 chip. Presently Linux and GCC
treat this chip as a generic 586. Whilst the CPU is 686 class,
@@ -364,6 +364,7 @@ config X86_P6_NOP
config X86_TSC
def_bool y
depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64
+ depends on !XEN
config X86_CMPXCHG64
def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c198b7e13e7b..201e3d883e79 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -25,6 +25,7 @@ config STRICT_DEVMEM
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
+ depends on !XEN
---help---
Enables the informational output from the decompression stage
(e.g. bzImage) of the boot. If you disable this you will still
@@ -32,6 +33,7 @@ config X86_VERBOSE_BOOTUP
config EARLY_PRINTK
bool "Early printk" if EXPERT
+ depends on !XEN_UNPRIVILEGED_GUEST
default y
---help---
Write kernel log output directly into the VGA buffer or to a serial
@@ -122,7 +124,7 @@ config DEBUG_NX_TEST
config DOUBLEFAULT
default y
bool "Enable doublefault exception handler" if EXPERT
- depends on X86_32
+ depends on X86_32 && !X86_NO_TSS
---help---
This option allows trapping of rare doublefault exceptions that
would otherwise cause a system to silently reboot. Disabling this
@@ -166,6 +168,7 @@ config IOMMU_DEBUG
config IOMMU_STRESS
bool "Enable IOMMU stress-test mode"
+ depends on !XEN
---help---
This option disables various optimizations in IOMMU related
code to do real stress testing of the IOMMU code. This option
@@ -181,6 +184,7 @@ config IOMMU_LEAK
config HAVE_MMIOTRACE_SUPPORT
def_bool y
+ depends on !XEN
config X86_DECODER_SELFTEST
bool "x86 instruction decoder selftest"
@@ -269,6 +273,7 @@ config DEBUG_BOOT_PARAMS
bool "Debug boot parameters"
depends on DEBUG_KERNEL
depends on DEBUG_FS
+ depends on !XEN
---help---
This option will cause struct boot_params to be exported via debugfs.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3b3690f620e5..6058a119fec3 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -189,8 +189,29 @@ boot := arch/x86/boot
BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
-PHONY += bzImage $(BOOT_TARGETS)
+PHONY += bzImage vmlinuz $(BOOT_TARGETS)
+ifdef CONFIG_XEN
+LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+ -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
+
+ifdef CONFIG_X86_64
+LDFLAGS_vmlinux := -e startup_64
+endif
+endif
+
+ifeq ($(CONFIG_XEN)-$(CONFIG_XEN_BZIMAGE),y-)
+# Default kernel to build
+all: vmlinuz
+
+# KBUILD_IMAGE specifies the target image being built
+KBUILD_IMAGE := $(boot)/vmlinuz
+
+vmlinuz: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+ $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
+ $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+else
# Default kernel to build
all: bzImage
@@ -204,6 +225,7 @@ endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+endif
$(BOOT_TARGETS): vmlinux
$(Q)$(MAKE) $(build)=$(boot) $@
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 379814bc41e3..917c4fae2f66 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -17,6 +17,7 @@
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
+targets += vmlinuz vmlinux-stripped
targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
@@ -34,6 +35,8 @@ setup-y += video-vga.o
setup-y += video-vesa.o
setup-y += video-bios.o
+setup-$(CONFIG_XEN) := header.o version.o
+
targets += $(setup-y)
hostprogs-y := mkcpustr tools/build
@@ -190,6 +193,20 @@ bzlilo: $(obj)/bzImage
cp System.map $(INSTALL_PATH)/
if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE
+ $(call if_changed,gzip)
+ @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded
+$(obj)/vmlinux-stripped: vmlinux FORCE
+ $(call if_changed,objcopy)
+
+ifneq ($(CONFIG_XEN)-$(CONFIG_XEN_BZIMAGE),y-)
+bzImage := bzImage
+else
+bzImage := vmlinuz
+endif
+
install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \
System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 5b7531966b84..ebb3427f69a4 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -23,8 +23,12 @@
#include <stdarg.h>
#include <linux/types.h>
#include <linux/edd.h>
+#ifndef CONFIG_XEN
#include <asm/boot.h>
#include <asm/setup.h>
+#else
+#include <asm/bootparam.h>
+#endif
#include "bitops.h"
#include <asm/cpufeature.h>
#include <asm/processor-flags.h>
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5ef205c5f37b..aada0945162e 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,9 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
LDFLAGS := -m elf_$(UTS_MACHINE)
+ifeq ($(CONFIG_XEN),y)
+LDFLAGS += -e 0
+endif
LDFLAGS_vmlinux := -T
hostprogs-y := mkpiggy
@@ -33,6 +36,9 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
ifeq ($(CONFIG_EFI_STUB), y)
VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
endif
+ifeq ($(CONFIG_XEN), y)
+ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/piggy.o
+endif
$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1f6d61..2f060c140865 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -411,6 +411,9 @@ setup_data: .quad 0 # 64-bit physical pointer to
pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
+#ifdef CONFIG_XEN
+#define ZO_startup_32 0
+#endif
#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
#define VO_INIT_SIZE (VO__end - VO__text)
#if ZO_INIT_SIZE > VO_INIT_SIZE
@@ -431,6 +434,7 @@ handover_offset:
.section ".entrytext", "ax"
start_of_setup:
+#ifndef CONFIG_XEN
# Force %es = %ds
movw %ds, %ax
movw %ax, %es
@@ -503,3 +507,4 @@ die:
setup_corrupt:
.byte 7
.string "No setup signature found...\n"
+#endif /* CONFIG_XEN */
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 94c544650020..274d1f0bf989 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -42,8 +42,10 @@ typedef unsigned int u32;
#define DEFAULT_MINOR_ROOT 0
#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT)
+#ifndef CONFIG_XEN
/* Minimal number of setup sectors */
#define SETUP_SECT_MIN 5
+#endif
#define SETUP_SECT_MAX 64
/* This must be large enough to hold the entire setup */
@@ -291,8 +293,8 @@ int main(int argc, char ** argv)
c = fread(buf, 1, sizeof(buf), file);
if (ferror(file))
die("read-error on `setup'");
- if (c < 1024)
- die("The setup must be at least 1024 bytes");
+ if (c <= 512)
+ die("The setup must be more than 512 bytes");
if (get_unaligned_le16(&buf[510]) != 0xAA55)
die("Boot block hasn't got boot flag (0xAA55)");
fclose(file);
@@ -305,8 +307,10 @@ int main(int argc, char ** argv)
/* Pad unused space with zeros */
setup_sectors = (c + 511) / 512;
+#ifdef SETUP_SECT_MIN
if (setup_sectors < SETUP_SECT_MIN)
setup_sectors = SETUP_SECT_MIN;
+#endif
i = setup_sectors*512;
memset(buf+c, 0, i-c);
diff --git a/arch/x86/ia32/ia32entry-xen.S b/arch/x86/ia32/ia32entry-xen.S
new file mode 100644
index 000000000000..edb4b08fc3f7
--- /dev/null
+++ b/arch/x86/ia32/ia32entry-xen.S
@@ -0,0 +1,388 @@
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE 0x40000000
+
+ .section .entry.text, "ax"
+
+ .macro IA32_ARG_FIXUP noebp=0
+ movl %edi,%r8d
+ .if \noebp
+ jmp .Lia32_common
+ .else
+ movl %ebp,%r9d
+.Lia32_common:
+ .endif
+ xchg %ecx,%esi
+ movl %ebx,%edi
+ movl %edx,%edx /* zero extension */
+ .endm
+
+ /* clobbers %eax */
+ .macro CLEAR_RREGS offset=0, _r9=rax
+ xorl %eax,%eax
+ movq %rax,\offset+R11(%rsp)
+ movq %rax,\offset+R10(%rsp)
+ movq %\_r9,\offset+R9(%rsp)
+ movq %rax,\offset+R8(%rsp)
+ .endm
+
+ /*
+ * Reload arg registers from stack in case ptrace changed them.
+ * We don't reload %eax because syscall_trace_enter() returned
+ * the %rax value we should see. Instead, we just truncate that
+ * value to 32 bits again as we did on entry from user mode.
+ * If it's a new value set by user_regset during entry tracing,
+ * this matches the normal truncation of the user-mode value.
+ * If it's -1 to make us punt the syscall, then (u32)-1 is still
+ * an appropriately invalid value.
+ */
+ .macro LOAD_ARGS32 offset, _r9=0
+ .if \_r9
+ movl \offset+16(%rsp),%r9d
+ .endif
+ movl \offset+40(%rsp),%ecx
+ movl \offset+48(%rsp),%edx
+ movl \offset+56(%rsp),%esi
+ movl \offset+64(%rsp),%edi
+ movl %eax,%eax /* zero extension */
+ .endm
+
+ .macro CFI_STARTPROC32 simple
+ CFI_STARTPROC \simple
+ CFI_UNDEFINED r8
+ CFI_UNDEFINED r9
+ CFI_UNDEFINED r10
+ CFI_UNDEFINED r11
+ CFI_UNDEFINED r12
+ CFI_UNDEFINED r13
+ CFI_UNDEFINED r14
+ CFI_UNDEFINED r15
+ .endm
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+ swapgs
+ sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+ swapgs
+ sti
+ sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6
+ *
+ * Interrupts on.
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(ia32_sysenter_target)
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-RIP+16
+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
+ CFI_REL_OFFSET rsp,RSP-RIP+16
+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
+ CFI_REL_OFFSET rip,RIP-RIP+16
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ popq_cfi %rcx
+ CFI_RESTORE rcx
+ movl %ebp,%ebp /* zero extension */
+ movl %eax,%eax
+ movl TI_sysenter_return+THREAD_INFO(%rsp,8*6-KERNEL_STACK_OFFSET),%r10d
+ movl $__USER32_DS,40(%rsp)
+ movq %rbp,32(%rsp)
+ movl $__USER32_CS,16(%rsp)
+ movq %r10,8(%rsp)
+ movq %rax,(%rsp)
+ cld
+ SAVE_ARGS 0,1,0
+ /* no need to do an access_ok check here because rbp has been
+ 32bit zero extended */
+ ASM_STAC
+1: movl (%rbp),%ebp
+ _ASM_EXTABLE(1b,ia32_badarg)
+ ASM_CLAC
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jnz sysenter_tracesys
+ jmp .Lia32_check_call
+
+#ifdef CONFIG_AUDITSYSCALL
+ .macro auditsys_entry_common
+ movl %esi,%r9d /* 6th arg: 4th syscall arg */
+ movl %edx,%r8d /* 5th arg: 3rd syscall arg */
+ /* (already in %ecx) 4th arg: 2nd syscall arg */
+ movl %ebx,%edx /* 3rd arg: 1st syscall arg */
+ movl %eax,%esi /* 2nd arg: syscall number */
+ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
+ call __audit_syscall_entry
+ movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
+ cmpq $(IA32_NR_syscalls-1),%rax
+ ja ia32_badsys
+ movl %ebx,%edi /* reload 1st syscall arg */
+ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
+ movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
+ movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
+ movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
+ .endm
+
+sysenter_auditsys:
+ auditsys_entry_common
+ movl %ebp,%r9d /* reload 6th syscall arg */
+ jmp .Lia32_dispatch
+#endif
+ CFI_ENDPROC
+ENDPROC(ia32_sysenter_target)
+
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx return EIP
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
+ * %esp user stack
+ * 0(%esp) Arg6
+ *
+ * Interrupts on.
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(ia32_cstar_target)
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-RIP+16
+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
+ CFI_REL_OFFSET rsp,RSP-RIP+16
+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
+ CFI_REL_OFFSET rip,RIP-RIP+16
+ movl %eax,%eax /* zero extension */
+ movl RSP-RIP+16(%rsp),%r8d
+ SAVE_ARGS -8,0,0
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+ movl %ebp,%ecx
+ movl $__USER32_CS,CS-ARGOFFSET(%rsp)
+ movl $__USER32_DS,SS-ARGOFFSET(%rsp)
+ /* no need to do an access_ok check here because r8 has been
+ 32bit zero extended */
+ /* hardware stack frame is complete now */
+ ASM_STAC
+1: movl (%r8),%r9d
+ _ASM_EXTABLE(1b,ia32_badarg)
+ ASM_CLAC
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jnz cstar_tracesys
+ cmpq $IA32_NR_syscalls-1,%rax
+ ja ia32_badsys
+cstar_do_call:
+ IA32_ARG_FIXUP 1
+
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+ movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
+ auditsys_entry_common
+ movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
+ jmp .Lia32_dispatch
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jz cstar_auditsys
+#endif
+ xchgl %r9d,%ebp
+ SAVE_REST
+ CLEAR_RREGS 0, r9
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ xchgl %ebp,%r9d
+ cmpq $(IA32_NR_syscalls-1),%rax
+ ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
+ jmp cstar_do_call
+END(ia32_cstar_target)
+
+ia32_badarg:
+ ASM_CLAC
+ movq $-EFAULT,%rax
+ jmp ia32_sysret
+ CFI_ENDPROC
+
+/*
+ * Emulated IA32 system calls via int 0x80.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts on.
+ */
+
+ENTRY(ia32_syscall)
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-RIP+16
+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
+ CFI_REL_OFFSET rsp,RSP-RIP+16
+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
+ CFI_REL_OFFSET rip,RIP-RIP+16
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ popq_cfi %rcx
+ CFI_RESTORE rcx
+ movl %eax,%eax
+ movq %rax,(%rsp)
+ cld
+ /* note the registers are not zero extended to the sf.
+ this could be a problem. */
+ SAVE_ARGS 0,1,0
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jnz ia32_tracesys
+.Lia32_check_call:
+ cmpq $(IA32_NR_syscalls-1),%rax
+ ja ia32_badsys
+ia32_do_call:
+ IA32_ARG_FIXUP
+.Lia32_dispatch:
+ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ CLEAR_RREGS -ARGOFFSET
+ jmp int_ret_from_sys_call
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jz sysenter_auditsys
+#endif
+ia32_tracesys:
+ SAVE_REST
+ CLEAR_RREGS
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ cmpq $(IA32_NR_syscalls-1),%rax
+ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
+ jmp ia32_do_call
+END(ia32_syscall)
+
+ia32_badsys:
+ movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+ movq $-ENOSYS,%rax
+ jmp ia32_sysret
+
+ CFI_ENDPROC
+
+ .macro PTREGSCALL label, func, arg
+ ALIGN
+GLOBAL(\label)
+ leaq \func(%rip),%rax
+ jmp ia32_ptregs_common
+ .endm
+
+ CFI_STARTPROC32
+
+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
+ PTREGSCALL stub32_sigreturn, sys32_sigreturn
+ PTREGSCALL stub32_execve, compat_sys_execve
+ PTREGSCALL stub32_fork, sys_fork
+ PTREGSCALL stub32_vfork, sys_vfork
+
+ ALIGN
+GLOBAL(stub32_clone)
+ leaq sys_clone(%rip),%rax
+ mov %r8, %rcx
+ jmp ia32_ptregs_common
+
+ ALIGN
+ia32_ptregs_common:
+ popq %r11
+ CFI_ENDPROC
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-ARGOFFSET
+ CFI_REL_OFFSET rax,RAX-ARGOFFSET
+ CFI_REL_OFFSET rcx,RCX-ARGOFFSET
+ CFI_REL_OFFSET rdx,RDX-ARGOFFSET
+ CFI_REL_OFFSET rsi,RSI-ARGOFFSET
+ CFI_REL_OFFSET rdi,RDI-ARGOFFSET
+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
+/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
+/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
+ SAVE_REST
+ call *%rax
+ RESTORE_REST
+ jmp ia32_sysret /* misbalances the return cache */
+ CFI_ENDPROC
+END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index b31bf97775fc..f782411ff45e 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -111,7 +111,11 @@ static inline void acpi_disable_pci(void)
}
/* Low-level suspend routine. */
+#ifdef CONFIG_ACPI_PV_SLEEP
+#define acpi_suspend_lowlevel() acpi_enter_sleep_state(ACPI_STATE_S3)
+#else
extern int acpi_suspend_lowlevel(void);
+#endif
/* Physical address to resume after wakeup */
#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
@@ -121,6 +125,7 @@ extern int acpi_suspend_lowlevel(void);
*/
static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
{
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
/*
* Early models (<=5) of AMD Opterons are not supposed to go into
* C2 state.
@@ -135,6 +140,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
else if (amd_e400_c1e_detected)
return 1;
else
+#endif
return max_cstate;
}
@@ -174,7 +180,9 @@ static inline void disable_acpi(void) { }
#endif /* !CONFIG_ACPI */
+#ifndef CONFIG_XEN
#define ARCH_HAS_POWER_INIT 1
+#endif
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index eec2a70d4376..91e72c039eb2 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -15,6 +15,9 @@
#define map_page_into_agp(page) set_pages_uc(page, 1)
#define unmap_page_from_agp(page) set_pages_wb(page, 1)
+#define map_pages_into_agp set_pages_array_uc
+#define unmap_pages_from_agp set_pages_array_wb
+
/*
* Could use CLFLUSH here if the cpu supports it. But then it would
* need to be called for each cacheline of the whole page so it may
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 338803422239..f1f68a73b7c8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -9,11 +9,15 @@
#include <asm/processor.h>
#include <asm/apicdef.h>
#include <linux/atomic.h>
+#ifndef CONFIG_XEN
#include <asm/fixmap.h>
+#endif
#include <asm/mpspec.h>
#include <asm/msr.h>
+#ifndef CONFIG_XEN
#define ARCH_APICTIMER_STOPS_ON_C3 1
+#endif
/*
* Debugging macros
@@ -45,6 +49,7 @@ static inline void generic_apic_probe(void)
#ifdef CONFIG_X86_LOCAL_APIC
extern unsigned int apic_verbosity;
+#ifndef CONFIG_XEN
extern int local_apic_timer_c2_ok;
extern int disable_apic;
@@ -118,6 +123,8 @@ extern u64 native_apic_icr_read(void);
extern int x2apic_mode;
+#endif /* CONFIG_XEN */
+
#ifdef CONFIG_X86_X2APIC
/*
* Make previous memory operations globally visible before
@@ -242,7 +249,11 @@ extern void setup_local_APIC(void);
extern void end_local_APIC_setup(void);
extern void bsp_end_local_APIC_setup(void);
extern void init_apic_mappings(void);
+#ifndef CONFIG_XEN
void register_lapic_address(unsigned long address);
+#else
+#define register_lapic_address(address)
+#endif
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
extern int APIC_init_uniprocessor(void);
@@ -290,16 +301,19 @@ static inline void disable_local_APIC(void) { }
struct apic {
char *name;
+#ifndef CONFIG_XEN
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
int (*apic_id_valid)(int apicid);
int (*apic_id_registered)(void);
+#endif
u32 irq_delivery_mode;
u32 irq_dest_mode;
const struct cpumask *(*target_cpus)(void);
+#ifndef CONFIG_XEN
int disable_esr;
int dest_logical;
@@ -319,8 +333,10 @@ struct apic {
void (*setup_portio_remap)(void);
int (*check_phys_apicid_present)(int phys_apicid);
void (*enable_apic_mode)(void);
+#endif
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
+#ifndef CONFIG_XEN
/*
* When one of the next two hooks returns 1 the apic
* is switched to this. Essentially they are additional
@@ -335,6 +351,7 @@ struct apic {
int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
const struct cpumask *andmask,
unsigned int *apicid);
+#endif
/* ipi */
void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -344,6 +361,7 @@ struct apic {
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
+#ifndef CONFIG_XEN
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
@@ -391,6 +409,7 @@ struct apic {
*/
int (*x86_32_numa_cpu_node)(int cpu);
#endif
+#endif /* CONFIG_XEN */
};
/*
@@ -400,6 +419,8 @@ struct apic {
*/
extern struct apic *apic;
+#ifndef CONFIG_XEN
+
/*
* APIC drivers are probed based on how they are listed in the .apicdrivers
* section. So the order is important and enforced by the ordering
@@ -525,6 +546,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
extern void generic_bigsmp_probe(void);
+#endif /* CONFIG_XEN */
#ifdef CONFIG_X86_LOCAL_APIC
@@ -546,6 +568,8 @@ static inline const struct cpumask *online_target_cpus(void)
return cpu_online_mask;
}
+#ifndef CONFIG_XEN
+
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -686,6 +710,8 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
extern int default_check_phys_apicid_present(int phys_apicid);
#endif
+#endif /* CONFIG_XEN */
+
#endif /* CONFIG_X86_LOCAL_APIC */
#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index c46bb99d5fb2..2be8ade07662 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -17,6 +17,8 @@
*/
#define IO_APIC_SLOT_SIZE 1024
+#ifndef CONFIG_XEN
+
#define APIC_ID 0x20
#define APIC_LVR 0x30
@@ -147,6 +149,16 @@
#define XAPIC_ENABLE (1UL << 11)
#define X2APIC_ENABLE (1UL << 10)
+#else /* CONFIG_XEN */
+
+enum {
+ APIC_DEST_ALLBUT = 0x1,
+ APIC_DEST_SELF,
+ APIC_DEST_ALLINC
+};
+
+#endif /* CONFIG_XEN */
+
#ifdef CONFIG_X86_32
# define MAX_IO_APICS 64
# define MAX_LOCAL_APIC 256
@@ -155,6 +167,8 @@
# define MAX_LOCAL_APIC 32768
#endif
+#ifndef CONFIG_XEN
+
/*
* All x86-64 systems are xAPIC compatible.
* In the following, "apicid" is a physical APIC ID.
@@ -425,6 +439,8 @@ struct local_apic {
#undef u32
+#endif /* CONFIG_XEN */
+
#ifdef CONFIG_X86_32
#define BAD_APICID 0xFFu
#else
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..1ac8aec48e6f 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -11,7 +11,7 @@
& ~(CONFIG_PHYSICAL_ALIGN - 1))
/* Minimum kernel alignment, as a power of two */
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
#else
#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER)
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..1ac182ffdb69 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,11 @@ static inline void __user *arch_compat_alloc_user_space(long len)
sp = task_pt_regs(current)->sp;
} else {
/* -128 for the x32 ABI redzone */
+#ifndef CONFIG_XEN
sp = this_cpu_read(old_rsp) - 128;
+#else
+ sp = task_pt_regs(current)->sp - 128;
+#endif
}
return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e99ac27f95b2..1f2eca7ec6fb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -320,7 +320,11 @@ extern const char * const x86_power_flags[32];
#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
+#ifndef CONFIG_XEN
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
+#else
+#define cpu_has_xsave boot_cpu_has(X86_FEATURE_OSXSAVE)
+#endif
#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT)
#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 4b528a970bd4..deb6469e2b61 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -19,6 +19,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7);
static inline unsigned long native_get_debugreg(int regno)
{
+#ifndef CONFIG_XEN
unsigned long val = 0; /* Damn you, gcc! */
switch (regno) {
@@ -44,10 +45,14 @@ static inline unsigned long native_get_debugreg(int regno)
BUG();
}
return val;
+#else
+ return HYPERVISOR_get_debugreg(regno);
+#endif
}
static inline void native_set_debugreg(int regno, unsigned long value)
{
+#ifndef CONFIG_XEN
switch (regno) {
case 0:
asm("mov %0, %%db0" ::"r" (value));
@@ -70,6 +75,9 @@ static inline void native_set_debugreg(int regno, unsigned long value)
default:
BUG();
}
+#else
+ WARN_ON(HYPERVISOR_set_debugreg(regno, value));
+#endif
}
static inline void hw_breakpoint_disable(void)
@@ -93,7 +101,7 @@ extern void aout_dump_debugregs(struct user *dump);
extern void hw_breakpoint_restore(void);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_IDT)
DECLARE_PER_CPU(int, debug_stack_usage);
static inline void debug_stack_usage_inc(void)
{
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 699a973974ac..d6ae375d1303 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -27,6 +27,7 @@
#define CFI_REMEMBER_STATE .cfi_remember_state
#define CFI_RESTORE_STATE .cfi_restore_state
#define CFI_UNDEFINED .cfi_undefined
+#define CFI_SAME_VALUE .cfi_same_value
#define CFI_ESCAPE .cfi_escape
#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
@@ -70,6 +71,7 @@
#define CFI_REMEMBER_STATE cfi_ignore
#define CFI_RESTORE_STATE cfi_ignore
#define CFI_UNDEFINED cfi_ignore
+#define CFI_SAME_VALUE cfi_ignore
#define CFI_ESCAPE cfi_ignore
#define CFI_SIGNAL_FRAME cfi_ignore
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2fb5d5884e23..7dc4b95bc2db 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -113,7 +113,11 @@ struct efi_var_bootdata {
static inline bool efi_is_native(void)
{
+#ifndef CONFIG_XEN
return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT);
+#else
+ return 1; /* Hypervisor handles the mismatch quite fine. */
+#endif
}
#else
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index ab0ae1aa6d0a..389ba80830f7 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -21,11 +21,15 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+#ifndef CONFIG_XEN
/*
* irq_tlb_count is double-counted in irq_call_count, so it must be
* subtracted from irq_call_count when displaying irq_call_count
*/
unsigned int irq_tlb_count;
+#else
+ unsigned int irq_lock_count;
+#endif
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1da97efad08a..dd2454240e43 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -102,6 +102,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
irq_attr->polarity = polarity;
}
+#ifndef CONFIG_XEN
/* Intel specific interrupt remapping information */
struct irq_2_iommu {
struct intel_iommu *iommu;
@@ -135,6 +136,9 @@ struct irq_cfg {
};
#endif
};
+#else
+struct irq_cfg;
+#endif
extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
extern void send_cleanup_vector(struct irq_cfg *);
@@ -171,9 +175,15 @@ extern void smp_invalidate_interrupt(struct pt_regs *);
#else
extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
#endif
+extern void smp_irq_work_interrupt(struct pt_regs *);
+#ifdef CONFIG_XEN
+extern void smp_reboot_interrupt(struct pt_regs *);
+#endif
#endif
+#ifndef CONFIG_XEN
extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+#endif
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 2d4b5e6107cd..c80988ddce1d 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -62,3 +62,8 @@ static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_x2apic_available(void) { return false; }
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <asm/xen/hypervisor.h>
+#include_next <asm/hypervisor.h>
+#endif
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index a20365953bf8..b9daf61f5cab 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -54,6 +54,7 @@ extern struct irq_chip i8259A_chip;
struct legacy_pic {
int nr_legacy_irqs;
+#ifndef CONFIG_XEN
struct irq_chip *chip;
void (*mask)(unsigned int irq);
void (*unmask)(unsigned int irq);
@@ -61,6 +62,7 @@ struct legacy_pic {
void (*restore_mask)(void);
void (*init)(int auto_eoi);
int (*irq_pending)(unsigned int irq);
+#endif
void (*make_irq)(unsigned int irq);
};
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eefbe24c..51cd71d411cf 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -331,7 +331,7 @@ extern void early_iounmap(void __iomem *addr, unsigned long size);
extern void fixup_early_ioremap(void);
extern bool is_early_ioremap_ptep(pte_t *ptep);
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
#include <xen/xen.h>
struct bio_vec;
@@ -341,7 +341,7 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
#define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 459e50a424d1..881accc25e74 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -188,6 +188,7 @@ extern void mp_save_irq(struct mpc_intsrc *m);
extern void disable_ioapic_support(void);
+#ifndef CONFIG_XEN
extern void __init native_io_apic_init_mappings(void);
extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
@@ -214,6 +215,7 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
}
extern void io_apic_eoi(unsigned int apic, unsigned int vector);
+#endif /* !CONFIG_XEN */
#else /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 17483a492f18..3fb7a6bf28a3 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -5,14 +5,30 @@
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
# define PA_PGD 2
+# ifndef CONFIG_XEN
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+# define VA_PGD 3
+ */
+# define PA_SWAP_PAGE 4
+# define PAGES_NR 5
+# endif /* CONFIG_XEN */
#else
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
# define PA_TABLE_PAGE 2
+# ifndef CONFIG_XEN
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
+# else /* CONFIG_XEN, see comment above
+# define VA_TABLE_PAGE 3 */
+# define PA_SWAP_PAGE 4
+# define PAGES_NR 5
+# endif /* CONFIG_XEN */
#endif
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
@@ -166,6 +182,19 @@ struct kimage_arch {
typedef void crash_vmclear_fn(void);
extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+/* Under Xen we need to work with machine addresses. These macros give the
+ * machine address of a certain page to the generic kexec code instead of
+ * the pseudo physical address which would be given by the default macros.
+ */
+
+#ifdef CONFIG_XEN
+#define KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index a01e7ec7d237..a39d3e17abe8 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -5,6 +5,8 @@
#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+#include <linux/nmi.h>
+#include <asm/delay.h>
#include <asm/mc146818rtc.h>
#define NMI_REASON_PORT 0x61
@@ -22,6 +24,29 @@ static inline unsigned char default_get_nmi_reason(void)
return inb(NMI_REASON_PORT);
}
+static inline void clear_serr_error(unsigned char reason)
+{
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
+}
+
+static inline void clear_io_check_error(unsigned char reason)
+{
+ unsigned long i;
+
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
+
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+}
+
static inline void reassert_nmi(void)
{
int old_reg = -1;
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index d354fb781c57..9198ee489b6d 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -13,7 +13,7 @@
#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
#endif
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) && defined(__HAVE_ARCH_CMPXCHG)
/*
* This lock provides nmi access to the CMOS/RTC registers. It has some
* special properties. It is owned by a CPU and stores the index register
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5f55e6962769..e9b162ac1415 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -16,12 +16,15 @@ typedef struct {
/* True if mm supports a task running in 32 bit compatibility mode. */
unsigned short ia32_compat;
#endif
+#ifdef CONFIG_XEN
+ bool has_foreign_mappings:1;
+#endif
struct mutex lock;
void *vdso;
} mm_context_t;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
void leave_mm(int cpu);
#else
static inline void leave_mm(int cpu)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c0fa356e90de..623f905c1c68 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,10 @@ struct ctl_table;
extern int proc_nmi_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int unknown_nmi_panic;
+#endif
+#if (defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)) || \
+ (defined(CONFIG_XEN_SMPBOOT) && CONFIG_XEN_COMPAT >= 0x030200)
void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 0f1ddee6a0ce..33c9206162ea 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -7,7 +7,14 @@
/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
+#ifndef CONFIG_XEN
extern unsigned long phys_base;
+#else
+/* This would be nice, but the symbol name is too generic:
+#define phys_base 0
+*/
+extern const unsigned long phys_base;
+#endif
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
@@ -31,7 +38,15 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define __phys_reloc_hide(x) (x)
#ifdef CONFIG_FLATMEM
+/*
+ * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen
+ * other than for hotplugged memory.
+ */
+#ifndef CONFIG_XEN
#define pfn_valid(pfn) ((pfn) < max_pfn)
+#else
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif
#endif
void clear_page(void *page);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 22224b3b43bb..e914373221e7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -974,7 +974,7 @@ extern unsigned long arch_align_stack(unsigned long sp);
extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
void default_idle(void);
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
bool xen_set_default_idle(void);
#else
#define xen_set_default_idle 0
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 942a08623a1a..e9efa8f66039 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -63,6 +63,8 @@ struct pt_regs {
#include <linux/init.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt_types.h>
+#elif defined(CONFIG_X86_64_XEN)
+#include <xen/interface/xen.h>
#endif
struct cpuinfo_x86;
@@ -122,7 +124,13 @@ static inline int v8086_mode(struct pt_regs *regs)
#ifdef CONFIG_X86_64
static inline bool user_64bit_mode(struct pt_regs *regs)
{
-#ifndef CONFIG_PARAVIRT
+#if defined(CONFIG_XEN)
+ /*
+ * On Xen, these are the only long mode CPL 3 selectors.
+ * We do not allow long mode selectors in the LDT.
+ */
+ return regs->cs == __USER_CS || regs->cs == FLAT_USER_CS64;
+#elif !defined(CONFIG_PARAVIRT)
/*
* On non-paravirt systems, this is the only long mode CPL 3
* selector. We do not allow long mode selectors in the LDT.
@@ -134,6 +142,7 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}
+#ifndef CONFIG_XEN
#define current_user_stack_pointer() this_cpu_read(old_rsp)
/* ia32 vs. x32 difference */
#define compat_user_stack_pointer() \
@@ -141,6 +150,7 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
? current_pt_regs()->sp \
: this_cpu_read(old_rsp))
#endif
+#endif
#ifdef CONFIG_X86_32
extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
@@ -224,7 +234,9 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
}
#define arch_has_single_step() (1)
-#ifdef CONFIG_X86_DEBUGCTLMSR
+#if defined(CONFIG_XEN)
+#define arch_has_block_step() (0)
+#elif defined(CONFIG_X86_DEBUGCTLMSR)
#define arch_has_block_step() (1)
#else
#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..5a568350a47b 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -83,7 +83,11 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
rdtsc_barrier();
offset = pvclock_get_nsec_offset(src);
ret = src->system_time + offset;
+#ifndef CONFIG_XEN
ret_flags = src->flags;
+#else
+ ret_flags = PVCLOCK_TSC_STABLE_BIT;
+#endif
rdtsc_barrier();
*cycles = ret;
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 5c6e4fb370f5..24255a1ce8ed 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -54,7 +54,7 @@
#endif
#ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT
+#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN)
/* Paravirtualized systems may not have PSE or PGE available */
#define NEED_PSE 0
#define NEED_PGE 0
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index c48a95035a77..ffbf4d85473b 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -188,7 +188,9 @@
#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
-#ifndef CONFIG_PARAVIRT
+#if defined(CONFIG_X86_XEN)
+#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
+#elif !defined(CONFIG_PARAVIRT)
#define get_kernel_rpl() 0
#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1df6e84691f..9df514be6e5f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -97,6 +97,9 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#if defined(CONFIG_X86_XEN) && defined(CONFIG_CPU_SUP_AMD)
+#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
+#endif
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -121,6 +124,7 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_CSTAR (1 << TIF_CSTAR)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -150,9 +154,13 @@ struct thread_info {
_TIF_USER_RETURN_NOTIFY)
/* flags to check in __switch_to() */
+#ifndef CONFIG_XEN
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+#else
+#define _TIF_WORK_CTXSW (_TIF_NOTSC /*todo | _TIF_BLOCKSTEP */)
+#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 095b21507b6a..a00aa31da493 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -30,7 +30,7 @@
# define ENABLE_TOPO_DEFINES
# endif
#else
-# ifdef CONFIG_SMP
+# if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
# define ENABLE_TOPO_DEFINES
# endif
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 88eae2aec619..6d16bc7c9e13 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,6 +40,9 @@ asmlinkage void alignment_check(void);
asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);
+#ifdef CONFIG_X86_XEN
+asmlinkage void fixup_4gb_segment(void);
+#endif
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
@@ -68,6 +71,9 @@ dotraplinkage void do_machine_check(struct pt_regs *, long);
dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *, long);
+#ifdef CONFIG_XEN
+void do_fixup_4gb_segment(struct pt_regs *, long);
+#endif
#endif
static inline int get_si_code(unsigned long condition)
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 2c32df95bb78..5be85734c325 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -11,7 +11,7 @@
#ifndef _ASM_X86_UV_UV_HUB_H
#define _ASM_X86_UV_UV_HUB_H
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
#include <linux/numa.h>
#include <linux/percpu.h>
#include <linux/timer.h>
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index e709884d0ef9..6f786887890c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
#include <xen/interface/platform.h>
+#include <xen/interface/tmem.h>
#include <xen/interface/xen-mca.h>
/*
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 125f344f06a9..1301679df1f2 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,7 +57,7 @@ static inline uint32_t xen_cpuid_base(void)
return 0;
}
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
extern bool xen_hvm_need_lapic(void);
static inline bool xen_x2apic_para_available(void)
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index fd9cb7695b5f..e24bc4d39860 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -10,17 +10,20 @@
#define _ASM_X86_XEN_INTERFACE_H
#ifdef __XEN__
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
#else
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef type * __guest_handle_ ## name
#endif
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
+ ___DEFINE_XEN_GUEST_HANDLE(name, type); \
+ ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
#define DEFINE_GUEST_HANDLE_STRUCT(name) \
- __DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name) __guest_handle_ ## name
+ __DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
#ifdef __XEN__
#if defined(__i386__)
@@ -54,16 +57,6 @@ typedef unsigned long xen_pfn_t;
#define PRI_xen_pfn "lx"
typedef unsigned long xen_ulong_t;
#define PRI_xen_ulong "lx"
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint, unsigned int);
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-DEFINE_GUEST_HANDLE(uint32_t);
-DEFINE_GUEST_HANDLE(xen_pfn_t);
-DEFINE_GUEST_HANDLE(xen_ulong_t);
#endif
#ifndef HYPERVISOR_VIRT_START
@@ -75,7 +68,7 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
/* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
+#define XEN_LEGACY_MAX_VCPUS 32
/*
* SEGMENT DESCRIPTOR TABLES
diff --git a/arch/x86/include/mach-xen/asm/agp.h b/arch/x86/include/mach-xen/asm/agp.h
new file mode 100644
index 000000000000..45ba49dfa8d0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/agp.h
@@ -0,0 +1,58 @@
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/special_insns.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) ( \
+ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
+ ?: set_pages_uc(page, 1))
+#define unmap_page_from_agp(page) ( \
+ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
+ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+ set_pages_wb(page, 1))
+
+#define map_pages_into_agp(pages, nr) ({ \
+ __typeof__(nr) n__; \
+ int rc__ = 0; \
+ for (n__ = 0; n__ < (nr) && !rc__; ++n__) \
+ rc__ = xen_create_contiguous_region( \
+ (unsigned long)page_address((pages)[n__]), 0, 32); \
+ rc__ ?: set_pages_array_uc(pages, nr); \
+})
+#define unmap_pages_from_agp(pages, nr) ({ \
+ __typeof__(nr) n__; \
+ for (n__ = 0; n__ < nr; ++n__) \
+ xen_destroy_contiguous_region( \
+ (unsigned long)page_address((pages)[n__]), 0); \
+ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+ set_pages_array_wb(pages, nr); \
+})
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+#define virt_to_gart virt_to_machine
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order) ({ \
+ char *_t; dma_addr_t _d; \
+ _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
+ _t; })
+#define free_gatt_pages(table, order) \
+ dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
+
+#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_32.h b/arch/x86/include/mach-xen/asm/cmpxchg_32.h
new file mode 100644
index 000000000000..97a16fd06efe
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg_32.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_32_H
+#define _ASM_X86_XEN_CMPXCHG_32_H
+
+#include_next <asm/cmpxchg_32.h>
+
+static inline u64 get_64bit(const volatile u64 *ptr)
+{
+ u64 res;
+ __asm__("movl %%ebx,%%eax\n"
+ "movl %%ecx,%%edx\n"
+ LOCK_PREFIX "cmpxchg8b %1"
+ : "=&A" (res) : "m" (*ptr));
+ return res;
+}
+
+static inline u64 get_64bit_local(const volatile u64 *ptr)
+{
+ u64 res;
+ __asm__("movl %%ebx,%%eax\n"
+ "movl %%ecx,%%edx\n"
+ "cmpxchg8b %1"
+ : "=&A" (res) : "m" (*ptr));
+ return res;
+}
+
+#endif /* _ASM_X86_XEN_CMPXCHG_32_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_64.h b/arch/x86/include/mach-xen/asm/cmpxchg_64.h
new file mode 100644
index 000000000000..ef61aa9abde8
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg_64.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_64_H
+#define _ASM_X86_XEN_CMPXCHG_64_H
+
+#include_next <asm/cmpxchg_64.h>
+
+static inline u64 get_64bit(const volatile u64 *ptr)
+{
+ return *ptr;
+}
+
+#define get_64bit_local get_64bit
+
+#endif /* _ASM_X86_XEN_CMPXCHG_64_H */
diff --git a/arch/x86/include/mach-xen/asm/desc.h b/arch/x86/include/mach-xen/asm/desc.h
new file mode 100644
index 000000000000..3461903f8455
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@ -0,0 +1,434 @@
+#ifndef _ASM_X86_DESC_H
+#define _ASM_X86_DESC_H
+
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+
+#include <linux/smp.h>
+#include <linux/percpu.h>
+
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+
+ desc->base0 = (info->base_addr & 0x0000ffff);
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
+ /*
+ * Don't allow setting of the lm bit. It would confuse
+ * user_64bit_mode and would get overridden by sysret anyway.
+ */
+ desc->l = 0;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
+#endif
+
+struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+ return per_cpu(gdt_page, cpu).gdt;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate->offset_low = PTR_LOW(func);
+ gate->segment = __KERNEL_CS;
+ gate->ist = ist;
+ gate->p = 1;
+ gate->dpl = dpl;
+ gate->zero0 = 0;
+ gate->zero1 = 0;
+ gate->type = type;
+ gate->offset_middle = PTR_MIDDLE(func);
+ gate->offset_high = PTR_HIGH(func);
+}
+
+#else
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+ unsigned long base, unsigned dpl, unsigned flags,
+ unsigned short seg)
+{
+ gate->a = (seg << 16) | (base & 0xffff);
+ gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+ const u32 *desc = ptr;
+
+ return !(desc[0] | desc[1]);
+}
+
+#ifndef CONFIG_XEN
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
+{
+ memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+ memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
+{
+ unsigned int size;
+
+ switch (type) {
+ case DESC_TSS: size = sizeof(tss_desc); break;
+ case DESC_LDT: size = sizeof(ldt_desc); break;
+ default: size = sizeof(*gdt); break;
+ }
+
+ memcpy(&gdt[entry], desc, size);
+}
+#endif
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+ unsigned long limit, unsigned char type,
+ unsigned char flags)
+{
+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
+ ((flags & 0xf) << 20);
+ desc->p = 1;
+}
+
+
+#ifndef CONFIG_XEN
+static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+ struct ldttss_desc64 *desc = d;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->limit0 = size & 0xFFFF;
+ desc->base0 = PTR_LOW(addr);
+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
+#else
+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+ struct desc_struct *d = get_cpu_gdt_table(cpu);
+ tss_desc tss;
+
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap. See tss_struct definition in processor.h
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+ sizeof(unsigned long) - 1);
+ write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+ if (likely(entries == 0))
+ asm volatile("lldt %w0"::"q" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+ ldt_desc ldt;
+
+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+ entries * LDT_ENTRY_SIZE - 1);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ &ldt, DESC_LDT);
+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+ }
+}
+
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+
+ asm volatile("str %0":"=r" (tr));
+
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ unsigned int i;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+#else
+#include <asm/pgtable.h>
+
+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
+#define set_ldt xen_set_ldt
+
+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
+ const void *desc);
+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
+ const void *desc, int type);
+
+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ unsigned int i;
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ if (HYPERVISOR_update_descriptor(
+ arbitrary_virt_to_machine(&gdt[i]),
+ *(u64 *)&t->tls_array[i]))
+ BUG();
+}
+#endif
+
+#define _LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+ set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+ set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+ preempt_disable();
+ load_LDT_nolock(pc);
+ preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+ return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
+{
+ desc->base0 = base & 0xffff;
+ desc->base1 = (base >> 16) & 0xff;
+ desc->base2 = (base >> 24) & 0xff;
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+ return desc->limit0 | (desc->limit << 16);
+}
+
+static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
+{
+ desc->limit0 = limit & 0xffff;
+ desc->limit = (limit >> 16) & 0xf;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+ gate_desc s;
+
+ pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+ write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
+static inline void _set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate_desc s;
+
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+ /*
+ * does not need to be atomic because it is only done once at
+ * setup time
+ */
+ write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+extern int first_system_vector;
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+ if (!test_bit(vector, used_vectors)) {
+ set_bit(vector, used_vectors);
+ if (first_system_vector > vector)
+ first_system_vector = vector;
+ } else {
+ BUG();
+ }
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+ alloc_system_vector(n);
+ set_intr_gate(n, addr);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_system_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+#endif
+
+#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/mach-xen/asm/dma-mapping.h b/arch/x86/include/mach-xen/asm/dma-mapping.h
new file mode 100644
index 000000000000..cffca1854022
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/dma-mapping.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_DMA_MAPPING_H_
+
+#define phys_to_dma _phys_to_dma_
+#define dma_to_phys _dma_to_phys_
+
+#include_next <asm/dma-mapping.h>
+
+#undef phys_to_dma
+#undef dma_to_phys
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+ return phys_to_machine(paddr);
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+ return machine_to_phys(daddr);
+}
+
+extern int range_straddles_page_boundary(paddr_t p, size_t size);
+
+#endif /* _ASM_X86_DMA_MAPPING_H_ */
diff --git a/arch/x86/include/mach-xen/asm/fixmap.h b/arch/x86/include/mach-xen/asm/fixmap.h
new file mode 100644
index 000000000000..2faee6acf0b3
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/fixmap.h
@@ -0,0 +1,242 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
+#ifndef _ASM_X86_FIXMAP_H
+#define _ASM_X86_FIXMAP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <asm/pvclock.h>
+#include <asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
+#else
+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+#endif
+
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+ FIX_HOLE,
+ FIX_VDSO,
+#else
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VVAR_PAGE,
+ VSYSCALL_HPET,
+#endif
+#ifdef CONFIG_PARAVIRT_CLOCK
+ PVCLOCK_FIXMAP_BEGIN,
+ PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
+#endif
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#else
+ FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS 256
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+#endif
+#ifndef CONFIG_X86_NO_IDT
+ FIX_RO_IDT, /* Virtual mapping for read-only IDT */
+#endif
+#ifdef CONFIG_X86_32
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
+ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+#ifdef CONFIG_X86_INTEL_MID
+ FIX_LNW_VRTC,
+#endif
+ __end_of_permanent_fixed_addresses,
+
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * If necessary we round it up to the next 256 pages boundary so
+ * that we can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 4
+#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
+ FIX_BTMAP_END =
+ (__end_of_permanent_fixed_addresses ^
+ (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
+ -PTRS_PER_PTE
+ ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
+ (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
+ : __end_of_permanent_fixed_addresses,
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+#ifdef CONFIG_X86_32
+ FIX_WP_TEST,
+#endif
+#ifdef CONFIG_INTEL_TXT
+ FIX_TBOOT_BASE,
+#endif
+ __end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
+
+extern int fixmaps_set;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);
+
+static inline void __set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ xen_set_fixmap(idx, phys, flags);
+}
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx) \
+ __set_fixmap(idx, 0, __pgprot(0))
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
+
+/* Return an pointer with offset calculated */
+static __always_inline unsigned long
+__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+ __set_fixmap(idx, phys, flags);
+ return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/mach-xen/asm/fpu-internal.h b/arch/x86/include/mach-xen/asm/fpu-internal.h
new file mode 100644
index 000000000000..86475e8d79b5
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/fpu-internal.h
@@ -0,0 +1,66 @@
+#ifndef _FPU_INTERNAL_H
+#include <asm/i387.h>
+#define switch_fpu_prepare native_switch_fpu_prepare
+#include_next <asm/fpu-internal.h>
+#undef switch_fpu_prepare
+
+static inline bool xen_thread_fpu_begin(struct task_struct *tsk,
+ multicall_entry_t *mcl)
+{
+ bool ret = false;
+
+ if (mcl && !use_eager_fpu()) {
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = 0;
+ ret = true;
+ }
+ __thread_set_has_fpu(tsk);
+
+ return ret;
+}
+
+static inline fpu_switch_t xen_switch_fpu_prepare(struct task_struct *old,
+ struct task_struct *new,
+ int cpu,
+ multicall_entry_t **mcl)
+{
+ fpu_switch_t fpu;
+
+ /*
+ * If the task has used the math, pre-load the FPU on xsave processors
+ * or if the past 5 consecutive context-switches used math.
+ */
+ fpu.preload = tsk_used_math(new) && (use_eager_fpu() ||
+ new->fpu_counter > 5);
+ if (__thread_has_fpu(old)) {
+ if (!__save_init_fpu(old))
+ cpu = ~0;
+ old->thread.fpu.last_cpu = cpu;
+ old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */
+
+ /* Don't change CR0.TS if we just switch! */
+ if (fpu.preload) {
+ new->fpu_counter++;
+ __thread_set_has_fpu(new);
+ prefetch(new->thread.fpu.state);
+ } else if (!use_eager_fpu()) {
+ (*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+ (*mcl)++->args[0] = 1;
+ }
+ } else {
+ old->fpu_counter = 0;
+ old->thread.fpu.last_cpu = ~0;
+ if (fpu.preload) {
+ new->fpu_counter++;
+ if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
+ fpu.preload = 0;
+ else
+ prefetch(new->thread.fpu.state);
+ if (xen_thread_fpu_begin(new, *mcl))
+ ++*mcl;
+ }
+ }
+ return fpu;
+}
+
+#endif
diff --git a/arch/x86/include/mach-xen/asm/gnttab_dma.h b/arch/x86/include/mach-xen/asm/gnttab_dma.h
new file mode 100644
index 000000000000..b2b38d9e325a
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/gnttab_dma.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_I386_GNTTAB_DMA_H
+#define _ASM_I386_GNTTAB_DMA_H
+
+#include <asm/bug.h>
+
+static inline int gnttab_dma_local_pfn(struct page *page)
+{
+ /* Has it become a local MFN? */
+ return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
+}
+
+static inline maddr_t gnttab_dma_map_page(struct page *page,
+ unsigned long offset)
+{
+ unsigned int pgnr = offset >> PAGE_SHIFT;
+ unsigned int order = compound_order(page);
+
+ BUG_ON(pgnr >> order);
+ __gnttab_dma_map_page(page);
+ return ((maddr_t)pfn_to_mfn(page_to_pfn(page) + pgnr) << PAGE_SHIFT)
+ + (offset & ~PAGE_MASK);
+}
+
+static inline void gnttab_dma_unmap_page(maddr_t maddr)
+{
+ __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
+}
+
+#endif /* _ASM_I386_GNTTAB_DMA_H */
diff --git a/arch/x86/include/mach-xen/asm/highmem.h b/arch/x86/include/mach-xen/asm/highmem.h
new file mode 100644
index 000000000000..0b43fd4e5f2f
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/highmem.h
@@ -0,0 +1,98 @@
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_X86_HIGHMEM_H
+#define _ASM_X86_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * FIXADDR_TOP
+ * fixed_addresses
+ * FIXADDR_START
+ * temp fixed addresses
+ * FIXADDR_BOOT_START
+ * Persistent kmap area
+ * PKMAP_BASE
+ * VMALLOC_END
+ * Vmalloc area
+ * VMALLOC_START
+ * high_memory
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+struct page *kmap_atomic_to_page(void *ptr);
+
+#define kmap_atomic_pte(page) \
+ kmap_atomic_prot(page, \
+ PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot)
+
+#define flush_cache_kmaps() do { } while (0)
+
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+
+void clear_highpage(struct page *);
+static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+ clear_highpage(page);
+}
+#define __HAVE_ARCH_CLEAR_HIGHPAGE
+#define clear_user_highpage clear_user_highpage
+#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
+
+void copy_highpage(struct page *to, struct page *from);
+static inline void copy_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ copy_highpage(to, from);
+}
+#define __HAVE_ARCH_COPY_HIGHPAGE
+#define __HAVE_ARCH_COPY_USER_HIGHPAGE
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/mach-xen/asm/hypercall.h b/arch/x86/include/mach-xen/asm/hypercall.h
new file mode 100644
index 000000000000..e19cf1068c13
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall.h
@@ -0,0 +1,434 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * 64-bit updates:
+ * Benjamin Liu <benjamin.liu@intel.com>
+ * Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+
+#ifndef __HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+# include <xen/interface/platform.h>
+# include <xen/interface/arch-x86/xen-mca.h>
+#endif
+
+#ifdef CONFIG_XEN
+#define HYPERCALL_ASM_OPERAND "%c"
+#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32)
+#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
+#else
+#define HYPERCALL_ASM_OPERAND "*%"
+#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32)
+#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
+#endif
+
+#define HYPERCALL_ARG(arg, n) \
+ register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg)
+
+#define _hypercall0(type, name) \
+({ \
+ type __res; \
+ asm volatile ( \
+ "call " HYPERCALL_ASM_OPERAND "1" \
+ : "=a" (__res) \
+ : HYPERCALL_C_OPERAND(name) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall1(type, name, arg) \
+({ \
+ type __res; \
+ HYPERCALL_ARG(arg, 1); \
+ asm volatile ( \
+ "call " HYPERCALL_ASM_OPERAND "2" \
+ : "=a" (__res), "+r" (__arg1) \
+ : HYPERCALL_C_OPERAND(name) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ type __res; \
+ HYPERCALL_ARG(a1, 1); \
+ HYPERCALL_ARG(a2, 2); \
+ asm volatile ( \
+ "call " HYPERCALL_ASM_OPERAND "3" \
+ : "=a" (__res), "+r" (__arg1), "+r" (__arg2) \
+ : HYPERCALL_C_OPERAND(name) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ type __res; \
+ HYPERCALL_ARG(a1, 1); \
+ HYPERCALL_ARG(a2, 2); \
+ HYPERCALL_ARG(a3, 3); \
+ asm volatile ( \
+ "call " HYPERCALL_ASM_OPERAND "4" \
+ : "=a" (__res), "+r" (__arg1), \
+ "+r" (__arg2), "+r" (__arg3) \
+ : HYPERCALL_C_OPERAND(name) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ type __res; \
+ HYPERCALL_ARG(a1, 1); \
+ HYPERCALL_ARG(a2, 2); \
+ HYPERCALL_ARG(a3, 3); \
+ HYPERCALL_ARG(a4, 4); \
+ asm volatile ( \
+ "call " HYPERCALL_ASM_OPERAND "5" \
+ : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \
+ "+r" (__arg3), "+r" (__arg4) \
+ : HYPERCALL_C_OPERAND(name) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ type __res; \
+ HYPERCALL_ARG(a1, 1); \
+ HYPERCALL_ARG(a2, 2); \
+ HYPERCALL_ARG(a3, 3); \
+ HYPERCALL_ARG(a4, 4); \
+ HYPERCALL_ARG(a5, 5); \
+ asm volatile ( \
+ "call " HYPERCALL_ASM_OPERAND "6" \
+ : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \
+ "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \
+ : HYPERCALL_C_OPERAND(name) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall(type, op, a1, a2, a3, a4, a5) \
+({ \
+ type __res; \
+ HYPERCALL_ARG(a1, 1); \
+ HYPERCALL_ARG(a2, 2); \
+ HYPERCALL_ARG(a3, 3); \
+ HYPERCALL_ARG(a4, 4); \
+ HYPERCALL_ARG(a5, 5); \
+ asm volatile ( \
+ "call *%6" \
+ : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \
+ "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \
+ : "g" (HYPERCALL_LOCATION(op)) \
+ : "memory" ); \
+ __res; \
+})
+
+#ifdef CONFIG_X86_32
+# include "hypercall_32.h"
+#else
+# include "hypercall_64.h"
+#endif
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+ const trap_info_t *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ if (arch_use_lazy_mmu_mode())
+ return xen_multi_mmu_update(req, count, success_count, domid);
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ if (arch_use_lazy_mmu_mode())
+ return xen_multi_mmuext_op(op, count, success_count, domid);
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+ unsigned long *frame_list, unsigned int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+ unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+ int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+ int cmd, unsigned long arg)
+{
+ return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op, cmd, arg);
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static inline int __must_check
+HYPERVISOR_platform_op(
+ struct xen_platform_op *platform_op)
+{
+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_mca(
+ struct xen_mc *mc_op)
+{
+ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+ return _hypercall1(int, mca, mc_op);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+ unsigned int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+ unsigned int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+ unsigned int cmd, void *arg)
+{
+ if (arch_use_lazy_mmu_mode())
+ xen_multicall_flush();
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+ multicall_entry_t *call_list, unsigned int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+ int cmd, unsigned int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+ unsigned int cmd, void *uop, unsigned int count)
+{
+ bool fixup = false;
+ int rc;
+
+ if (arch_use_lazy_mmu_mode())
+ xen_multicall_flush();
+#ifdef GNTTABOP_map_grant_ref
+ if (cmd == GNTTABOP_map_grant_ref)
+#endif
+ fixup = gnttab_pre_map_adjust(cmd, uop, count);
+ rc = _hypercall3(int, grant_table_op, cmd, uop, count);
+ if (rc == 0 && fixup)
+ rc = gnttab_post_map_adjust(uop, count);
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+ unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+ int cmd, unsigned int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+ unsigned long srec)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = SHUTDOWN_suspend
+ };
+
+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+#endif
+
+ return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+ unsigned long op, void *arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+ int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+ int cmd, const void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+ int op, void *arg)
+{
+ return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+ unsigned long op, void *args)
+{
+ return _hypercall2(int, kexec_op, op, args);
+}
+
+struct tmem_op;
+
+static inline int __must_check
+HYPERVISOR_tmem_op(
+ struct tmem_op *op)
+{
+ return _hypercall1(int, tmem_op, (void *)op);
+}
+
+#endif /* __HYPERCALL_H__ */
diff --git a/arch/x86/include/mach-xen/asm/hypercall_32.h b/arch/x86/include/mach-xen/asm/hypercall_32.h
new file mode 100644
index 000000000000..3987b2eddf0e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_32.h
@@ -0,0 +1,62 @@
+#define HYPERCALL_arg1 "ebx"
+#define HYPERCALL_arg2 "ecx"
+#define HYPERCALL_arg3 "edx"
+#define HYPERCALL_arg4 "esi"
+#define HYPERCALL_arg5 "edi"
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+ unsigned long event_selector, unsigned long event_address,
+ unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+ return _hypercall4(int, set_callbacks,
+ event_selector, event_address,
+ failsafe_selector, failsafe_address);
+}
+#endif
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+ u64 timeout)
+{
+ return _hypercall2(long, set_timer_op,
+ (unsigned long)timeout,
+ (unsigned long)(timeout>>32));
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+ u64 ma, u64 desc)
+{
+ return _hypercall4(int, update_descriptor,
+ (unsigned long)ma, (unsigned long)(ma>>32),
+ (unsigned long)desc, (unsigned long)(desc>>32));
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+ unsigned long va, pte_t new_val, unsigned long flags)
+{
+ unsigned long pte_hi = 0;
+
+ if (arch_use_lazy_mmu_mode())
+ return xen_multi_update_va_mapping(va, new_val, flags);
+#ifdef CONFIG_X86_PAE
+ pte_hi = new_val.pte_high;
+#endif
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte_low, pte_hi, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+ unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+ pte_hi = new_val.pte_high;
+#endif
+ return _hypercall5(int, update_va_mapping_otherdomain, va,
+ new_val.pte_low, pte_hi, flags, domid);
+}
diff --git a/arch/x86/include/mach-xen/asm/hypercall_64.h b/arch/x86/include/mach-xen/asm/hypercall_64.h
new file mode 100644
index 000000000000..97d944572b8b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_64.h
@@ -0,0 +1,54 @@
+#define HYPERCALL_arg1 "rdi"
+#define HYPERCALL_arg2 "rsi"
+#define HYPERCALL_arg3 "rdx"
+#define HYPERCALL_arg4 "r10"
+#define HYPERCALL_arg5 "r8"
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+ unsigned long event_address, unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address, syscall_address);
+}
+#endif
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+ u64 timeout)
+{
+ return _hypercall1(long, set_timer_op, timeout);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+ unsigned long ma, unsigned long word)
+{
+ return _hypercall2(int, update_descriptor, ma, word);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+ unsigned long va, pte_t new_val, unsigned long flags)
+{
+ if (arch_use_lazy_mmu_mode())
+ return xen_multi_update_va_mapping(va, new_val, flags);
+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val.pte, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_segment_base(
+ int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
diff --git a/arch/x86/include/mach-xen/asm/hypervisor.h b/arch/x86/include/mach-xen/asm/hypervisor.h
new file mode 100644
index 000000000000..d01075908f83
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@ -0,0 +1,403 @@
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <asm/percpu.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable_types.h>
+
+extern shared_info_t *HYPERVISOR_shared_info;
+
+#if defined(CONFIG_XEN_VCPU_INFO_PLACEMENT)
+DECLARE_PER_CPU(struct vcpu_info, vcpu_info);
+# define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu))
+# define current_vcpu_info() (&__get_cpu_var(vcpu_info))
+void setup_vcpu_info(unsigned int cpu);
+void adjust_boot_vcpu_info(void);
+#elif defined(CONFIG_XEN)
+# define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
+# ifdef CONFIG_SMP
+# include <asm/smp-processor-id.h>
+# define current_vcpu_info() vcpu_info(smp_processor_id())
+# else
+# define current_vcpu_info() vcpu_info(0)
+# endif
+static inline void setup_vcpu_info(unsigned int cpu) {}
+#endif
+
+#ifdef CONFIG_X86_32
+extern unsigned long hypervisor_virt_start;
+#endif
+
+/* arch/xen/i386/kernel/setup.c */
+extern start_info_t *xen_start_info;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
+#else
+#define is_initial_xendomain() 0
+#endif
+
+#define init_hypervisor(c) ((void)(c))
+#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
+
+DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
+#define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running)
+
+/* arch/xen/kernel/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+/* arch/xen/kernel/process.c */
+void xen_cpu_idle (void);
+
+/* arch/xen/i386/kernel/hypervisor.c */
+void do_hypervisor_callback(struct pt_regs *regs);
+
+/* arch/xen/i386/mm/hypervisor.c */
+/*
+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
+ * be MACHINE addresses.
+ */
+
+void xen_pt_switch(pgd_t *);
+void xen_new_user_pt(pgd_t *); /* x86_64 only */
+void xen_load_gs(unsigned int selector); /* x86_64 only */
+void xen_tlb_flush(void);
+void xen_invlpg(unsigned long ptr);
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val);
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
+void xen_pgd_pin(pgd_t *);
+void xen_pgd_unpin(pgd_t *);
+
+void xen_init_pgd_pin(void);
+#ifdef CONFIG_PM_SLEEP
+void setup_pfn_to_mfn_frame_list(void *(*)(unsigned long, unsigned long,
+ unsigned long));
+#endif
+
+void xen_set_ldt(const void *ptr, unsigned int ents);
+
+#ifdef CONFIG_SMP
+#include <linux/cpumask.h>
+void xen_tlb_flush_all(void);
+void xen_invlpg_all(unsigned long ptr);
+void xen_tlb_flush_mask(const cpumask_t *mask);
+void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr);
+#else
+#define xen_tlb_flush_all xen_tlb_flush
+#define xen_invlpg_all xen_invlpg
+#endif
+
+/* Returns zero on success else negative errno. */
+int xen_create_contiguous_region(
+ unsigned long vstart, unsigned int order, unsigned int address_bits);
+void xen_destroy_contiguous_region(
+ unsigned long vstart, unsigned int order);
+int early_create_contiguous_region(unsigned long pfn, unsigned int order,
+ unsigned int address_bits);
+
+struct page;
+
+int xen_limit_pages_to_max_mfn(
+ struct page *pages, unsigned int order, unsigned int address_bits);
+
+bool __cold hypervisor_oom(void);
+
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+
+#ifdef CONFIG_XEN_SCRUB_PAGES
+void xen_scrub_pages(void *, unsigned int);
+#else
+#define xen_scrub_pages(_p,_n) ((void)0)
+#endif
+
+#if defined(CONFIG_XEN) && !defined(MODULE)
+
+DECLARE_PER_CPU(bool, xen_lazy_mmu);
+
+void xen_multicall_flush(void);
+
+int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t,
+ unsigned long flags);
+int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count,
+ unsigned int *success_count, domid_t);
+int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count,
+ unsigned int *success_count, domid_t);
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+ this_cpu_write_1(xen_lazy_mmu, true);
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+ this_cpu_write_1(xen_lazy_mmu, false);
+ xen_multicall_flush();
+}
+
+#define arch_use_lazy_mmu_mode() unlikely(this_cpu_read_1(xen_lazy_mmu))
+
+#if 0 /* All uses are in places potentially called asynchronously, but
+ * asynchronous code should rather not make use of lazy mode at all.
+ * Therefore, all uses of this function get commented out, proper
+ * detection of asynchronous invocations is added whereever needed,
+ * and this function is disabled to catch any new (improper) uses.
+ */
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ if (arch_use_lazy_mmu_mode())
+ xen_multicall_flush();
+}
+#endif
+
+#else /* !CONFIG_XEN || MODULE */
+
+static inline void xen_multicall_flush(void) {}
+#define arch_use_lazy_mmu_mode() false
+#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; })
+#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
+#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN && !MODULE */
+
+#ifdef CONFIG_XEN
+
+struct gnttab_map_grant_ref;
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
+ unsigned int count);
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
+#else
+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
+ unsigned int count)
+{
+ BUG();
+ return -ENOSYS;
+}
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define gnttab_pre_map_adjust(...) false
+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN */
+
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
+#ifdef CONFIG_XEN
+#define is_running_on_xen() 1
+extern char hypercall_page[PAGE_SIZE];
+#else
+extern char *hypercall_stubs;
+#define is_running_on_xen() (!!hypercall_stubs)
+#endif
+
+#include <xen/hypercall.h>
+
+static inline int
+HYPERVISOR_yield(
+ void)
+{
+ int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+ return rc;
+}
+
+static inline int
+HYPERVISOR_block(
+ void)
+{
+ int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
+#endif
+
+ return rc;
+}
+
+static inline void __noreturn
+HYPERVISOR_shutdown(
+ unsigned int reason)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = reason
+ };
+
+ VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
+#if CONFIG_XEN_COMPAT <= 0x030002
+ VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
+#endif
+ /* Don't recurse needlessly. */
+ BUG_ON(reason != SHUTDOWN_crash);
+ for(;;);
+}
+
+static inline int __must_check
+HYPERVISOR_poll(
+ evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
+{
+ int rc;
+ struct sched_poll sched_poll = {
+ .nr_ports = nr_ports,
+ .timeout = jiffies_to_st(timeout)
+ };
+ set_xen_guest_handle(sched_poll.ports, ports);
+
+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_poll_no_timeout(
+ evtchn_port_t *ports, unsigned int nr_ports)
+{
+ int rc;
+ struct sched_poll sched_poll = {
+ .nr_ports = nr_ports
+ };
+ set_xen_guest_handle(sched_poll.ports, ports);
+
+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+ return rc;
+}
+
+#ifdef CONFIG_XEN
+
+static inline void
+MULTI_update_va_mapping(
+ multicall_entry_t *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+#if defined(CONFIG_X86_64)
+ mcl->args[1] = new_val.pte;
+#elif defined(CONFIG_X86_PAE)
+ mcl->args[1] = new_val.pte_low;
+ mcl->args[2] = new_val.pte_high;
+#else
+ mcl->args[1] = new_val.pte_low;
+ mcl->args[2] = 0;
+#endif
+ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
+}
+
+static inline void
+MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req,
+ unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_memory_op(multicall_entry_t *mcl, unsigned int cmd, void *arg)
+{
+ mcl->op = __HYPERVISOR_memory_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)arg;
+}
+
+static inline void
+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
+ void *uop, unsigned int count)
+{
+ mcl->op = __HYPERVISOR_grant_table_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)uop;
+ mcl->args[2] = count;
+}
+
+#else /* !defined(CONFIG_XEN) */
+
+/* Multicalls not supported for HVM guests. */
+static inline void MULTI_bug(multicall_entry_t *mcl, ...)
+{
+ BUG_ON(mcl);
+}
+
+#define MULTI_update_va_mapping MULTI_bug
+#define MULTI_mmu_update MULTI_bug
+#define MULTI_memory_op MULTI_bug
+#define MULTI_grant_table_op MULTI_bug
+
+#endif
+
+#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI)
+
+#ifdef LINUX
+/* drivers/staging/ use Windows-style types, including VOID */
+#undef VOID
+#endif
+
+#endif /* __HYPERVISOR_H__ */
diff --git a/arch/x86/include/mach-xen/asm/io.h b/arch/x86/include/mach-xen/asm/io.h
new file mode 100644
index 000000000000..2d07f8a8ac83
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/io.h
@@ -0,0 +1,343 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#define ARCH_HAS_IOREMAP_WC
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <asm/page.h>
+#ifdef __KERNEL__
+#include <asm/fixmap.h>
+#endif
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+
+#define readq_relaxed(a) readq(a)
+
+#define __raw_readq(a) readq(a)
+#define __raw_writeq(val, addr) writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq readq
+#define writeq writeq
+
+#endif
+
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+ return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ * However, we truncate the address to unsigned int to avoid undesirable
+ * promitions in legacy drivers.
+ */
+#define isa_virt_to_bus(_x) ({ \
+ unsigned long _va_ = (unsigned long)(_x); \
+ _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+ ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+ : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void set_iounmap_nonlazy(void);
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+{
+ memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+{
+ memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+{
+ memcpy((void __force *)dst, src, count);
+}
+
+/*
+ * Cache management
+ *
+ * This needed for two cases
+ * 1. Out of order aware processors
+ * 2. Accidentally out of order processors (PPro errata #51)
+ */
+
+static inline void flush_write_buffers(void)
+{
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+#endif
+}
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+
+#define BUILDIO(bwl, bw, type) \
+static inline void out##bwl(unsigned type value, int port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline unsigned type in##bwl(int port) \
+{ \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+} \
+ \
+static inline void out##bwl##_p(unsigned type value, int port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_p(int port) \
+{ \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) : "d"(port)); \
+} \
+ \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) : "d"(port)); \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+/* We will be supplying our own /dev/mem implementation */
+#define ARCH_HAS_DEV_MEM
+
+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
+ (unsigned long)(bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
+ == bvec_to_pseudophys(vec2))
+
+#endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+ unsigned long prot_val);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_reset(void);
+extern void __iomem *early_ioremap(resource_size_t phys_addr,
+ unsigned long size);
+extern void __iomem *early_memremap(resource_size_t phys_addr,
+ unsigned long size);
+extern void __iomem *early_memremap_ro(resource_size_t phys_addr,
+ unsigned long size);
+extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern void fixup_early_ioremap(void);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#define IO_SPACE_LIMIT 0xffff
+
+#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/mach-xen/asm/ipi.h b/arch/x86/include/mach-xen/asm/ipi.h
new file mode 100644
index 000000000000..4bdda1d7870e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/ipi.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+#include <asm/hw_irq.h>
+#include <asm/smp.h>
+
+void xen_send_IPI_mask(const struct cpumask *, int vector);
+void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector);
+void xen_send_IPI_allbutself(int vector);
+void xen_send_IPI_all(int vector);
+void xen_send_IPI_self(int vector);
+
+#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/mach-xen/asm/irq_vectors.h b/arch/x86/include/mach-xen/asm/irq_vectors.h
new file mode 100644
index 000000000000..779873184643
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_X86_IRQ_VECTORS_H
+#define _ASM_X86_IRQ_VECTORS_H
+
+#define MCE_VECTOR 0x12
+
+#define IA32_SYSCALL_VECTOR 0x80
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR 0x80
+#endif
+
+#define RESCHEDULE_VECTOR 0
+#define CALL_FUNCTION_VECTOR 1
+#define NMI_VECTOR 0x02
+#define CALL_FUNC_SINGLE_VECTOR 3
+#define REBOOT_VECTOR 4
+#ifdef CONFIG_IRQ_WORK
+#define IRQ_WORK_VECTOR 5
+#define NR_IPIS 6
+#else
+#define NR_IPIS 5
+#endif
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS 256
+
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+
+#ifndef __ASSEMBLY__
+static inline int invalid_vm86_irq(int irq)
+{
+ return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+#endif
+
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
+
+#define NR_IRQS_LEGACY 16
+
+/*
+ * The flat IRQ space is divided into two regions:
+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
+ * if we have physical device-access privilege. This region is at the
+ * start of the IRQ space so that existing device drivers do not need
+ * to be modified to translate physical IRQ numbers into our IRQ space.
+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ * are bound using the provided bind/unbind functions.
+ */
+#define PIRQ_BASE 0
+/* PHYSDEVOP_pirq_eoi_gmfn restriction: */
+#define PIRQ_MAX(n) ((n) < (1 << (PAGE_SHIFT + 3)) - NR_VECTORS \
+ ? (n) : (1 << (PAGE_SHIFT + 3)) - NR_VECTORS)
+
+#define IO_APIC_VECTOR_LIMIT PIRQ_MAX(32 * MAX_IO_APICS)
+#define CPU_VECTOR_LIMIT PIRQ_MAX(64 * NR_CPUS)
+
+#if defined(CONFIG_X86_IO_APIC)
+# define NR_PIRQS \
+ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
+ (NR_VECTORS + CPU_VECTOR_LIMIT) : \
+ (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
+# define NR_PIRQS (NR_VECTORS + CPU_VECTOR_LIMIT)
+#else /* !CONFIG_X86_IO_APIC: */
+# define NR_PIRQS NR_IRQS_LEGACY
+#endif
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_SPARSE_IRQ
+extern int nr_pirqs;
+#else
+# define nr_pirqs NR_PIRQS
+#endif
+#endif
+
+#define DYNIRQ_BASE (PIRQ_BASE + nr_pirqs)
+#ifdef CONFIG_SPARSE_IRQ
+#define NR_DYNIRQS (CPU_VECTOR_LIMIT + CONFIG_XEN_NR_GUEST_DEVICES)
+#else
+#define NR_DYNIRQS (64 + CONFIG_XEN_NR_GUEST_DEVICES)
+#endif
+
+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
+
+#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/mach-xen/asm/irqflags.h b/arch/x86/include/mach-xen/asm/irqflags.h
new file mode 100644
index 000000000000..5db69693e500
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irqflags.h
@@ -0,0 +1,213 @@
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/smp-processor-id.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <xen/interface/vcpu.h>
+/*
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operations must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
+ */
+
+#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask)
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+#define xen_restore_fl(f) ({ \
+ typeof(vcpu_info(0)->evtchn_upcall_mask) f__ = (f); \
+ barrier(); \
+ vcpu_info_write(evtchn_upcall_mask, f__); \
+ barrier(); /* unmask then check (avoid races) */ \
+ if (likely(!f__) \
+ && unlikely(vcpu_info_read(evtchn_upcall_pending))) \
+ force_evtchn_callback(); \
+})
+#else
+#define xen_restore_fl(f) ({ \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = current_vcpu_info(); \
+ if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
+ barrier(); /* unmask then check (avoid races) */\
+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
+ force_evtchn_callback(); \
+ } \
+})
+#endif
+
+#define xen_irq_disable() ({ \
+ vcpu_info_write(evtchn_upcall_mask, 1); \
+ barrier(); \
+})
+
+#define xen_irq_enable() xen_restore_fl(0)
+
+#define arch_local_save_flags() xen_save_fl()
+
+#define arch_local_irq_restore(flags) xen_restore_fl(flags)
+
+#define arch_local_irq_disable() xen_irq_disable()
+
+#define arch_local_irq_enable() xen_irq_enable()
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+#define arch_safe_halt HYPERVISOR_block
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+#define halt() VOID(irqs_disabled() \
+ ? HYPERVISOR_vcpu_op(VCPUOP_down, \
+ smp_processor_id(), NULL) \
+ : 0)
+
+/*
+ * For spinlocks, etc:
+ */
+#define arch_local_irq_save() \
+({ \
+ unsigned long flags = arch_local_save_flags(); \
+ \
+ arch_local_irq_disable(); \
+ \
+ flags; \
+})
+#else
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending /* 0 */
+#define evtchn_upcall_mask 1
+
+#ifdef CONFIG_X86_64
+# define __REG_si %rsi
+# define __CPU_num PER_CPU_VAR(cpu_number)
+#else
+# define __REG_si %esi
+# define __CPU_num TI_cpu(%ebp)
+#endif
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+
+#define GET_VCPU_INFO PER_CPU(vcpu_info, __REG_si)
+#define __DISABLE_INTERRUPTS movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
+#define __ENABLE_INTERRUPTS movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
+#define __TEST_PENDING cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0)
+#define DISABLE_INTERRUPTS(clb) __DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb) __ENABLE_INTERRUPTS
+
+#define __SIZEOF_DISABLE_INTERRUPTS 8
+#define __SIZEOF_TEST_PENDING 8
+
+#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
+
+#define sizeof_vcpu_shift 6
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO movl __CPU_num,%esi ; \
+ shl $sizeof_vcpu_shift,%esi ; \
+ add HYPERVISOR_shared_info,__REG_si
+#else
+#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
+#endif
+
+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
+#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
+ __DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
+ __ENABLE_INTERRUPTS
+
+#define __SIZEOF_DISABLE_INTERRUPTS 4
+#define __SIZEOF_TEST_PENDING 3
+
+#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
+
+#ifndef CONFIG_X86_64
+#define INTERRUPT_RETURN iret
+#define ENABLE_INTERRUPTS_SYSEXIT \
+ movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */ ; \
+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
+ cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */ ; \
+ jnz 14f /* process more events if necessary... */ ; \
+ movl PT_ESI(%esp), %esi ; \
+ sysexit ; \
+14: movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */ ; \
+ TRACE_IRQS_OFF ; \
+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
+ mov $__KERNEL_PERCPU, %ecx ; \
+ push %esp ; \
+ mov %ecx, %fs ; \
+ SET_KERNEL_GS %ecx ; \
+ call evtchn_do_upcall ; \
+ add $4,%esp ; \
+ jmp ret_from_intr
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+static inline int arch_irqs_disabled_flags(unsigned long flags)
+{
+ return (flags != 0);
+}
+
+#define arch_irqs_disabled() \
+({ \
+ unsigned long flags = arch_local_save_flags(); \
+ \
+ arch_irqs_disabled_flags(flags); \
+})
+
+#else
+
+#ifdef CONFIG_X86_64
+#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
+ TRACE_IRQS_ON; \
+ ENABLE_INTERRUPTS(CLBR_NONE); \
+ SAVE_REST; \
+ LOCKDEP_SYS_EXIT; \
+ RESTORE_REST; \
+ __DISABLE_INTERRUPTS; \
+ TRACE_IRQS_OFF;
+
+#else
+#define ARCH_LOCKDEP_SYS_EXIT \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call lockdep_sys_exit; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+# define LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/mach-xen/asm/kbdleds.h b/arch/x86/include/mach-xen/asm/kbdleds.h
new file mode 100644
index 000000000000..2fc9a8086a70
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/kbdleds.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_KBDLEDS_H
+#define _ASM_X86_KBDLEDS_H
+
+/*
+ * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on.
+ * This seems a good reason to start with NumLock off. That's why on X86 we
+ * ask the bios for the correct state.
+ */
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+int kbd_defleds(void);
+#else
+static inline int kbd_defleds(void) { return 0; }
+#endif
+
+#endif /* _ASM_X86_KBDLEDS_H */
diff --git a/arch/x86/include/mach-xen/asm/mach_traps.h b/arch/x86/include/mach-xen/asm/mach_traps.h
new file mode 100644
index 000000000000..99314d328be3
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mach_traps.h
@@ -0,0 +1,37 @@
+/*
+ * include/asm-xen/asm-i386/mach-xen/mach_traps.h
+ *
+ * Machine specific NMI handling for Xen
+ */
+#ifndef _MACH_TRAPS_H
+#define _MACH_TRAPS_H
+
+#include <linux/bitops.h>
+#include <xen/interface/nmi.h>
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+static inline void clear_serr_error(unsigned char reason) {}
+static inline void clear_io_check_error(unsigned char reason) {}
+
+static inline unsigned char xen_get_nmi_reason(void)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ unsigned char reason = 0;
+
+ /* construct a value which looks like it came from
+ * port 0x61.
+ */
+ if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
+ reason |= NMI_REASON_IOCHK;
+ if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
+ reason |= NMI_REASON_SERR;
+
+ return reason;
+}
+
+static inline void reassert_nmi(void) {}
+
+#endif /* !_MACH_TRAPS_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr.h b/arch/x86/include/mach-xen/asm/maddr.h
new file mode 100644
index 000000000000..24088ab7c6c6
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr.h
@@ -0,0 +1,155 @@
+#ifndef _X86_MADDR_H
+#define _X86_MADDR_H
+
+#include <asm/asm.h>
+#include <asm/bug.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY (~0UL)
+#define FOREIGN_FRAME_BIT (1UL << (BITS_PER_LONG - 1))
+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+
+/* Definitions for machine and pseudophysical addresses. */
+#ifdef CONFIG_X86_PAE
+typedef unsigned long long paddr_t;
+typedef unsigned long long maddr_t;
+#else
+typedef unsigned long paddr_t;
+typedef unsigned long maddr_t;
+#endif
+
+#ifdef CONFIG_XEN
+
+extern unsigned long *phys_to_machine_mapping;
+extern unsigned long max_mapnr;
+
+#undef machine_to_phys_mapping
+extern const unsigned long *machine_to_phys_mapping;
+extern unsigned long machine_to_phys_nr;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return pfn;
+ if (likely(max_mapnr))
+ BUG_ON(pfn >= max_mapnr);
+ return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return 1;
+ if (likely(max_mapnr))
+ BUG_ON(pfn >= max_mapnr);
+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return mfn;
+
+ if (unlikely(mfn >= machine_to_phys_nr))
+ return max_mapnr;
+
+ /* The array access can fail (e.g., device space beyond end of RAM). */
+ asm (
+ "1: "_ASM_MOV" %1,%0\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: "_ASM_MOV" %2,%0\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b,3b)
+ : "=r" (pfn)
+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
+
+ return pfn;
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
+ * to be outside our maximum possible pseudophys range.
+ * 2. If the MFN belongs to a different domain then we will certainly
+ * not have MFN in our p2m table. Conversely, if the page is ours,
+ * then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(phys_addr_t mfn)
+{
+ unsigned long pfn = mfn_to_pfn(mfn);
+ if (likely(pfn < max_mapnr)
+ && likely(!xen_feature(XENFEAT_auto_translated_physmap))
+ && unlikely(phys_to_machine_mapping[pfn] != mfn))
+ return max_mapnr; /* force !pfn_valid() */
+ return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (likely(max_mapnr))
+ BUG_ON(pfn >= max_mapnr);
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return;
+ }
+ phys_to_machine_mapping[pfn] = mfn;
+}
+
+static inline maddr_t phys_to_machine(paddr_t phys)
+{
+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t machine_to_phys(maddr_t machine)
+{
+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
+ return phys;
+}
+
+#ifdef CONFIG_X86_32
+# include "maddr_32.h"
+#else
+# include "maddr_64.h"
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define pfn_to_mfn(pfn) (pfn)
+#define mfn_to_pfn(mfn) (mfn)
+#define mfn_to_local_pfn(mfn) (mfn)
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) 1
+#define phys_to_machine(phys) ((maddr_t)(phys))
+#define machine_to_phys(mach) ((paddr_t)(mach))
+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
+#define __pte_ma(x) __pte(x)
+
+#endif /* !CONFIG_XEN */
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v) phys_to_machine(__pa(v))
+#define virt_to_mfn(v) pfn_to_mfn(__pa(v) >> PAGE_SHIFT)
+#define mfn_to_virt(m) __va(mfn_to_pfn(m) << PAGE_SHIFT)
+
+#endif /* _X86_MADDR_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr_32.h b/arch/x86/include/mach-xen/asm/maddr_32.h
new file mode 100644
index 000000000000..de34d8727c17
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_32.h
@@ -0,0 +1,35 @@
+#ifndef _I386_MADDR_H
+#define _I386_MADDR_H
+
+#ifdef CONFIG_X86_PAE
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+ /*
+ * In PAE mode, the NX bit needs to be dealt with in the value
+ * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
+ * but for i386 the conversion to ulong for the argument will
+ * clip it off.
+ */
+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+ /*
+ * In PAE mode, the NX bit needs to be dealt with in the value
+ * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
+ * but for i386 the conversion to ulong for the argument will
+ * clip it off.
+ */
+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+ return phys;
+}
+#else
+#define pte_phys_to_machine phys_to_machine
+#define pte_machine_to_phys machine_to_phys
+#endif
+
+#endif /* _I386_MADDR_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr_64.h b/arch/x86/include/mach-xen/asm/maddr_64.h
new file mode 100644
index 000000000000..e2c271e81a6d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_64.h
@@ -0,0 +1,21 @@
+#ifndef _X86_64_MADDR_H
+#define _X86_64_MADDR_H
+
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+ maddr_t machine;
+ machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+ paddr_t phys;
+ phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+ return phys;
+}
+
+#endif /* _X86_64_MADDR_H */
+
diff --git a/arch/x86/include/mach-xen/asm/mmu_context.h b/arch/x86/include/mach-xen/asm/mmu_context.h
new file mode 100644
index 000000000000..549e75960927
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@ -0,0 +1,165 @@
+#ifndef _ASM_X86_MMU_CONTEXT_H
+#define _ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <linux/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+void mm_pin(struct mm_struct *mm);
+void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void xen_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+ if (!PagePinned(virt_to_page(next->pgd)))
+ mm_pin(next);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+#define prepare_arch_switch(next) __prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Save away %gs. No need to save %fs, as it was saved on the
+ * stack on entry. No need to save %es and %ds, as those are
+ * always kernel segments while inside the kernel.
+ */
+ lazy_save_gs(current->thread.gs);
+ lazy_load_gs(__KERNEL_STACK_CANARY);
+#else
+ /*
+ * Save away %es, %ds, %fs and %gs. Must happen before reload
+ * of cr3/ldt (i.e., not in __switch_to).
+ */
+ __asm__ __volatile__ (
+ "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
+ : "=m" (current->thread.es),
+ "=m" (current->thread.ds),
+ "=m" (current->thread.fsindex),
+ "=m" (current->thread.gsindex) );
+
+ if (current->thread.ds)
+ __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
+
+ if (current->thread.es)
+ __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
+
+ if (current->thread.fsindex) {
+ __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
+ current->thread.fs = 0;
+ }
+
+ if (current->thread.gsindex) {
+ load_gs_index(0);
+ current->thread.gs = 0;
+ }
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+ struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+#ifdef CONFIG_X86_64
+ pgd_t *upgd;
+#endif
+
+ if (likely(prev != next)) {
+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+ !PagePinned(virt_to_page(next->pgd)));
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /* Re-load page tables: load_cr3(next->pgd) */
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = virt_to_mfn(next->pgd);
+ op++;
+
+ /* xen_new_user_pt(next->pgd) */
+#ifdef CONFIG_X86_64
+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
+ upgd = __user_pgd(next->pgd);
+ op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
+ op++;
+#endif
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt)) {
+ /* load_LDT_nolock(&next->context) */
+ op->cmd = MMUEXT_SET_LDT;
+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
+ op->arg2.nr_ents = next->context.size;
+ op++;
+ }
+
+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+
+ /* stop TLB flushes for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ }
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ else {
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+ load_cr3(next->pgd);
+ xen_new_user_pt(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define activate_mm(prev, next) \
+do { \
+ xen_activate_mm(prev, next); \
+ switch_mm((prev), (next), NULL); \
+} while (0);
+
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm) \
+do { \
+ lazy_load_gs(0); \
+} while (0)
+#else
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ loadsegment(fs, 0); \
+} while (0)
+#endif
+
+#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/mach-xen/asm/mutex.h b/arch/x86/include/mach-xen/asm/mutex.h
new file mode 100644
index 000000000000..ee9126e5d9a6
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mutex.h
@@ -0,0 +1,3 @@
+#define arch_cpu_is_running(cpu) vcpu_running(cpu)
+
+#include_next <asm/mutex.h>
diff --git a/arch/x86/include/mach-xen/asm/pci.h b/arch/x86/include/mach-xen/asm/pci.h
new file mode 100644
index 000000000000..d26be5159e6b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pci.h
@@ -0,0 +1,198 @@
+#ifndef _ASM_X86_PCI_H
+#define _ASM_X86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+#include <asm/x86_init.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+ int domain; /* PCI domain */
+ int node; /* NUMA node */
+#ifdef CONFIG_ACPI
+ void *acpi; /* ACPI-specific data */
+#endif
+#ifdef CONFIG_X86_64
+ void *iommu; /* IOMMU private data */
+#endif
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+ struct pcifront_device *pdev;
+#endif
+};
+
+extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+/* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+ int node);
+extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+
+#ifdef CONFIG_PCI
+
+#ifdef CONFIG_PCI_DOMAINS
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+ return pci_domain_nr(bus);
+}
+#endif
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+ or architectures with incomplete PCI setup by the loader */
+
+extern unsigned int pcibios_assign_all_busses(void);
+extern int pci_legacy_init(void);
+# ifdef CONFIG_ACPI
+# define x86_default_pci_init pci_acpi_init
+# else
+# define x86_default_pci_init pci_legacy_init
+# endif
+#else
+# define pcibios_assign_all_busses() 0
+# define x86_default_pci_init NULL
+#endif
+
+#include <asm/hypervisor.h>
+#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO 0x1000
+#define PCIBIOS_MIN_MEM (pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+extern int pcibios_enabled;
+void pcibios_config_init(void);
+struct pci_bus *pcibios_scan_root(int bus);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq, int active);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+ enum pci_mmap_state mmap_state,
+ int write_combine);
+
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+ enum pci_dma_burst_strategy *strat,
+ unsigned long *strategy_parameter)
+{
+ *strat = PCI_DMA_BURST_INFINITY;
+ *strategy_parameter = ~0UL;
+}
+#else
+static inline void early_quirks(void) { }
+#endif
+
+extern void pci_iommu_alloc(void);
+
+#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
+/* MSI arch specific hooks */
+static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+static inline void x86_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ x86_msi.restore_msi_irqs(dev, irq);
+}
+#define arch_setup_msi_irqs x86_setup_msi_irqs
+#define arch_teardown_msi_irqs x86_teardown_msi_irqs
+#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
+/* implemented in arch/x86/kernel/apic/io_apic. */
+struct msi_desc;
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ unsigned int irq_base, unsigned int irq_offset);
+/* default to the implementation in drivers/lib/msi.c */
+#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
+void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
+#else
+#define native_setup_msi_irqs NULL
+#define native_teardown_msi_irq NULL
+#define default_teardown_msi_irqs NULL
+#define default_restore_msi_irqs NULL
+#endif
+
+#define PCI_DMA_BUS_IS_PHYS 0
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include <asm/pci_64.h>
+#endif
+
+/* implement the pci_ DMA API in terms of the generic device dma_ one */
+#include <asm-generic/pci-dma-compat.h>
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+#define PCIBIOS_MAX_MEM_32 0xffffffff
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(const struct pci_bus *bus)
+{
+ const struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->node;
+}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+ int node;
+
+ node = __pcibus_to_node(bus);
+ return (node == -1) ? cpu_online_mask :
+ cpumask_of_node(node);
+}
+#endif
+
+struct pci_setup_rom {
+ struct setup_data data;
+ uint16_t vendor;
+ uint16_t devid;
+ uint64_t pcilen;
+ unsigned long segment;
+ unsigned long bus;
+ unsigned long device;
+ unsigned long function;
+ uint8_t romdata[0];
+};
+
+#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/mach-xen/asm/percpu.h b/arch/x86/include/mach-xen/asm/percpu.h
new file mode 100644
index 000000000000..72c587c2c72d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/percpu.h
@@ -0,0 +1,75 @@
+#ifndef _ASM_X86_XEN_PERCPU_H
+#define _ASM_X86_XEN_PERCPU_H
+
+#include_next <asm/percpu.h>
+
+#ifdef CONFIG_64BIT
+# define __this_cpu_read_l __this_cpu_read_8
+# define __this_cpu_write_l __this_cpu_write_8
+#else
+# define __this_cpu_read_l __this_cpu_read_4
+# define __this_cpu_write_l __this_cpu_write_4
+#endif
+
+#define this_vcpu_read_1 this_cpu_read_1
+#define this_vcpu_read_2 this_cpu_read_2
+#define this_vcpu_read_4 this_cpu_read_4
+
+#ifdef CONFIG_64BIT
+# define this_vcpu_read_8 this_cpu_read_8
+#else
+# define this_vcpu_read_8(pcp) ({ \
+ typeof(pcp) res__; \
+ __asm__ ("movl %%ebx,%%eax\n" \
+ "movl %%ecx,%%edx\n" \
+ "cmpxchg8b " __percpu_arg(1) \
+ : "=&A" (res__) : "m" (pcp)); \
+ res__; })
+#endif
+
+#define this_vcpu_read(pcp) __pcpu_size_call_return(this_vcpu_read_, pcp)
+
+#define percpu_exchange_op(op, var, val) \
+({ \
+ typedef typeof(var) pxo_T__; \
+ pxo_T__ pxo_ret__; \
+ if (0) { \
+ pxo_ret__ = (val); \
+ (void)pxo_ret__; \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b %0,"__percpu_arg(1) \
+ : "=q" (pxo_ret__), "+m" (var) \
+ : "0" ((pxo_T__)(val))); \
+ break; \
+ case 2: \
+ asm(op "w %0,"__percpu_arg(1) \
+ : "=r" (pxo_ret__), "+m" (var) \
+ : "0" ((pxo_T__)(val))); \
+ break; \
+ case 4: \
+ asm(op "l %0,"__percpu_arg(1) \
+ : "=r" (pxo_ret__), "+m" (var) \
+ : "0" ((pxo_T__)(val))); \
+ break; \
+ case 8: \
+ asm(op "q %0,"__percpu_arg(1) \
+ : "=r" (pxo_ret__), "+m" (var) \
+ : "0" ((pxo_T__)(val))); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pxo_ret__; \
+})
+
+#if defined(CONFIG_XEN_VCPU_INFO_PLACEMENT)
+# define vcpu_info_read(fld) percpu_from_op("mov", vcpu_info.fld, "m" (vcpu_info.fld))
+# define vcpu_info_write(fld, val) percpu_to_op("mov", vcpu_info.fld, val)
+# define vcpu_info_xchg(fld, val) percpu_exchange_op("xchg", vcpu_info.fld, val)
+#elif defined(CONFIG_XEN)
+# define vcpu_info_read(fld) (current_vcpu_info()->fld)
+# define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val))
+#endif
+
+#endif /* _ASM_X86_XEN_PERCPU_H */
diff --git a/arch/x86/include/mach-xen/asm/perf_event.h b/arch/x86/include/mach-xen/asm/perf_event.h
new file mode 100644
index 000000000000..0987d1b90db2
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/perf_event.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT (1UL << 3)
+
+#define perf_instruction_pointer(regs) instruction_pointer(regs)
+
+#define perf_misc_flags(regs) ({ \
+ struct pt_regs *_r_ = (regs); \
+ unsigned long _f_ = user_mode(_r_) ? PERF_RECORD_MISC_USER \
+ : PERF_RECORD_MISC_KERNEL; \
+ _r_->flags & PERF_EFLAGS_EXACT ? _f_ | PERF_RECORD_MISC_EXACT_IP : _f_; \
+})
+
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->ip = (__ip); \
+ (regs)->bp = caller_frame_pointer(); \
+ (regs)->cs = __KERNEL_CS; \
+ regs->flags = 0; \
+ asm volatile( \
+ _ASM_MOV "%%"_ASM_SP ", %0\n" \
+ : "=m" ((regs)->sp) \
+ :: "memory" \
+ ); \
+}
+
+#endif
+
+#define arch_perf_out_copy_user copy_from_user_nmi
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/mach-xen/asm/pgalloc.h b/arch/x86/include/mach-xen/asm/pgalloc.h
new file mode 100644
index 000000000000..3879075ab233
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgalloc.h
@@ -0,0 +1,159 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+
+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+
+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+
+#ifdef CONFIG_X86_64
+void early_make_page_readonly(void *va, unsigned int feature);
+pmd_t *early_get_pmd(unsigned long va);
+#define make_lowmem_page_readonly make_page_readonly
+#define make_lowmem_page_writable make_page_writable
+#endif
+
+/*
+ * Flags to use when allocating a user page table page.
+ */
+extern gfp_t __userpte_alloc_gfp;
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+ free_page((unsigned long)pte);
+}
+
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+ __pte_free(pte);
+}
+
+extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+ unsigned long address)
+{
+ ___pte_free_tlb(tlb, pte);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+ pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+
+ paravirt_alloc_pte(mm, pfn);
+ if (PagePinned(virt_to_page(pmd))) {
+#ifndef CONFIG_HIGHPTE
+ BUG_ON(PageHighMem(pte));
+#endif
+ set_pmd(pmd, ent);
+ } else
+ *pmd = ent;
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+extern void __pmd_free(pgtable_t);
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ __pmd_free(virt_to_page(pmd));
+}
+
+extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long address)
+{
+ ___pmd_free_tlb(tlb, pmd);
+}
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ pud_t ent = __pud(_PAGE_TABLE | __pa(pmd));
+
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ if (PagePinned(virt_to_page(pud)))
+ set_pud(pud, ent);
+ else
+ *pud = ent;
+}
+#endif /* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud));
+
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ if (unlikely(PagePinned(virt_to_page(pgd))))
+ xen_l4_entry_update(pgd, ent);
+ else
+ *__user_pgd(pgd) = *pgd = ent;
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pud_t *)pmd_alloc_one(mm, addr);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ __pmd_free(virt_to_page(pud));
+}
+
+extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long address)
+{
+ ___pud_free_tlb(tlb, pud);
+}
+
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level.h b/arch/x86/include/mach-xen/asm/pgtable-3level.h
new file mode 100644
index 000000000000..56d22b6f9a95
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
@@ -0,0 +1,206 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_H
+#define _ASM_X86_PGTABLE_3LEVEL_H
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e) \
+ pr_err("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e) \
+ pr_err("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
+ __FILE__, __LINE__, &(e), __pmd_val(e), \
+ (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e) \
+ pr_err("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
+ __FILE__, __LINE__, &(e), __pgd_val(e), \
+ (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte. In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it. -ben
+ */
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the pmd can only
+ * transition from null to not null while pmd_read_atomic runs. So
+ * we can always return atomic pmd values with this function.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * trans_huge or none or point to a pte (and in turn become "stable")
+ * at any time under pmd_read_atomic. We could read it really
+ * atomically here with a atomic64_read for the THP enabled case (and
+ * it would be a whole lot simpler), but to avoid using cmpxchg8b we
+ * only return an atomic pmdval if the low part of the pmdval is later
+ * found stable (i.e. pointing to a pte). And we're returning a none
+ * pmdval if the low part of the pmd is none. In some cases the high
+ * and low part of the pmdval returned may not be consistent if THP is
+ * enabled (the low part may point to previously mapped hugepage,
+ * while the high part may point to a more recently mapped hugepage),
+ * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
+ * of the pmd to be read atomically to decide if the pmd is unstable
+ * or not, with the only exception of when the low part of the pmd is
+ * zero in which case we return a none pmd.
+ */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+ pmdval_t ret;
+ u32 *tmp = (u32 *)pmdp;
+
+ ret = (pmdval_t) (*tmp);
+ if (ret) {
+ /*
+ * If the low part is null, we must not read the high part
+ * or we can end up with a partial pmd.
+ */
+ smp_rmb();
+ ret |= ((pmdval_t)*(tmp + 1)) << 32;
+ }
+
+ return (pmd_t) { ret };
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ xen_l2_entry_update(pmdp, pmd);
+}
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+ xen_l3_entry_update(pudp, pud);
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void __xen_pte_clear(pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+#define xen_pmd_clear(pmd) \
+({ \
+ pmd_t *__pmdp = (pmd); \
+ PagePinned(virt_to_page(__pmdp)) \
+ ? set_pmd(__pmdp, __pmd(0)) \
+ : (void)(*__pmdp = __pmd(0)); \
+})
+
+static inline void __xen_pud_clear(pud_t *pudp)
+{
+ set_pud(pudp, __pud(0));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ *
+ * Currently all places where pud_clear() is called either have
+ * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
+ * pud_clear_bad()), so we don't need TLB flush here.
+ */
+}
+
+#define xen_pud_clear(pudp) \
+({ \
+ pud_t *__pudp = (pudp); \
+ PagePinned(virt_to_page(__pudp)) \
+ ? __xen_pud_clear(__pudp) \
+ : (void)(*__pudp = __pud(0)); \
+})
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+ uint64_t val = __pte_val(res);
+ if (__cmpxchg64(&ptep->pte, val, 0) != val) {
+ /* xchg acts as a barrier before the setting of the high bits */
+ res.pte_low = xchg(&ptep->pte_low, 0);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = 0;
+ }
+ return res;
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
+ ((_pte).pte_high << (32-PAGE_SHIFT)))
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_SMP
+union split_pmd {
+ struct {
+ u32 pmd_low;
+ u32 pmd_high;
+ };
+ pmd_t pmd;
+};
+static inline pmd_t xen_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ res.pmd_low = xchg(&orig->pmd_low, 0);
+ res.pmd_high = orig->pmd_high;
+ orig->pmd_high = 0;
+
+ return res.pmd;
+}
+#else
+#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
+#endif
+#endif
+
+/*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+#define pte_to_pgoff(pte) ((pte).pte_high)
+#define pgoff_to_pte(off) \
+ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define PTE_FILE_MAX_BITS 32
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+#define __swp_type(x) (((x).val) & 0x1f)
+#define __swp_offset(x) ((x).val >> 5)
+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level_types.h b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h
new file mode 100644
index 000000000000..36d6f2b9594f
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef u64 pteval_t;
+typedef u64 pmdval_t;
+typedef u64 pudval_t;
+typedef u64 pgdval_t;
+typedef u64 pgprotval_t;
+
+typedef union {
+ struct {
+ unsigned long pte_low, pte_high;
+ };
+ pteval_t pte;
+} pte_t;
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+#define PAGETABLE_LEVELS 3
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 30
+#define PTRS_PER_PGD 4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable.h b/arch/x86/include/mach-xen/asm/pgtable.h
new file mode 100644
index 000000000000..865dc84437bb
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -0,0 +1,939 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#include <asm/page.h>
+#include <asm/e820.h>
+
+#include <asm/pgtable_types.h>
+
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
+ : (prot))
+
+#ifndef __ASSEMBLY__
+
+#include <asm/x86_init.h>
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
+#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd) xen_set_pmd_at(mm, addr, pmdp, pmd)
+
+#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd) xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud) xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd) xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#define pmd_update(mm, addr, ptep) do { } while (0)
+#define pmd_update_defer(mm, addr, ptep) do { } while (0)
+
+#define pgd_val(x) xen_pgd_val(x)
+#define __pgd(x) xen_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x) xen_pud_val(x)
+#define __pud(x) xen_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x) xen_pmd_val(x)
+#define __pmd(x) xen_make_pmd(x)
+#endif
+
+#define pte_val(x) xen_pte_val(x)
+#define __pte(x) xen_make_pte(x)
+
+#define arch_end_context_switch(prev) do {} while(0)
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+ return 0;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+ return !(pte_flags(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SPECIAL;
+}
+
+#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
+ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
+#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \
+ (_pte).pte_low & _PAGE_PRESENT ? \
+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
+ __pte_mfn(_pte))
+
+#define pte_page(pte) pfn_to_page(pte_pfn(pte))
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int pmd_large(pmd_t pte)
+{
+ return pmd_flags(pte) & _PAGE_PSE;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+ return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+ pteval_t v = __pte_val(pte);
+
+ return __pte_ma(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+ pteval_t v = __pte_val(pte);
+
+ return __pte_ma(v & ~clear);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SPECIAL);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+
+/*
+ * Mask out unsupported bits in a present pgprot. Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
+{
+ pgprotval_t protval = pgprot_val(pgprot);
+
+ if (protval & _PAGE_PRESENT)
+ protval &= __supported_pte_mask;
+
+ return protval;
+}
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot));
+}
+
+static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
+{
+ return __pte_ma((page_nr << PAGE_SHIFT) | massage_pgprot(pgprot));
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot));
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
+
+ val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
+
+ return __pte(val);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd);
+
+ val &= _HPAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+ return __pmd(val);
+}
+#endif
+
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+ pgprotval_t addbits = pgprot_val(newprot);
+ return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
+ unsigned long flags,
+ unsigned long new_flags)
+{
+ /*
+ * PAT type is always WB for untracked ranges, so no need to check.
+ */
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
+ return 1;
+
+ /*
+ * Certain new memtypes are not allowed with certain
+ * requested memtype:
+ * - request is uncached, return cannot be write-back
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((flags == _PAGE_CACHE_UC_MINUS &&
+ new_flags == _PAGE_CACHE_WB) ||
+ (flags == _PAGE_CACHE_WC &&
+ new_flags == _PAGE_CACHE_WB)) {
+ return 0;
+ }
+
+ return 1;
+}
+
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32.h>
+#else
+# include <asm/pgtable_64.h>
+#endif
+
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>
+#include <linux/log2.h>
+
+static inline int pte_none(pte_t pte)
+{
+ return !pte.pte;
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+ _PAGE_NUMA);
+}
+
+#define pte_accessible pte_accessible
+static inline int pte_accessible(pte_t a)
+{
+ return pte_flags(a) & _PAGE_PRESENT;
+}
+
+static inline int pte_hidden(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_HIDDEN;
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+ can temporarily clear it. */
+ return __pmd_val(pmd) != 0;
+#else
+ /*
+ * Checking for _PAGE_PSE is needed too because
+ * split_huge_page will temporarily clear the present bit (but
+ * the _PAGE_PSE flag will remain set at all times while the
+ * _PAGE_PRESENT bit is clear).
+ */
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
+ _PAGE_NUMA);
+#endif
+}
+
+static inline int pmd_none(pmd_t pmd)
+{
+ /* Only check low word on 32-bit platforms, since it might be
+ out of sync with upper half. */
+ return (unsigned long)__pmd_val(pmd) == 0;
+}
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+ return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned long pmd_index(unsigned long address)
+{
+ return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned long pte_index(unsigned long address)
+{
+ return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}
+
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+ return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ /* pmd_numa check */
+ if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
+ return 0;
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+ return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT)
+ != (_KERNPG_TABLE & ~_PAGE_PRESENT);
+#else
+ return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+#endif
+}
+
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+ return npg >> (20 - PAGE_SHIFT);
+}
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
+
+#if PAGETABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+ return __pud_val(pud) == 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+ return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+ return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline int pud_large(pud_t pud)
+{
+ return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#else
+static inline int pud_large(pud_t pud)
+{
+ return 0;
+}
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#if PAGETABLE_LEVELS > 3
+static inline int pgd_present(pgd_t pgd)
+{
+ return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+ return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+
+/* to find an entry in a page-table-directory. */
+static inline unsigned long pud_index(unsigned long address)
+{
+ return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+ return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+ return !__pgd_val(pgd);
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+#include <asm/tlbflush.h>
+
+#define direct_gbpages 0
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+ xen_set_pte(ptep, __pte(0));
+ return res;
+}
+
+static inline pmd_t xen_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ pmd_t res = *pmdp;
+
+ xen_set_pmd(pmdp, __pmd(0));
+ return res;
+}
+
+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep , pte_t pte)
+{
+ if ((mm != current->mm && mm != &init_mm) ||
+ HYPERVISOR_update_va_mapping(addr, pte, 0))
+ xen_set_pte(ptep, pte);
+}
+
+static inline void xen_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp , pmd_t pmd)
+{
+ xen_set_pmd(pmdp, pmd);
+}
+
+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ if ((mm != current->mm && mm != &init_mm)
+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+ __xen_pte_clear(ptep);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces. It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush. The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+struct vm_area_struct;
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+#define ptep_clear_flush(vma, addr, ptep) \
+({ \
+ pte_t *__ptep = (ptep); \
+ pte_t __res = *__ptep; \
+ if (!pte_none(__res) && \
+ ((vma)->vm_mm != current->mm || \
+ HYPERVISOR_update_va_mapping(addr, __pte(0), \
+ uvm_multi(mm_cpumask((vma)->vm_mm)) | \
+ UVMF_INVLPG))) { \
+ __xen_pte_clear(__ptep); \
+ flush_tlb_page(vma, addr); \
+ } \
+ __res; \
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ if (!pte_none(pte)
+ && (mm != &init_mm
+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
+ pte = xen_ptep_get_and_clear(ptep, pte);
+ pte_update(mm, addr, ptep);
+ }
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+#define ptep_get_and_clear_full(mm, addr, ptep, full) \
+ ((full) ? ({ \
+ pte_t *__ptep = (ptep); \
+ pte_t __res = *__ptep; \
+ if (!PagePinned(virt_to_page((mm)->pgd))) \
+ __xen_pte_clear(__ptep); \
+ else if (!pte_none(__res)) \
+ xen_l1_entry_update(__ptep, __pte(0)); \
+ __res; \
+ }) : \
+ ptep_get_and_clear(mm, addr, ptep))
+
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ if (pte_write(pte))
+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+}
+
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+
+#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = xen_pmdp_get_and_clear(pmdp);
+ pmd_update(mm, addr, pmdp);
+ return pmd;
+}
+#endif
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+ pmd_update(mm, addr, pmdp);
+}
+#endif
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+ * src - ""
+ * count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+ memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+#define PTE_SHIFT ilog2(PTRS_PER_PTE)
+static inline int page_level_shift(enum pg_level level)
+{
+ return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
+}
+static inline unsigned long page_level_size(enum pg_level level)
+{
+ return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(enum pg_level level)
+{
+ return ~(page_level_size(level) - 1);
+}
+
+/*
+ * The x86 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+static inline void update_mmu_cache(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+}
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd)
+{
+}
+
+#define arbitrary_virt_to_mfn(va) \
+({ \
+ unsigned int __lvl; \
+ pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
+ BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
+ pte_mfn(*__ptep); \
+})
+
+#define arbitrary_virt_to_machine(va) \
+ (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT) \
+ | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+
+#ifdef CONFIG_HIGHPTE
+#include <asm/io.h>
+struct page *kmap_atomic_to_page(void *);
+#define ptep_to_machine(ptep) \
+({ \
+ pte_t *__ptep = (ptep); \
+ page_to_phys(kmap_atomic_to_page(__ptep)) \
+ | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
+})
+#else
+#define ptep_to_machine(ptep) virt_to_machine(ptep)
+#endif
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+#if CONFIG_XEN_COMPAT < 0x030300
+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
+ return ptep_get_and_clear(mm, addr, ptep);
+#endif
+ return *ptep;
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ mmu_update_t u;
+
+#if CONFIG_XEN_COMPAT < 0x030300
+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
+ set_pte_at(mm, addr, ptep, pte);
+ return;
+ }
+#endif
+ u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
+ u.val = __pte_val(pte);
+ if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
+#include <asm-generic/pgtable.h>
+
+#include <xen/features.h>
+void make_page_readonly(void *va, unsigned int feature);
+void make_page_writable(void *va, unsigned int feature);
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+
+struct vm_area_struct;
+
+int direct_remap_pfn_range(struct vm_area_struct *,
+ unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t,
+ domid_t);
+int direct_kernel_remap_pfn_range(unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t,
+ domid_t);
+int create_lookup_pte_addr(struct mm_struct *,
+ unsigned long address,
+ uint64_t *ptep);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_32.h b/arch/x86/include/mach-xen/asm/pgtable_32.h
new file mode 100644
index 000000000000..f00300875543
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_32.h
@@ -0,0 +1,83 @@
+#ifndef _ASM_X86_PGTABLE_32_H
+#define _ASM_X86_PGTABLE_32_H
+
+#include <asm/pgtable_32_types.h>
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <linux/threads.h>
+
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+struct vm_area_struct;
+
+extern pgd_t *swapper_pg_dir;
+extern pgd_t initial_page_table[1024];
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+
+extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
+
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir))) + \
+ pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte))
+#else
+#define pte_offset_map(dir, address) \
+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_unmap(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr) \
+do { \
+ if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
+ BUG(); \
+} while (0)
+
+void make_lowmem_page_readonly(void *va, unsigned int feature);
+void make_lowmem_page_writable(void *va, unsigned int feature);
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr) (1)
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
+
+#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64.h b/arch/x86/include/mach-xen/asm/pgtable_64.h
new file mode 100644
index 000000000000..f360898af0eb
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
@@ -0,0 +1,206 @@
+#ifndef _ASM_X86_PGTABLE_64_H
+#define _ASM_X86_PGTABLE_64_H
+
+#include <linux/const.h>
+#include <asm/pgtable_64_types.h>
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_XEN
+extern pud_t level3_user_pgt[512];
+
+extern void xen_init_pt(void);
+extern void xen_switch_pt(void);
+#endif
+
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
+extern pgd_t init_level4_pgt[];
+
+#define swapper_pg_dir init_level4_pgt
+
+extern void paging_init(void);
+
+#define pte_ERROR(e) \
+ pr_err("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e) \
+ pr_err("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
+ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
+#define pud_ERROR(e) \
+ pr_err("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \
+ __FILE__, __LINE__, &(e), __pud_val(e), \
+ (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e) \
+ pr_err("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n", \
+ __FILE__, __LINE__, &(e), __pgd_val(e), \
+ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+struct mm_struct;
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ xen_l2_entry_update(pmdp, pmd);
+}
+
+#define xen_pmd_clear(pmd) \
+({ \
+ pmd_t *__pmdp = (pmd); \
+ PagePinned(virt_to_page(__pmdp)) \
+ ? set_pmd(__pmdp, xen_make_pmd(0)) \
+ : (void)(*__pmdp = xen_make_pmd(0)); \
+})
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
+{
+ return __pte_ma(xchg(&xp->pte, 0));
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_SMP
+static inline pmd_t xen_pmdp_get_and_clear(pmd_t *xp)
+{
+ return xen_make_pmd(xchg(&xp->pmd, 0));
+}
+#else
+#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
+#endif
+#endif
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+ xen_l3_entry_update(pudp, pud);
+}
+
+#define xen_pud_clear(pud) \
+({ \
+ pud_t *__pudp = (pud); \
+ PagePinned(virt_to_page(__pudp)) \
+ ? set_pud(__pudp, xen_make_pud(0)) \
+ : (void)(*__pudp = xen_make_pud(0)); \
+})
+
+static inline pgd_t *__user_pgd(pgd_t *pgd)
+{
+ if (unlikely(((unsigned long)pgd & PAGE_MASK)
+ == (unsigned long)init_level4_pgt))
+ return NULL;
+ return (pgd_t *)(virt_to_page(pgd)->private
+ + ((unsigned long)pgd & ~PAGE_MASK));
+}
+
+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ xen_l4_entry_update(pgdp, pgd);
+}
+
+#define xen_pgd_clear(pgd) \
+({ \
+ pgd_t *__pgdp = (pgd); \
+ PagePinned(virt_to_page(__pgdp)) \
+ ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \
+ : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \
+})
+
+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+extern unsigned long early_arbitrary_virt_to_mfn(void *va);
+
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+static inline int pgd_large(pgd_t pgd) { return 0; }
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+
+/* PMD - Level 2 access */
+#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
+ _PAGE_FILE })
+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+/* PTE - Level 1 access. */
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+
+/* Encode and de-code a swap entry */
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ | ((offset) << SWP_OFFSET_SHIFT) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init() do { } while (0)
+#define check_pgt_cache() do { } while (0)
+
+#define PAGE_AGP PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
+
+#define __HAVE_ARCH_PTE_SAME
+
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64_types.h b/arch/x86/include/mach-xen/asm/pgtable_64_types.h
new file mode 100644
index 000000000000..3ea3da7aaded
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64_types.h
@@ -0,0 +1,68 @@
+#ifndef _ASM_X86_PGTABLE_64_DEFS_H
+#define _ASM_X86_PGTABLE_64_DEFS_H
+
+#include <asm/sparsemem.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+#define PAGETABLE_LEVELS 4
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT 30
+#define PTRS_PER_PUD 512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_START _AC(0xffffc90000000000, UL)
+#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffea0000000000, UL)
+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
+#define MODULES_END _AC(0xffffffffff000000, UL)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
+#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_types.h b/arch/x86/include/mach-xen/asm/pgtable_types.h
new file mode 100644
index 000000000000..14f57ab2d9a9
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@ -0,0 +1,413 @@
+#ifndef _ASM_X86_PGTABLE_DEFS_H
+#define _ASM_X86_PGTABLE_DEFS_H
+
+#include <linux/const.h>
+#include <asm/page_types.h>
+
+#define FIRST_USER_ADDRESS 0
+
+#define _PAGE_BIT_PRESENT 0 /* is present */
+#define _PAGE_BIT_RW 1 /* writeable */
+#define _PAGE_BIT_USER 2 /* userspace addressable */
+#define _PAGE_BIT_PWT 3 /* page write through */
+#define _PAGE_BIT_PCD 4 /* page cache disabled */
+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
+#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
+#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
+
+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#ifdef CONFIG_KMEMCHECK
+#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_HIDDEN (_AT(pteval_t, 0))
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+/*
+ * _PAGE_NUMA indicates that this page will trigger a numa hinting
+ * minor page fault to gather numa placement statistics (see
+ * pte_numa()). The bit picked (8) is within the range between
+ * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
+ * require changes to the swp entry format because that bit is always
+ * zero when the pte is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ *
+ * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
+ * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
+ * couldn't reach, like handle_mm_fault() (see access_error in
+ * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
+ * handle_mm_fault() to be invoked).
+ */
+#define _PAGE_NUMA _PAGE_PROTNONE
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB (0)
+#define _PAGE_CACHE_WT (_PAGE_PWT)
+#define _PAGE_CACHE_WC (_PAGE_PAT)
+#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+#define PAGE_COPY PAGE_COPY_NOEXEC
+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
+
+#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
+
+/* xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
+#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
+#endif
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_types.h>
+#else
+# include <asm/pgtable_64_types.h>
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
+
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+
+#include <asm/maddr.h>
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#define __pgd_ma(x) ((pgd_t) { (x) } )
+static inline pgd_t xen_make_pgd(pgdval_t val)
+{
+ if (likely(val & _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pgd_t) { val };
+}
+
+#define __pgd_val(x) ((x).pgd)
+static inline pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ pgdval_t ret = __pgd_val(pgd);
+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
+ if (likely(ret))
+ ret = machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+ if (likely(ret & _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+#endif
+ return ret;
+}
+
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+ return __pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
+#if PAGETABLE_LEVELS > 3
+typedef struct { pudval_t pud; } pud_t;
+
+#define __pud_ma(x) ((pud_t) { (x) } )
+static inline pud_t xen_make_pud(pudval_t val)
+{
+ if (likely(val & _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pud_t) { val };
+}
+
+#define __pud_val(x) ((x).pud)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+ pudval_t ret = __pud_val(pud);
+ if (likely(ret & _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+ return ret;
+}
+#else
+#include <asm-generic/pgtable-nopud.h>
+
+#define __pud_val(x) __pgd_val((x).pgd)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+ return xen_pgd_val(pud.pgd);
+}
+#endif
+
+#if PAGETABLE_LEVELS > 2
+typedef struct { pmdval_t pmd; } pmd_t;
+
+#define __pmd_ma(x) ((pmd_t) { (x) } )
+static inline pmd_t xen_make_pmd(pmdval_t val)
+{
+ if (likely(val & _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pmd_t) { val };
+}
+
+#define __pmd_val(x) ((x).pmd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ pmdval_t ret = __pmd_val(pmd);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (likely(ret))
+ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+ if (likely(ret & _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+#endif
+ return ret;
+}
+#else
+#include <asm-generic/pgtable-nopmd.h>
+
+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
+#define __pmd_val(x) __pgd_val((x).pud.pgd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ return xen_pgd_val(pmd.pud.pgd);
+}
+#endif
+
+static inline pudval_t pud_flags(pud_t pud)
+{
+ return __pud_val(pud) & PTE_FLAGS_MASK;
+}
+
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+ return __pmd_val(pmd) & PTE_FLAGS_MASK;
+}
+
+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
+static inline pte_t xen_make_pte(pteval_t val)
+{
+ if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pte_t) { .pte = val };
+}
+
+#define __pte_val(x) ((x).pte)
+static inline pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t ret = __pte_val(pte);
+ if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+ return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+ return __pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+
+typedef struct page *pgtable_t;
+
+extern pteval_t __supported_pte_mask;
+extern void set_nx(void);
+extern int nx_enabled;
+
+#define pgprot_writecombine pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot);
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#define xen_pagetable_init paging_init
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+enum pg_level {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+ PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern phys_addr_t slow_virt_to_phys(void *__address);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/probe_roms.h b/arch/x86/include/mach-xen/asm/probe_roms.h
new file mode 100644
index 000000000000..da90d01fb149
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/probe_roms.h
@@ -0,0 +1,10 @@
+#if !defined(CONFIG_XEN_UNPRIVILEGED_GUEST)
+# include_next <asm/probe_roms.h>
+#elif !defined(_PROBE_ROMS_H_)
+# define _PROBE_ROMS_H_
+struct pci_dev;
+
+static inline void __iomem *pci_map_biosrom(struct pci_dev *pdev) { return NULL; }
+static inline void pci_unmap_biosrom(void __iomem *rom) { }
+static inline size_t pci_biosrom_size(struct pci_dev *pdev) { return 0; }
+#endif
diff --git a/arch/x86/include/mach-xen/asm/processor.h b/arch/x86/include/mach-xen/asm/processor.h
new file mode 100644
index 000000000000..257ddc8ee18e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -0,0 +1,984 @@
+#ifndef _ASM_X86_PROCESSOR_H
+#define _ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+#include <asm/pgtable_types.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <asm/special_insns.h>
+
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/math64.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/irqflags.h>
+
+#include <xen/interface/physdev.h>
+
+/*
+ * We handle most unaligned accesses in hardware. On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN 0
+
+#define HBP_NUM 4
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+ void *pc;
+
+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+ return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN 16
+# define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+enum tlb_infos {
+ ENTRIES,
+ NR_INFO
+};
+
+extern u16 __read_mostly tlb_lli_4k[NR_INFO];
+extern u16 __read_mostly tlb_lli_2m[NR_INFO];
+extern u16 __read_mostly tlb_lli_4m[NR_INFO];
+extern u16 __read_mostly tlb_lld_4k[NR_INFO];
+extern u16 __read_mostly tlb_lld_2m[NR_INFO];
+extern u16 __read_mostly tlb_lld_4m[NR_INFO];
+extern s8 __read_mostly tlb_flushall_shift;
+
+/*
+ * CPU type and hardware bug flags. Kept separately for each CPU.
+ * Members of this structure are referenced in head.S, so think twice
+ * before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+ __u8 x86; /* CPU family */
+ __u8 x86_vendor; /* CPU vendor */
+ __u8 x86_model;
+ __u8 x86_mask;
+#ifdef CONFIG_X86_32
+ char wp_works_ok; /* It doesn't on 386's */
+
+ /* Problems on some 486Dx4's and old 386's: */
+ char hard_math;
+#ifndef CONFIG_XEN
+ char rfu;
+ char pad0;
+#endif
+#else
+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+ int x86_tlbsize;
+#endif
+ __u8 x86_virt_bits;
+ __u8 x86_phys_bits;
+#ifndef CONFIG_XEN
+ /* CPUID returned core id bits: */
+ __u8 x86_coreid_bits;
+#endif
+ /* Max extended CPUID function supported: */
+ __u32 extended_cpuid_level;
+ /* Maximum supported CPUID level, -1=no CPUID: */
+ int cpuid_level;
+ __u32 x86_capability[NCAPINTS + NBUGINTS];
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+ /* in KB - valid for CPUS which support this call: */
+ int x86_cache_size;
+ int x86_cache_alignment; /* In bytes */
+ int x86_power;
+ unsigned long loops_per_jiffy;
+#ifndef CONFIG_XEN
+ /* cpuid returned max cores value: */
+ u16 x86_max_cores;
+ u16 apicid;
+ u16 initial_apicid;
+#endif
+ u16 x86_clflush_size;
+#ifndef CONFIG_XEN
+ /* number of cores as seen by the OS: */
+ u16 booted_cores;
+ /* Physical processor id: */
+ u16 phys_proc_id;
+ /* Core id: */
+ u16 cpu_core_id;
+ /* Compute unit id */
+ u8 compute_unit_id;
+#endif
+ /* Index into per_cpu list: */
+ u16 cpu_index;
+#ifndef CONFIG_XEN
+ u32 microcode;
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+
+#define X86_VENDOR_UNKNOWN 0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+
+extern __u32 cpu_caps_cleared[NCAPINTS];
+extern __u32 cpu_caps_set[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+#else
+#define cpu_info boot_cpu_data
+#define cpu_data(cpu) boot_cpu_data
+#endif
+
+extern const struct seq_operations cpuinfo_op;
+
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+extern void early_cpu_init(void);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+void print_cpu_msr(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+extern int have_cpuid_p(void);
+#else
+static inline int have_cpuid_p(void)
+{
+ return 1;
+}
+#endif
+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(XEN_CPUID
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+ write_cr3(__pa(pgdir));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+ unsigned short back_link, __blh;
+ unsigned long sp0;
+ unsigned short ss0, __ss0h;
+ unsigned long sp1;
+ /* ss1 caches MSR_IA32_SYSENTER_CS: */
+ unsigned short ss1, __ss1h;
+ unsigned long sp2;
+ unsigned short ss2, __ss2h;
+ unsigned long __cr3;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long bx;
+ unsigned long sp;
+ unsigned long bp;
+ unsigned long si;
+ unsigned long di;
+ unsigned short es, __esh;
+ unsigned short cs, __csh;
+ unsigned short ss, __ssh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+ unsigned short ldt, __ldth;
+ unsigned short trace;
+ unsigned short io_bitmap_base;
+
+} __attribute__((packed));
+extern struct tss_struct doublefault_tss;
+#else
+struct x86_hw_tss {
+ u32 reserved1;
+ u64 sp0;
+ u64 sp1;
+ u64 sp2;
+ u64 reserved2;
+ u64 ist[7];
+ u32 reserved3;
+ u32 reserved4;
+ u16 reserved5;
+ u16 io_bitmap_base;
+
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+#endif /* CONFIG_X86_NO_TSS */
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS 65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+
+#ifndef CONFIG_X86_NO_TSS
+struct tss_struct {
+ /*
+ * The hardware state:
+ */
+ struct x86_hw_tss x86_tss;
+
+ /*
+ * The extra 1 is there because the CPU will access an
+ * additional byte beyond the end of the IO permission
+ * bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+
+ /*
+ * .. and then another 0x100 bytes for the emergency kernel stack:
+ */
+ unsigned long stack[64];
+
+} ____cacheline_aligned;
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+ unsigned long ist[7];
+};
+#endif /* CONFIG_X86_NO_TSS */
+
+#define MXCSR_DEFAULT 0x1f80
+
+struct i387_fsave_struct {
+ u32 cwd; /* FPU Control Word */
+ u32 swd; /* FPU Status Word */
+ u32 twd; /* FPU Tag Word */
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Pointer Offset */
+ u32 fos; /* FPU Operand Pointer Selector */
+
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+
+ /* Software status information [not touched by FSAVE ]: */
+ u32 status;
+};
+
+struct i387_fxsave_struct {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes: */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
+ u32 xmm_space[64];
+
+ u32 padding[12];
+
+ union {
+ u32 padding1[12];
+ u32 sw_reserved[12];
+ };
+
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+ u8 ftop;
+ u8 changed;
+ u8 lookahead;
+ u8 no_update;
+ u8 rm;
+ u8 alimit;
+ struct math_emu_info *info;
+ u32 entry_eip;
+};
+
+struct ymmh_struct {
+ /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
+ u32 ymmh_space[64];
+};
+
+struct xsave_hdr_struct {
+ u64 xstate_bv;
+ u64 reserved1[2];
+ u64 reserved2[5];
+} __attribute__((packed));
+
+struct xsave_struct {
+ struct i387_fxsave_struct i387;
+ struct xsave_hdr_struct xsave_hdr;
+ struct ymmh_struct ymmh;
+ /* new processor state extensions will go here */
+} __attribute__ ((packed, aligned (64)));
+
+union thread_xstate {
+ struct i387_fsave_struct fsave;
+ struct i387_fxsave_struct fxsave;
+ struct i387_soft_struct soft;
+ struct xsave_struct xsave;
+};
+
+struct fpu {
+ unsigned int last_cpu;
+ unsigned int has_fpu;
+ union thread_xstate *state;
+};
+
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_TSS
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+union irq_stack_union {
+ char irq_stack[IRQ_STACK_SIZE];
+ /*
+ * GCC hardcodes the stack canary as %gs:40. Since the
+ * irq_stack is the object at %gs:0, we reserve the bottom
+ * 48 bytes of the irq stack for the canary.
+ */
+ struct {
+ char gs_base[40];
+ unsigned long stack_canary;
+ };
+};
+
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern asmlinkage void ignore_sysret(void);
+#else /* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Make sure stack canary segment base is cached-aligned:
+ * "For Intel Atom processors, avoid non zero segment base address
+ * that is not aligned to cache line boundary at all cost."
+ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
+ */
+struct stack_canary {
+ char __pad[20]; /* canary at %gs:20 */
+ unsigned long canary;
+};
+DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+#endif /* X86_64 */
+
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
+
+struct perf_event;
+
+struct thread_struct {
+ /* Cached TLS descriptors: */
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+ unsigned long sp0;
+ unsigned long sp;
+#ifdef CONFIG_X86_32
+ unsigned long sysenter_cs;
+#else
+ unsigned short es;
+ unsigned short ds;
+ unsigned short fsindex;
+ unsigned short gsindex;
+#endif
+#ifdef CONFIG_X86_32
+ unsigned long ip;
+#endif
+#ifdef CONFIG_X86_64
+ unsigned long fs;
+#endif
+ unsigned long gs;
+ /* Save middle states of ptrace breakpoints */
+ struct perf_event *ptrace_bps[HBP_NUM];
+ /* Debug status used for traps, single steps, etc... */
+ unsigned long debugreg6;
+ /* Keep track of the exact dr7 value set by the user */
+ unsigned long ptrace_dr7;
+ /* Fault info: */
+ unsigned long cr2;
+ unsigned long trap_nr;
+ unsigned long error_code;
+ /* floating point and extended processor state */
+ struct fpu fpu;
+#ifdef CONFIG_X86_32
+ /* Virtual 86 mode info */
+ struct vm86_struct __user *vm86_info;
+ unsigned long screen_bitmap;
+ unsigned long v86flags, v86mask, saved_sp0;
+ unsigned int saved_fs, saved_gs;
+#endif
+ /* IO permissions: */
+ unsigned long *io_bitmap_ptr;
+ unsigned long iopl;
+ /* Max allowed port in the bitmap, in bytes: */
+ unsigned io_bitmap_max;
+};
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void xen_set_iopl_mask(unsigned mask)
+{
+ struct physdev_set_iopl set_iopl;
+
+ /* Force the change at ring 0. */
+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+ tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+ tss->x86_tss.ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+#endif
+}
+#else
+#define xen_load_sp0(tss, thread) do { \
+ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
+ BUG(); \
+} while (0)
+#endif
+
+#define __cpuid xen_cpuid
+#define paravirt_enabled() 1
+
+#define load_sp0 xen_load_sp0
+
+#define set_iopl_mask xen_set_iopl_mask
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long mmu_cr4_features;
+#define trampoline_cr4_features ((u32 *)NULL)
+
+static inline void set_in_cr4(unsigned long mask)
+{
+ unsigned long cr4;
+
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+ unsigned long cr4;
+
+ mmu_cr4_features &= ~mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+ rep_nop();
+}
+
+/* Stop speculative execution and prefetching of modified code. */
+static inline void sync_core(void)
+{
+ int tmp;
+
+#ifdef CONFIG_M486
+ /*
+ * Do a CPUID if available, otherwise do a jump. The jump
+ * can conveniently enough be the jump around CPUID.
+ */
+ asm volatile("cmpl %2,%1\n\t"
+ "jl 1f\n\t"
+ "cpuid\n"
+ "1:"
+ : "=a" (tmp)
+ : "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1)
+ : "ebx", "ecx", "edx", "memory");
+#else
+ /*
+ * CPUID is a barrier to speculative execution.
+ * Prefetched instructions are automatically
+ * invalidated when modified.
+ */
+ asm volatile("cpuid"
+ : "=a" (tmp)
+ : "0" (1)
+ : "ebx", "ecx", "edx", "memory");
+#endif
+}
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitor %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+ /* "mwait %eax, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void init_amd_e400_c1e_mask(void);
+
+extern unsigned long boot_option_idle_override;
+extern bool amd_e400_c1e_detected;
+
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+ IDLE_POLL};
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+extern void early_trap_init(void);
+void early_trap_pf_init(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
+extern void cpu_init(void);
+
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+ return debugctlmsr;
+}
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+extern void set_task_blockstep(struct task_struct *task, bool on);
+
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int bootloader_type;
+extern int bootloader_version;
+
+extern char ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH ASM_NOP4
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH "prefetcht0 (%1)"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchnta (%1)",
+ X86_FEATURE_XMM,
+ "r" (x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchw (%1)",
+ X86_FEATURE_3DNOW,
+ "r" (x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+ prefetchw(x);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE PAGE_OFFSET
+#define TASK_SIZE_MAX TASK_SIZE
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+#define INIT_THREAD { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .vm86_info = NULL, \
+ .sysenter_cs = __KERNEL_CS, \
+ .io_bitmap_ptr = NULL, \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS { \
+ .x86_tss = { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .ss0 = __KERNEL_DS, \
+ .ss1 = __KERNEL_CS, \
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
+ }, \
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
+}
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info) \
+({ \
+ unsigned long *__ptr = (unsigned long *)(info); \
+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessible even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task) \
+({ \
+ struct pt_regs *__regs__; \
+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+ __regs__ - 1; \
+})
+
+#else
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+ 0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX TASK_SIZE_MAX
+
+#define INIT_THREAD { \
+ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS { \
+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+
+/*
+ * User space RSP while inside the SYSCALL fast path
+ */
+DECLARE_PER_CPU(unsigned long, old_rsp);
+
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
+#define SET_TSC_CTL(val) set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+extern u16 amd_get_nb_id(int cpu);
+
+#ifndef CONFIG_XEN
+struct aperfmperf {
+ u64 aperf, mperf;
+};
+
+static inline void get_aperfmperf(struct aperfmperf *am)
+{
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
+
+ rdmsrl(MSR_IA32_APERF, am->aperf);
+ rdmsrl(MSR_IA32_MPERF, am->mperf);
+}
+
+#define APERFMPERF_SHIFT 10
+
+static inline
+unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
+ struct aperfmperf *new)
+{
+ u64 aperf = new->aperf - old->aperf;
+ u64 mperf = new->mperf - old->mperf;
+ unsigned long ratio = aperf;
+
+ mperf >>= APERFMPERF_SHIFT;
+ if (mperf)
+ ratio = div64_u64(aperf, mperf);
+
+ return ratio;
+}
+#endif
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void xen_idle(void);
+#ifdef CONFIG_PARAVIRT_XEN
+bool xen_set_default_idle(void);
+#else
+#define xen_set_default_idle 0
+#endif
+
+void stop_this_cpu(void *dummy);
+
+#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/mach-xen/asm/pvclock-abi.h b/arch/x86/include/mach-xen/asm/pvclock-abi.h
new file mode 100644
index 000000000000..0f547456a47c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pvclock-abi.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_PVCLOCK_ABI_H
+#define _ASM_X86_PVCLOCK_ABI_H
+#ifndef __ASSEMBLY__
+
+#include <xen/interface/xen.h>
+
+#define pvclock_vcpu_time_info vcpu_time_info
+struct pvclock_wall_clock; /* not used */
+
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/mach-xen/asm/setup.h b/arch/x86/include/mach-xen/asm/setup.h
new file mode 100644
index 000000000000..7a174aac3550
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/setup.h
@@ -0,0 +1,16 @@
+#ifndef __ASSEMBLY__
+
+void xen_start_kernel(void);
+void xen_arch_setup(void);
+
+extern unsigned long xen_initrd_start;
+
+#ifdef CONFIG_EFI
+void efi_probe(void);
+#else
+#define efi_probe() ((void)0)
+#endif
+
+#endif
+
+#include_next <asm/setup.h>
diff --git a/arch/x86/include/mach-xen/asm/smp-processor-id.h b/arch/x86/include/mach-xen/asm/smp-processor-id.h
new file mode 100644
index 000000000000..8a45974719e0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp-processor-id.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_SMP_PROCESSOR_ID_H
+#define _ASM_X86_SMP_PROCESSOR_ID_H
+
+#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__)
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU(int, cpu_number);
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() this_cpu_read_4(cpu_number)
+#define safe_smp_processor_id() smp_processor_id()
+
+#ifdef CONFIG_X86_64_SMP
+#define stack_smp_processor_id() \
+({ \
+ struct thread_info *ti; \
+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+ ti->cpu; \
+})
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
+#else
+# define smp_processor_id() raw_smp_processor_id()
+#endif
+
+#endif /* SMP && !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */
diff --git a/arch/x86/include/mach-xen/asm/smp.h b/arch/x86/include/mach-xen/asm/smp.h
new file mode 100644
index 000000000000..4e76b0ea15e9
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp.h
@@ -0,0 +1,240 @@
+#ifndef _ASM_X86_SMP_H
+#define _ASM_X86_SMP_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+# include <asm/io_apic.h>
+# endif
+#endif
+#include <linux/thread_info.h>
+#include <asm/cpumask.h>
+#include <asm/cpufeature.h>
+
+extern unsigned int num_processors;
+
+#ifndef CONFIG_XEN
+static inline bool cpu_has_ht_siblings(void)
+{
+ bool has_siblings = false;
+#ifdef CONFIG_SMP
+ has_siblings = cpu_has_ht && smp_num_siblings > 1;
+#endif
+ return has_siblings;
+}
+
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
+#endif
+
+static inline const struct cpumask *cpu_sibling_mask(int cpu)
+{
+ return cpumask_of(cpu);
+}
+
+static inline const struct cpumask *cpu_core_mask(int cpu)
+{
+ return cpumask_of(cpu);
+}
+
+#ifndef CONFIG_XEN
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifndef CONFIG_XEN
+
+/* Static state in head.S used to set up a CPU */
+extern unsigned long stack_start; /* Initial stack pointer address */
+
+struct task_struct;
+
+struct smp_ops {
+ void (*smp_prepare_boot_cpu)(void);
+ void (*smp_prepare_cpus)(unsigned max_cpus);
+ void (*smp_cpus_done)(unsigned max_cpus);
+
+ void (*stop_other_cpus)(int wait);
+ void (*smp_send_reschedule)(int cpu);
+
+ int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
+ int (*cpu_disable)(void);
+ void (*cpu_die)(unsigned int cpu);
+ void (*play_dead)(void);
+
+ void (*send_call_func_ipi)(const struct cpumask *mask);
+ void (*send_call_func_single_ipi)(int cpu);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+ smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+ smp_ops.stop_other_cpus(1);
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+ smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ return smp_ops.cpu_up(cpu, tidle);
+}
+
+static inline int __cpu_disable(void)
+{
+ return smp_ops.cpu_disable();
+}
+
+static inline void __cpu_die(unsigned int cpu)
+{
+ smp_ops.cpu_die(cpu);
+}
+
+static inline void play_dead(void)
+{
+ smp_ops.play_dead();
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+ smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+ smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+ smp_ops.send_call_func_ipi(mask);
+}
+
+void cpu_disable_common(void);
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
+int native_cpu_disable(void);
+void native_cpu_die(unsigned int cpu);
+void native_play_dead(void);
+void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+int wbinvd_on_all_cpus(void);
+void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
+
+void smp_store_boot_cpu_info(void);
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+
+#else /* CONFIG_XEN */
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+void xen_stop_other_cpus(int wait);
+void xen_smp_send_reschedule(int cpu);
+void xen_send_call_func_ipi(const struct cpumask *mask);
+void xen_send_call_func_single_ipi(int cpu);
+
+static inline void smp_send_stop(void)
+{
+ xen_stop_other_cpus(0);
+}
+
+#define smp_send_reschedule xen_smp_send_reschedule
+#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
+#define arch_send_call_function_ipi_mask xen_send_call_func_ipi
+
+void play_dead(void);
+
+#endif /* CONFIG_XEN */
+
+#elif /* !CONFIG_SMP && */ !defined(CONFIG_XEN)
+#define wbinvd_on_cpu(cpu) wbinvd()
+static inline int wbinvd_on_all_cpus(void)
+{
+ wbinvd();
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_XEN
+int wbinvd_on_all_cpus(void);
+#endif
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#include <asm/smp-processor-id.h>
+
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+
+#ifndef CONFIG_X86_64
+static inline int logical_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+}
+
+#endif
+
+extern int hard_smp_processor_id(void);
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+# define hard_smp_processor_id() 0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/mach-xen/asm/special_insns.h b/arch/x86/include/mach-xen/asm/special_insns.h
new file mode 100644
index 000000000000..e84d99c30296
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/special_insns.h
@@ -0,0 +1,247 @@
+#ifndef _ASM_X86_SPECIAL_INSNS_H
+#define _ASM_X86_SPECIAL_INSNS_H
+
+
+#ifdef __KERNEL__
+
+#include <asm/barrier.h>
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+DECLARE_PER_CPU(unsigned long, xen_x86_cr0);
+DECLARE_PER_CPU(unsigned long, xen_x86_cr0_upd);
+
+static inline unsigned long xen_read_cr0_upd(void)
+{
+ unsigned long upd = __this_cpu_read_l(xen_x86_cr0_upd);
+ rmb();
+ return upd;
+}
+
+static inline void xen_clear_cr0_upd(void)
+{
+ wmb();
+ __this_cpu_write_l(xen_x86_cr0_upd, 0);
+}
+
+static inline void xen_clts(void)
+{
+ if (unlikely(xen_read_cr0_upd()))
+ HYPERVISOR_fpu_taskswitch(0);
+ else if (__this_cpu_read_4(xen_x86_cr0) & X86_CR0_TS) {
+ __this_cpu_write_4(xen_x86_cr0_upd, X86_CR0_TS);
+ HYPERVISOR_fpu_taskswitch(0);
+ __this_cpu_and_4(xen_x86_cr0, ~X86_CR0_TS);
+ xen_clear_cr0_upd();
+ }
+}
+
+static inline void xen_stts(void)
+{
+ if (unlikely(xen_read_cr0_upd()))
+ HYPERVISOR_fpu_taskswitch(1);
+ else if (!(__this_cpu_read_4(xen_x86_cr0) & X86_CR0_TS)) {
+ __this_cpu_write_4(xen_x86_cr0_upd, X86_CR0_TS);
+ HYPERVISOR_fpu_taskswitch(1);
+ __this_cpu_or_4(xen_x86_cr0, X86_CR0_TS);
+ xen_clear_cr0_upd();
+ }
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+#define __force_order machine_to_phys_nr
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline unsigned long xen_read_cr0(void)
+{
+ return likely(!xen_read_cr0_upd()) ?
+ __this_cpu_read_l(xen_x86_cr0) : native_read_cr0();
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+static inline void xen_write_cr0(unsigned long val)
+{
+ unsigned long upd = val ^ __this_cpu_read_l(xen_x86_cr0);
+
+ if (unlikely(percpu_cmpxchg_op(xen_x86_cr0_upd, 0, upd))) {
+ native_write_cr0(val);
+ return;
+ }
+ switch (upd) {
+ case 0:
+ return;
+ case X86_CR0_TS:
+ HYPERVISOR_fpu_taskswitch(!!(val & X86_CR0_TS));
+ break;
+ default:
+ native_write_cr0(val);
+ break;
+ }
+ __this_cpu_write_l(xen_x86_cr0, val);
+ xen_clear_cr0_upd();
+}
+
+#define xen_read_cr2() vcpu_info_read(arch.cr2)
+#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val)
+
+static inline unsigned long xen_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+#ifdef CONFIG_X86_32
+ return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
+#else
+ return machine_to_phys(val);
+#endif
+}
+
+static inline void xen_write_cr3(unsigned long val)
+{
+#ifdef CONFIG_X86_32
+ val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
+#else
+ val = phys_to_machine(val);
+#endif
+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long xen_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+#define xen_read_cr4_safe() xen_read_cr4()
+
+static inline void xen_write_cr4(unsigned long val)
+{
+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+ return 0;
+}
+
+static inline void xen_write_cr8(unsigned long val)
+{
+ BUG_ON(val);
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+extern void xen_load_gs_index(unsigned);
+
+static inline unsigned long read_cr0(void)
+{
+ return xen_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ xen_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return xen_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ xen_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+ return xen_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ xen_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+ return xen_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+ return xen_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+ xen_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+ native_wbinvd();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long read_cr8(void)
+{
+ return xen_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ xen_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+ xen_load_gs_index(selector);
+}
+
+#endif
+
+/* Clear the 'TS' bit */
+static inline void clts(void)
+{
+ xen_clts();
+}
+
+static inline void stts(void)
+{
+ xen_stts();
+}
+
+static inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/mach-xen/asm/spinlock.h b/arch/x86/include/mach-xen/asm/spinlock.h
new file mode 100644
index 000000000000..76800f8c1a92
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@ -0,0 +1,367 @@
+#ifndef _ASM_X86_SPINLOCK_H
+#define _ASM_X86_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations. There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifdef CONFIG_X86_32
+# define LOCK_PTR_REG "a"
+#else
+# define LOCK_PTR_REG "D"
+#endif
+
+#if defined(CONFIG_XEN) || (defined(CONFIG_X86_32) && \
+ (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)))
+/*
+ * On Xen, as we read back the result of the unlocking increment, we must use
+ * a locked access (or insert a full memory barrier) in all cases (so that we
+ * read what is globally visible).
+ *
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+#ifdef TICKET_SHIFT
+
+#include <asm/irqflags.h>
+#include <asm/smp-processor-id.h>
+
+int xen_spinlock_init(unsigned int cpu);
+void xen_spinlock_cleanup(unsigned int cpu);
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *,
+ struct __raw_tickets);
+#else
+#define xen_spin_adjust(lock, raw_tickets) (raw_tickets)
+#define xen_spin_wait(l, t, f) xen_spin_wait(l, t)
+#endif
+unsigned int xen_spin_wait(arch_spinlock_t *, struct __raw_tickets *,
+ unsigned int flags);
+void xen_spin_kick(const arch_spinlock_t *, unsigned int ticket);
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ */
+#define __spin_count_dec(c, l) (vcpu_running((l)->owner) ? --(c) : ((c) >>= 1))
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
+{
+ struct __raw_tickets inc = { .tail = 1 };
+ unsigned int count, flags = arch_local_irq_save();
+
+ inc = xadd(&lock->tickets, inc);
+ if (likely(inc.head == inc.tail))
+ arch_local_irq_restore(flags);
+ else {
+ inc = xen_spin_adjust(lock, inc);
+ arch_local_irq_restore(flags);
+ count = 1 << 12;
+ do {
+ while (inc.head != inc.tail
+ && __spin_count_dec(count, lock)) {
+ cpu_relax();
+ inc.head = ACCESS_ONCE(lock->tickets.head);
+ }
+ } while (unlikely(!count)
+ && (count = xen_spin_wait(lock, &inc, flags)));
+ }
+ barrier(); /* make sure nothing creeps before the lock is taken */
+ lock->owner = raw_smp_processor_id();
+}
+#else
+#define __ticket_spin_lock(lock) __ticket_spin_lock_flags(lock, -1)
+#endif
+
+static __always_inline void __ticket_spin_lock_flags(arch_spinlock_t *lock,
+ unsigned long flags)
+{
+ struct __raw_tickets inc = { .tail = 1 };
+
+ inc = xadd(&lock->tickets, inc);
+ if (unlikely(inc.head != inc.tail)) {
+ unsigned int count = 1 << 12;
+
+ inc = xen_spin_adjust(lock, inc);
+ do {
+ while (inc.head != inc.tail
+ && __spin_count_dec(count, lock)) {
+ cpu_relax();
+ inc.head = ACCESS_ONCE(lock->tickets.head);
+ }
+ } while (unlikely(!count)
+ && (count = xen_spin_wait(lock, &inc, flags)));
+ }
+ barrier(); /* make sure nothing creeps before the lock is taken */
+ lock->owner = raw_smp_processor_id();
+}
+
+#undef __spin_count_dec
+
+static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
+{
+ arch_spinlock_t old;
+
+ old.tickets = ACCESS_ONCE(lock->tickets);
+ if (old.tickets.head != old.tickets.tail)
+ return 0;
+
+ /* cmpxchg is a full barrier, so nothing can move before it */
+ if (cmpxchg(&lock->head_tail, old.head_tail,
+ old.head_tail + (1 << TICKET_SHIFT)) != old.head_tail)
+ return 0;
+ lock->owner = raw_smp_processor_id();
+ return 1;
+}
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+ register struct __raw_tickets new;
+
+ __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+#if !defined(XEN_SPINLOCK_SOURCE) || !CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+# undef UNLOCK_LOCK_PREFIX
+#endif
+ new = ACCESS_ONCE(lock->tickets);
+ if (new.head != new.tail)
+ xen_spin_kick(lock, new.head);
+}
+
+static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
+{
+ struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+ return tmp.tail != tmp.head;
+}
+
+static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
+{
+ struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+ return (__ticket_t)(tmp.tail - tmp.head) > 1;
+}
+
+#define __arch_spin(n) __ticket_spin_##n
+
+#else /* TICKET_SHIFT */
+
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+
+static inline int __byte_spin_is_locked(arch_spinlock_t *lock)
+{
+ return lock->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(arch_spinlock_t *lock)
+{
+ return lock->spinners != 0;
+}
+
+static inline void __byte_spin_lock(arch_spinlock_t *lock)
+{
+ s8 val = 1;
+
+ asm("1: xchgb %1, %0\n"
+ " test %1,%1\n"
+ " jz 3f\n"
+ " " LOCK_PREFIX "incb %2\n"
+ "2: rep;nop\n"
+ " cmpb $1, %0\n"
+ " je 2b\n"
+ " " LOCK_PREFIX "decb %2\n"
+ " jmp 1b\n"
+ "3:"
+ : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
+}
+
+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
+
+static inline int __byte_spin_trylock(arch_spinlock_t *lock)
+{
+ u8 old = 1;
+
+ asm("xchgb %1,%0"
+ : "+m" (lock->lock), "+q" (old) : : "memory");
+
+ return old == 0;
+}
+
+static inline void __byte_spin_unlock(arch_spinlock_t *lock)
+{
+ smp_wmb();
+ lock->lock = 0;
+}
+
+#define __arch_spin(n) __byte_spin_##n
+
+#endif /* TICKET_SHIFT */
+
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+ return __arch_spin(is_locked)(lock);
+}
+
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
+{
+ return __arch_spin(is_contended)(lock);
+}
+#define arch_spin_is_contended arch_spin_is_contended
+
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+ __arch_spin(lock)(lock);
+}
+
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+ return __arch_spin(trylock)(lock);
+}
+
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+ __arch_spin(unlock)(lock);
+}
+
+static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
+ unsigned long flags)
+{
+ __arch_spin(lock_flags)(lock, flags);
+}
+
+#undef __arch_spin
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ while (arch_spin_is_locked(lock))
+ cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_read_can_lock(arch_rwlock_t *lock)
+{
+ return lock->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_write_can_lock(arch_rwlock_t *lock)
+{
+ return lock->write == WRITE_LOCK_CMP;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
+ "jns 1f\n"
+ "call __read_lock_failed\n\t"
+ "1:\n"
+ ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
+ "jz 1f\n"
+ "call __write_lock_failed\n\t"
+ "1:\n"
+ ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+ : "memory");
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *lock)
+{
+ READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
+
+ if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
+ return 1;
+ READ_LOCK_ATOMIC(inc)(count);
+ return 0;
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)&lock->write;
+
+ if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
+ return 1;
+ atomic_add(WRITE_LOCK_CMP, count);
+ return 0;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+ :"+m" (rw->lock) : : "memory");
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+ : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
+#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
+#define arch_spin_relax(lock) cpu_relax()
+#define arch_read_relax(lock) cpu_relax()
+#define arch_write_relax(lock) cpu_relax()
+
+/* The {read|write|spin}_lock() on x86 are full memory barriers. */
+static inline void smp_mb__after_lock(void) { }
+#define ARCH_HAS_SMP_MB_AFTER_LOCK
+
+#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/mach-xen/asm/spinlock_types.h b/arch/x86/include/mach-xen/asm/spinlock_types.h
new file mode 100644
index 000000000000..d78bbc0f828e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@ -0,0 +1,62 @@
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+#include <linux/types.h>
+
+#ifdef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+/*
+ * On Xen we support CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING levels of
+ * interrupt re-enabling per IRQ-safe lock. Hence we can have
+ * (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) times as many outstanding
+ * tickets. Thus the cut-off for using byte register pairs must be at
+ * a sufficiently smaller number of CPUs.
+ */
+#if (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) * CONFIG_NR_CPUS < 256
+typedef u8 __ticket_t;
+# define TICKET_SHIFT 8
+typedef u16 __ticketpair_t;
+#else
+typedef u16 __ticket_t;
+# define TICKET_SHIFT 16
+typedef u32 __ticketpair_t;
+#endif
+
+typedef union {
+ __ticketpair_t head_tail;
+ struct {
+ struct __raw_tickets {
+ __ticket_t head, tail;
+ } tickets;
+#if CONFIG_NR_CPUS <= 256
+ u8 owner;
+#else
+ u16 owner;
+#endif
+ };
+#else /* ndef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */
+typedef struct {
+/*
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure. It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+ u8 lock;
+#if CONFIG_NR_CPUS < 256
+ u8 spinners;
+#else
+# error NR_CPUS >= 256 not implemented
+#endif
+#endif /* def CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */
+} arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
+
+#include <asm/rwlock.h>
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/mach-xen/asm/swiotlb.h b/arch/x86/include/mach-xen/asm/swiotlb.h
new file mode 100644
index 000000000000..e82aad1ac108
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/swiotlb.h
@@ -0,0 +1,8 @@
+#include_next <asm/swiotlb.h>
+
+#ifndef CONFIG_SWIOTLB
+#define swiotlb_init(verbose) ((void)(verbose))
+#endif
+
+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
+ int dir);
diff --git a/arch/x86/include/mach-xen/asm/switch_to.h b/arch/x86/include/mach-xen/asm/switch_to.h
new file mode 100644
index 000000000000..4c1d7294782d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/switch_to.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_SWITCH_TO_H
+
+#define __switch_to_xtra(prev, next, tss) __switch_to_xtra(prev, next)
+
+#include_next <asm/switch_to.h>
+
+#undef __switch_to_xtra
+
+#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/mach-xen/asm/time.h b/arch/x86/include/mach-xen/asm/time.h
new file mode 100644
index 000000000000..d898756acbc7
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/time.h
@@ -0,0 +1,18 @@
+#ifndef _XEN_ASM_TIME_H
+#define _XEN_ASM_TIME_H
+
+unsigned long xen_read_wallclock(void);
+int xen_write_wallclock(unsigned long);
+
+struct timespec;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+int xen_update_wallclock(const struct timespec *);
+#else
+static inline int xen_update_wallclock(const struct timespec *tv) {
+ return -EPERM;
+}
+#endif
+
+#endif /* _XEN_ASM_TIME_H */
+
+#include_next <asm/time.h>
diff --git a/arch/x86/include/mach-xen/asm/tlbflush.h b/arch/x86/include/mach-xen/asm/tlbflush.h
new file mode 100644
index 000000000000..8af6cda95c0b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/tlbflush.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/special_insns.h>
+
+#define __flush_tlb() xen_tlb_flush()
+#define __flush_tlb_global() xen_tlb_flush()
+#define __flush_tlb_single(addr) xen_invlpg(addr)
+#define __flush_tlb_all() xen_tlb_flush()
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+
+#define TLB_FLUSH_ALL -1UL
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb() flushes the current mm struct TLBs
+ * - flush_tlb_all() flushes all processes TLBs
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+ if (mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void flush_tlb_mm_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end, unsigned long vmflag)
+{
+ if (mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void reset_lazy_tlbstate(void)
+{
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
+{
+ flush_tlb_all();
+}
+
+#else /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
+
+#define flush_tlb_range(vma, start, end) \
+ flush_tlb_mm_range((vma)->vm_mm, start, end, (vma)->vm_flags)
+
+#define flush_tlb_all xen_tlb_flush_all
+#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm))
+#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va)
+extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag);
+extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
+#define flush_tlb() flush_tlb_current_task()
+
+#ifndef CONFIG_XEN
+#define TLBSTATE_OK 1
+#define TLBSTATE_LAZY 2
+
+struct tlb_state {
+ struct mm_struct *active_mm;
+ int state;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+static inline void reset_lazy_tlbstate(void)
+{
+ this_cpu_write(cpu_tlbstate.state, 0);
+ this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
+}
+#endif
+
+#endif /* SMP */
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/mach-xen/asm/tsc.h b/arch/x86/include/mach-xen/asm/tsc.h
new file mode 100644
index 000000000000..1ef18c25eb3d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/tsc.h
@@ -0,0 +1,5 @@
+#undef check_tsc_unstable
+#define check_tsc_unstable _check_tsc_unstable_
+#include_next <asm/tsc.h>
+#undef check_tsc_unstable
+#define check_tsc_unstable() WARN_ON(true)
diff --git a/arch/x86/include/mach-xen/asm/vga.h b/arch/x86/include/mach-xen/asm/vga.h
new file mode 100644
index 000000000000..eee9832ebb20
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/vga.h
@@ -0,0 +1,26 @@
+/*
+ * Access to VGA videoram
+ *
+ * (c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _ASM_X86_VGA_H
+#define _ASM_X86_VGA_H
+
+/*
+ * On the PC, we can just recalculate addresses and then
+ * access the videoram directly without any black magic.
+ */
+
+#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#ifdef CONFIG_FB_EFI
+#define __ARCH_HAS_VGA_DEFAULT_DEVICE
+extern struct pci_dev *vga_default_device(void);
+extern void vga_set_default_device(struct pci_dev *pdev);
+#endif
+
+#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/mach-xen/asm/xenoprof.h b/arch/x86/include/mach-xen/asm/xenoprof.h
new file mode 100644
index 000000000000..2733e00ee46b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xenoprof.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+ * asm-i386/mach-xen/asm/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __ASM_XENOPROF_H__
+#define __ASM_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+struct super_block;
+struct dentry;
+int xenoprof_create_files(struct super_block * sb, struct dentry * root);
+#define HAVE_XENOPROF_CREATE_FILES
+
+struct xenoprof_init;
+void xenoprof_arch_init_counter(struct xenoprof_init *init);
+void xenoprof_arch_counter(void);
+void xenoprof_arch_start(void);
+void xenoprof_arch_stop(void);
+
+struct xenoprof_arch_shared_buffer {
+ /* nothing */
+};
+struct xenoprof_shared_buffer;
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_get_buffer;
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_passive;
+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
+
+#endif /* CONFIG_XEN */
+#endif /* __ASM_XENOPROF_H__ */
diff --git a/arch/x86/include/mach-xen/asm/xor.h b/arch/x86/include/mach-xen/asm/xor.h
new file mode 100644
index 000000000000..094381dce006
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_XEN_XOR_H
+#define _ASM_X86_XEN_XOR_H
+
+#include_next <asm/xor.h>
+
+#undef XOR_SELECT_TEMPLATE
+
+#ifdef CONFIG_X86_64
+
+/* Also try the generic routines. */
+#undef XOR_TRY_TEMPLATES
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
+ xor_speed(&xor_block_sse); \
+ xor_speed(&xor_block_sse_pf64); \
+ AVX_XOR_SPEED; \
+} while (0)
+
+#endif
+
+#endif /* _ASM_X86_XEN_XOR_H */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index bbae02470701..28d20702b377 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -60,7 +60,11 @@ struct e820map {
struct e820entry map[E820_X_MAX];
};
+#ifndef CONFIG_XEN
#define ISA_START_ADDRESS 0xa0000
+#else
+#define ISA_START_ADDRESS 0
+#endif
#define ISA_END_ADDRESS 0x100000
#define BIOS_BEGIN 0x000a0000
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7bd3bd310106..49f603761b3d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -103,6 +103,8 @@ obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+obj-$(CONFIG_X86_XEN) += fixup.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
@@ -114,3 +116,7 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
endif
+
+disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o i8237.o i8253.o i8259.o \
+ irqinit.o pci-swiotlb.o reboot.o smpboot.o trampoline%.o tsc%.o vsmp%.o
+disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 163b22581472..8ea66684ca0c 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -3,5 +3,9 @@ obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
+ifneq ($(CONFIG_PROCESSOR_EXTERNAL_CONTROL),)
+obj-$(CONFIG_XEN) += processor_extcntl_xen.o
+endif
endif
+disabled-obj-$(CONFIG_XEN) := cstate.o sleep.o wakeup_%.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c85342b763c7..8954ba036e4f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -69,6 +69,7 @@ int acpi_strict;
u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
+#ifndef CONFIG_XEN
int acpi_skip_timer_override __initdata;
int acpi_use_timer_override __initdata;
int acpi_fix_pin2_polarity __initdata;
@@ -76,6 +77,10 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
+#else
+#define acpi_skip_timer_override 0
+#define acpi_fix_pin2_polarity 0
+#endif
#ifndef __HAVE_ARCH_CMPXCHG
#warning ACPI uses CMPXCHG, i486 and later hardware
@@ -181,6 +186,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
return -ENODEV;
}
+#ifndef CONFIG_XEN
if (madt->address) {
acpi_lapic_addr = (u64) madt->address;
@@ -190,12 +196,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
+#endif
return 0;
}
static void __cpuinit acpi_register_lapic(int id, u8 enabled)
{
+#ifndef CONFIG_XEN
unsigned int ver = 0;
if (id >= (MAX_LOCAL_APIC-1)) {
@@ -212,6 +220,7 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
ver = apic_version[boot_cpu_physical_apicid];
generic_processor_info(id, ver);
+#endif
}
static int __init
@@ -242,7 +251,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
else
acpi_register_lapic(apic_id, enabled);
-#else
+#elif !defined(CONFIG_XEN)
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
#endif
@@ -296,6 +305,7 @@ static int __init
acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
const unsigned long end)
{
+#ifndef CONFIG_XEN
struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
@@ -304,6 +314,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
return -EINVAL;
acpi_lapic_addr = lapic_addr_ovr->address;
+#endif
return 0;
}
@@ -600,6 +611,7 @@ void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
+#ifndef CONFIG_XEN
static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
@@ -686,6 +698,9 @@ free_tmp_map:
out:
return retval;
}
+#else
+#define _acpi_map_lsapic(h, p) (-EINVAL)
+#endif
/* wrapper to silence section mismatch warning */
int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@ -696,6 +711,7 @@ EXPORT_SYMBOL(acpi_map_lsapic);
int acpi_unmap_lsapic(int cpu)
{
+#ifndef CONFIG_XEN
#ifdef CONFIG_ACPI_NUMA
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
#endif
@@ -703,6 +719,7 @@ int acpi_unmap_lsapic(int cpu)
per_cpu(x86_cpu_to_apicid, cpu) = -1;
set_cpu_present(cpu, false);
num_processors--;
+#endif
return (0);
}
@@ -1344,6 +1361,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
return 0;
}
+#ifndef CONFIG_XEN
/*
* Force ignoring BIOS IRQ0 override
*/
@@ -1356,6 +1374,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
}
return 0;
}
+#endif
static int __init force_acpi_rsdt(const struct dmi_system_id *d)
{
@@ -1476,6 +1495,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
{}
};
+#ifndef CONFIG_XEN
/* second table for DMI checks that should run after early-quirks */
static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
/*
@@ -1530,6 +1550,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
},
{}
};
+#endif
/*
* acpi_boot_table_init() and acpi_boot_init()
@@ -1602,8 +1623,10 @@ int __init early_acpi_boot_init(void)
int __init acpi_boot_init(void)
{
+#ifndef CONFIG_XEN
/* those are executed after early-quirks are executed */
dmi_check_system(acpi_dmi_table_late);
+#endif
/*
* If acpi_disabled, bail out
@@ -1703,7 +1726,7 @@ int __init acpi_mps_check(void)
return 0;
}
-#ifdef CONFIG_X86_IO_APIC
+#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
static int __init parse_acpi_skip_timer_override(char *arg)
{
acpi_skip_timer_override = 1;
diff --git a/arch/x86/kernel/acpi/processor_extcntl_xen.c b/arch/x86/kernel/acpi/processor_extcntl_xen.c
new file mode 100644
index 000000000000..a870a3d5fcae
--- /dev/null
+++ b/arch/x86/kernel/acpi/processor_extcntl_xen.c
@@ -0,0 +1,310 @@
+/*
+ * processor_extcntl_xen.c - interface to notify Xen
+ *
+ * Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/cpufreq.h>
+#include <acpi/processor.h>
+#include <asm/hypercall.h>
+
+static int xen_cx_notifier(struct acpi_processor *pr, int action)
+{
+ int ret, count = 0, i;
+ xen_platform_op_t op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_CX,
+ };
+ struct xen_processor_cx *data, *buf;
+ struct acpi_processor_cx *cx;
+
+ /* Convert to Xen defined structure and hypercall */
+ buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
+ GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ data = buf;
+ for (i = 1; i <= pr->power.count; i++) {
+ cx = &pr->power.states[i];
+ /* Skip invalid cstate entry */
+ if (!cx->valid)
+ continue;
+
+ data->type = cx->type;
+ data->latency = cx->latency;
+ /* data->power = cx->power; */
+ data->reg.space_id = cx->reg.space_id;
+ data->reg.bit_width = cx->reg.bit_width;
+ data->reg.bit_offset = cx->reg.bit_offset;
+ data->reg.access_size = cx->reg.access_size;
+ data->reg.address = cx->reg.address;
+
+ /* Get dependency relationships */
+ if (cx->csd_count) {
+ pr_warning("_CSD found: Not supported for now!\n");
+ kfree(buf);
+ return -EINVAL;
+ } else {
+ data->dpcnt = 0;
+ set_xen_guest_handle(data->dp, NULL);
+ }
+
+ data++;
+ count++;
+ }
+
+ if (!count) {
+ pr_info("No available Cx info for cpu %d\n", pr->acpi_id);
+ kfree(buf);
+ return -EINVAL;
+ }
+
+ op.u.set_pminfo.u.power.count = count;
+ op.u.set_pminfo.u.power.flags.bm_control = pr->flags.bm_control;
+ op.u.set_pminfo.u.power.flags.bm_check = pr->flags.bm_check;
+ op.u.set_pminfo.u.power.flags.has_cst = pr->flags.has_cst;
+ op.u.set_pminfo.u.power.flags.power_setup_done = pr->flags.power_setup_done;
+
+ set_xen_guest_handle(op.u.set_pminfo.u.power.states, buf);
+ ret = HYPERVISOR_platform_op(&op);
+ kfree(buf);
+ return ret;
+}
+
+static int xen_px_notifier(struct acpi_processor *pr, int action)
+{
+ int ret = -EINVAL;
+ xen_platform_op_t op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_PX,
+ };
+ struct xen_processor_performance *perf;
+ struct xen_processor_px *states = NULL;
+ struct acpi_processor_performance *px;
+ struct acpi_psd_package *pdomain;
+
+ if (!pr)
+ return -EINVAL;
+
+ perf = &op.u.set_pminfo.u.perf;
+ px = pr->performance;
+ if (!px)
+ return -EINVAL;
+
+ switch(action) {
+ case PROCESSOR_PM_CHANGE:
+ /* ppc dynamic handle */
+ perf->flags = XEN_PX_PPC;
+ perf->platform_limit = pr->performance_platform_limit;
+
+ ret = HYPERVISOR_platform_op(&op);
+ break;
+
+ case PROCESSOR_PM_INIT:
+ /* px normal init */
+ perf->flags = XEN_PX_PPC |
+ XEN_PX_PCT |
+ XEN_PX_PSS |
+ XEN_PX_PSD;
+
+ /* ppc */
+ perf->platform_limit = pr->performance_platform_limit;
+
+ /* pct */
+ xen_convert_pct_reg(&perf->control_register, &px->control_register);
+ xen_convert_pct_reg(&perf->status_register, &px->status_register);
+
+ /* pss */
+ perf->state_count = px->state_count;
+ states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
+ if (!states)
+ return -ENOMEM;
+ xen_convert_pss_states(states, px->states, px->state_count);
+ set_xen_guest_handle(perf->states, states);
+
+ /* psd */
+ pdomain = &px->domain_info;
+ xen_convert_psd_pack(&perf->domain_info, pdomain);
+ if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+ perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+ perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+ perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
+ else {
+ ret = -ENODEV;
+ kfree(states);
+ break;
+ }
+
+ ret = HYPERVISOR_platform_op(&op);
+ kfree(states);
+ break;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int xen_tx_notifier(struct acpi_processor *pr, int action)
+{
+ return -EINVAL;
+}
+
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+ int ret = -EINVAL;
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+ acpi_status status = 0;
+ acpi_object_type type;
+ uint32_t apic_id;
+ int device_decl = 0;
+ unsigned long long pxm;
+ xen_platform_op_t op;
+
+ status = acpi_get_type(pr->handle, &type);
+ if (ACPI_FAILURE(status)) {
+ pr_warn("can't get object type for acpi_id %#x\n",
+ pr->acpi_id);
+ return -ENXIO;
+ }
+
+ switch (type) {
+ case ACPI_TYPE_PROCESSOR:
+ break;
+ case ACPI_TYPE_DEVICE:
+ device_decl = 1;
+ break;
+ default:
+ pr_warn("unsupported object type %#x for acpi_id %#x\n",
+ type, pr->acpi_id);
+ return -EOPNOTSUPP;
+ }
+
+ apic_id = acpi_get_cpuid(pr->handle, ~device_decl, pr->acpi_id);
+ if (apic_id < 0) {
+ pr_warn("can't get apic_id for acpi_id %#x\n", pr->acpi_id);
+ return -ENODATA;
+ }
+
+ status = acpi_evaluate_integer(pr->handle, "_PXM", NULL, &pxm);
+ if (ACPI_FAILURE(status)) {
+ pr_warn("can't get pxm for acpi_id %#x\n", pr->acpi_id);
+ return -ENODATA;
+ }
+
+ switch (event) {
+ case HOTPLUG_TYPE_ADD:
+ op.cmd = XENPF_cpu_hotadd;
+ op.u.cpu_add.apic_id = apic_id;
+ op.u.cpu_add.acpi_id = pr->acpi_id;
+ op.u.cpu_add.pxm = pxm;
+ ret = HYPERVISOR_platform_op(&op);
+ break;
+ case HOTPLUG_TYPE_REMOVE:
+ pr_warn("Xen doesn't support CPU hot remove\n");
+ ret = -EOPNOTSUPP;
+ break;
+ }
+#endif
+
+ return ret;
+}
+
+static struct processor_extcntl_ops xen_extcntl_ops = {
+ .hotplug = xen_hotplug_notifier,
+};
+
+static int xen_sleep(u8 sleep_state, u32 val_a, u32 val_b, bool extended)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_enter_acpi_sleep,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.enter_acpi_sleep = {
+ .pm1a_cnt_val = val_a,
+ .pm1b_cnt_val = val_b,
+ .sleep_state = sleep_state,
+ .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0,
+ },
+ };
+ int err = HYPERVISOR_platform_op(&op);
+
+ if (!err)
+ return 1;
+
+ pr_err("ACPI: Hypervisor failure [%d]\n", err);
+ return -1;
+}
+
+static int __init init_extcntl(void)
+{
+ unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+ if (!pmbits)
+ return 0;
+#endif
+ if (pmbits & XEN_PROCESSOR_PM_CX)
+ xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
+ if (pmbits & XEN_PROCESSOR_PM_PX)
+ xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
+ if (pmbits & XEN_PROCESSOR_PM_TX)
+ xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
+
+ processor_extcntl_ops = &xen_extcntl_ops;
+
+ acpi_os_set_prepare_sleep(xen_sleep);
+
+ return 0;
+}
+arch_initcall(init_extcntl);
+
+unsigned int cpufreq_quick_get(unsigned int cpu)
+{
+ xen_platform_op_t op;
+
+ op.cmd = XENPF_get_cpu_freq;
+ op.u.get_cpu_freq.vcpu = cpu;
+ return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
+}
+
+unsigned int cpufreq_quick_get_max(unsigned int cpu)
+{
+ xen_platform_op_t op;
+
+ op.cmd = XENPF_get_cpu_freq_max;
+ op.u.get_cpu_freq.vcpu = cpu;
+ return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
+}
+EXPORT_SYMBOL(cpufreq_quick_get_max);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 3048ded1b598..664012bd5eda 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -18,6 +18,10 @@ static u32 *flush_words;
const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+#ifdef CONFIG_XEN
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */
+#endif
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
@@ -155,6 +159,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
return res;
}
+#ifndef CONFIG_XEN
int amd_get_subcaches(int cpu)
{
struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
@@ -209,6 +214,7 @@ int amd_set_subcaches(int cpu, int mask)
return 0;
}
+#endif
static int amd_cache_gart(void)
{
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 0ae0323b1f9c..f30b9020d941 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -25,3 +25,7 @@ obj-$(CONFIG_X86_ES7000) += es7000_32.o
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
+
+probe_64-$(CONFIG_XEN) := probe_32.o
+
+disabled-obj-$(CONFIG_XEN) := apic_%.o
diff --git a/arch/x86/kernel/apic/apic-xen.c b/arch/x86/kernel/apic/apic-xen.c
new file mode 100644
index 000000000000..6b0603c77b5e
--- /dev/null
+++ b/arch/x86/kernel/apic/apic-xen.c
@@ -0,0 +1,69 @@
+/*
+ * Local APIC handling stubs
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+
+#include <asm/smp.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+
+unsigned int num_processors;
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+unsigned int apic_verbosity;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static int __init apic_set_verbosity(char *arg)
+{
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ return 0;
+#endif
+ return -EINVAL;
+ }
+
+ if (strcmp("debug", arg) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", arg) == 0)
+ apic_verbosity = APIC_VERBOSE;
+ else {
+ pr_warning("APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+
+#ifndef CONFIG_SMP
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+# ifdef CONFIG_X86_64
+ else
+ nr_ioapics = 0;
+# endif
+#endif
+
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 31cb9ae992b7..8773f2c65723 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -26,6 +26,10 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#endif
#ifdef arch_trigger_all_cpu_backtrace
+#ifdef CONFIG_XEN
+#include <asm/ipi.h>
+#endif
+
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
@@ -46,7 +50,11 @@ void arch_trigger_all_cpu_backtrace(void)
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
printk(KERN_INFO "sending NMI to all CPUs:\n");
+#ifndef CONFIG_XEN
apic->send_IPI_all(NMI_VECTOR);
+#else /* this works even without CONFIG_X86_LOCAL_APIC */
+ xen_send_IPI_all(NMI_VECTOR);
+#endif
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
diff --git a/arch/x86/kernel/apic/io_apic-xen.c b/arch/x86/kernel/apic/io_apic-xen.c
new file mode 100644
index 000000000000..e954525af69f
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic-xen.c
@@ -0,0 +1,4031 @@
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/syscore_ops.h>
+struct msi_msg; /* #include <linux/msi.h> */
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
+#include <linux/slab.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+#include <linux/bootmem.h>
+
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include <asm/irq_remapping.h>
+#include <asm/hw_irq.h>
+
+#include <asm/apic.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/evtchn.h>
+
+/* Fake i8259 */
+static void make_8259A_irq(unsigned int irq) { io_apic_irqs &= ~(1UL<<irq); }
+static const struct legacy_pic xen_legacy_pic = {
+ .nr_legacy_irqs = NR_IRQS_LEGACY,
+ .make_irq = make_8259A_irq
+};
+#define legacy_pic (&xen_legacy_pic)
+
+unsigned long io_apic_irqs;
+#endif /* CONFIG_XEN */
+
+#define __apicdebuginit(type) static type __init
+
+#define for_each_irq_pin(entry, head) \
+ for (entry = head; entry; entry = entry->next)
+
+/*
+ * Is the SiS APIC rmw bug present ?
+ * -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+#ifndef CONFIG_XEN
+static DEFINE_RAW_SPINLOCK(vector_lock);
+#endif
+
+static struct ioapic {
+ /*
+ * # of IRQ routing registers
+ */
+ int nr_registers;
+#ifndef CONFIG_XEN
+ /*
+ * Saved state during suspend/resume, or while enabling intr-remap.
+ */
+ struct IO_APIC_route_entry *saved_registers;
+#endif
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
+
+#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
+
+int mpc_ioapic_id(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicid;
+}
+
+unsigned int mpc_ioapic_addr(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+ return &ioapics[ioapic_idx].gsi_config;
+}
+
+int nr_ioapics;
+
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#ifndef CONFIG_XEN
+/* GSI interrupts */
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+#endif
+
+#ifdef CONFIG_EISA
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+static void __init _disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ _disable_ioapic_support();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr);
+
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+ return;
+ }
+
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+#ifndef CONFIG_XEN
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *alloc_irq_pin_list(int node)
+{
+ return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
+}
+
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
+
+int __init arch_early_irq_init(void)
+{
+ struct irq_cfg *cfg;
+ int count, node, i;
+
+ if (!legacy_pic->nr_legacy_irqs)
+ io_apic_irqs = ~0UL;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ ioapics[i].saved_registers =
+ kzalloc(sizeof(struct IO_APIC_route_entry) *
+ ioapics[i].nr_registers, GFP_KERNEL);
+ if (!ioapics[i].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+ }
+
+ cfg = irq_cfgx;
+ count = ARRAY_SIZE(irq_cfgx);
+ node = cpu_to_node(0);
+
+ /* Make sure the legacy interrupts are marked in the bitmap */
+ irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
+
+ for (i = 0; i < count; i++) {
+ irq_set_chip_data(i, &cfg[i]);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
+ */
+ if (i < legacy_pic->nr_legacy_irqs) {
+ cfg[i].vector = IRQ0_VECTOR + i;
+ cpumask_setall(cfg[i].domain);
+ }
+ }
+
+ return 0;
+}
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq_get_chip_data(irq);
+}
+
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+ struct irq_cfg *cfg;
+
+ cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+ if (!cfg)
+ return NULL;
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+ goto out_cfg;
+ if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ goto out_domain;
+ return cfg;
+out_domain:
+ free_cpumask_var(cfg->domain);
+out_cfg:
+ kfree(cfg);
+ return NULL;
+}
+
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
+{
+ if (!cfg)
+ return;
+ irq_set_chip_data(at, NULL);
+ free_cpumask_var(cfg->domain);
+ free_cpumask_var(cfg->old_domain);
+ kfree(cfg);
+}
+
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+ int res = irq_alloc_desc_at(at, node);
+ struct irq_cfg *cfg;
+
+ if (res < 0) {
+ if (res != -EEXIST)
+ return NULL;
+ cfg = irq_get_chip_data(at);
+ if (cfg)
+ return cfg;
+ }
+
+ cfg = alloc_irq_cfg(at, node);
+ if (cfg)
+ irq_set_chip_data(at, cfg);
+ else
+ irq_free_desc(at);
+ return cfg;
+}
+
+static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
+{
+ return irq_alloc_descs_from(from, count, node);
+}
+
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
+{
+ free_irq_cfg(at, cfg);
+ irq_free_desc(at);
+}
+
+
+struct io_apic {
+ unsigned int index;
+ unsigned int unused[3];
+ unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
+}
+
+void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
+}
+
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+}
+
+void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+#else /* !CONFIG_XEN */
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+ struct physdev_apic apic_op;
+ int ret;
+
+ apic_op.apic_physbase = mpc_ioapic_addr(apic);
+ apic_op.reg = reg;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+ if (ret)
+ return ret;
+ return apic_op.value;
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+ struct physdev_apic apic_op;
+
+ apic_op.apic_physbase = mpc_ioapic_addr(apic);
+ apic_op.reg = reg;
+ apic_op.value = value;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+}
+
+#define io_apic_modify io_apic_write
+#endif /* !CONFIG_XEN */
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+#ifndef CONFIG_XEN
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
+ return eu.entry;
+}
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ eu.entry = __ioapic_read_entry(apic, pin);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return eu.entry;
+}
+#endif
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ union entry_union eu = {{0, 0}};
+
+ eu.entry = e;
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+ unsigned long flags;
+ union entry_union eu = { .entry.mask = 1 };
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+ struct irq_pin_list **last, *entry;
+
+ /* don't allow duplicates */
+ last = &cfg->irq_2_pin;
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ if (entry->apic == apic && entry->pin == pin)
+ return 0;
+ last = &entry->next;
+ }
+
+ entry = alloc_irq_pin_list(node);
+ if (!entry) {
+ pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
+ node, apic, pin);
+ return -ENOMEM;
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+
+ *last = entry;
+ return 0;
+}
+
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+ if (__add_pin_to_irq_node(cfg, node, apic, pin))
+ panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ /* every one is different, right? */
+ return;
+ }
+ }
+
+ /* old apic/pin didn't exist, so just add new ones */
+ add_pin_to_irq_node(cfg, node, newapic, newpin);
+}
+
+static void __io_apic_modify_irq(struct irq_pin_list *entry,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ unsigned int reg, pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
+}
+
+static void io_apic_modify_irq(struct irq_cfg *cfg,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ __io_apic_modify_irq(entry, mask_and, mask_or, final);
+}
+
+static void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
+
+static void mask_ioapic(struct irq_cfg *cfg)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void mask_ioapic_irq(struct irq_data *data)
+{
+ mask_ioapic(data->chip_data);
+}
+
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic(struct irq_cfg *cfg)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_ioapic(cfg);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_ioapic_irq(struct irq_data *data)
+{
+ unmask_ioapic(data->chip_data);
+}
+
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+void native_eoi_ioapic_pin(int apic, int pin, int vector)
+{
+ if (mpc_ioapic_ver(apic) >= 0x20) {
+ io_apic_eoi(apic, vector);
+ } else {
+ struct IO_APIC_route_entry entry, entry1;
+
+ entry = entry1 = __ioapic_read_entry(apic, pin);
+
+ /*
+ * Mask the entry and change the trigger mode to edge.
+ */
+ entry1.mask = 1;
+ entry1.trigger = IOAPIC_EDGE;
+
+ __ioapic_write_entry(apic, pin, entry1);
+
+ /*
+ * Restore the previous level triggered entry.
+ */
+ __ioapic_write_entry(apic, pin, entry);
+ }
+}
+
+void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
+ cfg->vector);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+
+ /*
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remote-IRR is set.
+ */
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ entry = ioapic_read_entry(apic, pin);
+ }
+
+ if (entry.irr) {
+ unsigned long flags;
+
+ /*
+ * Make sure the trigger mode is set to level. Explicit EOI
+ * doesn't clear the remote-IRR if the trigger mode is not
+ * set to level.
+ */
+ if (!entry.trigger) {
+ entry.trigger = IOAPIC_LEVEL;
+ ioapic_write_entry(apic, pin, entry);
+ }
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ /*
+ * Clear the rest of the bits in the IO-APIC RTE except for the mask
+ * bit.
+ */
+ ioapic_mask_entry(apic, pin);
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.irr)
+ pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
+ mpc_ioapic_id(apic), pin);
+}
+
+static void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ clear_IO_APIC_pin(apic, pin);
+}
+#else
+#define add_pin_to_irq_node(cfg, node, apic, pin)
+#define __add_pin_to_irq_node(cfg, node, apic, pin) 0
+#endif /* !CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries[MAX_PIRQS] = {
+ [0 ... MAX_PIRQS - 1] = -1
+};
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifndef CONFIG_XEN
+/*
+ * Saves all the IO-APIC RTE's
+ */
+int save_ioapic_entries(void)
+{
+ int apic, pin;
+ int err = 0;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
+
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ ioapics[apic].saved_registers[pin] =
+ ioapic_read_entry(apic, pin);
+ }
+
+ return err;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapics[apic].saved_registers[pin];
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+ }
+}
+
+/*
+ * Restore IO APIC entries which was saved in the ioapic structure.
+ */
+int restore_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ ioapic_write_entry(apic, pin,
+ ioapics[apic].saved_registers[pin]);
+ }
+ return 0;
+}
+#endif /* CONFIG_XEN */
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int ioapic_idx, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+
+ return mp_irqs[i].dstirq;
+ }
+ return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+ break;
+ }
+
+ if (i < mp_irq_entries) {
+ int ioapic_idx;
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+ return ioapic_idx;
+ }
+
+ return -1;
+}
+#endif
+
+#ifdef CONFIG_EISA
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < legacy_pic->nr_legacy_irqs) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (0)
+#define default_ISA_polarity(idx) (0)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (1)
+#define default_PCI_polarity(idx) (1)
+
+static int irq_polarity(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+ int polarity;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].irqflag & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ polarity = default_ISA_polarity(idx);
+ else
+ polarity = default_PCI_polarity(idx);
+ break;
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ pr_warn("broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ pr_warn("broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ }
+ return polarity;
+}
+
+static int irq_trigger(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch ((mp_irqs[idx].irqflag>>2) & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+#ifdef CONFIG_EISA
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ default:
+ {
+ pr_warn("broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+#endif
+ break;
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ pr_warn("broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ pr_warn("broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
+ return trigger;
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+ int irq;
+ int bus = mp_irqs[idx].srcbus;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
+ if (test_bit(bus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].srcbusirq;
+ } else {
+ u32 gsi = gsi_cfg->gsi_base + pin;
+
+ if (gsi >= NR_IRQS_LEGACY)
+ irq = gsi;
+ else
+ irq = gsi_top + gsi;
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
+ }
+#endif
+
+ return irq;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int ioapic_idx, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+
+ if (!(ioapic_idx || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ set_io_apic_irq_attr(irq_attr, ioapic_idx,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ return irq;
+ }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0) {
+ set_io_apic_irq_attr(irq_attr, ioapic_idx,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ best_guess = irq;
+ }
+ }
+ }
+ return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#ifndef CONFIG_XEN
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ static int current_offset = VECTOR_OFFSET_START % 16;
+ int cpu, err;
+ cpumask_var_t tmp_mask;
+
+ if (cfg->move_in_progress)
+ return -EBUSY;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ cpumask_clear(cfg->old_domain);
+ cpu = cpumask_first_and(mask, cpu_online_mask);
+ while (cpu < nr_cpu_ids) {
+ int new_cpu, vector, offset;
+
+ apic->vector_allocation_domain(cpu, tmp_mask, mask);
+
+ if (cpumask_subset(tmp_mask, cfg->domain)) {
+ err = 0;
+ if (cpumask_equal(tmp_mask, cfg->domain))
+ break;
+ /*
+ * New cpumask using the vector is a proper subset of
+ * the current in use mask. So cleanup the vector
+ * allocation for the members that are not used anymore.
+ */
+ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
+ cfg->move_in_progress =
+ cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ break;
+ }
+
+ vector = current_vector;
+ offset = current_offset;
+next:
+ vector += 16;
+ if (vector >= first_system_vector) {
+ offset = (offset + 1) % 16;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
+ }
+
+ if (unlikely(current_vector == vector)) {
+ cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
+ cpumask_andnot(tmp_mask, mask, cfg->old_domain);
+ cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+ continue;
+ }
+
+ if (test_bit(vector, used_vectors))
+ goto next;
+
+#ifdef CONFIG_KDB
+ if (vector == KDBENTER_VECTOR)
+ goto next;
+#endif /* CONFIG_KDB */
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ goto next;
+ /* Found one! */
+ current_vector = vector;
+ current_offset = offset;
+ if (cfg->vector) {
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ cfg->move_in_progress =
+ cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector] = irq;
+ cfg->vector = vector;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
+ }
+ free_cpumask_var(tmp_mask);
+ return err;
+}
+
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ int err;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector(irq, cfg, mask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
+}
+
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+{
+ int cpu, vector;
+
+ BUG_ON(!cfg->vector);
+
+ vector = cfg->vector;
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ per_cpu(vector_irq, cpu)[vector] = -1;
+
+ cfg->vector = 0;
+ cpumask_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress))
+ return;
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+ continue;
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ break;
+ }
+ }
+ cfg->move_in_progress = 0;
+}
+
+void __setup_vector_irq(int cpu)
+{
+ /* Initialize vector_irq on a new cpu */
+ int irq, vector;
+ struct irq_cfg *cfg;
+
+ /*
+ * vector_lock will make sure that we don't run into irq vector
+ * assignments that might be happening on another cpu in parallel,
+ * while we setup our initial vector to irq mappings.
+ */
+ raw_spin_lock(&vector_lock);
+ /* Mark the inuse vectors */
+ for_each_active_irq(irq) {
+ cfg = irq_get_chip_data(irq);
+ if (!cfg)
+ continue;
+
+ if (!cpumask_test_cpu(cpu, cfg->domain))
+ continue;
+ vector = cfg->vector;
+ per_cpu(vector_irq, cpu)[vector] = irq;
+ }
+ /* Mark the free vectors */
+ for (vector = 0; vector < NR_VECTORS; ++vector) {
+ irq = per_cpu(vector_irq, cpu)[vector];
+ if (irq < 0)
+ continue;
+
+ cfg = irq_cfg(irq);
+ if (!cpumask_test_cpu(cpu, cfg->domain))
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+static struct irq_chip ioapic_chip;
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ return 1;
+}
+#endif
+
+static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
+ unsigned long trigger)
+{
+ struct irq_chip *chip = &ioapic_chip;
+ irq_flow_handler_t hdl;
+ bool fasteoi;
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
+ }
+
+ if (setup_remapped_irq(irq, cfg, chip))
+ fasteoi = trigger != 0;
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ irq_set_chip_and_handler_name(irq, chip, hdl,
+ fasteoi ? "fasteoi" : "edge");
+}
+#else /* !CONFIG_XEN */
+#define __clear_irq_vector(irq, cfg) ((void)0)
+#define ioapic_register_intr(irq, cfg, trigger) evtchn_register_pirq(irq)
+#endif
+
+#ifndef CONFIG_XEN
+int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+#else
+static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+#endif
+ unsigned int destination, int vector,
+ struct io_apic_irq_attr *attr)
+{
+ memset(entry, 0, sizeof(*entry));
+
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->dest = destination;
+ entry->vector = vector;
+ entry->mask = 0; /* enable IRQ */
+ entry->trigger = attr->trigger;
+ entry->polarity = attr->polarity;
+
+ /*
+ * Mask level triggered irqs.
+ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+ */
+ if (attr->trigger)
+ entry->mask = 1;
+
+ return 0;
+}
+
+static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
+ struct io_apic_irq_attr *attr)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned int dest;
+
+ if (!IO_APIC_IRQ(irq))
+ return;
+
+#ifdef CONFIG_XEN
+ /*
+ * For legacy IRQs we may get here before trigger mode and polarity
+ * get obtained, but Xen refuses to set those through
+ * PHYSDEVOP_setup_gsi more than once (perhaps even at all).
+ */
+ if (irq >= legacy_pic->nr_legacy_irqs
+ || test_bit(attr->ioapic_pin,
+ ioapics[attr->ioapic].pin_programmed)) {
+ struct physdev_setup_gsi setup_gsi = {
+ .gsi = irq,
+ .triggering = attr->trigger,
+ .polarity = attr->polarity
+ };
+ struct physdev_map_pirq map_pirq = {
+ .domid = DOMID_SELF,
+ .type = MAP_PIRQ_TYPE_GSI,
+ .index = irq,
+ .pirq = irq
+ };
+
+ switch (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+ &setup_gsi)) {
+ case -EEXIST:
+ if (irq < legacy_pic->nr_legacy_irqs)
+ break;
+ /* fall through */
+ case 0:
+ evtchn_register_pirq(irq);
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+ &map_pirq) == 0) {
+ /* fake (for init_IO_APIC_traps()): */
+ cfg->vector = irq;
+ return;
+ }
+ }
+ }
+#endif
+
+ if (assign_irq_vector(irq, cfg, apic->target_cpus()))
+ return;
+
+#ifndef CONFIG_XEN
+ if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
+ &dest)) {
+ pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
+ mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+ __clear_irq_vector(irq, cfg);
+
+ return;
+ }
+#else
+ dest = 0; /* meaningless */
+#endif
+
+ apic_printk(APIC_VERBOSE,KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+ "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
+ cfg->vector, irq, attr->trigger, attr->polarity, dest);
+
+#ifndef CONFIG_XEN
+ if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
+#else
+ if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
+#endif
+ pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
+ mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+ __clear_irq_vector(irq, cfg);
+
+ return;
+ }
+
+ ioapic_register_intr(irq, cfg, attr->trigger);
+#ifndef CONFIG_XEN
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->mask(irq);
+#endif
+
+ ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
+}
+
+static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
+{
+ if (idx != -1)
+ return false;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic_idx), pin);
+ return true;
+}
+
+static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
+{
+ int idx, node = cpu_to_node(0);
+ struct io_apic_irq_attr attr;
+ unsigned int pin, irq;
+
+ for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
+ idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+ if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
+ continue;
+
+ irq = pin_2_irq(idx, ioapic_idx, pin);
+
+ if ((ioapic_idx > 0) && (irq > 16))
+ continue;
+
+#ifdef CONFIG_XEN
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+ continue;
+#else
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(ioapic_idx, irq))
+ continue;
+#endif
+
+ set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+ irq_polarity(idx));
+
+ io_apic_setup_irq_pin(irq, node, &attr);
+ }
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ unsigned int ioapic_idx;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ __io_apic_setup_irqs(ioapic_idx);
+}
+
+/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+ int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
+ struct io_apic_irq_attr attr;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic_idx = mp_find_ioapic(gsi);
+ if (ioapic_idx < 0)
+ return;
+
+ pin = mp_find_ioapic_pin(ioapic_idx, gsi);
+ idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+ if (idx == -1)
+ return;
+
+ irq = pin_2_irq(idx, ioapic_idx, pin);
+#ifdef CONFIG_XEN
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+ return;
+#endif
+
+ /* Only handle the non legacy irqs on secondary ioapics */
+ if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
+ return;
+
+ set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+ irq_polarity(idx));
+
+ io_apic_setup_irq_pin_once(irq, node, &attr);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Set up the timer pin, possibly with the 8259A-master behind.
+ */
+static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
+ unsigned int pin, int vector)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned int dest;
+
+ memset(&entry, 0, sizeof(entry));
+
+ /*
+ * We use logical delivery to get the timer IRQ
+ * to the first CPU.
+ */
+ if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
+ apic->target_cpus(), &dest)))
+ dest = BAD_APICID;
+
+ entry.dest_mode = apic->irq_dest_mode;
+ entry.mask = 0; /* don't mask IRQ for edge */
+ entry.dest = dest;
+ entry.delivery_mode = apic->irq_delivery_mode;
+ entry.polarity = 0;
+ entry.trigger = 0;
+ entry.vector = vector;
+
+ /*
+ * The timer IRQ doesn't have to know that behind the
+ * scene we may have a 8259A-master in AEOI mode ...
+ */
+ irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+ "edge");
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_idx, pin, entry);
+}
+
+void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+{
+ int i;
+
+ pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
+
+ for (i = 0; i <= nr_entries; i++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(apic, i);
+
+ pr_debug(" %02x %02X ", i, entry.dest);
+ pr_cont("%1d %1d %1d %1d %1d "
+ "%1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector);
+ }
+}
+
+void intel_ir_io_apic_print_entries(unsigned int apic,
+ unsigned int nr_entries)
+{
+ int i;
+
+ pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
+
+ for (i = 0; i <= nr_entries; i++) {
+ struct IR_IO_APIC_route_entry *ir_entry;
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(apic, i);
+
+ ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
+
+ pr_debug(" %02x %04X ", i, ir_entry->index);
+ pr_cont("%1d %1d %1d %1d %1d "
+ "%1d %1d %X %02X\n",
+ ir_entry->format,
+ ir_entry->mask,
+ ir_entry->trigger,
+ ir_entry->irr,
+ ir_entry->polarity,
+ ir_entry->delivery_status,
+ ir_entry->index2,
+ ir_entry->zero,
+ ir_entry->vector);
+ }
+}
+
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ }
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+ x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+}
+
+__apicdebuginit(void) print_IO_APICs(void)
+{
+ int ioapic_idx;
+ struct irq_cfg *cfg;
+ unsigned int irq;
+ struct irq_chip *chip;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx),
+ ioapics[ioapic_idx].nr_registers);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ print_IO_APIC(ioapic_idx);
+
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for_each_active_irq(irq) {
+ struct irq_pin_list *entry;
+
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip)
+ continue;
+
+ cfg = irq_get_chip_data(irq);
+ if (!cfg)
+ continue;
+ entry = cfg->irq_2_pin;
+ if (!entry)
+ continue;
+ printk(KERN_DEBUG "IRQ%d ", irq);
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ pr_cont("-> %d:%d", entry->apic, entry->pin);
+ pr_cont("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+}
+
+__apicdebuginit(void) print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ pr_cont("%08x", apic_read(base + i*0x10));
+
+ pr_cont("\n");
+}
+
+__apicdebuginit(void) print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+ printk(KERN_DEBUG "... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ printk(KERN_DEBUG "... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ printk(KERN_DEBUG "... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) { /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) { /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ pr_cont("\n");
+}
+
+__apicdebuginit(void) print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+__apicdebuginit(void) print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
+
+ outb(0x0b,0xa0);
+ outb(0x0b,0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a,0xa0);
+ outb(0x0a,0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+static int __initdata show_lapic = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+__apicdebuginit(int) print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!cpu_has_apic && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
+
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+void __init enable_IO_APIC(void)
+{
+ int i8259_apic, i8259_pin;
+ int apic;
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ for(apic = 0; apic < nr_ioapics; apic++) {
+ int pin;
+ /* See if any of the pins is in ExtINT mode */
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ struct IO_APIC_route_entry entry;
+ entry = ioapic_read_entry(apic, pin);
+
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
+ }
+ }
+ }
+ found_i8259:
+ /* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+ /* Trust the MP table if nothing is setup in the hardware */
+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ ioapic_i8259.pin = i8259_pin;
+ ioapic_i8259.apic = i8259_apic;
+ }
+ /* Complain if the MP table and the hardware disagree */
+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ {
+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+ }
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+void native_disable_io_apic(void)
+{
+ /*
+ * If the i8259 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrupts can be delivered.
+ */
+ if (ioapic_i8259.pin != -1) {
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
+ entry.vector = 0;
+ entry.dest = read_apic_id();
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+ }
+
+ if (cpu_has_apic || apic_from_smp_config())
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+ /*
+ * Clear the IO-APIC before rebooting:
+ */
+ clear_IO_APIC();
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ x86_io_apic_ops.disable();
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int ioapic_idx;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+ /* Read the register 0 value */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mpc_ioapic_id(ioapic_idx);
+
+ if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&phys_id_present_map,
+ mpc_ioapic_id(ioapic_idx))) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ ioapics[ioapic_idx].mp_config.apicid = i;
+ } else {
+ physid_mask_t tmp;
+ apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
+ &tmp);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mpc_ioapic_id(ioapic_idx))
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mpc_ioapic_id(ioapic_idx);
+
+ /*
+ * Update the ID register according to the right value
+ * from the MPC table if they are different.
+ */
+ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+ continue;
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
+
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+ pr_cont("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+ unsigned long flags;
+
+ if (no_timer_check)
+ return 1;
+
+ local_save_flags(flags);
+ local_irq_enable();
+ /* Let ten ticks pass... */
+ mdelay((10 * 1000) / HZ);
+ local_irq_restore(flags);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+
+ /* jiffies wrap? */
+ if (time_after(jiffies, t1 + 4))
+ return 1;
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_ioapic_irq(struct irq_data *data)
+{
+ int was_pending = 0, irq = data->irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < legacy_pic->nr_legacy_irqs) {
+ legacy_pic->mask(irq);
+ if (legacy_pic->irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_ioapic(data->chip_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+static int ioapic_retrigger_irq(struct irq_data *data)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned long flags;
+ int cpu;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
+ apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+#ifdef CONFIG_SMP
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+ unsigned vector, me;
+
+ ack_APIC_irq();
+ irq_enter();
+ exit_idle();
+
+ me = smp_processor_id();
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ unsigned int irq;
+ unsigned int irr;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ irq = __this_cpu_read(vector_irq[vector]);
+
+ if (irq == -1)
+ continue;
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ cfg = irq_cfg(irq);
+ if (!cfg)
+ continue;
+
+ raw_spin_lock(&desc->lock);
+
+ /*
+ * Check if the irq migration is in progress. If so, we
+ * haven't received the cleanup request yet for this irq.
+ */
+ if (cfg->move_in_progress)
+ goto unlock;
+
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ goto unlock;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ /*
+ * Check if the vector that needs to be cleanedup is
+ * registered at the cpu's IRR. If so, then this is not
+ * the best time to clean it up. Lets clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to myself.
+ */
+ if (irr & (1 << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ goto unlock;
+ }
+ __this_cpu_write(vector_irq[vector], -1);
+unlock:
+ raw_spin_unlock(&desc->lock);
+ }
+
+ irq_exit();
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+ unsigned me;
+
+ if (likely(!cfg->move_in_progress))
+ return;
+
+ me = smp_processor_id();
+
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ send_cleanup_vector(cfg);
+}
+
+static void irq_complete_move(struct irq_cfg *cfg)
+{
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+ struct irq_cfg *cfg = irq_get_chip_data(irq);
+
+ if (!cfg)
+ return;
+
+ __irq_complete_move(cfg, cfg->vector);
+}
+#else
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
+#endif
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+
+ apic = entry->apic;
+ pin = entry->pin;
+
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+ io_apic_modify(apic, 0x10 + pin*2, reg);
+ }
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int irq = data->irq;
+ int err;
+
+ if (!config_enabled(CONFIG_SMP))
+ return -1;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return -EINVAL;
+
+ err = assign_irq_vector(irq, cfg, mask);
+ if (err)
+ return err;
+
+ err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+ if (err) {
+ if (assign_irq_vector(irq, cfg, data->affinity))
+ pr_err("Failed to recover vector for irq %d\n", irq);
+ return err;
+ }
+
+ cpumask_copy(data->affinity, mask);
+
+ return 0;
+}
+
+
+int native_ioapic_set_affinity(struct irq_data *data,
+ const struct cpumask *mask,
+ bool force)
+{
+ unsigned int dest, irq = data->irq;
+ unsigned long flags;
+ int ret;
+
+ if (!config_enabled(CONFIG_SMP))
+ return -1;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (!ret) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, data->chip_data);
+ ret = IRQ_SET_MASK_OK_NOCOPY;
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return ret;
+}
+
+static void ack_apic_edge(struct irq_data *data)
+{
+ irq_complete_move(data->chip_data);
+ irq_move_irq(data);
+ ack_APIC_irq();
+}
+
+atomic_t irq_mis_count;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+#ifndef CONFIG_XEN
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+#endif
+
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+ /* If we are moving the irq we need to mask it */
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
+ mask_ioapic(cfg);
+ return true;
+ }
+ return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+ struct irq_cfg *cfg, bool masked)
+{
+ if (unlikely(masked)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(cfg))
+ irq_move_masked_irq(data);
+ unmask_ioapic(cfg);
+ }
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+ return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+ struct irq_cfg *cfg, bool masked)